def svr_main(X, Y): X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) predict_list = [] for i in xrange(TEST_SIZE): X = [ [x] for x in xrange(i, TRAIN_SIZE+i)] clf.fit(X, Y[i:TRAIN_SIZE+i]) y_pred = clf.predict([TRAIN_SIZE+1+i]) predict_list.append(y_pred) print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list) print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list)) origin_data = Y_test print "origin data:%s"%origin_data plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
class SVMLearner(object): def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False): self.name = "{} Support Vector Machine Learner".format(kernel.capitalize()) self.kernel=kernel if kernel=="linear": self.svr = SVR(kernel=kernel, C=C) elif kernel=="rbf": self.svr = SVR(kernel=kernel, C=C, gamma=gamma) elif kernel=="poly": self.svr = SVR(kernel=kernel, C=C, degree=degree) def addEvidence(self,dataX,dataY): """ @summary: Add training data to learner @param dataX: X values of data to add @param dataY: the Y training values """ # build and save the model self.svr.fit(dataX, dataY) def query(self,points): """ @summary: Estimate a set of test points given the model we built. @param points: should be a numpy array with each row corresponding to a specific query. @returns the estimated values according to the saved model. """ return self.svr.predict(points)
def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") assert_raises(TypeError, check_is_fitted, "SVR", "support_") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard, "coef_") assert_raises(NotFittedError, check_is_fitted, svr, "support_") except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") except ValueError as e: assert_equal(str(e), "Random message ARDRegression, ARDRegression") try: check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") except AttributeError as e: assert_equal(str(e), "Another message SVR, SVR") ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_"))
class HotTweets: ''' Train and get tweet hotness ''' def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100): ''' Prepare support vector regression ''' self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True) #self.svr = LogisticRegression(random_state=42, verbose=0) self.n_comp = n_comp def fit_scaler(self, dev, i_dev): ''' Train normalizers for features and importances ''' # importance scaler self.std_scaler_i = sklearn.preprocessing.StandardScaler() self.std_scaler_i.fit(i_dev) self.norm = sklearn.preprocessing.StandardScaler() self.norm.fit(dev[:,0:self.n_comp]) self.n_comp = self.n_comp def train(self, features, importances): ''' Train regression ''' importances = self.std_scaler_i.transform(importances) features = self.norm.transform(features[:,0:self.n_comp]) self.svr.fit(features, importances) def predict(self, features): ''' Predict importances ''' features = self.norm.transform(features[:,0:self.n_comp]) results = self.svr.predict(features) #print results[0:100:5] results = self.std_scaler_i.inverse_transform(results) #print results[0:100:5] return results
def svm(self): """ C_range = np.logspace(-2, 10, 2) print C_range gamma_range = np.logspace(-9, 3, 2) print gamma_range param_grid = dict(gamma=gamma_range, C=C_range) cv = ShuffleSplit(len(self.search_inputs.y_train), n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVR(verbose=True), param_grid=param_grid, cv=cv) #grid = GridSearchCV(svm.SVR(kernel='rbf', verbose=True), param_grid=param_grid, cv=cv) grid.fit(self.search_inputs.X_train, self.search_inputs.y_train) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) self.svm_preds = grid.predict(self.search_inputs.X_test) """ regression = SVR(kernel='rbf', C=1e3, gamma=0.1, verbose=True) regress_fit = regression.fit(self.search_inputs.X_train,self.search_inputs.y_train) self.svm_preds = regress_fit.predict(self.search_inputs.X_test) for i in range(0,len(self.svm_preds) - 1): if self.svm_preds[i] < 1: self.svm_preds[i] = 1.00 elif self.svm_preds[i] > 3: self.svm_preds[i] = 3.00 self.search_inputs.fin_df['relevance'] = np.array(self.svm_preds) # easy swap in / out final_file_svm = self.search_inputs.fin_df.to_csv(self.fin_file_name+'_svm.csv', float_format='%.5f', index=False)
def getError1(signal, normedDay, period, phase): ''' Gets the error for a list of points across a normed day given a sklean model, the period, and the phase of the fitted signal. Here I'm using the Euclidean distance as the error measurement. This requires a little more computation due to the need to fit an inverse model, but provides better fits. Returns the squared Euclidean error. ''' if rank(normedDay.index[0]) > 0: t0= round((array(normedDay.index.get_level_values(0))- phase)%period,3) else: t0 = round((array(normedDay.index,dtype=float) - phase)%period,3) nD = Series(normedDay, index=t0) tUp = array([arange(0,period+.1,.1)]).T invSignal = SVR(kernel='rbf', C=signal.C, gamma=signal.gamma, epsilon=signal.epsilon) invSignal.fit(array([signal.predict(tUp)]).T, tUp.flatten()) xDiff = nD - signal.predict(array([array(nD)]).T) yDiff = nD - signal.predict(array([nD.index]).T) error = sum(pow(xDiff/period,2) + pow(yDiff/2,2)) return error
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'kernel': ['rbf', 'sigmoid', 'linear'], 'C': [0.01, 0.1, 1, 10, 100], 'epsilon': [0.0000001, 0.000001, 0.00001] }] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): svr = SVR(**param) # ada = AdaBoostRegressor(svr) svr.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, svr.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = svr return self
def RunSVRScikit(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. opts = {} if "c" in options: opts["C"] = float(options.pop("c")) if "epsilon" in options: opts["epsilon"] = float(options.pop("epsilon")) if "gamma" in options: opts["gamma"] = float(options.pop("gamma")) opts["kernel"] = "rbf" if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform SVR. model = SSVR(**opts) model.fit(X, y) except Exception as e: return -1 return totalTimer.ElapsedTime()
def train(self, x, y, param_names, random_search=100, kernel_cache_size=2000, **kwargs): if self._debug: print "First training sample\n", x[0] start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search c, gamma = self._random_search(random_iter=random_search, x=scaled_x, y=y, kernel_cache_size=kernel_cache_size) # Now train model try: svr = SVR(gamma=gamma, C=c, random_state=self._rng, cache_size=kernel_cache_size) svr.fit(scaled_x, y) self._model = svr except Exception, e: print "Training failed", e.message svr = None
def predict_device_byday_SVR(): X,Y_unique,Y_all,X_raw = load_device_counter_byday() from sklearn.svm import SVR model = SVR() # model = SVR(kernel='linear') training_size = 160 # model.fit(X[:training_size],Y_unique[:training_size]) model.fit(X[:training_size],Y_all[:training_size]) start_index = 180 end_index = 190 X_to_predict = X[start_index:end_index] # X_to_predict.append([date_str_toordinal('2017-04-18')]) # X_to_predict.append([date_str_toordinal('2017-03-27')]) print X_to_predict # Y_real = Y_unique[start_index:end_index] Y_real = Y_all[start_index:end_index] print X_raw[start_index:end_index] y_predicted=model.predict(X_to_predict) # print y_predicted y_predicted = np.array(y_predicted).astype(int) print y_predicted print Y_real # print y_predicted - np.array(Y_real) # plt.subplot(111) # plt.scatter(X_to_predict,Y_real,c='r') plt.scatter(X_to_predict,y_predicted) # plt.plot(X_to_predict,y_predicted) plt.show()
def main(args): (training_file, label_file, test_file, test_label, c, e) = args svr = SVR(C=float(c), epsilon=float(e), kernel='rbf') X = load_feat(training_file) y = [float(line.strip()) for line in open(label_file)] X = np.asarray(X) y = np.asarray(y) test_X = load_feat(test_file) test_X = np.asarray(test_X) test_X[np.isnan(test_X)] = 0 svr.fit(X, y) pred = svr.predict(test_X) if test_label != 'none': test_y = [float(line.strip()) for line in open(test_label)] test_y = np.asarray(test_y) print 'MAE: ', mean_absolute_error(test_y, pred) print 'RMSE: ', sqrt(mean_squared_error(test_y, pred)) print 'corrpearson: ', sp.stats.pearsonr(test_y, pred) print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2 print mquantiles(test_y, prob=[0.10, 0.90]) print mquantiles(pred, prob=[0.10, 0.90]) with open(test_file + '.svr.pred', 'w') as output: for p in pred: print >>output, p return
def train_learning_model_svm(df): X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) regressor = SVR() regressor.fit(X_train, y_train) calculate_results(regressor, X_train, X_test, y_train, y_test)
def getCharacteristicSignal(normedDays, phase, period, plotAxis=False): series = pandas.Series() for day in normedDays: series = series.append(day) '''Shift the times to give relative time of day''' t0 = array(series.index, dtype=float) t0 = (t0 - phase) % period t0 = array([t0]).T '''Shift the array to fit the edges''' tExt = array([array([t0-period,t0,t0+period]).flatten()]).T seriesExt = numpy.array([array(series),array(series), array(series)]).flatten() '''Fit the model''' svr_rbf = SVR(kernel='rbf', C=1e4, gamma=.03, epsilon=.01) y_rbf = svr_rbf.fit(tExt, seriesExt) '''Predict a new characteristic signal''' t1 = array([arange(0,period, period/100.)]).T signal = y_rbf.predict(t1) if plotAxis: plotAxis.plot(t1, signal) colors = ['b','g','r','c'] for i,day in enumerate(normedDays): timesAdjusted = array(normedDays[i].index,dtype=float) timesAdjusted = (timesAdjusted - phase) % period plotAxis.plot(timesAdjusted, day, 'o', label=str(i), color=colors[i]) plotAxis.set_title('Characteristic Signal') plotAxis.legend(loc='best') plotAxis.set_xbound(0,period) plotAxis.set_ybound(-1.1,1.1) return signal
def svr(self, X, y): """ Train support vector regression model Parameters ---------- X : numpy ndarray with numeric values Array containing input parameters for the model. Model will try to learn the output y[i] in terms of inputs X[i] y : columnar numpy array with numeric values Array containing single column of output values. Entry at y[i] corresponds to value of the underlying experiment for input parameters X[i] Returns ------- result : model Model learnt from incoming input inputs and outputs """ clf = SVR(C=1.0, epsilon=0.2) clf.fit(X, y) return clf
def svm_regression(y, x=None, ker='rbf', opt=0.1, show=False): """ Pass an array, with or without x-axis values, and this returns the SVM. A kernel (ker) can also be specified: 'rbf' 'linear', 'poly' """ from sklearn.svm import SVR if x is None: # Assume linearly spaced points x = np.arange(0,len(y)) # Fit the regression model if ker == 'linear': svr = SVR(kernel=ker, C=1e3) elif ker == 'poly': if type(opt) is not int: print('Need a degree for a polynomial fit, not' + str(opt)) return None svr = SVR(kernel=ker, C=1e3, degree=opt) else: svr = SVR(kernel='rbf', C=1e3, gamma=opt) # default is radial basis func y_svr = svr.fit(x, y).predict(x) # Fit # And plot if requested if show: plt.scatter(x, y, c='k', label='data') plt.plot(x, y_svr, c='b', label='SVR model') plt.xlabel('data') plt.ylabel('target') plt.title('Support Vector Regression') plt.legend() plt.show() return y_svr
def rollingMeanScale(series, period, plotAxis=False): svr_rbf = SVR(kernel='rbf', C=1e4, gamma=.01, epsilon=.01) '''Fit Model to Data Series''' tS= numpy.array([series.index]).T y_rbf = svr_rbf.fit(tS, list(series)) '''Up-sample to get rid of bias''' fFit = arange(series.index[0],series.index[-1]+.1,.25) trend = y_rbf.predict(numpy.array([fFit]).T) '''Take rolling mean over 1-day window''' shift = int(round(period/.5)) rMean = pandas.rolling_mean(trend, shift*2) rMean = numpy.roll(rMean, -shift) rMean[:shift]=rMean[shift] rMean[-(shift+1):]=rMean[-(shift+1)] rMean = pandas.Series(rMean, index=fFit) '''Adjust Data Series by subtracting out trend''' series = series - array(rMean[array(series.index, dtype=float)]) series = scaleMe(series)-.5 if plotAxis: plotAxis.plot(fFit, trend, label='Series Trend') plotAxis.plot(fFit, rMean, label='Rolling Mean') plotAxis.set_title('Detrend the Data') plotAxis.legend(loc='lower left') return series
def machinelearning(csv_file): # parse CSV d = {} d['date'] = [] d['radiation'] = [] d['humidity'] = [] d['temperature'] = [] d['wind'] = [] d['demand'] = [] dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',') next(dictreader) for row in dictreader: for key in row: d[key].append(row[key]) # interpolate weather data interpolate(d['radiation']) interpolate(d['humidity']) interpolate(d['temperature']) interpolate(d['wind']) # train machine learning algorithm training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32]) training_y = np.array(d['demand'][:32]) poly_svr = SVR(kernel='poly', degree=2) poly_svr.fit(training_x, training_y) prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:]) demand_predictions = poly_svr.predict(prediction_x) return demand_predictions
def train_SVR(viper): from sklearn.svm import SVR model = SVR(C=10, kernel='rbf', shrinking=False, verbose=True) model.fit(viper.train_feat, viper.train_y) return model
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator): #=========================================================================== # param_grid = {'C':[100,500,1000, 5000, 10000, 100000], # 'epsilon':[0.075,0.1, 0.125] # } # # svr = SVR(cache_size = 1000, random_state=42) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator) #=========================================================================== #search.fit(X_train, Y_train["Sand"]) #search.grid_scores_ #svr = search.best_estimator_ #svr.fit(X_train, Y_train["SAND"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) svr = SVR(C=10000) svr.fit(X_train, Y_train["Sand"]) yhat_svr = svr.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr)) return svr, test_error
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator): # # param_grid = {'C':[10000], # 'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1] # } # # svr = SVR(random_state=42, cache_size=1000, verbose=2) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator) # search.fit(X_train, Y_train["Ca"]) # #search.grid_scores_ # # model = search.best_estimator_ #scaler = StandardScaler() model = SVR(C=10000, epsilon = 0.01, cache_size=1000) model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) yhat_svr = model.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr)) return model, test_error
def test_regression(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='r2', num_rounds=1, seed=123) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert sum(imp_vals[3:]) <= 0.01
def learn(X, y): # do pca pca = PCA(n_components=6) pca_6 = pca.fit(X) print('variance ratio') print(pca_6.explained_variance_ratio_) X = pca.fit_transform(X) # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1) # do svr svr_rbf = SVR(kernel='rbf', C=1) svr_rbf.fit(X, y) # print(model_rbf) y_rbf = svr_rbf.predict(X) print(y_rbf) print(y) # see difference y_rbf = np.transpose(y_rbf) deviation(y, y_rbf) # pickle model with open('rbfmodel.pkl', 'wb') as f: pickle.dump(svr_rbf, f) with open('pcamodel.pkl', 'wb') as f: pickle.dump(pca_6, f)
def train_SVM(X, Y, kernel='rbf', shrinking=True, tol=0.001, cache_size=1500, verbose=True, max_iter=-1): """Assumes all irrelevant features have been removed from X and Y""" """Learns several hundred SVMs""" clf = SVR(kernel=kernel, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter) pipeline = Pipeline(zip([ "imputate", "vart", "scale", "svm" ], [ Imputer(), VarianceThreshold(), StandardScaler(), clf ])) param_grid = dict(svm__C=[0.1, 1, 10, 100, 1000], svm__gamma=[0.001, 0.01, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3) results = [] for i in range(Y[0].shape[1]): Y_new = np.fromiter((x[:, i][0, 0] for x in Y), np.double) X_new = np.array([np.matrix(x.data).flatten().tolist() for x in X], np.double) #X_new = np.fromiter((np.matrix(x.data) for x in X), np.double) X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new, Y_new, test_size = 0.2) X_train = flatten(X_train) X_test = flatten(X_test) grid_search.fit(X_train, Y_train) results.append( (grid_search.best_estimator_, clf.score(X_test, Y_test))) print("Best estimators (C): {0}, Score: {1}".format(grid_search.best_estimator_, clf.score(X_test, Y_test))) return results
def train_model(train, test, labels): clf = SVR(C=1.0, epsilon=0.2) clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape test['id'].to_csv("TEST_TEST.csv",index=False) predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
class SVRegression: def __init__(self, kernel_value, c_value, iter_value): self.kernel = kernel_value self.c = c_value self.iter = iter_value self.svr_lin = None def fit_predict(self, x_train, y_train, x_test): self.svr_lin = SVR(kernel=self.kernel, C=self.c, max_iter=self.iter) y_lin = self.svr_lin.fit(x_train, y_train).predict(x_test) return y_lin def computeC(self, x_train): print "ARRAY ", type(x_train) print x_train array = x_train.todense() print "ARRAY ", type(array) print array result = array.sum(axis=1, dtype='float') result = pow(result, 2) total = result.sum(axis=0, dtype='float') rows, columns = x_train.shape total = float(total)/float(rows) total = pow(total,-1) print "C", total self.c = total def computeAccuracy(self, x, y): return self.svr_lin.score(x, y)
def compute_mse(regressor, horizon): # get wind park and corresponding target. windpark = NREL().get_windpark(NREL.park_id['tehachapi'], 3, 2004, 2005) target = windpark.get_target() # use power mapping for pattern-label mapping. feature_window = 3 mapping = PowerMapping() X = mapping.get_features_park(windpark, feature_window, horizon) y = mapping.get_labels_turbine(target, feature_window, horizon) # train roughly for the year 2004, test for 2005. train_to = int(math.floor(len(X) * 0.5)) test_to = len(X) train_step, test_step = 25, 25 X_train=X[:train_to:train_step] y_train=y[:train_to:train_step] X_test=X[train_to:test_to:test_step] y_test=y[train_to:test_to:test_step] if(regressor == 'svr'): reg = SVR(kernel='rbf', epsilon=0.1, C = 100.0,\ gamma = 0.0001).fit(X_train,y_train) mse = mean_squared_error(reg.predict(X_test),y_test) elif(regressor == 'knn'): reg = KNeighborsRegressor(10, 'uniform').fit(X_train,y_train) mse = mean_squared_error(reg.predict(X_test),y_test) return mse
def RunSVRScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. c = re.search("-c (\d+\.\d+)", options) e = re.search("-e (\d+\.\d+)", options) g = re.search("-g (\d+\.\d+)", options) C = 1.0 if not c else float(c.group(1)) epsilon = 1.0 if not e else float(e.group(1)) gamma = 0.1 if not g else float(g.group(1)) try: with totalTimer: # Perform SVR. model = SSVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma) model.fit(X, y) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def train_svm(train_file, avg={}): test_X, test_Y, weight = load_data(train_file, avg) svr = SVR(kernel='rbf', C=100, gamma=1, verbose=True, cache_size=1024) print("start train") svr.fit(test_X, test_Y) print("train finish") return svr
def train_svm(data): test_X, test_Y = load_data(data) svr = SVR(kernel='rbf', C=100, gamma=1) print("start train") svr.fit(test_X, test_Y) print("train finish") return svr
# This method is useful because you can use it on any model and on out-of-sample data. vi2 = permutation_vi(mod_rf, test_X, test_y) (ggplot(vi2.melt(), aes(y="value", x='variable')) + geom_boxplot() + coord_flip() + ylim(0, 10)) # %% ----------------------------------------- # Let's use a completely different class of model, and the # method still works. This is the model "agnostic" bit. from sklearn.svm import SVR mod_svr = SVR() mod_svr.fit(X, y) vi3 = permutation_vi(mod_svr, X, y) (ggplot(vi3.melt(), aes(y="value", x='variable')) + geom_boxplot() + coord_flip() + ylim(0, 10)) # %% ----------------------------------------- # Problematic when features are highly correlated. # Set seed np.random.seed(123) # Generate correlated predictors
# Fitting SVR to the dataset ''' Gaussian RBF(Radial Basis Function) is another popular Kernel method used in SVM models for more. RBF kernel is a function whose value depends on the distance from the origin or from some point. ''' ### Feature scaling is necessary in SVR model, because he doesn't to do this automatically from sklearn.preprocessing import StandardScaler sc_X=StandardScaler() sc_Y=StandardScaler() X=sc_X.fit_transform(X) y=sc_Y.fit_transform(y) from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(sc_X.fit_transform(np.array[[6.5]])) y_pred = regressor.predict(X) y_pred = sc_Y.inverse_transform(y_pred) # Visualising the SVR results plt.scatter(X, y, color = 'red') plt.plot(X, regressor.predict(X), color = 'blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
Predicting a new result reg.predict(X_test)SLR and MLR # Fitting Polynomial Regression to the dataset from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 4) X_poly = poly_reg.fit_transform(X) poly_reg.fit(X_poly, y) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) # Fitting Decision tree to dataset from sklearn.tree import DecisionTreeRegressor regressor=DecisionTreeRegressor(random_state=0) regressor.fit(X,y) regressor.predict(X_test) from sklearn.metrics import classification_report,confusion_matrix print(classification_report(y_test,predictions)) print(confusion_matrix(y_test,predictions)) # Fitting Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor
def main(): # Load data and run brief analysis on it raw_data = load_data('train.csv') quick_analysis(raw_data) plt.hist(raw_data['SalePrice']) plt.show() # View all unique values of categorical features non_numeric_cols = raw_data.loc[:, raw_data.dtypes == object] for col in non_numeric_cols.columns: print(non_numeric_cols[col].value_counts()) # Analize correlations between features and the label corr_matrix = raw_data.corr() sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False) print(sale_correl) # Feature engineering the following: # Grade = OverallQual / OverallCond # Age = YrSold - YearBuilt # RemodAge = YrSold - YearRemodAdd # TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF raw_data['Grade'] = raw_data['OverallQual'] / raw_data['OverallCond'] raw_data['Age'] = raw_data['YrSold'] - raw_data['YearBuilt'] raw_data['RemodAge'] = raw_data['YrSold'] - raw_data['YearRemodAdd'] raw_data['TotalSF'] = raw_data['TotalBsmtSF'] + raw_data[ '1stFlrSF'] + raw_data['2ndFlrSF'] # Correlation matrix for the new features corr_matrix = raw_data.corr() sale_correl = corr_matrix['SalePrice'].sort_values(ascending=False) print(sale_correl) # Check correlation of new features with their respective components age_correl = corr_matrix['Age'].sort_values(ascending=False) print('Age correlations:', age_correl, '\n') remod_age_correl = corr_matrix['RemodAge'].sort_values(ascending=False) print('RemodAge correlations:', remod_age_correl, '\n') grade_correl = corr_matrix['Grade'].sort_values(ascending=False) print('Grade correlations:', grade_correl, '\n') totalsf_correl = corr_matrix['TotalSF'].sort_values(ascending=False) print('TotalSF correlations:', totalsf_correl, '\n') # Correlation matrix vizualization corr_plot(raw_data, 'SalePrice', fig_size=(4, 4)) corr_plot(raw_data, 'SalePrice', plot_type='hist', fig_size=(4, 4)) # Change type of columns to reflect their nature. Concretely, change the YrSold, MoSold, MSZoning and OverallCond features to categorical ones raw_data['YrSold_C'] = raw_data['YrSold'].copy().astype(str) raw_data['MoSold'] = raw_data['MoSold'].astype(str) raw_data['MSZoning'] = raw_data['MSZoning'].astype(str) raw_data['OverallCond_C'] = raw_data['OverallCond'].copy().astype(str) num_cols = [ 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GarageCars', 'GarageArea', 'FullBath', 'YrSold', ] cat_cols = [ 'MSZoning', 'Street', 'Utilities', 'Neighborhood', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'Heating', 'CentralAir', 'PavedDrive', 'SaleType', 'SaleCondition', 'YrSold_C', 'MoSold', 'OverallCond_C', ] # Create a list of all values that the categorical features can take cat_cols_categs = [raw_data[col].unique() for col in cat_cols] print(cat_cols_categs) # Create the pipeline to process data num_pipeline = Pipeline([ ('feat_sel', FeatureSelector(num_cols, True)), ('Grade', FeatureCreator(['OverallCond', 'OverallQual'], lambda x, y: x / y, as_dataframe=True, feat_name='Grade')), ('Age', FeatureCreator(['YrSold', 'YearBuilt'], lambda x, y: x - y, as_dataframe=True, feat_name='Age')), ('RemodAge', FeatureCreator(['YrSold', 'YearRemodAdd'], lambda x, y: x - y, as_dataframe=True, feat_name='RemodAge')), ('TotalSF', FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], lambda x, y: x + y, as_dataframe=True, feat_name='TotalSF')), ('drop_cat_feat', FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)), ('imputer_mean', Imputer(strategy='mean')), ('std_scaler', RobustScaler()) ]) cat_pipeline = Pipeline([ ('feat_sel', FeatureSelector(cat_cols, True)), ('imputer_most_frequent', CategoricalImputer()), ('encode', OneHotEncoder(categories=cat_cols_categs, sparse=False)), ]) feat_union = FeatureUnion(transformer_list=[ ('num_features', num_pipeline), ('cat_features', cat_pipeline), ]) # Create the train data and labels train_labels = raw_data['SalePrice'].copy() train_feat = feat_union.fit_transform(raw_data) # Check the linear regression model lin_reg = LinearRegression() print('Linear regression best hyperparameters:') final_lr_model = find_best_estimator(lin_reg, [{}], train_feat, train_labels) # Check the decision tree model hyperparams_vals = [ { 'max_features': [6, 10, 12, 16, 18, 20, 24] }, ] dt_reg = DecisionTreeRegressor(random_state=42) print('Decision tree best hyperparameters:') final_dt_model = find_best_estimator(dt_reg, hyperparams_vals, train_feat, train_labels) # Check the random forest model hyperparams_vals = [ { 'n_estimators': [200, 225, 250], 'max_features': [16, 24, 30] }, { 'bootstrap': [False], 'n_estimators': [220, 225], 'max_features': [24, 28] }, ] forest_reg = RandomForestRegressor(n_jobs=-1, random_state=42) print('Random forest best hyperparameters:') final_rf_model = find_best_estimator(forest_reg, hyperparams_vals, train_feat, train_labels) # Check the XGBoost model hyperparams_vals = [ { 'n_estimators': [450, 500, 400], 'max_features': [2, 4, 8], 'max_depth': [3, 4, None] }, ] xgbr_reg = XGBRegressor(learning_rate=0.05, n_threads=-1, random_state=42) print('XGBoost regressor best hyperparameters:') final_xgb_model = find_best_estimator(xgbr_reg, hyperparams_vals, train_feat, train_labels) # Check the SVM model hyperparams_vals = [ { 'kernel': ['linear', 'sigmoid', 'rbf'], 'gamma': ['auto', 'scale'] }, { 'kernel': ['poly'], 'gamma': ['auto', 'scale'], 'degree': [3, 4, 5] }, ] svm_reg = SVR() print('Support vector machine best hyperparameters:') final_svm_model = find_best_estimator(svm_reg, hyperparams_vals, train_feat, train_labels) # Check the ElasticNet model hyperparams_vals = [ { 'alpha': [0.0005, 0.005, 0.05, 0.2], 'l1_ratio': [0.1, 0.25, 0.75, 0.9] }, ] enet_reg = ElasticNet(max_iter=100000000, tol=0.001) print('ElasticNet best hyperparameters:') final_enet_model = find_best_estimator(enet_reg, hyperparams_vals, train_feat, train_labels) # Check the feature importances for both random forest algorithms rf_feat_imp = final_rf_model.feature_importances_ xgb_feat_imp = final_xgb_model.feature_importances_ other_feat = ['Grade', 'RemodAge', 'TotalSF'] all_features = num_cols.copy() print(num_cols) for cat_values in cat_cols_categs.copy(): all_features.extend(cat_values) all_features.extend(other_feat.copy()) print('Random forest feature importances:') for feat in sorted(zip(rf_feat_imp, all_features), reverse=True): print(feat) print('\nXGBoost feature importances:') for feat in zip(xgb_feat_imp, all_features): print(feat) # Load and process test data test_data = load_data('test.csv') test_data['YrSold_C'] = test_data['YrSold'].copy().astype(str).replace( 'nan', None) test_data['MoSold'] = test_data['MoSold'].astype(str).replace('nan', None) test_data['MSZoning'] = test_data['MSZoning'].astype(str).replace( 'nan', None) test_data['OverallCond_C'] = test_data['OverallCond'].copy().astype( str).replace('nan', None) test_feat = feat_union.transform(test_data) # Predict using the combination of Random Forest and XGBoost rf_predictions = final_rf_model.predict(test_feat) xgb_predictions = final_xgb_model.predict(test_feat) predictions = rf_predictions * 0.35 + xgb_predictions * 0.65 # Save resulting predictions pred_df = pd.DataFrame() pred_df['Id'] = test_data['Id'] pred_df['SalePrice'] = predictions.flatten() print(pred_df) pred_df.to_csv('submission_rf_xgb.csv', index=False) # Predict using only the XGBoost model xgb_predictions = final_xgb_model.predict(test_feat) predictions = xgb_predictions.copy() pred_df = pd.DataFrame() pred_df['Id'] = test_data['Id'] pred_df['SalePrice'] = predictions.flatten() print(pred_df) pred_df.to_csv('submission_xgb.csv', index=False)
from sklearn.externals import joblib from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Lasso from sklearn.svm import SVR from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor import os default_model_dir = 'models' _models = { 'bayesian_ridge': BayesianRidge(), 'linear_regression': LinearRegression(), 'elastic_net': ElasticNet(), 'lasso': Lasso(), 'svr': SVR(kernel='linear'), 'gbr': GradientBoostingRegressor(n_estimators=300, max_depth=5) } def get_model_names(): """ Get supported model names. :return: """ return list(_models.keys()) def get_models(model_name): """ Get models.
# Add noise to targets # y[::5]는 1차원 배열에서 5배수 번째 인덱스에만 특정 랜덤 값을 더해줌 y[::5] += 3 * (0.5 - np.random.rand(8)) print(y) # [ 0.04361009 0.17796574 0.22978773 0.24928643 0.32014619 0.13695542 # 0.70427365 0.72169941 0.78309245 0.80656999 0.3792032 0.9218538 # 0.96352582 0.99939807 0.9366527 1.22007951 0.86145011 0.78439525 # 0.72848344 0.65509942 -0.91410799 0.37470255 0.27513696 0.24822033 # 0.09237645 0.42416063 0.01079613 -0.06667189 -0.07494893 -0.22322095 # -0.86223429 -0.54825618 -0.59995522 -0.85384305 -0.98249348 -2.24695741 # -0.99667893 -0.99887435 -0.99247294 -0.96955196] # ############################################################################# # Fit regression model # support vector machine regression(kinds of regression) svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = SVR(kernel='linear', C=1e3) svr_poly = SVR(kernel='poly', C=1e3, degree=4) y_rbf = svr_rbf.fit(X, y).predict(X) y_lin = svr_lin.fit(X, y).predict(X) y_poly = svr_poly.fit(X, y).predict(X) # ############################################################################# # Look at the results lw = 2 plt.scatter(X, y, color='darkorange', label='data') plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model') plt.plot(X, y_lin, color='c', lw=lw, label='Linear model') plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model') plt.xlabel('data')
def ml(): """Function tasked with running the machine learning alglorythm """ import numpy from sklearn import linear_model from sklearn.metrics import mean_squared_error, r2_score import statsmodels.api as sm from sklearn.model_selection import train_test_split resp=[] #define response list def convert(): # convert the dataset into a final dataset file by removing the lines containing empty ratings out=open("data/new_dataoutput.txt","w") f = open("data/testoutput.txt") for line in f: li=line.split("|") if li[2]=='' or li[0]=='': continue else: out.write(line) # write new dataset resp.append(float(li[2])) # import response into list f.close() out.close() convert() #export features from new (final) dataset file data=numpy.loadtxt("data/new_dataoutput.txt",delimiter="|", usecols = (3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40)) train_Y=[] test_Y=[] #Random split using built-in function: train_X, test_X, train_Y, test_Y = train_test_split(data, resp, test_size=0.5, random_state=42) #Ordinary Least Squares: reg = linear_model.LinearRegression() reg.fit(train_X,train_Y) #Fit the model pred_Y = reg.predict(test_X) #Make predictions print("\nOrdinary Least Squares prediction:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y)) #Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y)) #Calculates R2 of predictions # Analysis of the linear coefficients significance (least squares regression): X2 = sm.add_constant(train_X) est = sm.OLS(train_Y, X2)#Define the model est2 = est.fit()#Fit the model print(est2.summary()) #Ridge regression reg = linear_model.Ridge(alpha=.5) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nRidge Regression prediction:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nRidge Regression with Generalized Cross-Validation prediction:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions reg = linear_model.Lasso(alpha=0.1) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nLasso Model:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions reg = linear_model.Lars(n_nonzero_coefs=1) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nLeast Angle Regression (LARS) Model:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions reg = linear_model.LassoLars(alpha=.1) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nLARS Lasso Model:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions reg = linear_model.BayesianRidge() reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nBayesian Ridge Regressor predictions:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions #SVR: from sklearn.svm import SVR reg = SVR(kernel='rbf', C=1e3, gamma=0.1) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nSupported Vector Machine Regressor predictions:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions from sklearn import tree reg=tree.DecisionTreeRegressor() reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions #print(pred_Y,resp) print("\nDecision Tree Regressor predictions:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor rng = numpy.random.RandomState(1) reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng) reg.fit(train_X,train_Y)#Fit the model pred_Y = reg.predict(test_X)#Make predictions print("\nAda Boost Regressor predictions:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions #Dimensionality reduction with PCA and then linear regression from sklearn.decomposition import PCA pca = PCA(n_components=2) pca_data=pca.fit_transform(data) #Do PCA dimensional reduction #Random split using built-in function: pca_train_X, pca_test_X, train_Y, test_Y = train_test_split(pca_data, resp, test_size=0.5, random_state=42) reg = linear_model.LinearRegression() # Do least squares reg.fit(pca_train_X,train_Y) #Fit the model pred_Y = reg.predict(pca_test_X)#Make predictions print("\nLeast squares with features compressed into 2 principal components:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions #Add polynomial feautres from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=3) a=poly.fit_transform(train_X) #add polynomial feautures to training set b=poly.fit_transform(test_X) #add polynomial features to testing set reg = linear_model.LinearRegression() #Least squares reg.fit(a,train_Y)#Fit the model pred_Y = reg.predict(b)#Make predictions print("\nLeast squares with polynomial features:") print("Mean squared error: %.2f" % mean_squared_error(test_Y, pred_Y))#Calculates MSE of predictions print('Variance score: %.2f' % r2_score(test_Y, pred_Y))#Calculates R2 of predictions
import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Position_Salaries.csv') x = dataset.iloc[:, 1].values y = dataset.iloc[:, 2].values from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x.reshape(len(x), 1)) y = sc_y.fit_transform(y.reshape(len(y), 1)) from sklearn.svm import SVR regressor = SVR(kernel='rbf', degree=3) regressor.fit(x, y) # regressor.predict() y_pred = sc_y.inverse_transform( regressor.predict(sc_y.transform(np.array([[6.5]])))) print(y_pred) plt.scatter(x, y, color='red') #plt.plot(x, y) #plt.plot(x, y, marker='o', markersize=3, color="red") #plt.plot(x, y_pred, color = 'black') plt.plot(x, regressor.predict(x), color='blue') plt.title('Truth or Bloof (SVR module) ') plt.xlabel('Position') plt.ylabel('Salary')
for i in range(len(df_total)): Y11.append(abs(Y1[i][0])) #getting rid of 0`s for i in range(len(Y11)): if Y11[i] > 1 and Y11[i] < 121: a = Y11[i] Y.append(a) for i in range(len(Y)): X.append([i]) print X #initializes the model svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) X_predicted = [] Y_predicted = [] #setting total length to iterate. as i is iterated through total length and #when we use (i+24) it should be in data total = len(X) - TrainingWindow - 2 #iterations of i for every n, where n is window size. 3 in this case PredictionWindowArray = [] MAPE = [] TrueError = [] PredictionWindow = 2 i = 0
X_train, X_test, y_train, y_test = train_test_split(train_X_reduced, train_y, test_size=0.2) ################## np.set_printoptions(precision=4) pd.set_option('precision', 4) model_ridge = Ridge(alpha=12.0, random_state=seed) model_KRR = KernelRidge(alpha=0.2, kernel='polynomial', degree=2, coef0=2.0, gamma=0.0032) model_svr = SVR(C=44.73, epsilon=0.0774, gamma=0.0004, kernel='rbf') model_byr = BayesianRidge() model_ENet = ElasticNet(alpha=0.0001, l1_ratio=0.551, random_state=seed, max_iter=10000) model_lasso = Lasso(alpha=0.0004, random_state=seed) model_lsvr = LinearSVR(C=0.525, epsilon=0.04, random_state=seed) model_lasso_lars = LassoLars(alpha=1.22e-05) model_rforest = RandomForestRegressor(n_estimators=300, max_features=0.4, min_samples_split=4, random_state=seed) model_GBoost = GradientBoostingRegressor(n_estimators=2000,
# Initialize models clf_line = LinearRegression() clf_ridg = Ridge(alpha=300, tol=1e-05, solver='sparse_cg', max_iter=5000) clf_laso = Lasso(alpha=0.1, tol=1e-05, max_iter=5000) clf_lala = LassoLars(alpha=0.001, max_iter=5000) clf_enet = ElasticNet(alpha=0.1, tol=0.001, l1_ratio=0.2, max_iter=5000) clf_xgbr = xgb.XGBRegressor() # not yet clf_xgrf = xgb.XGBRFRegressor() # not yet # clf_rf = RandomForestRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10) clf_tree = ExtraTreesRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10) clf_ada = AdaBoostRegressor(n_estimators=3, loss='linear') # clf_grad = GradientBoostingRegressor() # not yet clf_svr = SVR(kernel='rbf', C=0.1) # ori 5 # base_model_name = ['RandomForest', 'ExtraTree', 'AdaBoost', 'GradientBoosting', 'SVR'] # base_model_list = [clf_rf, clf_tree, clf_ada, clf_grad, clf_svr] # new 5 base_model_name = ['Ridge', 'SVR', 'XgbReg', 'ExtraTree', 'AdaBoost'] base_model_list = [clf_ridg, clf_svr, clf_xgbr, clf_tree, clf_ada] # base_model_name = ['LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'XgbReg', 'XgbRf', 'ExtraTree', 'AdaBoost', 'SVR'] # base_model_list = [clf_line, clf_ridg, clf_laso, clf_lala, clf_enet, clf_xgbr, clf_xgrf, clf_tree, clf_ada, clf_svr] # base_model_name = ['LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'Xgb', 'RandomForest', 'ExtraTree', 'AdaBoost', 'GradientBoosting', 'SVR'] # base_model_list = [clf_line, clf_ridg, clf_laso, clf_lala, clf_enet, clf_bxgb, clf_rf, clf_tree, clf_ada, clf_grad, clf_svr]
from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.svm import SVR from sklearn.metrics import accuracy_score n = 15 # number of iterations of the training and testing process scaler = StandardScaler() linear_regression = LinearRegression() forest = RandomForestRegressor() boosting = GradientBoostingRegressor(random_state=60) regressor = SVR( kernel='linear' ) # kernels are rbf, linear and polynomial out of which polynomial kernel gives the highest MAE and MSE warnings.filterwarnings("ignore") pd.set_option('display.width', 10000000) pd.set_option('display.max_columns', 10000000) # pd.set_option('display.max_rows', 10000000) DataSet = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv") # Wii sports is not a game, it's bundle of games that sold arround 82.53 million # copies, which is much higher than any other game in the dataset, # This will be a huge outlier and hence affects accuracy of any model we hence remove it DataSet.drop(index=[0], inplace=True) DataSet.drop( 'Developer', axis=1, inplace=True
dataset = pd.read_csv("Position_Salaries.csv") X = dataset.iloc[:, 1:-1].values y = dataset.iloc[:, -1].values #feature scalling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(-1,1).astype(float)) #fitting the svr model to the dataset #create regressor from sklearn.svm import SVR regressor = SVR(kernel = "rbf") regressor.fit(X,y) #predicting a new result using svr y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]])))) #visualing the svr result X_grid = np.arange(min(X), max(X),0.1) X_grid = X_grid.reshape((len(X_grid),1)) plt.scatter(X, y, color='red') plt.plot(X_grid, regressor.predict(X_grid),color="blue") plt.title("regression model")
X_train , X_test , Y_train , Y_test = train_test_split(X,Y ,test_size=0.2, random_state= 0 """ # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler()#objects sc_Y = StandardScaler()#objects that are going to scale x and y X = X.reshape(-1,1) Y = Y.reshape(-1,1) X = sc_X.fit_transform(X)#fitting and transforming these to the scale these x and y Y = sc_Y.fit_transform(Y) #fitting the SVR to the dataset #create our regressor from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X,Y) #predicting the reults with the polynomial regression Y_pred = sc_Y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))#we need to transform it as X AND Y were transformed to fit by feature scaling the SVR and this is not transformed #so we use the sc_x object to , transform it , now making it in array using np library and method array #we also need to use the inverse transform to get the original scale salary #if we execute the above line we get the scaled salary #so we need to inverse sc_y to get the original scale prediction #visualising the SVR results plt.scatter(X , Y , color='red') plt.plot(X, regressor.predict(X),color='blue')#here the lin_rag_2 is still the object of the linear regression class so we need to add something to make predictions of the poly regression class. #so we add this poly_rag.fit and not x_poly as we need to make it general for any new matrix of features x plt.title("Truth or Bluff(SVR)")
# Importing the dataset dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(-1, 1)) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict([[6.5]]) y_pred = sc_y.inverse_transform(y_pred) # Visualising the SVR results plt.scatter(X, y, color='purple') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve)
features = ['High', 'Low', 'Open'] X = preprocessing.scale(df_svr[features]) y = df_svr.Price # Take first 90% as the train data n_split = int(len(df_svr) * 0.9) # Define training and testing X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] test_date = df_svr.Date[n_split:] time_Start = time.time() # Classifier clf = SVR(kernel='rbf', C=1e3, gamma=0.1) clf = clf.fit(X_train, y_train) # Prediction prediction = clf.predict(X_test) petur.print_evaluation(y_test, prediction, "SVR") time_End = time.time() print("Seconds to run:", time_End - time_Start) # In[37]: #============================================================================== # Plots #============================================================================== # Define time series data real_price = pd.Series(y_test)
from sklearn.preprocessing import PolynomialFeatures #for degree in range(2, 6): # model = make_pipeline(PolynomialFeatures(degree=degree), linear_model.Ridge()) # scores = cross_val_score(model, x_scaled, y, cv=kf, scoring='neg_mean_squared_error') # print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) model = make_pipeline(PolynomialFeatures(degree=3), linear_model.Ridge()) if(verbose): print('polyl2::Cross validating') scores = cross_val_score(model, x_scaled, y, cv=kf, scoring='neg_mean_squared_error') scores_map['PolyRidge'] = scores if(alg == 'svr'): from sklearn.svm import SVR from sklearn.model_selection import GridSearchCV if(verbose): print('SVR::Initiating SVR') svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) #grid_sv = GridSearchCV(svr_rbf, cv=kf, param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, scoring='neg_mean_squared_error') #grid_sv.fit(x_scaled, y) #print("Best classifier :", grid_sv.best_estimator_) if(verbose): print('SVR::Cross validating') scores = cross_val_score(svr_rbf, x_scaled, y, cv=kf, scoring='neg_mean_squared_error') scores_map['SVR'] = scores if(alg == 'tree'): from sklearn.tree import DecisionTreeRegressor desc_tr = DecisionTreeRegressor(max_depth=5) #grid_sv = GridSearchCV(desc_tr, cv=kf, param_grid={"max_depth" : [1, 2, 3, 4, 5, 6, 7]}, scoring='neg_mean_squared_error') #grid_sv.fit(x_scaled, y) #print("Best classifier :", grid_sv.best_estimator_)
import numpy as np dfTraining = pd.read_csv("DataSetTraining.csv") dfTesting = pd.read_csv("DataSetTesting.csv") X_train = dfTraining[["anio","mes","dia"]] y_train=dfTraining.AvgMedicion X_testing = dfTesting[["anio","mes","dia"]] y_testing = dfTesting.AvgMedicion print("-------------------- Normal SVM -------------------------") clf = SVR(C=1.0, epsilon=0.2) clf.fit(X_train, y_train) SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) scores = cross_val_score(clf, X_train, y_train, cv = 10) res1=clf.predict(X_testing) print("---------------------------------------------") index=0 for element in res1: error=(abs((element-y_testing[index]))/y_testing[index])*100 print('Predicted Value: ', element, ' Real value: ', y_testing[index], " % Error: ", error) index=index+1
all_columns].rename(columns={ "fd_num_" + str(i): "scaled_x", "norm_cells_" + str(i): "norm_y" }) ], axis=0, ignore_index=True) X_columns = ["scaled_x"] + [ "MAX_CONC" ] + X_PubChem_properties + X_targets + X_target_pathway + X_cancer_cell_lines scaler = MinMaxScaler().fit(train_drug[X_columns]) Xtrain_drug = scaler.transform(train_drug[X_columns]) y_train_drug = train_drug["norm_y"].values print("Linear SVR") param_tested_C = [0.01, 0.1, 1, 5, 10, 100, 500] param_tested_epsilon = [0.001, 0.01, 0.1, 1] param_grid = dict(C=param_tested_C, epsilon=param_tested_epsilon) splitter_loo = LeaveOneOut() grid = GridSearchCV(SVR(kernel="linear"), param_grid=param_grid, cv=splitter_loo, scoring="neg_mean_absolute_error") grid.fit(Xtrain_drug, y_train_drug) print("Dataset:4, best C:", grid.best_params_["C"]) print("Dataset:4, best_epsilon", grid.best_params_["epsilon"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) X_train, X_mean, X_std = normalize(X_train) X_test = normalize_test(X_test, X_mean, X_std) y_train, y_mean, y_std = normalize(y_train) # y_test = normalize_test(y_test, y_mean, y_std) # ============== # MODEL CREATION # ============== svr_model = SVR() rf_model = RandomForestRegressor(n_estimators=100) adb_model = AdaBoostRegressor(n_estimators=100) xgb_model = XGBRegressor() svr_model.fit(X_train, y_train) joblib.dump( svr_model, path + 'models/' + str(data_interval) + 'min/svr_' + stock + '.pkl') # svr_model = joblib.load(path+'models/'+str(data_interval)+'min/svr_'+stock+'.pkl') rf_model.fit(X_train, y_train) joblib.dump( rf_model, path + 'models/' + str(data_interval) + 'min/rf_' + stock + '.pkl') # rf_model = joblib.load(path+'models/'+str(data_interval)+'min/rf_'+stock+'.pkl')
def save_model(path, aaindex_r2_list, learning_set, validation_set, threshold=5, regressor='pls', no_fft=False, train_on_all=False): """ Function Save_Model saves the best -s THRESHOLD models as 'Pickle' files (pickle.dump), which can be loaded again for doing predictions. Also, in Save_Model included is the def cross_validation -based computing of the k-fold CV performance of the n component-optimized model on all data (learning + validation set); by default k is 5 (n_samples = 5). Plots of the CV performance for the t best models are stored inside the folder CV_performance. """ regressor = regressor.lower() try: os.mkdir('CV_performance') except FileExistsError: pass try: os.mkdir('Pickles') except FileExistsError: pass try: os.remove('CV_performance/_CV_Results.txt') except FileNotFoundError: pass file = open('CV_performance/_CV_Results.txt', 'w') file.write('5-fold cross-validated performance of top models for validation set across all data.\n\n') if no_fft: file.write("No FFT used in this model construction, performance represents" " model accuracies on raw encoded sequence data.\n\n") file.close() for t in range(threshold): try: idx = aaindex_r2_list[t][0] parameter = aaindex_r2_list[t][7] # Estimating the CV performance of the n_component-fitted model on all data xy_learn = XY(full_path(idx), learning_set) xy_test = XY(full_path(idx), validation_set) if no_fft is False: x_test, y_test, _ = xy_test.get_x_and_y() x_learn, y_learn, _ = xy_learn.get_x_and_y() else: _, y_test, x_test = xy_test.get_x_and_y() _, y_learn, x_learn = xy_learn.get_x_and_y() x = np.concatenate([x_learn, x_test]) y = np.concatenate([y_learn, y_test]) if regressor == 'pls' or regressor == 'pls_cv': # n_components according to lowest MSE for validation set regressor_ = PLSRegression(n_components=parameter.get('n_components')) elif regressor == 'rf': regressor_ = RandomForestRegressor( random_state=parameter.get('random_state'), n_estimators=parameter.get('n_estimators'), max_features=parameter.get('max_features') ) elif regressor == 'svr': regressor_ = SVR(C=parameter.get('C'), gamma=parameter.get('gamma')) elif regressor == 'mlp': regressor_ = MLPRegressor( hidden_layer_sizes=parameter.get('hidden_layer_sizes'), activation=parameter.get('activation'), solver=parameter.get('solver'), learning_rate=parameter.get('learning_rate'), learning_rate_init=parameter.get('learning_rate_init'), max_iter=parameter.get('max_iter'), random_state=parameter.get('random_state') ) else: raise SystemError("Did not find specified regression model as valid option. " "See '--help' for valid regression model options.") # perform 5-fold cross-validation on all data (on X and Y) n_samples = 5 y_test_total, y_predicted_total = cross_validation(x, y, regressor_, n_samples) r_squared = r2_score(y_test_total, y_predicted_total) rmse = np.sqrt(mean_squared_error(y_test_total, y_predicted_total)) stddev = np.std(y_test_total, ddof=1) nrmse = rmse / stddev pearson_r = np.corrcoef(y_test_total, y_predicted_total)[0][1] # ranks for Spearman correlation y_test_total_rank = np.array(y_test_total).argsort().argsort() y_predicted_total_rank = np.array(y_predicted_total).argsort().argsort() spearman_rho = np.corrcoef(y_test_total_rank, y_predicted_total_rank)[0][1] with open('CV_performance/_CV_Results.txt', 'a') as f: f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format( regressor.upper(), parameter, idx[:-4])) f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};' ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho)) figure, ax = plt.subplots() ax.scatter(y_test_total, y_predicted_total, marker='o', s=20, linewidths=0.5, edgecolor='black') ax.plot([min(y_test_total) - 1, max(y_test_total) + 1], [min(y_predicted_total) - 1, max(y_predicted_total) + 1], 'k', lw=2) ax.legend([ '$R^2$ = {}\nRMSE = {}\nNRMSE = {}\nPearson\'s $r$ = {}\nSpearman\'s '.format( round(r_squared, 3), round(rmse, 3), round(nrmse, 3), round(pearson_r, 3)) + r'$\rho$ = {}'.format(str(round(spearman_rho, 3))) ]) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.savefig('CV_performance/' + idx[:-4] + '_' + str(n_samples) + '-fold-CV.png', dpi=250) plt.close('all') if train_on_all: # fit on all available data (learning + validation set; FFT or noFFT is defined already above) regressor_.fit(x, y) else: # fit (only) on full learning set (FFT or noFFT is defined already above) regressor_.fit(x_learn, y_learn) file = open(os.path.join(path, 'Pickles/'+idx[:-4]), 'wb') pickle.dump(regressor_, file) file.close() except IndexError: break return ()
import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Salaries.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2:3].values from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]]))))
df_y = df.iloc[:, 4].values # remove NA values # categorical to continous # add new columns # remove columns df_x = df_x[:, 3:4] # split the data into train and test from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, train_size=0.8) # perform the logic from sklearn.svm import SVR regressor = SVR() regressor = regressor.fit(x_train, y_train) result_re = regressor.predict(x_test) result_final = [] #result_final.append(0) for i in range(0, len(result_re)): if result_re[i] > 0.4: result_final.append(1) else: result_final.append(0) result_final = np.asarray(result_final) #print(result_re) # consume the result
def get_r2(x_learn, x_valid, y_learn, y_valid, regressor='pls'): """ The function Get_R2 takes features and labels from the learning and validation set. When using 'pls' as regressor, the MSE is calculated for all LOOCV sets for predicted vs true labels (mse = mean_squared_error(y_test_loo, y_pred_loo) for a fixed number of components for PLS regression. In the next iteration, the number of components is increased by 1 (number_of_components += 1) and the MSE is calculated for this regressor. The loop breaks if i > 9. Finally, the model of the single AAindex model with the lowest MSE is chosen. When using other regressors the parameters are tuned using GridSearchCV. This function returnes performance (R2, (N)RMSE, Pearson's r) and model parameters. """ regressor = regressor.lower() mean_squared_error_list = [] if regressor == 'pls': # PLS regression with LOOCV n_components tuning as described by Cadet et al. # https://doi.org/10.1186/s12859-018-2407-8 # https://doi.org/10.1038/s41598-018-35033-y # Hyperparameter (N component) tuning of PLS regressor for n_comp in range(1, 10): # n_comp = 1, 2,..., 9 pls = PLSRegression(n_components=n_comp) loo = LeaveOneOut() y_pred_loo = [] y_test_loo = [] for train, test in loo.split(x_learn): x_learn_loo = [] y_learn_loo = [] x_test_loo = [] for j in train: x_learn_loo.append(x_learn[j]) y_learn_loo.append(y_learn[j]) for k in test: x_test_loo.append(x_learn[k]) y_test_loo.append(y_learn[k]) pls.fit(x_learn_loo, y_learn_loo) y_pred_loo.append(pls.predict(x_test_loo)[0][0]) mse = mean_squared_error(y_test_loo, y_pred_loo) mean_squared_error_list.append(mse) mean_squared_error_list = np.array(mean_squared_error_list) # idx = np.where(...) finds best number of components idx = np.where(mean_squared_error_list == np.min(mean_squared_error_list))[0][0] + 1 # Model is fitted with best n_components (lowest MSE) best_params = {'n_components': idx} regressor_ = PLSRegression(n_components=best_params.get('n_components')) # other regression options (CV tuning) elif regressor == 'pls_cv': params = {'n_components': list(np.arange(1, 10))} # n_comp = 1, 2,..., 9 regressor_ = GridSearchCV(PLSRegression(), param_grid=params, iid=False, cv=5) # iid in future # versions redundant elif regressor == 'rf': params = { # similar parameter grid as Xu et al., https://doi.org/10.1021/acs.jcim.0c00073 'random_state': [42], # state determined 'n_estimators': [100, 250, 500, 1000], # number of individual decision trees in the forest 'max_features': ['auto', 'sqrt', 'log2'] # “auto” -> max_features=n_features, # “sqrt” -> max_features=sqrt(n_features) “log2” -> max_features=log2(n_features) } regressor_ = GridSearchCV(RandomForestRegressor(), param_grid=params, iid=False, cv=5) elif regressor == 'svr': params = { # similar parameter grid as Xu et al. 'C': [2 ** 0, 2 ** 2, 2 ** 4, 2 ** 6, 2 ** 8, 2 ** 10, 2 ** 12], # Regularization parameter 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001] # often 1 / n_features or 1 / (n_features * X.var()) } regressor_ = GridSearchCV(SVR(), param_grid=params, iid=False, cv=5) elif regressor == 'mlp': params = { # feedforward network trained via backpropagation – here only using a single hidden layer 'hidden_layer_sizes': [i for i in range(1, 12)], # size of hidden layer [(1,), (2,), ..., (12,)] 'activation': ['relu'], # rectified linear unit 'solver': ['adam', 'lbfgs'], # ADAM: A Method for Stochastic Optimization , or Limited-memory BFGS 'learning_rate': ['constant'], # learning rate given by ‘learning_rate_init’ 'learning_rate_init': [0.001, 0.01, 0.1], # only used when solver=’sgd’ or ‘adam’ 'max_iter': [1000, 200], # for stochastic solvers (‘sgd’, ‘adam’) determines epochs 'random_state': [42] } regressor_ = GridSearchCV(MLPRegressor(), param_grid=params, iid=False, cv=5) else: raise SystemError("Did not find specified regression model as valid option. See '--help' for valid " "regression model options.") regressor_.fit(x_learn, y_learn) # fit model if regressor != 'pls': # take best parameters for the regressor and the AAindex best_params = regressor_.best_params_ y_pred = [] for y_p in regressor_.predict(x_valid): # predict validation entries with fitted model y_pred.append(float(y_p)) r2 = r2_score(y_valid, y_pred) rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) nrmse = rmse / np.std(y_valid, ddof=1) # ranks for Spearman's rank correlation y_val_rank = np.array(y_valid).argsort().argsort() y_pred_rank = np.array(y_pred).argsort().argsort() with warnings.catch_warnings(): # catching RunTime warning when there's no variance in an array, e.g. [2, 2, 2, 2] warnings.simplefilter("ignore") # which would mean divide by zero pearson_r = np.corrcoef(y_valid, y_pred)[0][1] spearman_rho = np.corrcoef(y_val_rank, y_pred_rank)[0][1] return r2, rmse, nrmse, pearson_r, spearman_rho, regressor, best_params
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Using linear regression model lr_model = LinearRegression(n_jobs=-1) lr_model.fit(X_train, y_train) # Using quadratic regression with 2 polynomial features quad1_model = make_pipeline(PolynomialFeatures(2), Ridge()) quad1_model.fit(X_train, y_train) quad2_model = make_pipeline(PolynomialFeatures(3), Ridge()) quad2_model.fit(X_train, y_train) # Using SVM radias basis function (RBF) model rbf_model = SVR(kernel='rbf', C=1e3, gamma=0.1) rbf_model.fit(X_train, y_train) # KNN Regression knn_model = KNeighborsRegressor(n_neighbors=2) knn_model.fit(X_train, y_train) # Get confidence scores for each model lr_confidence = lr_model.score(X_test, y_test) quad1_confidence = quad1_model.score(X_test, y_test) quad2_confidence = quad2_model.score(X_test, y_test) rbf_confidence = rbf_model.score(X_test, y_test) knn_confidence = knn_model.score(X_test, y_test) # print confidence scores for each model--Quad 2 performs best print("lr confidence: ", lr_confidence)
random_state=rand_st) scores = cross_validate(estimator=rgr, X=data_np, y=target_np, scoring=scorers, cv=5) scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE'] ]) #Turns negative MSE scores into RMSE scores_Expl_Var = scores['test_expl_var'] print("Neural Network RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2))) print("Neural Network Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2))) print("CV Runtime:", time.time() - start_ts) if norm_features == 1: #SciKit SVM - Cross Val start_ts = time.time() rgr = SVR(kernel='linear', gamma=0.1, C=1.0) scores = cross_validate(rgr, data_np, target_np, scoring=scorers, cv=5) scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE'] ]) #Turns negative MSE scores into RMSE scores_Expl_Var = scores['test_expl_var'] print("SVM RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2))) print("SVM Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2))) print("CV Runtime:", time.time() - start_ts)
def BOP(): big_score = 0 big_name = "" big_y_pred = 0 big_error = 0 model = LinearRegression() name = "Linear Regression" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y_pred, y2) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = KNeighborsRegressor() name = "KNN Regression" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = DecisionTreeRegressor() name = "Decision Tree" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = SVR() name = "SVR" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = RandomForestRegressor() name = "Random Forest" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = GradientBoostingRegressor() name = "Gradient Booster" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error model = ExtraTreesRegressor() name = "Extra Trees Regressor" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred3 big_error = error model = AdaBoostRegressor() name = "AdaBoost Regressor" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) if (score > big_score and score is not 1): big_score = score big_name = name big_y_pred = y_pred big_error = error plotgraph(big_y_pred, big_name, error, big_score)
from sklearn.metrics import mean_squared_error, explained_variance_score from sklearn.utils import shuffle # Load housing data data = datasets.load_boston() # Shuffle the data X, y = shuffle(data.data, data.target, random_state=7) # Split the data into training and testing datasets num_training = int(0.8 * len(X)) X_train, y_train = X[:num_training], y[:num_training] X_test, y_test = X[num_training:], y[num_training:] # Create Support Vector Regression model sv_regressor = SVR(kernel='linear', C=1.0, epsilon=0.1) # Train Support Vector Regressor sv_regressor.fit(X_train, y_train) # Evaluate performance of Support Vector Regressor y_test_pred = sv_regressor.predict(X_test) mse = mean_squared_error(y_test, y_test_pred) evs = explained_variance_score(y_test, y_test_pred) print("nn #### Performance #### ") print("Mean squared error =", round(mse, 2)) print("Explained variance score =", round(evs, 2)) # Test the regressor on test datapoint test_data = [ 3.7, 0, 18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27 ]
def compare1(): f1, axes = subplots(2, 3) model = LinearRegression() name = "Linear Regression" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y_pred, y2) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[0, 0].plot(y_pred) axes[0, 0].set_title(name + ": error = " + error + " MSE = " + score) model = joblib.load('lasso.pkl') y_pred = model.predict(x2) name = "Lasso" error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[0, 1].plot(y_pred) axes[0, 1].set_title(name + ": error = " + error + " MSE = " + score) model = KNeighborsRegressor() name = "KNN Regression" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[0, 2].plot(y_pred) axes[0, 2].set_title(name + ": error = " + error + " MSE = " + score) model = DecisionTreeRegressor() name = "Decision Tree" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[1, 0].plot(y_pred) axes[1, 0].set_title(name + ": error = " + error + " MSE = " + score) model = SVR() name = "SVR" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[1, 1].plot(y_pred) axes[1, 1].set_title(name + ": error = " + error + " MSE = " + score) model = ElasticNet() name = "Elastic Net" model.fit(x1, y1) y_pred = model.predict(x2) error = mean_squared_error(y2, y_pred) score = model.score(y2, y_pred) error = round(error, 6) error = str(error) score = round(score, 6) score = str(score) axes[1, 2].plot(y_pred) axes[1, 2].set_title(name + ": error = " + error + " MSE = " + score) f1.show()
import matplotlib.pyplot as plt dataset = pd.read_csv('Position_Salaries.csv') x = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values y = np.reshape(y, (-1, 1)) from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) #we cannot apply StandardScaler to a 1D array , but we can use scale #from sklearn.preprocessing import scale #y = scale(y) y = sc_y.fit_transform(y) plt.scatter(x, y, color='red') plt.plot(x, y, color='red') plt.show() from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(x, y) plt.scatter(x, y, color='red') plt.plot(x, regressor.predict(x), color='blue') plt.show()
def analyze(data, label, num_folds): # Partition data into folds n = len(data) // num_folds data_folds = [data[i:i+n] for i in range(0, len(data), n)] label_folds = [label[i:i+n] for i in range(0, len(label), n)] lin_reg_error = 0 cs = [4**c for c in range(-10, 0, 1)] svm_error = [0] * len(cs) svm_std = [0] * len(cs) # for i in range(0, num_folds): # test_data = data_folds[i] # test_label = label_folds[i] # train_data = [] # train_label = [] # for j in range(num_folds): # if i != j: # train_data += data_folds[j] # train_label += label_folds[j] # model = linear_model.LinearRegression() # model.fit(data, label) # return model # lin_reg_error += np.mean(abs(model.predict(test_data) - test_label)) # # for i2 in range(len(cs)): # svm_classifier = SVR(gamma=cs[i2]) # svm_classifier.fit(train_data, train_label) # svm_error[i2] += np.mean(abs(svm_classifier.predict(test_data) - test_label)) # svm_std[i2] += np.std(abs(svm_classifier.predict(test_data) - test_label)) svm_c = SVR(gamma=4**-7) svm_c.fit(data, label) return svm_c