def svm_regressor(features,target,test_size_percent=0.2,cv_split=5): scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) svr = SVR(kernel='rbf',C=10,gamma=1) svr.fit(X_train,y_train.ravel()) test_prediction = svr.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy return svr
def Random_forest(features,target,test_size_percent=0.2,cv_split=3): X_array = features.as_matrix() y_array = target.as_matrix() model_rdf = RandomForestRegressor() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) model_rdf.fit(X_train,y_train) test_prediction = model_rdf.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return model_rdf
def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01): '''Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1 Layer("Rectifier",units=3) # Hidden Layer2 ,Layer("Linear")], # Output Layer n_iter = n_iter, learning_rate=0.01) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) mlp.fit(X_train,y_train) test_prediction = mlp.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return mlp
def fit_on_increasing_size(model): n_samples = 100 n_features_ = np.arange(10, 800, 20) r2_train, r2_test, snr = [], [], [] for n_features in n_features_: # Sample the dataset (* 2 nb of samples) n_features_info = int(n_features/10) np.random.seed(42) # Make reproducible X = np.random.randn(n_samples * 2, n_features) beta = np.zeros(n_features) beta[:n_features_info] = 1 Xbeta = np.dot(X, beta) eps = np.random.randn(n_samples * 2) y = Xbeta + eps # Split the dataset into train and test sample Xtrain, Xtest = X[:n_samples, :], X[n_samples:, :], ytrain, ytest = y[:n_samples], y[n_samples:] # fit/predict lr = model.fit(Xtrain, ytrain) y_pred_train = lr.predict(Xtrain) y_pred_test = lr.predict(Xtest) snr.append(Xbeta.std() / eps.std()) r2_train.append(metrics.r2_score(ytrain, y_pred_train)) r2_test.append(metrics.r2_score(ytest, y_pred_test)) return n_features_, np.array(r2_train), np.array(r2_test), np.array(snr)
def decision_tree(train_features, train_labels, test_features, test_labels, feature_names): regressor = tree.DecisionTreeRegressor() regressor.fit(train_features, train_labels) test_results = cap_results(regressor.predict(test_features)) train_results = cap_results(regressor.predict(train_features)) print "test result", metrics.mean_squared_error(test_labels, test_results) print "test r2", metrics.r2_score(test_labels, test_results) print "train result", metrics.mean_squared_error(train_labels, train_results) print "train r2", metrics.r2_score(train_labels, train_results) # print "importances" # temp = [] # for index, val in enumerate(regressor.feature_importances_): # if val > 0.001: # temp.append((index, val)) # print sorted(temp, key=lambda x: x[1]) '''graph stuff''' dot_data = StringIO() tree.export_graphviz(regressor, out_file=dot_data, special_characters=True, class_names=regressor.classes_, impurity=False, feature_names=feature_names) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("tree.pdf") return (test_results, train_results)
def fit_predict(self, X, y, T): X = np.array(X) y = np.array(y) T = np.array(T) folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y)) S_train = np.zeros((X.shape[0], len(self.base_models))) S_test = np.zeros((T.shape[0], len(self.base_models))) for i, clf in enumerate(self.base_models): S_test_i = np.zeros((T.shape[0], self.n_splits)) for j, (train_idx, test_idx) in enumerate(folds): X_train = X[train_idx] y_train = y[train_idx] X_holdout = X[test_idx] y_holdout = y[test_idx] clf.fit(X_train, y_train) y_pred = clf.predict(X_holdout)[:] print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred))) S_train[test_idx, i] = y_pred S_test_i[:, j] = clf.predict(T)[:] S_test[:, i] = S_test_i.mean(axis=1) oof_score = r2_score(y, S_train[:, i]) print 'Final Out-of-Fold Score %f'%oof_score return S_train, S_test
def metrics(y_test, clf_pred): print 'R^2 Score' print r2_score(y_test, clf_pred) print 'Mean Squared Error' print mean_squared_error(y_test, clf_pred) print 'Root Mean Squared Error' print np.sqrt(mean_squared_error(y_test, clf_pred))
def make_huber_train(): x, y = regression.get_data( filenames=['/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_4hr.csv']) alpha = 8 model = mycvx.kfolds_convex(x, y, alpha, random_seed=seed) y_test = model['data']['y_test'] y_pred = model['data']['y_pred'] y_train = model['data']['y_train'] y_train_pred = model['data']['y_train_pred'] r2 = np.round(r2_score(y_test, y_pred), 3) r2train = np.round(r2_score(y_train, y_train_pred), 3) plt.clf() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(y_train_pred, y_train, '.b') ax.plot(y_pred, y_test, '.r') ax.plot(np.arange(0, 1.2 * np.max(y_test), .1), np.arange(0, 1.2 * np.max(y_test), .1), '-k') fig.suptitle('Reconstruction Ability of Robust Regression Model') ax.set_xlabel('Remotely Sensed SPM (mg/L)') ax.set_ylabel('In situ measure SPM (mg/L)') # print (max(np.max(y_pred), np.max(y_test))- np.min(np.min(y_pred), 0))*5./6. - np.min(np.min(y_pred), 0) ax.text((max(np.max(y_pred), np.max(y_test)) - min(np.min(y_pred), 0)) * 5. / 6. - min(np.min(y_pred), 0), np.max(y_test) / 7., r'$R^2=%s$' % (r2train), fontsize=15) plt.savefig('../figures/huber_training') # plt.show() print 'r2: ', r2train
def linear_regression(features,target,test_size_percent=0.2,cv_split=5): ''' Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' X_array = features.as_matrix() y_array = target.as_matrix() ols = linear_model.LinearRegression() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) # model = ols.fit(X_train, y_train) ols.fit(X_train, y_train) # test_prediction_model = ols.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return ols
def r_square_score(target,fitted): #return np.corrcoef(target,fitted)[0,1] #=========================================================================== # target1 = target[0:370] # fitted1 = fitted[0:370] # target2 = target[415:500] # fitted2 = fitted[415:500] # target3 = target[500:650] # fitted3 = fitted[500:650] # score1 = r2_score(target1, fitted1) * 175 / 620 # score2 = r2_score(target2, fitted2) * 300 / 620 # score3 = r2_score(target3, fitted3) * 145 / 620 #=========================================================================== target1 = target[0:370] fitted1 = fitted[0:370] target2 = target[415:450] fitted2 = fitted[415:450] target3 = target[450:520] fitted3 = fitted[450:520] target4 = target[520:550] fitted4 = fitted[520:550] target5 = target[550:650] fitted5 = fitted[550:650] score1 = r2_score(target1, fitted1) * 25 / 80 score2 = r2_score(target2, fitted2) * 25 / 80 score3 = r2_score(target3, fitted3) * 12 / 80 score4 = r2_score(target4, fitted4) * 12 / 80 score5 = r2_score(target5, fitted5) * 6 / 80 return score1 + score2 + score3 + score4 + score5
def r2_excoeff_vs_time_cutoff(times): r2s = np.zeros_like(times, dtype='float64') num_data = np.zeros_like(times, dtype='int32') for index, time in enumerate(times): # get appropriate features x, y = regression.get_data(filenames=[ '/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_excoeff_{}hr.csv'.format( time)]) x = regression.Kau_MB_BR_features(x) # create the huber fit model alpha = 8 model = mycvx.kfolds_convex(x, y, alpha, random_seed=seed) y_test = model['data']['y_test'] y_pred = model['data']['y_pred'] y_train = model['data']['y_train'] y_train_pred = model['data']['y_train_pred'] r2_test = np.round(r2_score(y_test, y_pred), 3) r2_train = np.round(r2_score(y_train, y_train_pred), 3) r2s[index] = r2_train num_data[index] = x.shape[0] print r2s, num_data return r2s, num_data
def fit_all_commodities(df, commodities_list, model_name): """ INPUT: df (dataframe), \ commodity_list (list of respective commodities \ for which one whishes to build regressio models) OUTPUT: print result; write pickled models to file path PURPOSE: fit all models at once """ sklearn_models = { 'RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor' } sm_models = {"Linear Regression"} for commodity in commodities_list: if model_name in sklearn_models: model, X_train, X_test, y_train, y_test = \ fit_model_sklearn(df, commodity, model_name) predict = model.predict(X_test) print "***********************" print "{}'s adjusted r^2 score with {} is:".format( commodity, model_name ) print r2_score(y_test, predict) elif model_name in sm_models: model, results, X_train, X_test, y_train, y_test = \ fit_model_sm(df, commodity, model_name) # pickle model: joblib.dump(model, '{}_with_{}.pkl'.format( commodity, model_name) )
def main(): random.seed(SEED) np.random.seed(SEED) trainable_model = get_trainable_model() groundtruth_model = get_groundtruth_model() my_loss = loss.HeteroscedasticNormalLossFunction() train_X, train_y = get_data(NUM_TRAIN, NUM_FEATURES, groundtruth_model) test_X, test_y = get_data(NUM_TEST, NUM_FEATURES, groundtruth_model) trainable_model.fit(train_X, train_y) print 'train results' expected_mu, expected_std = groundtruth_model.predict(train_X) for i, pred in enumerate(trainable_model.staged_predict(train_X)): if i % 10 != 0: continue print 'stage %d: NLL = %.3f, R2 on mu(X) = %.3f, R2 on std(X) = %.3f' \ % (i, my_loss(train_y, pred), r2_score(expected_mu, pred[:, 0]), r2_score(expected_std, pred[:, 1])) print 'test results' expected_mu, expected_std = groundtruth_model.predict(test_X) for i, pred in enumerate(trainable_model.staged_predict(test_X)): if i % 10 != 0: continue print 'stage %d: NLL = %.3f, R2 on mu(X) = %.3f, R2 on std(X) = %.3f' \ % (i, my_loss(test_y, pred), r2_score(expected_mu, pred[:, 0]), r2_score(expected_std, pred[:, 1]))
def create_model(): print('Training robust regression') x, y = regression.get_data( filenames=['/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_2hr.csv'], spm_cutoff=None) # 2hr data # Get top 5 correlated band ratios and add to feature array x = regression.Kau_MB_BR_features(x) # Shape of x is (75,11) # log spm regression logy = np.log(y) alpha = 8 seed = 4 model = mycvx.kfolds_convex(x, logy, alpha, random_seed=seed) theta = model['theta'] y_test = model['data']['y_test'] y_pred = model['data']['y_pred'] y_train = model['data']['y_train'] y_train_pred = model['data']['y_train_pred'] r2_test = np.round(r2_score(np.exp(y_test), np.exp(y_pred)), 3) r2_train = np.round(r2_score(np.exp(y_train), np.exp(y_train_pred)), 3) print( 'Done training robust regression. R2 of actual spm vs predicted spm on training set = {}. \n'.format(r2_train)) return theta
def meanDecreaseAccuracyOnWeibo(): X, y, names = loadData() X = [dict(enumerate(sample)) for sample in X] vect = feature_extraction.DictVectorizer(sparse=False) X = vect.fit_transform(X) rf = RandomForestClassifier(n_estimators=500) scores = defaultdict(list) iter = 0 while(iter <= 100): iter += 1 X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3) rf = rf.fit(X_train, Y_train) acc = r2_score(Y_test, rf.predict(X_test)) for i in range(X.shape[1]): X_t = X_test.copy() np.random.shuffle(X_t[:, i]) shuff_acc = r2_score(Y_test, rf.predict(X_t)) scores[names[i]].append((acc-shuff_acc)/acc) print "Features sorted by their score:" result = sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True) importance = [] featurename = [] for score, name in result: importance.append(score) featurename.append(name) featureRanking(importance, featurename)
def hyperopt_obj(self,param,train_X,train_y): # 5-fold crossvalidation error #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round']) kf = KFold(n_splits = 3) errors = [] r2 = [] int_params = ['max_depth','num_round'] for item in int_params: param[item] = int(param[item]) for train_ind,test_ind in kf.split(train_X): train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind] test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind] dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y) dtest = xgb.DMatrix(test_valid_x) pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round'])) pred_test = pred_model.predict(dtest) errors.append(mean_squared_error(test_valid_y,pred_test)) r2.append(r2_score(test_valid_y,pred_test)) all_dtrain = xgb.DMatrix(train_X,label = train_y) print('training score:') pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round'])) all_dtest = xgb.DMatrix(train_X) pred_train = pred_model.predict(all_dtest) print(str(r2_score(train_y,pred_train))) print(np.mean(r2)) print('\n') return {'loss':np.mean(errors),'status': STATUS_OK}
def multi_regression(): ''' 多元回归 :return: ''' from sklearn.cross_validation import train_test_split X = df.iloc[:, :-1].values y = df['MEDV'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) slr = LinearRegression() slr.fit(X_train, y_train) y_train_pred = slr.predict(X_train) y_test_pred = slr.predict(X_test) # 计算Mean Squared Error (MSE) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) # MSE train: 19.958, test: 27.196 => over fitting # 计算R*R # If R*R =1, the model ts the data perfectly with a corresponding MSE = 0 . print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # plot plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') plt.xlim([-10, 50]) plt.show()
def housing_polynomial_regression(): ''' housing数据1元多次 :return: ''' X = df[['LSTAT']].values y = df['MEDV'].values regr = LinearRegression() # create polynomial features quadratic = PolynomialFeatures(degree=2) cubic = PolynomialFeatures(degree=3) X_quad = quadratic.fit_transform(X) X_cubic = cubic.fit_transform(X) # linear fit X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis] regr = regr.fit(X, y) y_lin_fit = regr.predict(X_fit) linear_r2 = r2_score(y, regr.predict(X)) # quadratic fit regr = regr.fit(X_quad, y) y_quad_fit = regr.predict(quadratic.fit_transform(X_fit)) quadratic_r2 = r2_score(y, regr.predict(X_quad)) # cubic fit regr = regr.fit(X_cubic, y) y_cubic_fit = regr.predict(cubic.fit_transform(X_fit)) cubic_r2 = r2_score(y, regr.predict(X_cubic)) # plot results plt.scatter(X, y, label='training points', color='lightgray') plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':') plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-') plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--') plt.xlabel('% lower status of the population [LSTAT]') plt.ylabel('Price in $1000\'s [MEDV]') plt.legend(loc='upper right') plt.show()
def get_cv_r2(labels, features, model): """r2scores = get_cv_r2(labels, features, model) Calculate cross-validated R2 score for a model. Inputs: labels = Labels for the data set features = Features for the data set model = the model Outputs: r2scores = R2 scores for training and cross-validation data set. """ # Get training and cross-validation metrics for each k-fold Nfolds = 5 kf = cross_validation.KFold(features.shape[0], n_folds = Nfolds, shuffle = True, random_state = 47) r2scores = np.zeros((Nfolds, 2), dtype = np.float64) ik = 0 for itrain, icross in kf: ftrain = features[itrain, :] ltrain = labels[itrain] fcross = features[icross, :] lcross = labels[icross] model.fit(ftrain, ltrain) r2scores[ik, 0] = metrics.r2_score(ltrain, model.predict(ftrain)) r2scores[ik, 1] = metrics.r2_score(lcross, model.predict(fcross)) ik = ik + 1 # Return linear regression model return r2scores
def model_years(df, model, start, end, categoricals=None): ''' Run model over years from start to end IN df: dataframe with features and label model: initialized sklearn model start: int, start year end: int, end year ''' trained = {} for year in range(start, end + 1): dfin = df.copy()[df.date < dt.date(year + 1, 1, 1)] print "Training... ", year data = BorderData(dfin, categoricals=categoricals) params = {} grid = GridSearchCV(model, params, cv=data.cv_train) grid.fit(data.X_train, data.y_train) data.predict(grid) data.predict_ensemble() print "Baseline : ", r2_score(data.y_test, data.baseline) print 'Model : ', r2_score(data.y_test, data.yhat) print "Ensemble : ", r2_score(data.y_test, data.ensemble) trained[year] = (data, grid) return trained
def otherOutcomeModel(clean_data_path,X_train,X_cross,X_test,X_predict,X_eval,outcome,name='default'): #Fit logit models for now, might want to do others later print "Running intermediate model on " + outcome Y_predict = getOutcome(clean_data_path, 'prediction train',outcome) Y_train = getOutcome(clean_data_path,'two year train',outcome) Y_cross = getOutcome(clean_data_path,'cross validation data',outcome) Y_cross.fillna(0, inplace=True) Y_predict.fillna(0, inplace=True) Y_train.fillna(0,inplace=True) standardize = preprocessing.StandardScaler() X_train_predict = standardize.fit_transform(X_train) X_cross_predict = standardize.transform(X_cross) X_test_predict = standardize.transform(X_test) standardize_predict = preprocessing.StandardScaler() X_predict_predict = standardize_predict.fit_transform(X_predict) X_eval_predict = standardize_predict.transform(X_eval) if ((Y_train == 1) | (Y_train == 0)).all(): #Binary variable logit = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=.0004325, intercept_scaling=1, class_weight='auto', random_state=423) logit.fit(X_train_predict,Y_train) logit2 = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=.0004325, intercept_scaling=1, class_weight='auto', random_state=423) logit2.fit(X_predict_predict,Y_predict) inScore = roc_auc_score(Y_cross,logit.predict_proba(X_cross_predict)[:,1]) print "Cross Logistic: Area under auc curve is %f" % (inScore) X_train[name] = logit.predict_proba(X_train_predict)[:,1] X_cross[name] = logit.predict_proba(X_cross_predict)[:,1] X_test[name] = logit.predict_proba(X_test_predict)[:,1] X_predict[name] = logit2.predict_proba(X_predict_predict)[:,1] X_eval[name] = logit2.predict_proba(X_eval_predict)[:,1] else: #Continuous variable ridge = Ridge(alpha=.001) ridge.fit(X_train_predict,Y_train) ridge2 = Ridge(alpha=0.001) ridge2.fit(X_predict_predict, Y_predict) inScore = r2_score(Y_train,ridge.predict(X_train_predict)) print "Train Ridge: r2 score is %f" % (inScore) inScore = r2_score(Y_cross,ridge.predict(X_cross_predict)) print "Cross Ridge: r2 score is %f" % (inScore) X_train[name] = ridge.predict(X_train_predict) X_cross[name] = ridge.predict(X_cross_predict) X_test[name] = ridge.predict(X_test_predict) X_predict[name] = ridge2.predict(X_predict_predict) X_eval[name] = ridge2.predict(X_eval_predict) return X_train,X_cross,X_test,X_predict,X_eval
def fit_predict(self, X, y, T): X = np.array(X) y = np.array(y) T = np.array(T) folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y)) S_train = np.zeros((X.shape[0], len(self.base_models))) S_test = np.zeros((T.shape[0], len(self.base_models))) for i, clf in enumerate(self.base_models): S_test_i = np.zeros((T.shape[0], self.n_splits)) for j, (train_idx, test_idx) in enumerate(folds): X_train = X[train_idx] y_train = y[train_idx] X_holdout = X[test_idx] y_holdout = y[test_idx] clf.fit(X_train, y_train) y_pred = clf.predict(X_holdout)[:] print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred))) S_train[test_idx, i] = y_pred S_test_i[:, j] = clf.predict(T)[:] S_test[:, i] = S_test_i.mean(axis=1) oof_score = r2_score(y, S_train[:, i]) print 'Final Out-of-Fold Score %f'%oof_score # results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2') # print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std())) # exit() f_train = np.zeros((X.shape[0], 1)) f_test = np.zeros((T.shape[0], 1)) f_test_i = np.zeros((T.shape[0], self.n_splits)) i = 0 total_train = np.hstack((X, S_train)) total_test = np.hstack((T, S_test)) for j, (train_idx, test_idx) in enumerate(folds): X_train = total_train[train_idx] y_train = y[train_idx] X_holdout = total_train[test_idx] y_holdout = y[test_idx] self.stacker.fit(X_train, y_train) y_pred = self.stacker.predict(X_holdout)[:] print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred))) f_train[test_idx, i] = y_pred f_test_i[:, j] = self.stacker.predict(total_test)[:] f_test[:, i] = f_test_i.mean(axis=1) oof_score = r2_score(y, f_train[:, i]) print 'Final Out-of-Fold Score %f'%oof_score return f_test
def permutationImportance(X,y,rf): # Get feature importances acc = r2_score(y, rf.predict(X)) scores= defaultdict(list) for i in range(X.shape[1]): X_t = X.copy() np.random.shuffle(X_t[:, i]) shuff_acc = r2_score(y, rf.predict(X_t)) scores[i].append((acc-shuff_acc)/acc) return np.array([ np.mean(scores[i]) for i in range(X.shape[1]) ])
def square_score_func1(x,y,a,c,d,weight_list): y_predict = [] for x_value in x: y_predict.append(func1(x_value, a, c, d)) #print weight_if if weight_if == True: square_score_fitted = r2_score(y,y_predict, sample_weight = weight_list) else : square_score_fitted = r2_score(y,y_predict) return square_score_fitted
def score(self, x, y): yhat = self.predict(x) if self.loss_func == "mse": if self.output_dim == 1: return r2_score(y, yhat[:, 0]) else: return np.mean([r2_score(y[:, i], yhat[:, i]) for i in range(self.output_dim)]) else: return accuracy_score(y, yhat)
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 13) assert_almost_equal(zero_one(y_true, y_pred, normalize=True), 13 / float(n_samples), 2) assert_almost_equal(zero_one_loss(y_true, y_pred), 13 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 13. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) assert_equal(accuracy_score(y_true, y_pred, normalize=False), n_samples - zero_one_loss(y_true, y_pred, normalize=False)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def bagofwords(X_train, X_cross, X_test, X_predict, X_eval, Y_train, Y_cross, Y_predict,variable = 'test', typeModel='binary',name='test'): X_train_text = X_train[variable] X_cross_text = X_cross[variable] X_test_text = X_test[variable] X_predict_text = X_predict[variable] X_eval_text = X_eval[variable] train_vec = getFeatures(X_train_text) X_train_text = train_vec.transform(X_train_text) X_cross_text = train_vec.transform(X_cross_text) X_test_text = train_vec.transform(X_test_text) predict_vec = getFeatures(X_predict_text) X_predict_text = predict_vec.transform(X_predict_text) X_eval_text = predict_vec.transform(X_eval_text) if typeModel == 'continuous': bowModel = Ridge(alpha = 0.001) bowModel2 = Ridge(alpha = 0.001) bowModel.fit(X_train_text,Y_train) bowModel2.fit(X_predict_text,Y_predict) inScore = r2_score(Y_train,bowModel.predict(X_train_text)) print "Train Ridge: r2 score is %f" % (inScore) inScore = r2_score(Y_cross,bowModel.predict(X_cross_text)) print "Cross Ridge: r2 score is %f" % (inScore) X_train[name] = bowModel.predict(X_train_text) X_cross[name] = bowModel.predict(X_cross_text) X_test[name] = bowModel.predict(X_test_text) X_predict[name] = bowModel2.predict(X_test_text) X_eval[name] = bowModel2.predict(X_eval_text) else: bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=0.0005, intercept_scaling=1, class_weight=None, random_state=423) bowModel2 = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=.0005, intercept_scaling=1, class_weight=None, random_state=423) bowModel.fit(X_train_text,Y_train) bowModel2.fit(X_predict_text,Y_predict) inScore = roc_auc_score(Y_train,bowModel.predict_proba(X_train_text)[:,1]) print "Train Logistic: Area under auc curve is %f" % (inScore) inScore = roc_auc_score(Y_cross,bowModel.predict_proba(X_cross_text)[:,1]) print "Cross Logistic: Area under auc curve is %f" % (inScore) X_train[name] = bowModel.predict_proba(X_train_text)[:,1] X_cross[name] = bowModel.predict_proba(X_cross_text)[:,1] X_test[name] = bowModel.predict_proba(X_test_text)[:,1] X_predict[name] = bowModel2.predict_proba(X_predict_text)[:,1] X_eval[name] = bowModel2.predict_proba(X_eval_text)[:,1] return X_train, X_cross, X_test, X_predict, X_eval
def calc_new_model(hf, pos): ranks = hf[hf['Points'] > 0][hf['Pos'] == pos]['Avg Rank'] scores = hf[hf['Points'] > 0][hf['Pos'] == pos]['Points'] if pos in PROJECTION_TYPE[FP_QB] or pos in PROJECTION_TYPE[FP_DST]: crazy_fit = np.poly1d(np.polyfit(ranks, scores, 5)) print "r2 score is %f" % (r2_score(scores, map(crazy_fit, ranks))) return crazy_fit elif pos in PROJECTION_TYPE[FP_FLEX]: crazy_fit = np.poly1d(np.polyfit(ranks, scores, 5)) print "r2 score is %f" % (r2_score(scores, map(crazy_fit, ranks))) return crazy_fit
def run_methods(train_points, train_targets, test_points, test_targets, model_parameters, m_list, file_name, title, show=False, full=True, vi=True): method = 'means' optimizer = 'L-BFGS-B' max_iter = 50 options = {'maxiter': max_iter, 'disp': False, 'mydisp': True} means_r2 = [] vi_r2 = [] for m in m_list: print('m:', m) print('Finding means...') means = KMeans(n_clusters=m, n_init=1, max_iter=20) means.fit(train_points.T) inputs = means.cluster_centers_.T print('...found') model_covariance_obj = SquaredExponential(np.copy(model_parameters)) new_gp = GPR(model_covariance_obj, method='means', optimizer=optimizer) res = new_gp.fit(train_points, train_targets, num_inputs=m, optimizer_options=options, inputs=inputs) predicted_y_test, _, _ = new_gp.predict(test_points) means_r2.append(r2_score(test_targets, predicted_y_test)) if vi: model_covariance_obj = SquaredExponential(np.copy(model_parameters)) new_gp = GPR(model_covariance_obj, method='vi', optimizer=optimizer) res = new_gp.fit(train_points, train_targets, num_inputs=m, optimizer_options=options, inputs=inputs) predicted_y_test, _, _ = new_gp.predict(test_points) vi_r2.append(r2_score(test_targets, predicted_y_test)) if full: model_covariance_obj = SquaredExponential(np.copy(model_parameters)) new_gp = GPR(model_covariance_obj, method='brute') res = new_gp.fit(train_points, train_targets, max_iter=max_iter) predicted_y_test, _, _ = new_gp.predict(test_points, train_points, train_targets) brute_r2 = r2_score(test_targets, predicted_y_test) plt.plot(range(len(m_list)), means_r2, '-kx', label='vi-means') if vi: plt.plot(range(len(m_list)), vi_r2, '-rx', label='vi') if full: plt.plot(range(len(m_list)), len(m_list) * [brute_r2], '--g', label='full GP') plt.xticks(range(len(m_list)), m_list) plt.xlabel('m') plt.ylabel('$R^2$-score on test data') # plt.ylim(0.5, 1) plt.legend(loc=4) plt.title(title) plt.savefig('../Plots/inducing_inputs/'+file_name + '.pgf') if show: plt.show()
def estimator_metrics(true_values, estimates): print "---------------------------------------" print "MSE: " print mean_squared_error(true_values, estimates) print "MAE: " print median_absolute_error(true_values, estimates) print "R-squared: " print r2_score(true_values, estimates) print "---------------------------------------" return
x = veriler.iloc[:, 1:2] y = veriler.iloc[:, 2:] X = x.values Y = y.values #linear regression from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X, Y) plt.scatter(X, Y, color='red') plt.plot(x, lin_reg.predict(X), color='blue') plt.show() print("Linear R2 degeri:") print(r2_score(Y, lin_reg.predict(X))) #polynomial regression from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=2) x_poly = poly_reg.fit_transform(X) print(x_poly) lin_reg2 = LinearRegression() lin_reg2.fit(x_poly, y) plt.scatter(X, Y, color='red') plt.plot(X, lin_reg2.predict(poly_reg.fit_transform(X)), color='blue') plt.show() from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree=4) x_poly = poly_reg.fit_transform(X) print(x_poly)
for (i,train_day,test_day) in [(i, dp.split(train,nsplits=7)[i], dp.split(test,nsplits=7)[i]) for i in dp.split(train,nsplits=7)]: # for each day test_day_pred=ets_v(train_day,test_day,hor=1,batch=batch,freq=freq) # predict for all hours of the respective day test_pred.iloc[i::7]=test_day_pred # fill corresponding rows with out of sample predictions return test_pred np.random.seed(0) # fix seed for reprodicibility path='C:/Users/SABA/Google Drive/mtsg/data/household_power_consumption.csv' # data path load_raw=dp.load(path) # load data load_raw=dp.cut(load_raw) # remove leading & trailing Nans targets=load_raw.apply(axis=1,func=(lambda x: np.nan if (x.isnull().sum()>0) else x.mean())).unstack() # custom sum function where any Nan in arguments gives Nan as result targets.fillna(method='bfill',inplace=True) train,test=dp.split_train_test(data=targets, test_size=0.25, base=7) # vertical test_pred=ets_v(train,test,batch=7,freq=7) r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average') dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_v.csv') # vertical week test_pred=ets_vw(train,test,batch=7,freq=52) r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average') dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_vw.csv') # horizontal test_pred=ets(train,test,hor=24,batch=7,freq=24) r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average') dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_h.csv') # horizontal week test_pred=ets_hw(train,test,batch=7,freq=52) r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average') dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_hw.csv')
def r2d2(y_train,train_yhat): r2_lm1 = r2_score(y_train,train_yhat) return r2_lm1
df_ml = df_ml.fillna(0) X = df_ml.drop(['like'], axis = 1).values Y = df_ml['like'].values X = StandardScaler().fit_transform(X) X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101) randomforest = RandomForestRegressor(n_estimators=500,min_samples_split=10) randomforest.fit(X_Train,Y_Train) p_train = randomforest.predict(X_Train) p_test = randomforest.predict(X_Test) train_acc = r2_score(Y_Train, p_train) test_acc = r2_score(Y_Test, p_test) app.layout = html.Div([html.H1("Facebook Data Analysis", style={"textAlign": "center"}), dcc.Markdown(''' Welcome to my Plotly (Dash) Data Science interactive dashboard. In order to create this dashboard have been used two different datasets. The first one is the [Huge Stock Market Dataset by Boris Marjanovic](https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs) and the second one is the [Facebook metrics Data Set by Moro, S., Rita, P., & Vala, B](https://archive.ics.uci.edu/ml/datasets/Facebook+metrics). This dashboard is divided in 3 main tabs. In the first one you can choose whith which other companies to compare Facebook Stock Prices to anaylise main trends. Using the second tab, you can analyse the distributions each of the Facebook Metrics Data Set features. Particular interest is on how paying to advertise posts can boost posts visibility. Finally, in the third tab a Machine Learning analysis of the considered datasets is proposed. All the data displayed in this dashboard is fetched, processed and updated using Python (eg. ML models are trained in real time!). ''') , dcc.Tabs(id="tabs", children=[ dcc.Tab(label='Stock Prices', children=[ html.Div([html.H1("Dataset Introduction", style={'textAlign': 'center'}), dash_table.DataTable( id='table', columns=[{"name": i, "id": i} for i in df.columns], data=df.iloc[0:5,:].to_dict("rows"),
def reg_s_lightGBM(merge_data,outnameimp,outname): from sklearn.model_selection import StratifiedShuffleSplit # 目的変数を分離 X = merge_data.drop("target",axis=1).values y = merge_data["target"].values columns_name = merge_data.drop("target",axis=1).columns # 分類するための関数を定義 sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2) def data_split(X,y): for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = pd.DataFrame(X_train, columns=columns_name) X_test = pd.DataFrame(X_test, columns=columns_name) return X_train, y_train, X_test, y_test # train, test, valに分離 X_train, y_train, X_test, y_test = data_split(X, y) X_train, y_train, X_val, y_val = data_split(X_train.values, y_train) # shape 確認 print("train shape", X_train.shape) print("test shape", X_test.shape) print("validation shape", X_val.shape) # shape 確認 print("y_train shape", y_train.shape) print("y_test shape", y_test.shape) print("y_validation shape", y_val.shape) y_test_df = pd.DataFrame(y_test) print("y_test describe",y_test_df.describe()) print("not_ y_test describe",(~y_test_df.duplicated()).sum()) #y_test_df.value_counts().plot(kind="bar") print("y_test_df.duplicated().sum()",y_test_df.duplicated().sum()) #print(y_test_df[y_test_df.duplicated()]) # クラスの割合を確認 plt.figure(figsize=(20,5)) plt.subplot(1,3,1) plt.hist(y_train) plt.subplot(1,3,2) plt.hist(y_test) plt.subplot(1,3,3) plt.hist(y_val) import lightgbm as lgb # データセットを作成 train = lgb.Dataset(X_train, label=y_train) valid = lgb.Dataset(X_val, label=y_val) # モデルのパラメータを設定 # パラメータを設定 params = {'task': 'train', # 学習、トレーニング ⇔ 予測predict 'boosting_type': 'gbdt', # 勾配ブースティング 'objective': 'regression', # 目的関数:回帰 'metric': 'rmse', # 回帰分析モデルの性能を測る指標 'learning_rate': 0.1 } # 学習率(初期値0.1) # モデルを訓練 model = lgb.train(params, train, valid_sets=valid, num_boost_round=5000, early_stopping_rounds=500) # 予測 y_pred = model.predict(X_test, num_iteration=model.best_iteration) from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差) from sklearn.metrics import r2_score # モデル評価用(決定係数) # 真値と予測値の表示 df_pred = pd.DataFrame({'regression_y_test':y_test,'regression_y_pred':y_pred}) display(df_pred) # 散布図を描画(真値 vs 予測値) plt.plot(y_test, y_test, color = 'red', label = 'x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる) plt.scatter(y_test, y_pred) # 散布図のプロット plt.xlabel('y') # x軸ラベル plt.ylabel('y_test') # y軸ラベル plt.title('y vs y_pred') # グラフタイトル # モデル評価 # rmse : 平均二乗誤差の平方根 mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出 rmse = np.sqrt(mse) # RSME = √MSEの算出 print('RMSE :',rmse) # r2 : 決定係数 r2 = r2_score(y_test,y_pred) print('R2 :',r2) df_Df = pd.DataFrame({'regression_y_test':y_test,'regression_y_pred':y_pred,'RMSE':rmse,'R2':r2}) df_Df.to_csv(r""+"./output/"+'DPC_g/'+outname+'.csv', encoding = 'shift-jis') importance = pd.DataFrame(model.feature_importance(), columns=['importance']) display(importance) C_you=merge_data.drop(["target"], axis=1) importance["columns"] =list(C_you.columns) importance.to_csv(r""+"./output/"+'DPC_g/'+outnameimp+'.csv', encoding = 'shift-jis') #整数に直して解析 y_pred2=y_pred len(y_pred2) y_test2=y_test #for文にする for i in range(len(y_pred2)): if y_pred2[i]>=1.51: y_pred2[i]=2 elif y_pred2[i]>=0.51: y_pred2[i]=1 else: y_pred2[i]=0 print(y_pred2) for i in range(len(y_test2)): if y_test2[i]>=1.51: y_test2[i]=2 elif y_test2[i]>=0.51: y_test2[i]=1 else: y_test2[i]=0 print(y_test2) #0と1と2のデータにして解析 df_pred2 = pd.DataFrame({'regression_y_test2':y_test2,'regression_y_pred2':y_pred2}) display(df_pred2) # モデル評価 # rmse : 平均二乗誤差の平方根 mse = mean_squared_error(y_test2, y_pred2) # MSE(平均二乗誤差)の算出 rmse = np.sqrt(mse) # RSME = √MSEの算出 print('RMSE :',rmse) # r2 : 決定係数 r2 = r2_score(y_test2,y_pred2) print('R2 :',r2) df_Df = pd.DataFrame({'regression_y_test2':y_test2,'regression_y_pred2':y_pred2,'RMSE':rmse,'R2':r2}) df_Df.to_csv(r""+"./output/"+'DPC_g/'+"int"+outname+'.csv', encoding = 'shift-jis')
model.add(Embedding(vocab_size+1, vector_length,weights=[embedding_matrix], input_length=embedding_vecor_length,trainable=False)) # input length is the length of words in review model.add(Conv1D(nb_filter=50,filter_length=5,border_mode="valid",activation="relu",subsample_length=1)) model.add(MaxPooling1D(pool_length=1)) model.add(Flatten()) model.add(Dense(100,activation='relu')) model.add(Dense(10,activation='relu')) model.add(Dense(1)) # compile the model #loss ke liye mse(mean squared error) aur metrics me bhi # summarize the model model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse']) print(model.summary()) history1 = model.fit(padded_docs[train], labels[train],validation_data=(padded_docs[test], labels[test]),epochs=number_of_epoch,batch_size=100,verbose=verbose_value,callbacks=[history]) prediction1 = model.predict(padded_docs[test]) print(r2_score(labels[test],prediction1)) a=history1.history['mean_squared_error'] va=history1.history['val_mean_squared_error'] l = history1.history['loss'] vl=history1.history['val_loss'] #for i in range(0,number_of_epoch): # accuracy_single_list=accuracy_single_list.append(a[i]) # loss_single_list=loss_single_list.append(l[i]) # val_accuracy_single_list=val_accuracy_single_list.append(va[i]) # val_loss_single_list=val_loss_single_list.append(vl[i]) accuracy.append(a) loss.append(l) val_accuracy.append(va) val_loss.append(vl) #plot_model(model, to_file='model_plot.png',show_shapes=True, show_layer_names=True) #predict =np.asarray( model.predict(X_test))
def plot_results(trainData, testData, target_term, show): target_term = target_term.replace(" ", "_").replace("/", "") preds = trainData["Preds"] target = trainData["Target"] print("Datapoints in the training set =", len(preds)) #plt.show() plt.rcParams["figure.figsize"] = (10,8) plt.scatter(target,preds) y_train = target.to_numpy() y_pred = preds.to_numpy() plt.xlim() limits = [(min(np.min(y_train), np.min(y_pred))) - 0.2, 0.2 + max(0, (np.max(y_train)), (np.max(y_pred)))] plt.xlim(limits) plt.ylim(limits) infotext = "MAE = {:.3f}\n".format(mean_absolute_error(y_train, y_pred)) + r"$r^2$ = {:.3f}".format(r2_score(y_train, y_pred)) plt.text(limits[0], limits[1], infotext, bbox={"facecolor": "lightblue", "pad": 5}) # for test preds = testData["Preds"] target = testData["Target"] print("Datapoints in the validation set =", len(preds)) plt.rcParams["figure.figsize"] = (10,8) plt.scatter(target,preds) y_test = target.to_numpy() y_pred = preds.to_numpy() plt.suptitle(target_term, fontsize=30) plt.xlabel("%s_DFT"%target_term, fontsize=18) plt.ylabel("%s_GNN"%target_term, fontsize=18) infotext2 = "MAE = {:.3f}\n".format(mean_absolute_error(y_test, y_pred)) + r"$r^2$ = {:.3f}".format(r2_score(y_test, y_pred)) #plt.text(-6, -6, infotext2) plt.text(limits[0], 0.8*limits[1], infotext2, bbox={"facecolor": "orange", "pad": 5}) plt.savefig('../plots/python/%s.png'%target_term) if (0 == show): plt.close() #plt.show() #plt.close() return(trainData, testData)
print(" beta_1 = %f, beta_2 = %f" % (popt[0], popt[1])) x = np.linspace(1960, 2015, 55) x = x/max(x) plt.figure(figsize=(8,5)) y = sigmoid(x, *popt) plt.plot(xdata, ydata, 'ro', label='data') plt.plot(x,y, linewidth=3.0, label='fit') plt.legend(loc='best') plt.ylabel('GDP') plt.xlabel('Year') plt.show() # split data into train/test msk = np.random.rand(len(df)) < 0.8 train_x = xdata[msk] test_x = xdata[~msk] train_y = ydata[msk] test_y = ydata[~msk] # build the model using train set popt, pcov = curve_fit(sigmoid, train_x, train_y) # predict using test set y_hat = sigmoid(test_x, *popt) # evaluation print("Mean absolute error: %.2f" % np.mean(np.absolute(y_hat - test_y))) print("Residual sum of squares (MSE): %.2f" % np.mean((y_hat - test_y) ** 2)) print("R2-score: %.2f" % r2_score(y_hat , test_y) )
from sklearn import linear_model regr = linear_model.LinearRegression() train_x = np.asanyarray(train[['ENGINESIZE']]) train_y = np.asanyarray(train[['CO2EMISSIONS']]) regr.fit(train_x, train_y) # The coefficients print('Coefficients: ', regr.coef_) print('Intercept: ', regr.intercept_) # In[28]: plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS, color='blue') plt.plot(train_x, regr.coef_[0][0] * train_x + regr.intercept_[0], '-r') plt.xlabel("Engine size") plt.ylabel("Emission") plt.show() # In[29]: from sklearn.metrics import r2_score test_x = np.asanyarray(test[['ENGINESIZE']]) test_y = np.asanyarray(test[['CO2EMISSIONS']]) test_y_ = regr.predict(test_x) print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y))) print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y)**2)) print("R2-score: %.2f" % r2_score(test_y_, test_y)) # In[ ]:
data.drop_duplicates(inplace=True) onehot = LabelEncoder() for i in data.columns: if data[i].dtype == 'object': data[i] = onehot.fit_transform(data[i]) x = data.iloc[:,:-1] y = data.iloc[:,-1] X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.6,random_state=1) clf1 = LinearRegression() clf1.fit(X_train,y_train) '''R2方法是将预测值跟只使用均值的情况下相比,看能好多少。其区间通常在(0,1)之间。 0表示还不如什么都不预测,直接取均值的情况,而1表示所有预测跟真实结果完美匹配的情况。 与均值相比的优秀程度,介于[0~1]。0表示不如均值。1表示完美预测 ''' print('均方误差',metrics.mean_squared_error(y_test,clf1.predict(X_test))) print('R^2 ',int(round(metrics.r2_score(y_test,clf1.predict(X_test)),2)*100),'%') #岭回归 ridge = linear_model.Ridge(alpha=10) ridge.fit(X_train,y_train) print('均方误差',metrics.mean_squared_error(y_test,ridge.predict(X_test))) print('R^2 ',int(round(metrics.r2_score(y_test,ridge.predict(X_test)),2)*100),'%') #LASSO回归 lasso = linear_model.Lasso(alpha=0.1) lasso.fit(X_train,y_train) print('均方误差',metrics.mean_squared_error(y_test,lasso.predict(X_test))) print('R^2 ',int(round(metrics.r2_score(y_test,lasso.predict(X_test)),2)*100),'%') # 网格搜索 print('-------------岭回归--------------------') gs1 = GridSearchCV(ridge,param_grid={'alpha':[0.01,0.1,1,10]},scoring='r2') gs1.fit(X_train,y_train) print(gs1.best_score_)
# train model with your data model.fit(feature_train, target_train) # Score your model score = model.score(feature_train, target_train) print("Score:\n", score) # predict data using your model target_prediction = model.predict(feature_test) print("Prediction:\n", target_prediction) plt.plot(len(feature_train), target_prediction, 'rx') # calculate the amount of error in your prediction MSE = mean_squared_error(target_test, target_prediction) R2 = r2_score(target_test, target_prediction) print("MSE:\n", MSE) print("R2:\n", R2) # print statistics for your model intercept = model.intercept_ coeff = model.coef_ print("Intercept:\n", intercept) print("Coeff:\n", coeff) # plot the graph based on the intercept and coefficient points = [intercept + coeff[0] * eachitem[0] for eachitem in feature_train] print(points) plt.plot(points, 'r--') # Save model
include_estimators=[ "gaussian_process", ], exclude_estimators=None, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics()) #-----CLASSIFIER----- #print('\n\n--- SCORE: ---') #print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions)) #-----REGRESSION----- print('\n\n--- SCORE: ---') print("R2 score", r2_score(y_test, predictions))
validation_split=0.2, verbose=1) #4. 평가예측 loss, mae = model.evaluate(x_test, y_test, batch_size=10) print('loss, mae : ', loss, mae) y_predict=model.predict(x_test) #RMSE from sklearn.metrics import mean_squared_error def RMSE(y_test, y_predict) : return np.sqrt(mean_squared_error(y_test, y_predict)) print("RMSE : ", RMSE(y_test, y_predict)) #R2 from sklearn.metrics import r2_score r2 = r2_score(y_test, y_predict) print('R2 :', r2) ''' 파라미터는 그대로 이다. dropout은 layer가 아니므로 적용안한다 실질적으로 train 할 때만 dropout적용 test시에는 그대로 레이어 다쓴다. Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 10)] 0 _________________________________________________________________ dense (Dense) (None, 15) 165 _________________________________________________________________ dropout (Dropout) (None, 15) 0
# 3. 훈련 model.compile(loss='mse', optimizer='adam', metrics=['mse']) model.fit(x_train, y_train, epochs=100, batch_size=1, validation_split=0.25, verbose=1) # 4. 평가, 예측 loss, mse = model.evaluate(x_test, y_test, batch_size=1) print(f"loss : {loss}, mse : {mse}") y_pred = model.predict(x_test) print(f"y_predict : {y_pred}") # RMSE 구하기 from sklearn.metrics import mean_squared_error def RMSE(y_true, y_predict): return np.sqrt(mean_squared_error(y_test, y_pred)) print(f"RMSE : {RMSE(y_test, y_pred)}") # R^2 구하기 from sklearn.metrics import r2_score r2_y_pred = r2_score(y_test, y_pred) print(f"R2: {r2_y_pred}")
def calc_ToP_result(alg_hyp_set_combo, scaler_class, dependent_variable, first_train_idx, test_idx, random_state=None): if not isinstance(alg_hyp_set_combo, BaseEstimator): raise TypeError( "alg_hyp_set_combo must be an instance of BaseEstimator, but was a {type(alg_hyp_set_combo).__name__}" ) if scaler_class not in [StandardScaler, RobustScaler, MinMaxScaler]: raise ValueError( "scaler_class must be either the StandardScaler, RobustScaler, or MinMaxScaler classes, but was {scaler_class}" ) if not isinstance(dependent_variable, str): raise TypeError( f"dependent_variable must be a str, but was a {type(dependent_variable).__name__}" ) elif dependent_variable not in ["Log(Rmax)", "Log(Efficiency)"]: raise ValueError( f"dependent_variable must be either 'Log(Rmax)' or 'Log(Efficiency)', but was '{dependent_variable}'" ) if not isinstance(first_train_idx, int): raise TypeError( f"first_train_idx must be an int, but was a {type(first_train_idx).__name__}" ) elif first_train_idx not in range(1, 18): raise ValueError( f"first_train_idx must be between 1 and 17 inclusive, but was {first_train_idx}" ) if not isinstance(test_idx, int): raise TypeError( f"test_idx must be an int, but was a {type(test_idx).__name__}") elif test_idx not in range(2, 19): raise ValueError( f"test_idx must be between 2 and 18 inclusive, but was {test_idx}") if test_idx <= first_train_idx: raise ValueError( f"test_idx must be greater than first_train_idx, but test_idx was {test_idx} and first_train_idx was {first_train_idx}" ) if random_state is not None: random.seed(random_state) np.random.seed(random_state) tensorflow.random.set_seed(random_state) clean_train = None clean_test = None if dependent_variable == "Log(Rmax)": clean_train_dep_var_rmax, clean_test_dep_var_rmax = get_clean_datasets_rmax( all_datasets, test_idx, range(first_train_idx, test_idx)) clean_test_dep_var_rmax_no_dupes = no_dupes(clean_train_dep_var_rmax, clean_test_dep_var_rmax) clean_train = clean_train_dep_var_rmax clean_test = clean_test_dep_var_rmax_no_dupes else: clean_train_dep_var_efficiency, clean_test_dep_var_efficiency = get_clean_datasets_efficiency( all_datasets, test_idx, range(first_train_idx, test_idx)) clean_test_dep_var_efficiency_no_dupes = no_dupes( clean_train_dep_var_efficiency, clean_test_dep_var_efficiency) clean_train = clean_train_dep_var_efficiency clean_test = clean_test_dep_var_efficiency_no_dupes random.seed(10) train_x, train_y, test_x, test_y = normalize_and_split( clean_train, clean_test, normalizer=scaler_class) model = clone( alg_hyp_set_combo) #clone() in case the model was already trained model.fit(train_x, train_y) pred_y = model.predict(test_x) return r2_score(test_y, pred_y)
hLayer = Dense(hLayer.relu(), 128, initialize, 'w1') hLayer = hLayer.dropout(0.5) hLayer = Dense(hLayer.relu(), 128, initialize, 'w2') hLayer = hLayer.dropout(0.5) hLayer = Dense(hLayer.relu(), 256, initialize, 'w3') hLayer = hLayer.dropout(0.5) hLayer = Dense(hLayer.relu(), output_dim, initialize, 'w_out') hLayer = hLayer.relu() hypothesis = hLayer.Y cost = tf.reduce_mean(tf.square(hypothesis - Y)) train = tf.train.GradientDescentOptimizer(learning_rate=1e-5).minimize(cost) # Launch the graph in Sesstion from sklearn.metrics import r2_score, mean_squared_error with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(5001): cost_val, _ = sess.run([cost, train], feed_dict={ X: x_train, Y: y_train }) if step % 20 == 0: print('>>', step, 'cost:', cost_val) h = sess.run(hypothesis, feed_dict={X: x_test}) r2Score = r2_score(y_test, h) # 높을 수록 좋음. rmseScore = np.sqrt(mean_squared_error(y_test, h)) # 낮을 수록 좋다. print('R2:', r2Score, 'RMSE:', rmseScore)
def r2_score(y_true, y_pred): """Implements r2_score metric from sklearn""" return r2_score(y_true, y_pred, multioutput="raw_values")
# CV による成分数の最適化 components = [] # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加 r2_in_cv_all = [] # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加 for component in range( 1, min(np.linalg.matrix_rank(autoscaled_x), max_number_of_principal_components) + 1): # PLS model = PLSRegression(n_components=component) # PLS モデルの宣言 estimated_y_in_cv = pd.DataFrame( cross_val_predict( model, autoscaled_x, autoscaled_y, cv=fold_number)) # クロスバリデーション推定値の計算し、DataFrame型に変換 estimated_y_in_cv = estimated_y_in_cv * y.std() + y.mean( ) # スケールをもとに戻す r2_in_cv = metrics.r2_score(y, estimated_y_in_cv) # r2 を計算 print(component, r2_in_cv) # 成分数と r2 を表示 r2_in_cv_all.append(r2_in_cv) # r2 を追加 components.append(component) # 成分数を追加 optimal_component_number = components[r2_in_cv_all.index( max(r2_in_cv_all))] print('\nCV で最適化された成分数 :', optimal_component_number) # PLS model = PLSRegression(n_components=optimal_component_number) # モデルの宣言 elif method_name == 'svr': # グラム行列の分散を最大化することによる γ の最適化 variance_of_gram_matrix = list() for index, ocsvm_gamma in enumerate(svr_gammas): print(index + 1, '/', len(svr_gammas)) gram_matrix = np.exp( -ocsvm_gamma *
plt.title('Price ~ RM') plt.xlabel('RM') plt.ylabel('Price') plt.show() # MSE(Mean Squared Error: 오차 제곱들의 평균) 계산 # error = y - y_hat, error^2 = (y - y_hat)^2 # MSE = sum(error^2) / 개수 mse = mean_squared_error(y_test, y_pred_rm) # RMSE(Squared-Root MSE) rmse = np.sqrt(mse) print('Price ~ RM: RMSE =', rmse) # R2-score(결정 계수) 계산 r2_1 = lin_reg.score(X_test_rm, y_test) r2_2 = r2_score(y_test, y_pred_rm) print(f'Price ~ RM: R^2 = {r2_1}, {r2_2}') # Price ~ LSTAT 선형 회귀: price = b0 + b1 * lstat X_train_lstat = X_train[:, np.newaxis, 12] # 학습 세트 X_test_lstat = X_test[:, np.newaxis, 12] # 검증 세트 lin_reg.fit(X_train_lstat, y_train) # 모델 fit, train print(f'Price ~ LSTAT: intercept: {lin_reg.intercept_}, coefficients: {lin_reg.coef_}') y_pred_lstat = lin_reg.predict(X_test_lstat) # 예측, 테스트 plt.scatter(X_test_lstat, y_test) # 실제값 산점도 그래프 plt.plot(X_test_lstat, y_pred_lstat, 'r') # 예측값 선 그래프 plt.title('Price ~ LSTAT') plt.xlabel('LSTAT')
def train(df, target_column="price", initial_features=None, method='rf', test_size=0.3, random_state_split=66, **kwargs): """ train model return model object and print model R square Args: df: dataframe including additional features target_column: target column name (dependent variable) initial_features: features for training model method: Type of model to train ('logistic') test_size: Test set size for training model random_state: Seed for spliting train and test **kwargs: Keyword arguments for sklearn.ensemble.RandomForestRegressor. Please see sklearn documentation for all possible options: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html Returns: model: model object df_metric: metric saved in a dataframe """ # Check if input is pandas dataframe if df is None: logger.warning("Input dataframe is empty. Empty frame returned") return None, None if initial_features is None: logger.warning("No features to train the model. Empty frame returned") return None, None if not isinstance(df, pd.DataFrame): logger.error("Parameter %s is not a DataFrame object.", df) logger.info('Training a %s model', method) # Generate features X = df[initial_features] Y = df[target_column] try: # Split data into test and train X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=test_size, random_state=random_state_split) # Instantiate model model = methods[method](**kwargs) # Train the model on training data model.fit(X_train, Y_train) except Exception as err: logger.error("Error occurred while training the model: %s", err) # Use the forest's predict method on the test data y_hat = model.predict(X_test) df_metric = pd.DataFrame(data={'R square': [r2_score(Y_test, y_hat)]}) return model, df_metric
diabetes_X_test = diabetes_X[-20:] # Split the targets into training/testing sets diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(diabetes_X_train, diabetes_y_train) # Make predictions using the testing set diabetes_y_pred = regr.predict(diabetes_X_test) # The coefficients print('Coefficients: \n', regr.coef_) # The mean squared error print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred)) # Plot outputs plt.scatter(diabetes_X_test, diabetes_y_test, color='black') plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show()
def validation(fit, outcome , time, continuous=False): plt.rcParams['figure.dpi']= 300 plt.rcParams['figure.figsize'] = (16, 9) plt.rcParams.update({'font.size': 16}) fitP=pd.DataFrame(data=fit) outcomeP=pd.DataFrame(data=outcome) timeP=pd.DataFrame(data=time) if isinstance(fit, pd.Series): fit=fit.values if isinstance(outcome, pd.Series): outcome=outcome.values if isinstance(time, pd.Series): time=time.values data_in = pd.concat([fitP, outcomeP, timeP], axis=1) data_in.columns = ['fit', 'outcome', 'time'] means = data_in.groupby('time')[['fit', 'outcome']].mean().reset_index(drop=False) data_in['outcomeD']=data_in.loc[:,'outcome'] if continuous==True: data_in.loc[data_in['outcome'] >= data_in.outcome.mean(), 'outcomeD'] = 1 data_in.loc[data_in['outcome'] < data_in.outcome.mean(), 'outcomeD'] = 0 outcomeD=data_in.loc[:,'outcomeD'].values lr_log_loss = np.nan roc_auc = np.nan brier = np.nan binom_p = np.nan Jeffreys_p = np.nan max_outcome_fit=np.maximum(max(outcome), max(fit)) min_outcome_fit=np.minimum(min(outcome), min(fit)) if min_outcome_fit>=0 and max_outcome_fit<=1: lr_log_loss = log_loss(outcomeD, fit).round(4) roc_auc = roc_auc_score(outcomeD, fit).round(4) binom_p = binom_test(sum(outcomeD), n=len(outcomeD), p= np.mean(fit), alternative='greater').round(decimals=4) Jeffreys_p = beta.cdf(np.mean(fit), sum(outcomeD)+0.5, len(outcomeD)-sum(outcomeD)+0.5).round(decimals=4) corr,_=pearsonr(fit,outcome) r2_OLS=corr**2 the_table = [['Counts', len(outcome)], ['Mean outcome', (sum(outcome)/len(outcome)).round(4)], ['Mean fit', np.mean(fit).round(4)], ['AUC ', roc_auc], ['R-squared (OLS)', round(r2_OLS,4)], ['R-squared', r2_score(outcome, fit).round(decimals=4)], ['RMSE/ SQR(Brier score)', round(np.sqrt(((outcome-fit).dot(outcome-fit))/len(outcome)),4)], ['Log loss', lr_log_loss], ['Binomial p-value', binom_p], ['Jeffreys p-value', Jeffreys_p]] the_table=pd.DataFrame(data=the_table) the_table.columns = ['Metric', 'Value'] plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.subplot(221) plt.title('Summary') plt.axis('off') plt.axis('tight') test=plt.table(cellText=the_table.values, colLabels=the_table.columns, loc='center', cellLoc='center', colWidths=[0.34, 0.2]) test.auto_set_font_size(False) test.set_fontsize(16) test.scale(2, 1.5) plt.subplot(222) plt.title('Time-Series Real-Fit') plt.plot(means['time'],means['outcome']) plt.plot(means['time'],means['fit'], color='red', ls='dashed') plt.xlabel('Time', fontsize=15) plt.ylabel('Mean', fontsize=15) plt.tick_params(axis='both', labelsize=13) plt.legend(('Outcome','Fit'), loc='best', fontsize=15) plt.subplot(223) plt.title('Fit Histogram') plt.hist(fit, bins=20, histtype='bar', density=True) plt.xlabel('Fit', fontsize=15) plt.ylabel('Frequency', fontsize=15) plt.tick_params(axis='both', labelsize=13) data_in['cat'] = pd.qcut(data_in.fit, 10, labels=False, duplicates='drop') real_fit = data_in.groupby('cat')[['fit', 'outcome']].mean() mpv=real_fit.fit.values fop=real_fit.outcome.values maximum=np.maximum(max(fop), max(mpv)) maximum=np.ceil(maximum*100)/100 minimum=np.minimum(min(fop), min(mpv)) minimum=np.floor(minimum*100)/100 plt.subplot(224) plt.title('Calibration Curve') plt.plot(mpv, fop, marker='.', linestyle='', markersize=18) plt.plot([minimum,maximum],[minimum,maximum], linestyle='--', color='gray') plt.xlim((minimum,maximum)) plt.ylim((minimum,maximum)) plt.xlabel('Mean fit', fontsize=15) plt.ylabel('Mean outcome', fontsize=15) plt.tick_params(axis='both', labelsize=13) plt.show()
# activation: {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’ # others regressors #reg = RandomForestRegressor(max_depth=2, random_state=9) # uncomment if wanna see all the parameters of the model print(reg) # fit model with trin data model = reg.fit(Xtrain, ytrain) # prediciton for test set preds = model.predict(Xtest) # sklearn regression scores print('r2 (Pearson) score: ', r2_score(ytest, preds)) print('explained_variance_score: ', explained_variance_score(ytest, preds)) print('mean_absolute_error ', mean_absolute_error(ytest, preds)) print('mean_squared_error ', mean_squared_error(ytest, preds)) #print('mean_squared_log_error ', mean_squared_log_error(ytest,preds)) print('median_absolute_error ', median_absolute_error(ytest, preds)) # visualization of predections vs target # --- hm = len(ytest) ytestsortind = sorted(range(len(ytest)), key=lambda x: ytest[x]) ytestsort = ytest[ytestsortind[:hm]] predssort = preds[ytestsortind[:hm]] plt.title('Ordered test set target vs regression') plt.xlabel('label')
c='limegreen', marker='s', edgecolor='white', label='Test Data') plt.xlabel('predicted value') plt.ylabel('residual') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-3, xmax=3, color='black', lw=2) plt.xlim([-3, 3]) plt.show() print('\n') print('The MSE of prediction is:') print(mean_squared_error(y_test, y_pred)) print('\n') print('The R^2 score is:') print(r2_score(y_test, y_pred)) for i in [0.5, 1, 5, 10]: ridge = Ridge(alpha=i) ridge.fit(X_train, y_train) print('\nfor alpha =', i, '\n') print('Slope:') print(ridge.coef_) print('Intercept:') print(ridge.intercept_) y_r_pred = ridge.predict(X_test) y_r_tpred = ridge.predict(X_train) plt.scatter(y_r_tpred, y_r_tpred - y_train, c='steelblue', marker='o',
from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score # Code starts here #Instantiate linear regression model regressor = LinearRegression() # fit the model regressor.fit(X_train, y_train) # predict the result y_pred = regressor.predict(X_test) # Calculate r2_score r2 = r2_score(y_test, y_pred) #print r2 print(r2) # Code ends here # -------------- from sklearn.linear_model import Lasso # Code starts here # instantiate lasso model lasso = Lasso() # fit and predict
if iter % 100 == 0: print("iteration: %s, loss: %s" % (iter, loss.item())) # Save model save_filename = 'checkpoints/LSTM_FC.pth' torch.save(model, save_filename) print('Saved as %s' % save_filename) # Start evaluating model model.eval() y_pred_dep_ = model(X_test_dep_std).detach().numpy() y_pred_dep = ss_y_dep.inverse_transform(y_pred_dep_[0, 144:]) print('the value of R-squared of Evaporation is ', r2_score(Outputs[144:], y_pred_dep)) print('the value of Root mean squared error of Evaporation is ', rmse(Outputs[144:], y_pred_dep)) f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(6, 4)) ax1.plot(Outputs[144:], color="blue", linestyle="-", linewidth=1.5, label="Measurements") ax1.plot(y_pred_dep, color="green", linestyle="--", linewidth=1.5, label="Proposed model")
def polynomial_regression(): # Check if user is loggedin if ('loggedin' in session): # Init variables (data_to_html, feature, graph_title, msg_suc, msg_err, msg_warn) = (None,) * 6 # Init list (columns, res_list, score_list) = (list(), ) * 3 # Get session details username = session['username'] lang = session['lang'] # Define tag category + model cat_tag = 'REG' mod_tag = 'PR' # Connect to database cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor) # Get categories of navbar navbar_cat = datas_cat_nav(cursor, lang) navbar_cat_name = navbar_cat[0] navbar_cat_tag = navbar_cat[1] navbar_cat_icon = navbar_cat[2] navbar_cat_link = navbar_cat[3] # Get models of navbar navbar_models = datas_mod_nav(cursor, lang, navbar_cat_tag) # Get settings of navbar navbar_settings = datas_set_nav(cursor, lang) navbar_set_name = navbar_settings[0] navbar_set_icon = navbar_settings[1] navbar_set_link = navbar_settings[2] # Get category details for breadcrumb cat_details = cards_categories(cursor, lang, cat_tag) cat_name = cat_details[0] cat_link = cat_details[3] # Get model details for breadcrumb model_details = datas_model(cursor, lang, mod_tag) model_name = model_details[0] model_link = model_details[1] # Break connection cursor.close() if (request.method == 'POST'): # Upload file if (request.form['submit_btn'] == 'Upload Now' or request.form['submit_btn'] == 'Envoyer maintenant'): # All fields was complete if (bool(request.files['file']) == 1 and bool(request.form['sep_select']) == 1 ): get_upload_datas = upload_file(lang, False) msg_err = get_upload_datas[0] msg_suc = get_upload_datas[1] msg_warn = get_upload_datas[2] global new_tmp_path new_tmp_path = get_upload_datas[3] global colname_list colname_list = get_upload_datas[4] columns = colname_list data_to_html = get_upload_datas[5] global df df = get_upload_datas[6] else: if (lang == 'en'): # Submit without upload file msg_err = ( 'Please upload your data and select a separator.' ) else: msg_err = ( 'Veuillez télécharger vos données et ' 'choisir un séparateur.' ) # Model compute if (request.form['submit_btn'] == 'Launch the model' or request.form['submit_btn'] == 'Lancer le modèle'): feature = request.form['feature'] # Get colname list columns = colname_list # Show uploading files data_to_html = df_html_show(df) # Delete feature from columns columns.remove(feature) for i in columns: x_feat = df[feature].values.reshape(-1, 1) y_targ = df[i].values.reshape(-1, 1) # Train Test X_train, X_test, y_train, y_test = train_test_split( x_feat, y_targ, test_size=0.33, random_state=42 ) score_rmse = list() min_rmse, min_deg = (math.inf,) * 2 for deg in range(1, 11): # Train features poly_features = PolynomialFeatures(degree=deg, include_bias=False) x_poly_train = poly_features.fit_transform(X_train) # Linear regression poly_reg = LinearRegression().fit(x_poly_train, y_train) # Compare with test data x_poly_test = poly_features.fit_transform(X_test) poly_predict = poly_reg.predict(x_poly_test) poly_rmse = np.sqrt(mean_squared_error(y_test, poly_predict)) score_rmse.append(poly_rmse) # Cross-validation of degree if (min_rmse > poly_rmse): min_rmse = poly_rmse min_deg = deg # Create Polynomial model polynomial = PolynomialFeatures(degree=min_deg) # Fit polynomial model X_train = polynomial.fit_transform(X_train) X_test = polynomial.fit_transform(X_test) # Create linear model and fit regressor = linear_model.LinearRegression().fit(X_train, y_train) # Predicting test set results y_test_pred = regressor.predict(X_test) # Prediction y_pred = regressor.predict(X_train) y_pred = y_pred.tolist() # Accuracy r2_test = r2_score(y_test , y_test_pred) * 100 r2_train = r2_score(y_train, y_pred) * 100 res = [i, round(statistics.mean([r2_test, r2_train]), 2)] res_list.append(res) # Save scoring score_list = [score[1] for score in res_list] if (lang == 'en'): # Add graph title graph_title = ( 'Comparison of the correlation between ' + feature + ' and the columns :' ) # Success msg_suc = ( 'The model was successfully calculated. ' 'Your data was automatically deleted.' ) else: graph_title = ( 'Comparaison de la corrélation entre ' + feature + ' et les colonnes :' ) msg_suc = ( 'Le modèle a été calculé avec succès. ' 'Vos données ont été automatiquement supprimées.' ) # Delete file file_remove(new_tmp_path) return render_template( 'regression/pol_reg.html', title = model_name, username = username, lang = lang, nav_cat_name = navbar_cat_name, nav_cat_tag = navbar_cat_tag, nav_cat_icon = navbar_cat_icon, nav_cat_lnk = navbar_cat_link, nav_models = navbar_models, nav_set_name = navbar_set_name, nav_set_icon = navbar_set_icon, nav_set_lnk = navbar_set_link, cat_name = cat_name, cat_tag = cat_tag, cat_link = cat_link, model_name = model_name, model_link = model_link, msg_err = msg_err, msg_suc = msg_suc, msg_warn = msg_warn, data_show = data_to_html, df_columns = columns, feature = feature, score_list = score_list, graph_title = graph_title ) else: return redirect('404')
select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) selection_model = XGBRegressor(n_jobs=-1) selection_model.fit(select_x_train, y_train, eval_metric=["logloss", "rmse", "mae"], eval_set=[(select_x_train, y_train), (select_x_test, y_test)], early_stopping_rounds=20, verbose=0) y_pred = selection_model.predict(select_x_test) score = r2_score(y_test, y_pred) # print("R2 : ", r2_score) results = selection_model.evals_result() print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0)) ''' Thresh=0.002, n=13, R2: 85.98% Thresh=0.004, n=12, R2: 85.98% Thresh=0.008, n=11, R2: 85.99% Thresh=0.009, n=10, R2: 85.67% Thresh=0.009, n=9, R2: 85.79% Thresh=0.013, n=8, R2: 85.70% Thresh=0.016, n=7, R2: 86.26% Thresh=0.032, n=6, R2: 82.53%
def multiple_linear_regression(): # Check if user is loggedin if ('loggedin' in session): # Init variables (data_to_html, X_col, Y_col, graph_title, msg_suc, msg_err, msg_warn) = (None,) * 7 # Init list (columns, score_list) = (list(), ) * 2 # Get session details username = session['username'] lang = session['lang'] # Define tag category + model cat_tag = 'REG' mod_tag = 'MLR' # Connect to database cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor) # Get categories of navbar navbar_cat = datas_cat_nav(cursor, lang) navbar_cat_name = navbar_cat[0] navbar_cat_tag = navbar_cat[1] navbar_cat_icon = navbar_cat[2] navbar_cat_link = navbar_cat[3] # Get models of navbar navbar_models = datas_mod_nav(cursor, lang, navbar_cat_tag) # Get settings of navbar navbar_settings = datas_set_nav(cursor, lang) navbar_set_name = navbar_settings[0] navbar_set_icon = navbar_settings[1] navbar_set_link = navbar_settings[2] # Get category details for breadcrumb cat_details = cards_categories(cursor, lang, cat_tag) cat_name = cat_details[0] cat_link = cat_details[3] # Get model details for breadcrumb model_details = datas_model(cursor, lang, mod_tag) model_name = model_details[0] model_link = model_details[1] # Break connection cursor.close() if (request.method == 'POST'): # Upload file if (request.form['submit_btn'] == 'Upload Now' or request.form['submit_btn'] == 'Envoyer maintenant'): # All fields was complete if (bool(request.files['file']) == 1 and bool(request.form['sep_select']) == 1 ): get_upload_datas = upload_file(lang, False) msg_err = get_upload_datas[0] msg_suc = get_upload_datas[1] msg_warn = get_upload_datas[2] global new_tmp_path new_tmp_path = get_upload_datas[3] global colname_list colname_list = get_upload_datas[4] columns = colname_list data_to_html = get_upload_datas[5] global df df = get_upload_datas[6] else: if (lang == 'en'): # Submit without upload file msg_err = ( 'Please upload your data and select a separator.' ) else: msg_err = ( 'Veuillez télécharger vos données et ' 'choisir un séparateur.' ) # Model compute if (request.form['submit_btn'] == 'Launch the model' or request.form['submit_btn'] == 'Lancer le modèle'): X_col = request.form['X_col'] Y_col = request.form['Y_col'] # Show uploading files data_to_html = df_html_show(df) # Get colname list columns = colname_list # Delete feature from columns columns.remove(X_col) columns.remove(Y_col) for i in columns: x_feat = df[[X_col, i]].values y_targ = df[Y_col].values # Train Test X_train, X_test, y_train, y_test = train_test_split( x_feat, y_targ, test_size=0.33, random_state=42 ) # Create model and fit regressor = LinearRegression().fit(X_train, y_train) # Predicting test set results y_test_pred = regressor.predict(X_test) # Prediction y_pred = regressor.predict(X_train) y_pred = y_pred.tolist() # Accuracy r2_test = r2_score(y_test , y_test_pred) * 100 r2_train = r2_score(y_train, y_pred) * 100 score_list.append(round(statistics.mean([r2_test, r2_train]), 2)) columns = [X_col + ' + ' + c for c in columns] if (lang == 'en'): # Add graph title graph_title = ( 'Comparison of the correlation between ' + Y_col + ' and the columns :' ) # Success msg_suc = ( 'The model was successfully calculated. ' 'Your data was automatically deleted.' ) else: graph_title = ( 'Comparaison de la corrélation entre ' + Y_col + ' et les colonnes :' ) msg_suc = ( 'Le modèle a été calculé avec succès. ' 'Vos données ont été automatiquement supprimées.' ) # Delete file file_remove(new_tmp_path) return render_template( 'regression/mul_lin_reg.html', title = model_name, username = username, lang = lang, nav_cat_name = navbar_cat_name, nav_cat_tag = navbar_cat_tag, nav_cat_icon = navbar_cat_icon, nav_cat_lnk = navbar_cat_link, nav_models = navbar_models, nav_set_name = navbar_set_name, nav_set_icon = navbar_set_icon, nav_set_lnk = navbar_set_link, cat_name = cat_name, cat_tag = cat_tag, cat_link = cat_link, model_name = model_name, model_link = model_link, msg_err = msg_err, msg_suc = msg_suc, msg_warn = msg_warn, data_show = data_to_html, df_columns = columns, X_col = X_col, Y_col = Y_col, score_list = score_list, graph_title = graph_title ) else: return redirect('404')
model = LinearRegression() model.fit(x_poly, y) y_poly_pred = model.predict(x_poly) # In[74]: y_poly_pred[:20] # In[75]: math.sqrt(mean_squared_error(y, y_poly_pred)) # In[76]: r2_score(y, y_poly_pred) # In[49]: ##visualize(degree = 2) plt.scatter(x, y) plt.plot(x, y_poly_pred, color='m') plt.show() # In[77]: ##visualize(degree = 1) plt.scatter(x, y) plt.plot(x, y_poly_pred, color='m') plt.show()