def estimator_metrics(true_values, estimates): print "---------------------------------------" print "MSE: " print mean_squared_error(true_values, estimates) print "MAE: " print median_absolute_error(true_values, estimates) print "R-squared: " print r2_score(true_values, estimates) print "---------------------------------------" return
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 3. YOUR CODE GOES HERE ### ################################### return m.median_absolute_error(label, prediction)
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 3. YOUR CODE GOES HERE ### ################################### # The following page has a table of scoring functions in sklearn: # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics # In order to study all of the different performance metrics, I will simply # calculate them all and return a dictionary with all of the results l, p = label, prediction output = collections.OrderedDict() output["explained variance score"] = skmetrics.explained_variance_score( l, p) output["mean absolute error"] = skmetrics.mean_absolute_error(l, p) output["mean squared error"] = skmetrics.mean_squared_error(l, p) output["root mean squared error"] = np.sqrt( skmetrics.mean_squared_error(l, p)) output["median absolute error"] = skmetrics.median_absolute_error(l, p) output["r2 score"] = skmetrics.r2_score(l, p) return output
def learning( self): X = self.X y = self.y print( "Shape of X and y are", X.shape, y.shape) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=42) val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val, early_stopping_rounds=200) model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000) model.fit(X_train, y_train, val_monitor) yP = model.predict(X_test) score_r2 = metrics.r2_score(y_test, yP) score_MedAE = metrics.median_absolute_error(y_test, yP) print('Accuracy') print('--------') print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE)) if self.graph: kutil.regress_show4( y_test, yP)
def cv_LinearRegression_ci( xM, yV, n_folds = 5, scoring = 'median_absolute_error', disp = False): """ metrics.explained_variance_score(y_true, y_pred) Explained variance regression score function metrics.mean_absolute_error(y_true, y_pred) Mean absolute error regression loss metrics.mean_squared_error(y_true, y_pred[, ...]) Mean squared error regression loss metrics.median_absolute_error(y_true, y_pred) Median absolute error regression loss metrics.r2_score(y_true, y_pred[, ...]) R^2 (coefficient of determination) regression score function. """ if disp: print(xM.shape, yV.shape) clf = linear_model.LinearRegression() kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True) cv_score_l = list() ci_l = list() for train, test in kf5: # clf.fit( xM[train,:], yV[train,:]) # yV is vector but not a metrix here. Hence, it should be treated as a vector clf.fit( xM[train,:], yV[train]) yVp_test = clf.predict( xM[test,:]) # Additionally, coef_ and intercept_ are stored. ci_l.append( (clf.coef_, clf.intercept_)) if scoring == 'median_absolute_error': cv_score_l.append( metrics.median_absolute_error(yV[test], yVp_test)) else: raise ValueError( "{} scoring is not supported.".format( scoring)) if disp: # Now only this flag is on, the output will be displayed. print('{}: mean, std -->'.format( scoring), np.mean( cv_score_l), np.std( cv_score_l)) return cv_score_l, ci_l
def kfold_cv_rand(self, n_folds = 3): """ Takes in: number of folds Prints out RMSE score and stores the results in self.results """ cv = KFold(n = self.X_train.shape[0], n_folds = n_folds) gbr = RandomForestRegressor(**self.params) self.med_error = [] self.rmse_cv = [] self.pct_error=[] self.r2=[] self.results = {'pred': [], 'real': []} for train, test in cv: print "Starting fit" gbr.fit(self.X_train[train], self.y_train[train]) pred = gbr.predict(self.X_train[test]) predExp=np.power(pred, 10) testExp=np.power(self.y_train[test], 10) medError=median_absolute_error(predExp, testExp) percentError=np.median([np.fabs(p-t)/t for p,t in zip(predExp, testExp)]) error = mean_squared_error(np.power(pred, 10), np.power(self.y_train[test], 10))**0.5 self.results['pred'] += list(pred) self.results['real'] += list(self.y_train[test]) self.rmse_cv += [error] self.med_error+=[medError] self.pct_error+=[percentError] self.r2+=[r2_score(self.y_train[test], pred)] print 'Abs Median Error:', np.mean(self.med_error) print 'Abs Percent Error:', np.mean(self.pct_error) print 'Mean RMSE:', np.mean(self.rmse_cv) print "R2",np.mean(self.r2)
def cross_val_cols(self, n_folds = 3): """ Takes in: number of folds Prints out RMSE score and stores the results in self.results """ cv = KFold(n = self.X_train.shape[0], n_folds = n_folds) gbr = GradientBoostingRegressor(**self.params) self.med_error = [] self.rmse_cv = [] self.pct_error=[] self.results = {'pred': [], 'real': []} for train, test in cv: gbr.fit(self.X_train[train], self.y_train[train]) dfFeatures+=[unencode(pd.DataFrame(columns=final_cols[:-1], data=self.X_train[test]))] pred = gbr.predict(self.X_train[test]) medError=median_absolute_error(predExp, testExp) percentError=np.median([np.fabs(p-t)/t for p,t in zip(predExp, testExp)]) error = mean_squared_error(np.power(pred, 10), np.power(self.y_train[test], 10))**0.5 self.inFeatures=(self.X_train[test]) self.results['pred'] += list(predExp) self.results['real'] += list(testExp) self.rmse_cv += [error] self.med_error+=[medError] self.pct_error+=[percentError] print 'Abs Median Error:', np.mean(self.med_error) print 'Abs Percent Error:', np.mean(self.pct_error) print 'Mean RMSE:', np.mean(self.rmse_cv) self.valDf=pd.DataFrame.concat(dfFeatures) self.valDf= self.valDf.reset_index().drop('index', axis = 1) self.valDf['pred']=self.results['pred'] self.valDf['real']=self.results['real'] return self.valDf
def collect_metrics(true_values, predicted_values, store_metrics=False, file_prefix=""): """ Returns a list of regression quality metrics. :param true_values: The list containing the true values. :param predicted_values: The list containing the predicted values. :return: List of metrics. """ mse = mean_squared_error(true_values, predicted_values) rmse = np.sqrt(mse) mar = mean_absolute_error(true_values, predicted_values) medar = median_absolute_error(true_values, predicted_values) mmre = mean_magnitude_relative_error(true_values, predicted_values, balanced=False) bmmre = mean_magnitude_relative_error(true_values, predicted_values, balanced=True) mdmre = median_magnitude_relative_error(true_values, predicted_values) r_squared = r2_score(true_values, predicted_values) if store_metrics: mre_values = get_mre_values(true_values, predicted_values) metrics_dataframe = pd.DataFrame({'true_values': true_values, 'predicted_values': predicted_values, 'mre_values': mre_values}) metrics_dataframe.to_csv("csv/pred_metrics_" + file_prefix + ".csv", index=False) return mse, rmse, mar, medar, mmre, bmmre, mdmre, r_squared
def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) y_pred = y_true + 1 assert_almost_equal(mean_squared_error(y_true, y_pred), 1.) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
def getPredQuality(y_true, y_pred, prefix): result = {} result[prefix + '_ExpVariance'] = metrics.explained_variance_score(y_true, y_pred) result[prefix + '_MAE'] = metrics.mean_absolute_error(y_true, y_pred) result[prefix + '_MSE'] = metrics.mean_squared_error(y_true, y_pred) result[prefix + '_MedAE'] = metrics.median_absolute_error(y_true, y_pred) result[prefix + '_Rtwo'] = metrics.r2_score(y_true, y_pred) result[prefix + '_MAPEmod'] = HelperFunctions.MAPE_mod(np.ravel(y_true), y_pred) return result
def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2) assert_raises_regex(ValueError, "Mean Squared Logarithmic Error cannot be " "used when targets contain negative values.", mean_squared_log_error, [-1.], [-1.])
def predict(regr, X_train, Y_train, X_test, Y_test): X_train, Y_train, scaler = preprocess(X_train, Y_train) regr.fit(X_train, Y_train) X_test = scaler.transform(X_test) Y_pred = regr.predict(X_test) print "R2-score: ", metrics.r2_score(Y_test, Y_pred) print "Mean Squared Error: ", metrics.mean_squared_error(Y_test, Y_pred) print "Median Absolute Error: ", metrics.median_absolute_error(Y_test, Y_pred) print "Explained Variance Error: ", metrics.explained_variance_score(Y_test, Y_pred) return Y_pred
def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'median_absolute_error': median_absolute_error(self._y_test, y_pred), 'mean_squared_error': mean_squared_error(self._y_test, y_pred), 'explained_variance_score': explained_variance_score(self._y_test, y_pred), }
def estimate_accuracy4(yEv, yEv_calc, disp = False): r_sqr = metrics.r2_score( yEv, yEv_calc) RMSE = np.sqrt( metrics.mean_squared_error( yEv, yEv_calc)) MAE = metrics.mean_absolute_error( yEv, yEv_calc) DAE = metrics.median_absolute_error( yEv, yEv_calc) if disp: print("r^2={0:.2e}, RMSE={1:.2e}, MAE={2:.2e}, DAE={3:.2e}".format( r_sqr, RMSE, MAE, DAE)) return r_sqr, RMSE, MAE, DAE
def classification_level_prediction(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3] #scaling the data for feature selection X_scaled = preprocessing.scale(X) #feature selection: #featureSelector = SelectKBest(score_func=f_regression,k=15) #featureSelector.fit(X_norm,y_actual['session_length'].values) #create a list of selected features (columns) #selected_features = [X.columns[zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))] #print "Those are the 15 selected features:"+str(selected_features) #create a data frame with only the selected features #X_selected = X[selected_features] #X_selected_norm=preprocessing.normalize(X_selected,norm='l2') #do not perform any feature selection #split the data set for train and test sets X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0) # Set the parameters by cross-validation - for CLASSIFICATION LEVEL predictions tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e+1,1e+0,1e-1,1e-2,1e-3], 'C': [50, 100, 250, 500, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 500, 1000]}] print "y_actual_train shape:"+str(y_actual_train.shape) print "y_actual_test shape:"+str(y_actual_test.shape) scores = ['mean_squared_error','median_absolute_error'] for score in scores: #using SVR to predict session length print(str(datetime.datetime.now())+": Predicting session length - Tuning hyper-parameters for %s" % score) # must be called with SVR instance as first argument clf = GridSearchCV(SVR(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(X_scaled_train, y_actual_train['session_length'].values) print("Best parameters set found...") print(clf.best_estimator_) print("Grid scores:") for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print("Detailed classification report:") print("The model is trained on the full training set.") print("The scores are computed on the full testing set.") y_true, y_pred = y_actual_test['session_length'].values, clf.predict(X_scaled_test) print pd.DataFrame(X_scaled_test,y_true, y_pred).to_csv(str(score)+"pred_true.csv") print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) print "Median absolute error:"+str(median_absolute_error(y_true,y_pred)) print "Done:"+str(datetime.datetime.now())
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 2. YOUR CODE GOES HERE ### ################################### # I'm going to use MSE as an error metric. It's fairly standard although it does # have some disadvantages # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics error = median_absolute_error(label, prediction) #print "Error: {0}".format(error) return (error)
def performance_metric(label, prediction): '''Calculate and return the appropriate performance metric.''' # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics # median_absolute_error vs mean_absolute_error vs mean_squared_error # median_absolute_error appears fairly robust against # outliers of [target - prediction] as compared to mean_absolute_error # mean_squared_error behaves similar to mean_absolute_error but it seems # amplify the error, which could hurt the scoring in grid search error_loss = median_absolute_error(label, prediction) return error_loss
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 2. YOUR CODE GOES HERE ### ################################### metric = metrics.median_absolute_error(label, prediction) #metric = metrics.mean_absolute_error(label, prediction) #metric = metrics.r2_score(label, prediction) # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics return metric
def eval_score( model, X_test, y_test, string = "Test", graph = False): print() print( "Evaluation of", string) print('--------') yP = model.predict(X_test) score_r2 = metrics.r2_score(y_test, yP) score_MedAE = metrics.median_absolute_error(y_test, yP) print('Accuracy') print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE)) print() if graph: kutil.regress_show4( y_test, yP)
def evaluatePrediction(y_true, y_pred, print_details=False): ''' Returns an array of the form [explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, MAPE_mod]. If print_details is true, results are also print (esp. for notebooks) :param y_true: observed y values :param y_pred: predicted y values of the same period :param print_details: bool, if true print results :return: ''' evaluation = [metrics.explained_variance_score(y_true, y_pred), metrics.mean_absolute_error(y_true, y_pred), metrics.mean_squared_error(y_true, y_pred), metrics.median_absolute_error(y_true, y_pred), metrics.r2_score(y_true, y_pred), MAPE_mod(np.ravel(y_true), y_pred)] if print_details: print('Explained Variance: {}'.format(metrics.explained_variance_score(y_true, y_pred))) print('MAE: {}'.format(metrics.mean_absolute_error(y_true, y_pred))) print('MSE: {}'.format(metrics.mean_squared_error(y_true, y_pred))) print('Median Absolute Error: : {}'.format(metrics.median_absolute_error(y_true, y_pred))) print('R-Square Score: {}'.format(metrics.r2_score(y_true, y_pred))) print('MAPE_mod: {}'.format(MAPE_mod(np.ravel(y_true), y_pred))) return evaluation
def extractMetrics(pred,test_y): '''It extracts three different metrics: mean absolute error,median absolute error,mean square error ''' try: meanae=mean_absolute_error(test_y,pred) except ValueError: #sometimes the moving average filter on the output reduce the dimensionality of it #so some value of the predition is dropped pred=pred[:len(test_y)-len(pred)] meanae=mean_absolute_error(test_y,pred) mae=median_absolute_error(test_y,pred) mse=mean_squared_error(test_y,pred) return meanae,mae,mse
def performance_metric(label, prediction): """Calculate and return the appropriate error performance metric.""" ################################### ### Step 3. YOUR CODE GOES HERE ### ################################### # The following page has a table of scoring functions in sklearn: # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics #since we have a regression model we use a regression metrics #I choose the mean squared error regression loss metric - a very common metric for regression models # return mean_squared_error(label, prediction) return median_absolute_error(label, prediction)
def estimate_accuracy4(yEv, yEv_calc, disp = False): """ It was originally located in jchem. However now it is allocated here since the functionality is more inline with jutil than jchem. """ r_sqr = metrics.r2_score( yEv, yEv_calc) RMSE = np.sqrt( metrics.mean_squared_error( yEv, yEv_calc)) MAE = metrics.mean_absolute_error( yEv, yEv_calc) DAE = metrics.median_absolute_error( yEv, yEv_calc) if disp: print("r^2={0:.2e}, RMSE={1:.2e}, MAE={2:.2e}, DAE={3:.2e}".format( r_sqr, RMSE, MAE, DAE)) return r_sqr, RMSE, MAE, DAE
def _reportPredictionResults(y_true, y_pred): ''' ### Deprecated Private Function to calculate the error measures. New error functions have to be added here. :param y_true: observation :param y_pred: prediction :return: Nothing ''' print('Explained Variance: {}'.format(metrics.explained_variance_score(y_true, y_pred))) print('MAE: {}'.format(metrics.mean_absolute_error(y_true, y_pred))) print('MSE: {}'.format(metrics.mean_squared_error(y_true, y_pred))) print('Median Absolute Error: : {}'.format(metrics.median_absolute_error(y_true, y_pred))) print('R-Square Score: {}'.format(metrics.r2_score(y_true, y_pred))) print('MAPE_mod: {}'.format(HelperFunctions.MAPE_mod(np.ravel(y_true), y_pred)))
def predictAndExport(models, X_train, X_test, y_train, y_test): results = [] fieldnames = [ "name", "mean_squared_error", "mean_absolute_error", "median_absolute_error", "r2", "explained_variance_score", ] fout = open("result.csv", "w") writer = csv.DictWriter(fout, fieldnames=fieldnames) writer.writeheader() for model in models: print "\n\n=== Reuslt of ", model["name"], " ===" result = {} result["name"] = model["name"] clf = model["model"] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred_sample = scaler.inverse_transform(y_pred[:20]) y_test_sample = scaler.inverse_transform(y_test[:20]) print "\npredicted result, true result" for i in range(len(y_test_sample)): print y_pred_sample[i], "\t", y_test_sample[i] result["mean_squared_error"] = metrics.mean_squared_error(y_pred, y_test) print "mean_squared_error:", result["mean_squared_error"] result["mean_absolute_error"] = metrics.mean_absolute_error(y_pred, y_test) print "mean_absolute_error:", result["mean_absolute_error"] result["median_absolute_error"] = metrics.median_absolute_error(y_pred, y_test) print "median_absolute_error:", result["median_absolute_error"] result["r2"] = metrics.r2_score(y_pred, y_test) print "r2_score:", result["r2"] result["explained_variance_score"] = metrics.explained_variance_score(y_pred, y_test) print "explained_variance_score:", result["explained_variance_score"] writer.writerow(result) return results
def calculateScores(y_pred, y_test, result): result["mean_squared_error"] = metrics.mean_squared_error(y_pred, y_test) print "mean_squared_error:", result["mean_squared_error"] result["mean_absolute_error"] = metrics.mean_absolute_error(y_pred, y_test) print "mean_absolute_error:", result["mean_absolute_error"] result["median_absolute_error"] = metrics.median_absolute_error(y_pred, y_test) print "median_absolute_error:", result["median_absolute_error"] result["r2"] = metrics.r2_score(y_pred, y_test) print "r2_score:", result["r2"] result["explained_variance_score"] = metrics.explained_variance_score(y_pred, y_test) print "explained_variance_score:", result["explained_variance_score"] return result
def plot_model_performance(y_pred, y_test, model_name, zoom=False): """Save a scatter plot of the predicted vs actuals.""" """zoom: Zoom in on the part of the distribution where most data lie.""" if (zoom is True): axes_limit = 0.2 * 1e7 path_suffix = "_zoom" else: axes_limit = y_pred.max()*1.1 path_suffix = "" fig, ax = plt.subplots() ax.scatter(y_test, y_pred, alpha=0.1) line = mlines.Line2D([0, 1], [0, 1], color="red") transform = ax.transAxes line.set_transform(transform) ax.add_line(line) subplot_title = "{} \n Median AE: {:.0f}, Median APE: {:.3f}".format( model_name, median_absolute_error(y_test, y_pred), median_absolute_percentage_error(y_test, y_pred) ) ax.set(title=subplot_title, xlabel="Actual selling price in $", ylabel="Predicted selling price in $", xlim=(0, axes_limit), ylim=(0, axes_limit)) ax.xaxis.set_major_locator(plt.MaxNLocator(5)) ax.yaxis.set_major_locator(plt.MaxNLocator(5)) ax.xaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ","))) ax.yaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ","))) fig.savefig( "./figures/model_performance_{}{}.png".format( model_name, path_suffix), dpi=1000, bbox_inches="tight" ) plt.close(fig)
def predict(X, Y): # X = df[list(df.columns)[:-1]] # Y = df["quality"] # print X # print Y X_train,X_test,Y_train,Y_test = train_test_split(X,Y) # print X_train # print Y_train regressor = LinearRegression() regressor.fit(X_train,Y_train) scores = cross_val_score(regressor,X,Y,cv = 3) print scores.mean(),scores y_prediction = regressor.predict(X_test) print "r squared=",regressor.score(X_test,Y_test) print "abs error",median_absolute_error(Y_test,y_prediction) print "mean squared error",mean_squared_error(Y_test,y_prediction)
def evaluate(truth, preds, true_lats, lats, time): """ Measure regression performance :return: list of error measures and corresponding names """ names = list() errs = list() errs.append(mean_absolute_error(truth, preds)) names.append('Mean Absolute Error') errs.append(mean_squared_error(truth, preds)) names.append('Mean Squared Error') errs.append(np.sqrt(mean_squared_error(truth, preds))) names.append('Root Mean Squared Error') errs.append(median_absolute_error(truth, preds)) names.append('Median Absolute Error') errs.append(r2_score(truth, preds)) names.append('R2 Score') errs.append(adjusted_rand_score(true_lats, lats)) names.append('Adjusted Rand Score') errs.append(time) names.append('Runtime') return np.array(errs), names
ElasticNet(), LinearSVR(verbose=0), AdaBoostRegressor(), BaggingRegressor(n_jobs=-1), GradientBoostingRegressor(verbose=0), RandomForestRegressor(n_jobs=-1, verbose=0), ExtraTreesRegressor(n_jobs=-1, verbose=0), MLPRegressor(), KNeighborsRegressor() ] regressorNames = [ "Linear Regression", "Ridge Regressor", "SVR", "Lasso", "ElasticNet", "Linear SVR", "AdaBoost", "Bagging", "XGBoost", "Random Forest Regressor", "Extra Trees Regressor", "MLP Regressor", "KNN Regressor" ] assert len(regressors) == len(regressorNames) numRegressors = len(regressors) metrics = [ mean_absolute_error, lambda y_true, y_pred: 10.0**mean_absolute_error(y_true, y_pred), median_absolute_error, lambda y_true, y_pred: 10.0**median_absolute_error(y_true, y_pred), r2_score ] metricNames = [ "Mean Absolute Error", "10^Mean AE", "Median Absolute Error", "10^Median AE", "$r^2$" ] assert len(metrics) == len(metricNames) numMetrics = len(metrics)
train_data, test_data, train_output, test_output = cross_validation.train_test_split( data.values, output_variables.values, test_size=0.3) rf = RandomForestRegressor(n_estimators=101) ada = AdaBoostRegressor(n_estimators=101) bagging = BaggingRegressor(n_estimators=101) gradBoost = GradientBoostingRegressor(n_estimators=101) bayes = BayesianRidge() regressors = [rf, ada, bagging, gradBoost, bayes] regressor_names = [ "Random Forests", "Adaboost", "Bagging", "Gradient Boosting", "Bayesian Ridge" ] for regressor, regressor_name in zip(regressors, regressor_names): regressor.fit(train_data, train_output) predicted_values = regressor.predict(test_data) print "--------------------------------\n" print "Mean Absolute Error for ", regressor_name, " : ", metrics.mean_absolute_error( test_output, predicted_values) print "Median Absolute Error for ", regressor_name, " : ", metrics.median_absolute_error( test_output, predicted_values) print "Mean Squared Error for ", regressor_name, " : ", metrics.mean_squared_error( test_output, predicted_values) print "R2 score for ", regressor_name, " : ", metrics.r2_score( test_output, predicted_values) print "--------------------------------\n"
if search_tree_root == None: print ("Cannot find any trace that is compliant with formula given current beam size") break output = [] if search_tree_root == None: predicted = u"" total_predicted_time = 0 else: predicted = (search_tree_root.cropped_line[prefix_size:]) total_predicted_time = search_tree_root.total_predicted_time if len(ground_truth)>0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth))) if dls<0: dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
# Define: Home directory (later this is referred at ".")??? HomeDir = 'D:/Karine/ArcGIS/Deal/PostStorm03112018' #rms = sqrt(mean_squared_error(y_actual, y_predicted)) # List of sections and list of layers rastersList = open(HomeDir + '/Python/AM_input/raster_file.txt').read().splitlines() rastersList[0] = 'walk_DEM' # Assign the filename: file file = HomeDir + '/Python/AM_output/extracted_points_GCPs.csv' # Read the file into a DataFrame : df df = pd.read_csv(file, header=0) #calculate rmse in vertical direction RMSEz_GCP = {} for item in rastersList: RMSEz_GCP[item] = sqrt(mean_squared_error(df['Field5'], df[item])) print 'RMSEz_GCP' print RMSEz_GCP # calculate median absolute error MAEz_GCP = {} for item in rastersList: MAEz_GCP[item]= median_absolute_error(df['Field5'], df[item]) print 'Median Absolute Error, z GCP =' print MAEz_GCP
#quick check on min and max predictions vs actual results np.exp(final_data1.logSalePrice).min() y_pred1.min() np.exp(final_data1.logSalePrice).max() y_pred1.max() # Regression metrics explained_variance1 = metrics.explained_variance_score( np.exp(final_data1.logSalePrice), y_pred1) mean_absolute_error1 = metrics.mean_absolute_error( np.exp(final_data1.logSalePrice), y_pred1) mse1 = metrics.mean_squared_error(np.exp(final_data1.logSalePrice), y_pred1) mean_squared_log_error1 = metrics.mean_squared_log_error( np.exp(final_data1.logSalePrice), y_pred1) median_absolute_error1 = metrics.median_absolute_error( np.exp(final_data1.logSalePrice), y_pred1) r21 = metrics.r2_score(np.exp(final_data1.logSalePrice), y_pred1) #Score results print('explained_variance: ', round(explained_variance1, 4)) print('mean_squared_log_error: ', round(mean_squared_log_error1, 4)) print('r2: ', round(r21, 4)) print('MAE: ', round(mean_absolute_error1, 4)) print('MSE: ', round(mse1, 4)) print('RMSE: ', round(np.sqrt(mse1), 4)) #Cross Validation reg_score = [] reg_score = pd.DataFrame(reg_score) reg_score['CV_MSE'] = cross_val_score(LinearRegression(), X1,
def run_experiments(server_replayer, log_name, models_folder, fold): beam_size = shared_variables.beam_size model_filename = shared_variables.extract_last_model_checkpoint( log_name, models_folder, fold, 'CFR') declare_model_filename = shared_variables.extract_declare_model_filename( log_name) log_settings_dictionary = shared_variables.log_settings[log_name] formula = log_settings_dictionary['formula'] prefix_size_pred_from = log_settings_dictionary['prefix_size_pred_from'] prefix_size_pred_to = log_settings_dictionary['prefix_size_pred_to'] start_time = time.time() # prepare the data lines, \ lines_id, \ lines_group, \ lines_t, \ lines_t2, \ lines_t3, \ lines_t4, \ maxlen, \ chars, \ chars_group, \ char_indices, \ char_indices_group, \ divisor, \ divisor2, \ divisor3, \ predict_size, \ target_indices_char, \ target_indices_char_group, \ target_char_indices, \ target_char_indices_group = prepare_testing_data(log_name) # find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # load model, set this to the model generated by train.py model = load_model(model_filename) # Get the predicted group symbol def get_symbol_group(predictions, vth_best=0): v = np.argsort(predictions)[len(predictions) - vth_best - 1] return target_indices_char_group[v] class NodePrediction: def __init__(self, data, trace_id, crop_line, crop_line_group, crop_times, tot_predicted_time, probability_of=0): self.data = data self.trace_id = trace_id self.cropped_line = crop_line self.cropped_line_group = crop_line_group self.cropped_times = crop_times self.total_predicted_time = tot_predicted_time self.probability_of = probability_of folder_path = shared_variables.outputs_folder + models_folder + '/' + str( fold) + '/results/LTL/' if not os.path.exists(folder_path): os.makedirs(folder_path) output_filename = folder_path + '%s_%s.csv' % (log_name, 'CFR') with open(output_filename, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) # headers for the new file spamwriter.writerow([ "Prefix length", "Ground truth", "Predicted", "Damerau-Levenshtein", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE", "Ground Truth Group", "Predicted Group", "Damerau-Levenshtein Resource" ]) # make predictions for different prefix sizes as specified in 'shared variables' for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print("prefix size: " + str(prefix_size)) curr_time = time.time() lines_s, \ lines_id_s, \ lines_group_s, \ lines_t_s, \ lines_t2_s, \ lines_t3_s, \ lines_t4_s = select_declare_verified_traces(server_replayer, declare_model_filename, lines, lines_id, lines_group, lines_t, lines_t2, lines_t3, lines_t4, prefix_size) print("formulas verified: " + str(len(lines_s)) + " out of : " + str(len(lines))) print('elapsed_time:', time.time() - curr_time) counterr = 0 for line, line_id, line_group, times, times2, times3, times4 in izip( lines_s, lines_id_s, lines_group_s, lines_t_s, lines_t2_s, lines_t3_s, lines_t4_s): times.append(0) cropped_line_id = line_id cropped_line = ''.join(line[:prefix_size]) cropped_line_group = ''.join(line_group[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] cropped_times4 = times4[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_node_root = NodePrediction( encode(cropped_line, cropped_line_group, cropped_times, cropped_times3, maxlen, chars, chars_group, char_indices, char_indices_group, divisor, divisor2), cropped_line_id, cropped_line, cropped_line_group, cropped_times4, total_predicted_time_initialization) ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_group = ''.join( line_group[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t queue_next_steps = PriorityQueue() queue_next_steps.put( (-search_node_root.probability_of, search_node_root)) queue_next_steps_future = PriorityQueue() start_of_the_cycle_symbol = " " found_satisfying_constraint = False current_beam_size = beam_size current_prediction_premis = None for i in range(predict_size): for k in range(current_beam_size): if queue_next_steps.empty(): break _, current_prediction_premis = queue_next_steps.get() if not found_satisfying_constraint: if server_replayer.verify_formula_as_compliant( current_prediction_premis.cropped_line, formula, prefix_size): # the formula verified and we can just finish the predictions # beam size is 1 because predict only sequence of events current_beam_size = 1 current_prediction_premis.probability_of = 0.0 # overwrite new queue queue_next_steps_future = PriorityQueue() found_satisfying_constraint = True enc = current_prediction_premis.data temp_cropped_line = current_prediction_premis.cropped_line y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_group = y[1][0] y_t = y[2][0][0] if y_t < 0: y_t = 0 cropped_times.append(y_t) if not i == 0: stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = \ amplify(temp_cropped_line) # in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for j in range(current_beam_size): temp_prediction = get_symbol_ampl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, j) temp_prediction_group = get_symbol_group(y_group) # end of case was just predicted, therefore, stop predicting further into the future if temp_prediction == '!': if server_replayer.verify_formula_as_compliant( temp_cropped_line, formula, prefix_size): stop_symbol_probability_amplifier_current = 1 # print('! predicted, end case') queue_next_steps = PriorityQueue() break else: continue temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction temp_cropped_line_group = \ current_prediction_premis.cropped_line_group + temp_prediction_group # adds a fake timestamp to the list t = time.strptime(cropped_times4[-1], "%Y-%m-%d %H:%M:%S") new_timestamp = datetime.fromtimestamp( time.mktime(t)) + timedelta(0, 2000) cropped_times4.append( new_timestamp.strftime("%Y-%m-%d %H:%M:%S")) temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t temp_state_data = encode( temp_cropped_line, temp_cropped_line_group, cropped_times, cropped_times3, maxlen, chars, chars_group, char_indices, char_indices_group, divisor, divisor2) probability_this = np.sort(y_char)[len(y_char) - 1 - j] temp = NodePrediction( temp_state_data, cropped_line_id, temp_cropped_line, temp_cropped_line_group, cropped_times4, temp_total_predicted_time, current_prediction_premis.probability_of + np.log(probability_this)) queue_next_steps_future.put( (-temp.probability_of, temp)) # print 'INFORMATION: ' + str(counterr) + ' ' + str(i) + ' ' + str(k) + ' ' + str(j) + ' ' + \ # temp_cropped_line[prefix_size:] + " " + str(temp.probability_of) queue_next_steps = queue_next_steps_future queue_next_steps_future = PriorityQueue() counterr += 1 if current_prediction_premis is None: print "Cannot find any trace that is compliant with formula given current beam size" break output = [] if current_prediction_premis is None: predicted = u"" predicted_group = u"" total_predicted_time = 0 else: predicted = ( current_prediction_premis.cropped_line[prefix_size:]) predicted_group = (current_prediction_premis. cropped_line_group[prefix_size:]) total_predicted_time = current_prediction_premis.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) output.append(unicode(ground_truth_group).encode("utf-8")) output.append(unicode(predicted_group).encode("utf-8")) output.append(1 - distance.nlevenshtein( predicted_group, ground_truth_group)) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
5 1.3 2.33 6 1.9 1.64 7 4.6 2.55 8 1.3 1.64 9 3.1 2.35 """ dataFrameWithTestVariables.head(50).plot() plt.savefig("charts/dataFrameWithTestVariables_CO(GT).png") plt.show() print("R^2 score: ", trainRegression.score(X_test, y_test)) #info errors print('Mean Squared Error: ', metrics.mean_squared_error(y_test, yNew)) print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, yNew)) print('Median Absolute Error: ', metrics.median_absolute_error(y_test, yNew)) # Plots for month, weekDay and hours level of CO airData.pivot_table(index='month', values='CO(GT)', aggfunc='mean').plot(kind='bar') plt.savefig("charts/month_CO(GT).png" ) #Save plot as a image - will be included in report airData.pivot_table(index='weekdayName', values='CO(GT)', aggfunc='mean').plot(kind='bar', color='g') plt.savefig("charts/weekDay_CO(GT).png") airData.pivot_table(index='hours', values='CO(GT)', aggfunc='mean').plot(kind='bar', color='y') plt.savefig("charts/hours_CO(GT).png") plt.show() # Plots for month, weekDay and hours level of C6H6
#Add "features" from previous year ##clean_data[['Ownership Type','NERC Region']] = alldata[i-1]['ops'].loc[clean_data.index,slice('Ownership Type','NERC Region')].copy() # alldata[i]['rel']['SAIDI With MED'].fillna(alldata[i]['rel']['SAIDI With MED.1'],inplace=True) # temp_ops = alldata['2013']['ops'].set_index(['Utility Name','State']) # temp_ops.loc[temp['SAIDI With MED'].index,:] # # alldata['2014']['rel'].loc[(slice(None),'WA'),:] ''' Super Simple Baseline Predictor: predict average value from prev. year ''' val_actual = clean_res['2015'].loc[:,'SAIDI With MED'] prev_avg = np.average(clean_res['2014'].loc[:,'SAIDI With MED']) worst = prev_avg*np.ones(val_actual.shape) basic_MSE = metrics.mean_squared_error(val_actual.values.reshape(-1,1), worst) basic_MAE = metrics.median_absolute_error(val_actual.values.reshape(-1,1), worst) print("Worst case MSE:",basic_MSE) print("Worst case MAE:",basic_MAE) #quick test to check baseline for predicting the utilities prev year SAIDI test14 = clean_res['2014'].copy() test15 = clean_res['2015'].copy() testcombo = pd.concat([test14,test15],axis=1,join_axes=[test14.index]) test_act = testcombo['SAIDI With MED'].values[:,0] test_pred = testcombo['SAIDI With MED'].fillna(0).values[:,1] metrics.mean_squared_error(test_act, test_pred) #548292.735939868 metrics.median_absolute_error(test_act, test_pred) #48.058499999999995 #plot mean/var of each year to understand underlying trends avg_saidi_MED = {yr:np.average(clean_res[yr].loc[:,'SAIDI With MED']) for yr in pred_year}
import sklearn.metrics as sm x, y = np.loadtxt('./ml_data/abnormal.txt', delimiter=',', unpack=True, usecols=(0, 1)) # 把输入变成二维数组,一行一样本,一列一特征 x = x.reshape(-1, 1) # 变成n行1列 model = lm.Ridge(150, fit_intercept=True, max_iter=1000) model.fit(x, y) pred_y = model.predict(x) # 把样本x带入模型求出预测y # 输出模型的评估指标 print('平均绝对值误差:', sm.mean_absolute_error(y, pred_y)) print('平均平方误差:', sm.mean_squared_error(y, pred_y)) print('中位绝对值误差:', sm.median_absolute_error(y, pred_y)) print('R2得分:', sm.r2_score(y, pred_y)) # 输出结果:平均绝对值误差: 1.0717908951634179 # 平均平方误差: 3.7362971803503267 # 中位绝对值误差: 0.696470799282414 # R2得分: 0.44530850891980656 # 绘制图像 mp.figure("Linear Regression", facecolor='lightgray') mp.title('Linear Regression', fontsize=16) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.xlabel('x') mp.ylabel('y') mp.scatter(x, y, s=60, marker='o', c='dodgerblue', label='Points')
with open(os.path.dirname(__file__) + "/single.txt", "r") as f: for line in f.readlines(): data = [float(substr) for substr in line.split(',')] x.append(data[:-1]) y.append(data[-1]) x = np.array(x) y = np.array(y) model = lm.LinearRegression() # 构建一个线性回归器 model.fit(x, y) pred_y = model.predict(x) print("平均值误差", sm.mean_absolute_error(y, pred_y)) print("平均值平方误差", sm.mean_squared_error(y, pred_y)) print("中位数误差", sm.median_absolute_error(y, pred_y)) # 保存模型 with open(os.path.dirname(__file__) + "/linear.pkl", "wb") as f: pickle.dump(model, f) plt.figure("Linear Regression", facecolor="lightgray") plt.title("Linear Regression", fontsize=14) plt.xlabel("x", fontsize=12) plt.ylabel("y", fontsize=12) plt.tick_params(labelsize=10) plt.grid(linestyle=":") plt.scatter(x, y, c="dodgerblue", alpha=0.75, s=60, label='Sample') sorted_indices = x.T[0].argsort()
def validate(self, model, val_data_loader, curr_epoch): batch_time_v = AverageMeter() data_time_v = AverageMeter() losses_v = AverageMeter() metric_values_v = AverageMeter() if self.goal_type == 'classification': all_result_v = np.zeros((0, self.cfg.model.n_classes)) elif self.goal_type == 'ordinal-regression': all_result_v = np.zeros((0)) elif self.goal_type == 'regression': all_result_v = np.zeros((0)) all_integer_outputs = np.zeros((0)) all_target_v = np.zeros((0)) all_uids_v = np.zeros((0)) all_loss_v = np.zeros((0)) end_time_v = time.time() model.eval() for k, (inputs_v, targets_v, _, uids_v, _) in enumerate(val_data_loader): # Update timer for data retrieval data_time_v.update(time.time() - end_time_v) # Move input to correct self.device if len(inputs_v) > 1: for p, inp in enumerate(inputs_v): inputs_v[p] = inp.to(self.device, non_blocking=True) else: inputs_v = inputs_v.to(self.device, non_blocking=True) if self.goal_type == 'classification': targets_v = targets_v.long().squeeze() targets_v = targets_v.to(self.device, non_blocking=True) with torch.no_grad(): # Get model validation output and validation loss with autocast(enabled=self.use_half_prec): outputs_v = model(inputs_v) if self.goal_type == 'ordinal-regression': loss_v = self.criterion(outputs_v, targets_v, model.thresholds) else: loss_v = self.criterion(outputs_v, targets_v) loss_mean_v = loss_v.mean() # Update timer for batch batch_time_v.update(time.time() - end_time_v) # Update metrics if self.cfg.evaluation.use_best_sample: if self.goal_type == 'classification': all_result_v = np.concatenate( (all_result_v, F.softmax(outputs_v, dim=1).cpu().detach().numpy())) all_target_v = np.concatenate( (all_target_v, targets_v.cpu().detach().numpy())) all_loss_v = np.concatenate( (all_loss_v, loss_v.cpu().detach().numpy())) elif self.goal_type == 'ordinal-regression': all_result_v = np.concatenate( (all_result_v, outputs_v.squeeze(axis=1).cpu().detach().numpy())) all_target_v = np.concatenate( (all_target_v, targets_v.squeeze(axis=1).cpu().detach().numpy())) all_loss_v = np.concatenate( (all_loss_v, loss_v.cpu().squeeze(axis=0).detach().numpy())) elif self.goal_type == 'regression': all_result_v = np.concatenate( (all_result_v, outputs_v.squeeze(axis=1).cpu().detach().numpy())) all_target_v = np.concatenate( (all_target_v, targets_v.squeeze(axis=1).cpu().detach().numpy())) all_loss_v = np.concatenate( (all_loss_v, loss_v.cpu().squeeze(axis=1).detach().numpy())) # Convert model output to nearest integer of % 5 == 0 out_v = (np.around( outputs_v.squeeze(axis=1).cpu().detach().numpy() / 5, decimals=0) * 5).astype(np.int) out_v[out_v < 20] = 20 out_v[out_v > 70] = 70 all_integer_outputs = np.concatenate( (all_integer_outputs, out_v)) all_uids_v = np.concatenate((all_uids_v, uids_v)) if k % 100 == 0 and is_master(): print( 'Validation Batch: [{}/{}] in epoch: {} \t ' 'Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) \t ' 'Validation Data Time: {data_time.val:.3f} ({data_time.avg:.3f}) \t ' .format(k + 1, len(val_data_loader), curr_epoch + 1, batch_time=batch_time_v, data_time=data_time_v)) else: metric_targets_v = targets_v.cpu().detach().numpy() metric_outputs_v = outputs_v.cpu().detach().numpy() if self.goal_type == 'regression': metric_v = r2_score(metric_targets_v, metric_outputs_v) elif self.goal_type == 'classification': predictions_v = np.argmax(metric_outputs_v, 1) metric_v = accuracy_score(metric_targets_v, predictions_v) elif self.goal_type == 'ordinal-regression': labels_v = get_ORAT_labels(metric_outputs_v, model.thresholds) metric_v = accuracy_score(metric_targets_v, labels_v) metric_values_v.update(metric_v) losses_v.update(loss_mean_v) if k % 100 == 0 and is_master(): print(( 'Validation Batch: [{}/{}] in epoch: {} \t ' 'Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) \t ' 'Validation Data Time: {data_time.val:.3f} ({data_time.avg:.3f}) \t ' 'Validation Loss: {loss.val:.4f} ({loss.avg:.4f}) \t ' 'Validation' + self.metric_name + ': {metric.val:.3f} ({metric.avg:.3f}) \t').format( k + 1, len(val_data_loader), curr_epoch + 1, batch_time=batch_time_v, data_time=data_time_v, loss=losses_v, metric=metric_values_v)) end_time_v = time.time() res = {} if self.cfg.evaluation.use_best_sample: # As results are over all possible combinations of views in each examination # each different combination needs to have a weight equal to its ratio. val_data = np.array((all_uids_v, all_target_v, all_loss_v)) val_data = val_data.transpose(1, 0) pd_val_data = pd.DataFrame(val_data, columns=['us_id', 'target', 'loss']) pd_val_data[['target', 'loss']] = pd_val_data[['target', 'loss']].astype(np.float32) val_ue = pd_val_data.drop_duplicates(subset='us_id')[[ 'us_id', 'target' ]] all_mean_loss = [] for ue in val_ue.itertuples(): exam_results = pd_val_data[pd_val_data['us_id'] == ue.us_id] num_combinations = len(exam_results) weight = 1 / num_combinations mean_exam_loss = exam_results['loss'].mean() all_mean_loss.append(mean_exam_loss) for indx in exam_results.index: pd_val_data.loc[indx, 'metric_weight'] = weight np_loss = np.array(all_mean_loss, dtype=np.float32) loss_mean_v = np_loss.mean() targets = pd_val_data['target'].to_numpy() results = all_result_v weights = pd_val_data['metric_weight'].to_numpy() if self.goal_type == 'regression': metric_v = r2_score(targets, results, sample_weight=weights) metric_v_r2_integer = r2_score(targets, all_integer_outputs, sample_weight=weights) metric_v_mean_ae = mean_absolute_error(targets, all_integer_outputs, sample_weight=weights) metric_v_median_ae = median_absolute_error( targets, all_integer_outputs, sample_weight=weights) val_mse = mean_squared_error(targets, all_integer_outputs, sample_weight=weights) target_classes = np.array( [convert_EF_to_classes(t) for t in targets]).astype(np.int) pred_classes = np.array( [convert_EF_to_classes(p) for p in results]).astype(np.int) res['val/r2'] = metric_v res['val/accuracy'] = accuracy_score(target_classes, pred_classes, sample_weight=weights) elif self.goal_type == 'classification': predictions_v = np.argmax(results, 1) metric_v = accuracy_score(targets.astype(np.int), predictions_v, sample_weight=weights) preds_ef = np.array([ convert_classes_to_EF(p) for p in predictions_v ]).astype(np.int) targets_ef = np.array( [convert_classes_to_EF(c) for c in targets]).astype(np.int) val_mse = mean_squared_error(targets_ef, preds_ef, sample_weight=weights) metric_v_r2_integer = r2_score(targets_ef, preds_ef, sample_weight=weights) metric_v_mean_ae = mean_absolute_error(targets_ef, preds_ef, sample_weight=weights) metric_v_median_ae = median_absolute_error( targets_ef, preds_ef, sample_weight=weights) res['val/accuracy'] = metric_v elif self.goal_type == 'ordinal-regression': labels_v = get_ORAT_labels(results, model.thresholds) metric_v = accuracy_score(targets, labels_v) preds_ef = np.array([ convert_classes_to_EF(p) for p in labels_v ]).astype(np.int) targets_ef = np.array( [convert_classes_to_EF(c) for c in targets]).astype(np.int) val_mse = mean_squared_error(targets_ef, preds_ef, sample_weight=weights) metric_v_r2_integer = r2_score(targets_ef, preds_ef, sample_weight=weights) metric_v_mean_ae = mean_absolute_error(targets_ef, preds_ef, sample_weight=weights) metric_v_median_ae = median_absolute_error( targets_ef, preds_ef, sample_weight=weights) res['val/accuracy'] = metric_v else: loss_mean_v = losses_v.avg metric_v = metric_values_v.avg res['val/mse_integer'] = val_mse res['val/r2_integer'] = metric_v_r2_integer res['val/mean_ae'] = metric_v_mean_ae res['val/median_ae'] = metric_v_median_ae res['val/loss'] = loss_mean_v # End of validation epoch prints and updates if is_master(): print(('Finished Validation Epoch: {} \t ' 'Validation Time: {batch_time.avg:.3f} \t ' 'Validation Data Time: {data_time.avg:.3f} \t ' 'Validation Loss: {loss:.4f} \t ' 'Validation ' + self.metric_name + ': {metric:.3f} \t').format(curr_epoch + 1, batch_time=batch_time_v, data_time=data_time_v, loss=loss_mean_v, metric=metric_v)) if self.goal_type == 'classification': outputs_v = torch.tensor(predictions_v) else: outputs_v = torch.squeeze(outputs_v) print('Example targets: {} \n Example outputs: {}'.format( torch.squeeze(targets_v), outputs_v)) return res
train1, test1, train2, test2 = cross_validation.train_test_split( split_data, data.price, test_size=0.4, train_size=0.6, random_state=13) # mean of prices mean = np.mean(data.price) # standard deviation to compare std = np.std(data.price) print("mean: " + str(mean)) print("standard deviation: " + str(std)) # linear regression testing linear_reg = linear_model.LinearRegression() linear_reg.fit(train1, train2) linear_reg_error = metrics.median_absolute_error(test2, linear_reg.predict(test1)) # ridge model testing ridge = linear_model.Ridge() ridge.fit(train1, train2) ridge_error = metrics.median_absolute_error(test2, ridge.predict(test1)) print("Linear Regression: " + str(linear_reg_error)) print("Ridge: " + str(ridge_error)) # ada boost regressor param_names = ["n_estimators", "learning_rate", "loss"] param_values = [[1], [1, 2], ['linear']] parameters = dict(zip(param_names, param_values)) abr = GridSearchCV(ensemble.AdaBoostRegressor(),
X, y = boston.data, boston.target X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25, random_state=33) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print(X_train) import skflow tf_lr = skflow.TensorFlowLinearRegressor(steps=10000, learning_rate=0.01, batch_size=50) tf_lr.fit(X_train, y_train) tf_lr_y_predict = tf_lr.predict(X_test) print( 'The mean absolute error of Tensorflow Linear Regressor on boston dataset is ', metrics.mean_absolute_error(tf_lr_y_predict, y_test)) print( 'The mean squared error of Tensorflow Linear Regressor on boston dataset is ', metrics.median_absolute_error(tf_lr_y_predict, y_test)) print( 'The R-squared value of Tensorflow Linear Regressor on boston dataset is ', metrics.r2_score(tf_lr_y_predict, y_test))
x = [] y = [] file_name = 'data/data_single.txt' with open(file_name, 'r') as fp: for line in fp.readlines(): xt, yt = [float(i) for i in line.split(',')] x.append(xt) y.append(yt) num_training = int(0.8 * len(x)) num_test = len(x) - num_training x_train = np.array(x[:num_training]).reshape((num_training, 1)) y_train = np.array(y[:num_training]) x_test = np.array(x[num_training:]).reshape((num_test, 1)) y_test = np.array(y[num_training:]) lr = linear_model.LinearRegression() lr.fit(x_train, y_train) y_test_prob = lr.predict(x_test) print('mean absolute err:', round(sm.mean_absolute_error(y_test, y_test_prob), 2)) print('mean squared err:', round(sm.mean_squared_error(y_test, y_test_prob), 2)) print('median absolute err:', round(sm.median_absolute_error(y_test, y_test_prob), 2)) print('variance:', round(sm.explained_variance_score(y_test, y_test_prob), 2)) print('r2:', round(sm.r2_score(y_test, y_test_prob), 2))
def create_final_model(input_data_dir, output_data_dir, output_model_summaries_dir): interim_modeling_df = pd.read_csv( "{}/step3_interim_modeling_data.csv".format(input_data_dir)) pred_df, response_df, invalid_preds, response_var = get_pred_and_response_dfs( interim_modeling_df) model_1_pred_train, model_1_pred_test, model_1_response_train, model_1_response_test = \ train_test_split(pred_df, response_df, test_size=0.5, random_state=223) model_2_pred_train = model_1_pred_test model_2_pred_test = model_1_pred_train model_2_response_train = model_1_response_test model_2_response_test = model_1_response_train final_preds = [ #"school_name", "dbn", # only incl these two for convenience in ID'ing rows "Average ELA Proficiency", "pct_math_level_3_or_4_2017_city_diff", "sa_attendance_90plus_2017", "pct_8th_graders_w_hs_credit_2017_city_diff", "min_dist_to_big_three" # "Collaborative Teachers Rating_Approaching Target", # "Collaborative Teachers Rating_Exceeding Target", # "Collaborative Teachers Rating_Not Meeting Target", # "Collaborative Teachers Rating_nan", # "Major N_proportion" ] model_1_train_pred_df = model_1_pred_train[final_preds] model_1_test_pred_df = model_1_pred_test[final_preds] model_2_train_pred_df = model_2_pred_train[final_preds] model_2_test_pred_df = model_2_pred_test[final_preds] stdized_model_1_train, stdized_model_1_test = standardize_cols( model_1_train_pred_df, model_1_test_pred_df) stdized_model_2_train, stdized_model_2_test = standardize_cols( model_2_train_pred_df, model_2_test_pred_df) model_1 = linear_model.LinearRegression().fit( stdized_model_1_train, model_1_response_train[response_var]) model_1_train_predicted = model_1.predict(stdized_model_1_train) model_1_test_predicted = model_1.predict(stdized_model_1_test) model_1_response_train[ "predicted_perc_testtakers"] = model_1_train_predicted model_1_response_test["predicted_perc_testtakers"] = model_1_test_predicted model_1_coefficients = pd.concat([ pd.DataFrame(stdized_model_1_train.columns), pd.DataFrame(np.transpose(model_1.coef_)) ], axis=1) model_1_coefficients.columns = ["model_1_pred_name", "model_1_coef"] model_1_full_train_df = pd.concat( [model_1_response_train, stdized_model_1_train], axis=1) model_1_full_test_df = pd.concat( [model_1_response_test, stdized_model_1_test], axis=1) model_2 = linear_model.LinearRegression().fit( stdized_model_2_train, model_2_response_train[response_var]) model_2_train_predicted = model_1.predict(stdized_model_2_train) model_2_test_predicted = model_2.predict(stdized_model_2_test) model_2_response_train[ "predicted_perc_testtakers"] = model_2_train_predicted model_2_response_test["predicted_perc_testtakers"] = model_2_test_predicted model_2_coefficients = pd.concat([ pd.DataFrame(stdized_model_2_train.columns), pd.DataFrame(np.transpose(model_2.coef_)) ], axis=1) model_2_coefficients.columns = ["model_2_pred_name", "model_2_coef"] model_2_full_train_df = pd.concat( [model_2_response_train, stdized_model_2_train], axis=1) model_2_full_test_df = pd.concat( [model_2_response_test, stdized_model_2_test], axis=1) final_train_set = pd.concat([model_1_full_train_df, model_2_full_train_df]) final_test_set = pd.concat([model_1_full_test_df, model_2_full_test_df]) final_train_set.to_csv("{}/full_train_dataset.csv".format(output_data_dir), index=False) final_test_set.to_csv("{}/full_test_dataset.csv".format(output_data_dir), index=False) model_1_coefficients.to_csv( "{}/model_1_coefficients.csv".format(output_model_summaries_dir), index=False) model_2_coefficients.to_csv( "{}/model_2_coefficients.csv".format(output_model_summaries_dir), index=False) final_r2 = metrics.r2_score(final_test_set["perc_testtakers"], final_test_set["predicted_perc_testtakers"]) final_median_abs_err = metrics.median_absolute_error( final_test_set["perc_testtakers"], final_test_set["predicted_perc_testtakers"]) print("\n\nmodel_1_coefficients: ") print(model_1_coefficients) print("\nmodel_2_coefficients: ") print(model_2_coefficients) print("\n\n** r2 of entire dataset: {}".format(final_r2)) print("** median_absolute_error of entire dataset: {}".format( final_median_abs_err)) print( "\n\ndropped output to folder: {}".format(output_model_summaries_dir))
def compute(labels, pred_scores): return median_absolute_error(labels, pred_scores)
def median_absolute_error(self): if self.multi_output is not None: print("Median absolute error is not supported for multi output") return None temp = median_absolute_error(self.y_true, self.y_pred) self.score_meae = np.round(temp, self.number_rounding)
def GetError(model,data,scaler,actual): scaled_data = scaler.transform(data) preds = model.predict(scaled_data) mse = metrics.mean_squared_error(actual, preds) mae = metrics.median_absolute_error(actual, preds) return mse, mae
rf = RandomForestRegressor(n_estimators=101) ada = AdaBoostRegressor(n_estimators=101) grad = GradientBoostingRegressor(n_estimators=101) bagging = BaggingRegressor(n_estimators=101) bayes = BayesianRidge() regressors = [knn,rf,ada,grad,bagging,bayes] regressor_names = ["KNN","Random Forests","AdaBoost","Gradient Boost","Bagging","Bayes"] X_train,X_test,y_train,y_test = cross_validation.train_test_split(data.values,predictor.values,test_size=0.2) feature_importances = [] for regressor,name in zip(regressors,regressor_names): regressor.fit(X_train,y_train) predicted_values = regressor.predict(X_test) if name == "Random Forests": feature_importances = regressor.feature_importances_ print "---------------------------\n" print "Absolute Mean Error for ", name , " : ", metrics.mean_absolute_error(y_test,predicted_values) print "Median Error for ", name ," : ", metrics.median_absolute_error(y_test,predicted_values) print "Mean Squared Error for ",name," : ",metrics.mean_squared_error(y_test,predicted_values) print "R2 Score for ",name," : ",metrics.r2_score(y_test,predicted_values) print "---------------------------\n" print "\n------------- Feature Importances------------\n" for name,importance in zip(data.columns,feature_importances): print name," : ",importance
for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = output[train_index], output[test_index] accuracy.append(np.std(y_test)) kf = KFold(len(data), n_folds=5) for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = output[train_index], output[test_index] svm_model = svm.SVR() svm_model.fit(X_train, y_train) accuracy.append(np.sqrt(np.mean((svm_model.predict(X_test) - y_test)**2))) absolute_error.append( median_absolute_error(svm_model.predict(X_test), y_test)) kf = KFold(len(data), n_folds=5) for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = output[train_index], output[test_index] lasso = linear_model.Lasso(alpha=0.01) lasso.fit(X_train, y_train) accuracy.append(np.sqrt(np.mean((lasso.predict(X_test) - y_test)**2))) absolute_error.append(median_absolute_error(lasso.predict(X_test), y_test)) kf = KFold(len(data), n_folds=5) for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = output[train_index], output[test_index]
pca = PCA(n_components=2) X_Modeled_test = pca.fit_transform(X_Modeled_test) # Prediction using Gaussian Process y_pred, cov = gp.predict(y_train, X_Modeled_test) from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score # The best possible score is 1.0, lower values are worse. print("Score of variance : {}".format(explained_variance_score(y_test, y_pred))) print("Score of mean absolute error : {}".format( mean_absolute_error(y_test, y_pred))) print("Score of mean squared error : {}".format( mean_squared_error(y_test, y_pred))) print("Score of median absolute error : {}".format( median_absolute_error(y_test, y_pred))) print("Score of r2 score : {}".format(r2_score(y_test, y_pred))) # Save the result into raw files y_pred = y_pred.tolist() y_test = y_test.tolist() for element in y_pred: with open("predict_temp", "w", encoding="utf8") as file_open: file_open.write(str(element)) file_open.write('\n') file_open.close() for element in y_test: with open("real_temp", "w", encoding="utf8") as file_open: file_open.write(str(element))
plt.show() Y_test_pred = linear_regressor.predict(X_test) plt.figure() plt.scatter(X_test, Y_test, color='green') plt.plot(X_test, Y_test_pred, color='black', linewidth=4) plt.title('Test Data') plt.xticks(()) plt.yticks(()) plt.show() # Measure performance print('Mean absolute error =', round(sm.mean_absolute_error(Y_test, Y_test_pred), 2)) print('Mean squared error =', round(sm.mean_squared_error(Y_test, Y_test_pred), 2)) print('Median absolute error =', round(sm.median_absolute_error(Y_test, Y_test_pred), 2)) print('Explain variance score =', round(sm.explained_variance_score(Y_test, Y_test_pred), 2)) print('R2 score =', round(sm.r2_score(Y_test, Y_test_pred), 2)) # model persistance output_model_file = '3_model_linear_regr.pkl' with open(output_model_file, 'wb') as f: pickle.dump(linear_regressor, f) with open(output_model_file, 'rb') as f: model_linregr = pickle.load(f) y_test_pred_new = model_linregr.predict(X_test) print('\nNew meand absolute error = ', round(sm.mean_absolute_error(Y_test, y_test_pred_new), 2))
fit_intercept=True, max_iter=10000) #TRAIN THE MODEL USING THE TRAINING SETS ridge_regressor.fit(X_train, Y_train) #TEST y_test_pred_ridge = ridge_regressor.predict(X_test) #ERROR print('Mean absolute error =', round(sm.mean_absolute_error(Y_test, y_test_pred_ridge), 2)) print('Mean squared error =', round(sm.mean_squared_error(Y_test, y_test_pred_ridge), 2)) print('Median absolute error =', round(sm.median_absolute_error(Y_test, y_test_pred_ridge), 2)) print('Explain variance score =', round(sm.explained_variance_score(Y_test, y_test_pred_ridge), 2)) print('R2 score =', round(sm.r2_score(Y_test, y_test_pred_ridge), 2)) x = [] for i in range(len(X_train)): sum = 0 i = 0 for a in X_train[i]: sum += a i += 1 sum = sum / i x.append(sum) x_test = []
# Lets go back for now... regressor = LinearRegression() print("Swapped back to linear regression!") regressor.fit(X_train, y_train) print("Regressor fitted...") predicted = regressor.predict(X_test) print("Predictions made for X_test...") # Definitions from http://scikit-learn.org/stable/modules/model_evaluation.html from sklearn.metrics import median_absolute_error, r2_score # Median absolute error is the median of all absolute differences between the target and the prediction. # Less is better, more indicates a high error between target and prediction. medae = median_absolute_error(y_test, predicted) print("Median absolute error: {:.3g}".format(medae)) # R2 score is the coefficient of determination. Ranges from 1-0, 1.0 is best, 0.0 is worst. # Measures how well future samples are likely to be predicted. r2 = r2_score(y_test, predicted) print("r2 score: {:.3g}".format(r2)) # Plot outputs, compare actual vs predicted values # import matplotlib.pyplot as plt # # plt.scatter( # y_test, # predicted, # color='blue', # linewidth=1
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_valid, y_valid), callbacks=[earlystop], verbose=1) # Test the model X_test = test_inputs['X'] y1_test = test_inputs['target_load'] y1_preds = model.predict(X_test) y1_test = y_scaler.inverse_transform(y1_test) y1_preds = y_scaler.inverse_transform(y1_preds) y1_test, y1_preds = flatten_test_predict(y1_test, y1_preds) mse = mean_squared_error(y1_test, y1_preds) rmse_predict = RMSE(mse) evs = explained_variance_score(y1_test, y1_preds) mae = mean_absolute_error(y1_test, y1_preds) msle = mean_squared_log_error(y1_test, y1_preds) meae = median_absolute_error(y1_test, y1_preds) r_square = r2_score(y1_test, y1_preds) mape_v = mape(y1_preds.reshape(-1, 1), y1_test.reshape(-1, 1)) print("mse:", mse, 'rmse_predict:', rmse_predict, "mae:", mae, "mape:", mape_v, "r2:", r_square, "msle:", msle, "meae:", meae, "evs:", evs)
def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.0], [0.0]), 0.0) assert_almost_equal(mean_squared_error([0.0], [0.0], squared=False), 0.0) assert_almost_equal(mean_squared_log_error([0.0], [0.0]), 0.0) assert_almost_equal(mean_absolute_error([0.0], [0.0]), 0.0) assert_almost_equal(mean_pinball_loss([0.0], [0.0]), 0.0) assert_almost_equal(mean_absolute_percentage_error([0.0], [0.0]), 0.0) assert_almost_equal(median_absolute_error([0.0], [0.0]), 0.0) assert_almost_equal(max_error([0.0], [0.0]), 0.0) assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0) assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([-1.0], [-1.0]) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0]) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0]) # Tweedie deviance error power = -1.2 assert_allclose(mean_tweedie_deviance([0], [1.0], power=power), 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.0], [0.0], power=power) assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.00, 2) msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=1.0) power = 1.5 assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power)) msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) power = 2.0 assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) power = 3.0 assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.0], [0.0], power=power) with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.0], [0.0], power=0.5)
def fit(self, models="aefnsthd", lambday=None, atomic_arguments=None, weights=['equal', 'errors', 'cv.errors'], error_method=['MAE', 'MSE', 'MSLE', 'MEAE', 'RMSE'], cv_horizon=None, window_size=84, horizon_average=False, parallel=False, period=None): logging.info( 'Fitting with HybridForecast : models = {}'.format(models)) if len(self.ts) < 4: logging.fatal( "The input time series must have 4 or more observations.") return self.fitted self.setTimeSeries(period) rfreq = int(ro.r('frequency(r_timeseries)')[0]) # Note that if rfreq > 1 at this point, we want to also change atomic_arguments for 's' and 'h' to # have the argument 'period'. if atomic_arguments is None: atomic_arguments = dict() if rfreq > 1: if atomic_arguments.get('s', None) == None: atomic_arguments['s'] = {'period': rfreq} else: atomic_arguments['s'].update({'period': rfreq}) if atomic_arguments.get('h', None) == None: atomic_arguments['h'] = {'period': rfreq} else: atomic_arguments['h'].update({'period': rfreq}) if atomic_arguments.get('d', None) == None: atomic_arguments['d'] = {'period': rfreq} else: atomic_arguments['d'].update({'period': rfreq}) # Run checks on all the variables as appropriate if weights is list: logging.info( 'Selecting "equal" from ["equal", "errors", "cv.errors"]') weights = 'equal' elif weights not in ['equal', 'errors', 'cv.errors']: logging.warning('Invalid weight count or type - using "equal"') weights = 'equal' # Cross-validation errors are better accuracy wise, but slower if error_method is list: logging.info( 'Selecting "MSE" from ["MAE", "MSE", "MSLE", "MEAE", "RSME"]') error_method = 'MSE' elif error_method not in ['MAE', 'MSE', 'MSLE', 'MEAE']: logging.warning('Invalid error method - using MSE') error_method = 'MSE' if cv_horizon is None: cv_horizon = rfreq wexpanded_models = set(list(models.lower())) expanded_models = [] for m in wexpanded_models: if m in list(self.MODELS.keys()): expanded_models.append(m) logging.info('Using model ' + self.MODELS[m]) if len(expanded_models) < 1: logging.error( "At least one component model type must be specified.") return self.fitted # Validate cores and parallel arguments if type(parallel) is not bool: logging.warning( "Invalid type for parallel - assigning to run in parallel") parallel = True # Check for problems for specific models (e.g. long seasonality for ets and non-seasonal for stlm or nnetar) if rfreq >= 24: if 'e' in expanded_models: logging.warning( 'frequency >= 24, the ets model will not be used') expanded_models.remove('e') if 'f' in expanded_models: logging.warning( 'frequency >= 24, the theta model will not be used') expanded_models.remove('f') if 'f' in expanded_models and len(self.ts) < rfreq: logging.warning( 'The theta model requires more than a year of data. The theta model will not be used.' ) expanded_models.remove('f') if 's' in expanded_models: if rfreq < 2: logging.warning( "The stlm model requires that the input data be a seasonal ts object. The stlm model will not be used." ) expanded_models.remove('s') if rfreq * 2 >= len(self.ts): logging.warning( "The stlm model requres a series more than twice as long as the seasonal period. The stlm model will not be used." ) expanded_models.remove('s') if 'h' in expanded_models or 'd' in expanded_models: if rfreq < 2: logging.warning( "The holt-winters model requires that the input data be a seasonal ts object. The holt-winters model will not be used." ) expanded_models.remove('h') if 'n' in expanded_models: if rfreq * 2 >= len(self.ts): logging.warning( "The nnetar model requres a series more than twice as long as the seasonal period. The nnetar model will not be used." ) expanded_models.remove('n') if len(expanded_models) < 1: logging.error("A hybrid model must contain one component model.") return self.fitted if weights == 'cv.errors' and window_size > int(len(self.ts) / 5): window_size = int(len(self.ts) / 5) if window_size < 5: logging.warning( 'Not enough data for rolling validation, using "error" weighting.' ) weights = 'errors' else: logging.warning( 'Window size is too large - reducing size to {}'.format( window_size)) logging.info('Fitting with models : {}'.format(expanded_models)) logging.info('Number of cores = {}'.format( multiprocessing.cpu_count())) # If we have extra cpus - get several working on tbats because it is slow in the fit # if extra_cpus >= 2: # ncpus = min(3, extra_cpus) # logging.info('Extra Core count = {}'.format(ncpus)) # if atomic_arguments is None or atomic_arguments.get('t', None) is None: # atomic_arguments = {'t':{'use.parallel':True, 'num.cores':ncpus}} # else: # atomic_arguments['t'].update({'use.parallel':True, 'num.cores':ncpus}) # extra_cpus -= ncpus if weights == 'cv.errors': # cv.errors try: # There are reasons for exceptions like a list too short for stlm. We will handle it robustly ## Wrapper on type of R rcvts = cvts.cvts() rcvts.RcvtsWrapper(ets.ets(), {"model": "ZZZ"}) ## ROLLING METHOD - slow rolling = cvts.cvts() pool = multiprocessing.Pool( processes=multiprocessing.cpu_count()) rolling_results = list() if 't' in expanded_models: rolling_results.append( rolling.rolling(self.ts, tbats.tbats, args=None, code='t', error_method=error_method, pool=pool)) if 'n' in expanded_models: rolling_results.append( rolling.rolling(self.ts, nnetar.nnetar, args=None, code='n', error_method=error_method, pool=pool)) if 'a' in expanded_models: rolling_results.append( rolling.rolling(self.ts, Arima.Arima, args=None, code='a', error_method=error_method, pool=pool)) if 'e' in expanded_models: rolling_results.append( rolling.rolling(self.ts, ets.ets, args=None, code='e', error_method=error_method, pool=pool)) if 'f' in expanded_models: rolling_results.append( rolling.rolling(self.ts, thetam.thetam, args=None, code='f', error_method=error_method, pool=pool)) if 's' in expanded_models: rolling_results.append( rolling.rolling(self.ts, stlm.stlm, args=None, code='s', error_method=error_method, pool=pool)) if 'h' in expanded_models: atomic_arguments['h']['model'] = 'MAM' rolling_results.append( rolling.rolling(self.ts, ets.ets, args=atomic_arguments['h'], code='h', error_method=error_method, pool=pool)) if 'd' in expanded_models: atomic_arguments['d']['model'] = 'AAA' rolling_results.append( rolling.rolling(self.ts, ets.ets, args=atomic_arguments['d'], code='d', error_method=error_method, pool=pool)) # Turn the array in model results into a dictionary (map) and extract even weighting... # Also, sometimes we are going to get NaNs so we are going to have to work with a dataframe # and handle each observation separately. self.fitted = np.ndarray(shape=[len(self.ts), 1]) temp = dict() column_errors = list() df = pd.DataFrame() for i in range(0, len(self.model_results)): column_errors.append(self.model_results[i]['measure']) tdict = { 'model': self.model_results[i]['model'].refit(self.ts), 'error': self.model_results[i]['measure'] } temp.update({self.model_results[i]['model_code']: tdict}) df = pd.DataFrame(np.asmatrix(self.model_results[i]['model'].fitted)) if df is None else \ df.append(pd.DataFrame(np.asmatrix(self.model_results[i]['model'].fitted)), ignore_index=True) df = df.transpose() self.model_results = temp tweights = 1.0 / np.asarray(column_errors) weightsnorm = np.linalg.norm(tweights, ord=1) weights = tweights / weightsnorm # Hybrid weighting which handles NaNs in rows for i in range(0, df.shape[0]): good_row_data = df.ix[i].dropna() # Use properly weighted if we have all the data. if len(good_row_data) == len(weights): self.fitted[i] = np.dot(good_row_data, weights) # Weight on the good values elif len(good_row_data) > 0: stweights = tweights[df.ix[i].isna() == False] stweightsn = np.linalg.norm(stweights, ord=1) self.fitted[i] = np.dot(good_row_data, stweights / stweightsn) else: self.fitted[i] = np.NaN except: logging.warning('Python Traceback: {}'.format( str(sys.exc_info()))) logging.warning('R traceback: {}'.format( self.rtracebackerror())) logging.warning( 'Cannot use weighting "cv.errors", using "errors"') weights = 'errors' if weights == 'equal' or weights == 'errors': itlist = list() if 't' in expanded_models: itlist.append((self.ts, atomic_arguments, 't')) if 'n' in expanded_models: itlist.append((self.ts, atomic_arguments, 'n')) if 'a' in expanded_models: itlist.append((self.ts, atomic_arguments, 'a')) if 'e' in expanded_models: itlist.append((self.ts, atomic_arguments, 'e')) if 'f' in expanded_models: itlist.append((self.ts, atomic_arguments, 'f')) if 's' in expanded_models: itlist.append((self.ts, atomic_arguments, 's')) if 'h' in expanded_models: atomic_arguments['h']['model'] = 'MAM' itlist.append((self.ts, atomic_arguments, 'h')) if 'd' in expanded_models: atomic_arguments['h']['model'] = 'AAA' itlist.append((self.ts, atomic_arguments, 'd')) # Initial fit of all models in parallel! pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) self.model_results = pool.starmap(all_workers, itlist) pool.close() # Turn the array in model results into a dictionary (map) and extract even weighting... # Also, sometimes we are going to get NaNs so we are going to have to work with a dataframe # and handle each observation separately. temp = dict() df = pd.DataFrame() for i in range(0, len(self.model_results)): temp.update( {self.model_results[i][0]: self.model_results[i][1]}) df = pd.concat( [df, pd.DataFrame(self.model_results[i][1].fitted)], axis=1, ignore_index=True) self.fitted = np.ndarray(shape=[len(self.ts), 1]) self.model_results = temp if weights == 'equal': for i in range(0, df.shape[0]): good_row_data = df.ix[i].dropna() if len(good_row_data) > 0: weights = np.ndarray(shape=(len(good_row_data), 1)) weights.fill(1.0 / len(good_row_data)) self.fitted[i] = np.dot(good_row_data, weights) else: self.fitted[i] = np.NaN else: # Measure the error - using some error measure between self.ts and df[,i] # ['MAE', 'MSE', 'MSLE', 'MEAE'] column_errors = list() for i in range(0, df.shape[1]): fitted_value = df[df.columns[i]] if error_method == 'MAE': column_errors.append( metrics.mean_absolute_error( self.ts.values[fitted_value.isna() == False], fitted_value[fitted_value.isna() == False])) elif error_method == 'MSE': column_errors.append( metrics.mean_squared_error( self.ts.values[fitted_value.isna() == False], fitted_value[fitted_value.isna() == False])) elif error_method == 'RMSE': column_errors.append( math.sqrt( metrics.mean_squared_error( self.ts.values[fitted_value.isna() == False], fitted_value[fitted_value.isna() == False]))) elif error_method == 'MSLE': column_errors.append( metrics.mean_squared_log_error( self.ts.values[fitted_value.isna() == False], fitted_value[fitted_value.isna() == False])) elif error_method == 'MEAE': column_errors.append( metrics.median_absolute_error( self.ts.values[fitted_value.isna() == False], fitted_value[fitted_value.isna() == False])) tweights = 1.0 / np.asarray(column_errors) weightsnorm = np.linalg.norm(tweights, ord=1) weights = tweights / weightsnorm # Hybrid weighting which handles NaNs in rows for i in range(0, df.shape[0]): good_row_data = df.ix[i].dropna() # Use properly weighted if we have all the data. if len(good_row_data) == len(weights): self.fitted[i] = np.dot(good_row_data, weights) # Weight on the good values elif len(good_row_data) > 0: stweights = tweights[df.ix[i].isna() == False] stweightsn = np.linalg.norm(stweights, ord=1) self.fitted[i] = np.dot(good_row_data, stweights / stweightsn) else: self.fitted[i] = np.NaN return self.fitted
def kmeans(X_train, y_train, X_val, y_val): n_clusters = 8 kmeans = KMeans(n_clusters=n_clusters, random_state=0, verbose=0, n_jobs=int(0.8 * n_cores)).fit(X_train) c_train = kmeans.predict(X_train) c_pred = kmeans.predict(X_val) centroids = kmeans.cluster_centers_ y_val_stats = None predicted_values = None y_train_stats = None labels_stats = None for i in range(n_clusters): print('--------analyzing cluster %d--------' % i) train_mask = c_train == i std_train = np.std(y_train[train_mask]) mean_train = np.mean(y_train[train_mask]) print( "# examples & price mean & std for training set within cluster %d is:(%d, %.2f, %.2f)" % (i, train_mask.sum(), np.float(mean_train), np.float(std_train))) pred_mask = c_pred == i std_pred = np.std(y_val[pred_mask]) mean_pred = np.mean(y_val[pred_mask]) print( "# examples & price mean & std for validation set within cluster %d is:(%d, %.2f, %.2f)" % (i, pred_mask.sum(), np.float(mean_pred), np.float(std_pred))) if pred_mask.sum() == 0: print( 'Zero membered test set! Skipping the test and training validation.' ) continue #LinearModelRidge(X_train[train_mask], y_train[train_mask], X_val[pred_mask], y_val[pred_mask]) regr = Ridge(alpha=7) #7 regr.fit(X_train[train_mask], y_train[train_mask]) labels_pred = regr.predict(X_train[train_mask].values) y_pred = regr.predict(X_val[pred_mask].values) if (y_val_stats is None): y_val_stats = copy.deepcopy(y_val[pred_mask]) y_train_stats = copy.deepcopy(y_train[train_mask]) predicted_values = copy.deepcopy(y_pred) labels_stats = copy.deepcopy(labels_pred) else: y_val_stats = y_val_stats.append(y_val[pred_mask]) y_train_stats = y_train_stats.append(y_train[train_mask]) predicted_values = np.append(predicted_values, y_pred) labels_stats = np.append(labels_stats, labels_pred) print('--------Finished analyzing cluster %d--------' % i) print("Mean absolute error: ", metrics.mean_absolute_error(y_val_stats, predicted_values)) print("Median absolute error: ", metrics.median_absolute_error(y_val_stats, predicted_values)) print("Mean squared error: ", metrics.mean_squared_error(y_val_stats, predicted_values)) print("R2: ", metrics.r2_score(y_val_stats, predicted_values)) print('------------TRAIN--------------------') print("Mean absolute error: ", metrics.mean_absolute_error(y_train_stats, labels_stats)) print("Median absolute error: ", metrics.median_absolute_error(y_train_stats, labels_stats)) print("Mean squared error: ", metrics.mean_squared_error(y_train_stats, labels_stats)) print("R2: ", metrics.r2_score(y_train_stats, labels_stats)) return c_pred, centroids
##数据预测============== y_pred_linear = linearRegressor.predict(x_test) y_pred_decisionTree = decisionTreeRegressor.predict(x_test) y_test = y_test.values # 写入csv文件 x_test['pred_price'] = y_pred_decisionTree x_test.to_csv("../homework/result.csv", sep=',', header=True, index=True) ##评分============== linear_score = r2_score(y_test, y_pred_linear) y_test = y_test / 10000 y_pred_decisionTree = y_pred_decisionTree / 10000 mean_absolute_value = mean_absolute_error(y_test, y_pred_decisionTree) print(f"平均绝对误差:{mean_absolute_value}") mean_squared_value = mean_squared_error(y_test, y_pred_decisionTree) print(f"均方差:{mean_squared_value}") median_absolute_value = median_absolute_error(y_test, y_pred_decisionTree) print(f"中值绝对误差:{median_absolute_value}") explained_variance_value = explained_variance_score( y_test, y_pred_decisionTree) print(f"可解释方差:{explained_variance_value}") r2_value = r2_score(y_test, y_pred_decisionTree) print(f"r2值:{r2_value}")
num_epochs=num_epochs, shuffle=shuffle, batch_size=batch_size) evaluations = [] STEPS = 400 for i in range(100): regressor.train(input_fn=wx_input_fn(X_train, y=y_train), steps=STEPS) evaluations.append( regressor.evaluate( input_fn=wx_input_fn(X_val, y_val, num_epochs=1, shuffle=False))) loss_values = [ev['loss'] for ev in evaluations] training_steps = [ev['global_step'] for ev in evaluations] # show loss-train graph show_graph(training_steps, loss_values) # prediction pred = regressor.predict( input_fn=wx_input_fn(X_test, num_epochs=1, shuffle=False)) predictions = np.array([p['predictions'][0] for p in pred]) print("The Explained Variance: %.2f" % explained_variance_score(y_test, predictions)) print("The Mean Absolute Error: %.2f degrees Celcius" % mean_absolute_error(y_test, predictions)) print("The Median Absolute Error: %.2f degrees Celcius" % median_absolute_error(y_test, predictions))
loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights("lstm2_en.h5") print("Loaded model from disk") #Apply Sliding Window Function for Test Data tx, ty = sliding_window(train_sc, 100) #Convert to NumpyArrays t_features = np.array(tx) t_labels = np.array(ty) #Prediction pred = loaded_model.predict(t_features, verbose = 0) #Plot Results rcParams['figure.figsize'] = 100, 72 plt.legend(['Actual Prices', 'Predicted Prices']) plt.plot(t_labels) plt.plot(pred) #Save as png plt.savefig('lstm2_gr_en.png') #Print Metrics print('R-Squared: %f'%(r2_score(t_labels, pred))) print ('RMSE: %f'%(sqrt(mean_squared_error(t_labels, pred)))) print('MAE: %f'%(mean_absolute_error(t_labels, pred))) print('medAE: %f'%(median_absolute_error(t_labels, pred)))