def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:,0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k:[] for k in X.columns} pred=np.zeros(y.shape[0]) for train,test in CV.split(X,y): Xtrain = X.iloc[train,:] ytrain = y.iloc[train] Xtest = X.iloc[test,:] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape)>1: p=p[:,0] pred[test]=p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k,v in importance_out.items(): importances[k].append(v) cv_scores = [{'r': np.corrcoef(y,pred)[0,1], 'R2': np.corrcoef(y,pred)[0,1]**2, 'MAE': mean_absolute_error(y,pred)}] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{'r': np.corrcoef(y,in_pred)[0,1], 'R2': np.corrcoef(y,in_pred)[0,1]**2, 'MAE': mean_absolute_error(y,in_pred)}] GAM_results[name] = {'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam} return GAM_results
def evaluate(ytest, ypred, filename='metrics.txt'): true_result = [1 if item > 0.5 else 0 for item in ytest] pred_result = [1 if item > 0.5 else 0 for item in ypred] cm = confusion_matrix(true_result, pred_result) print('\nConfusion matrix:') print(cm) print("\nLoss classified as loss", cm[0][0]) print("Wins classified as wins", cm[1][1]) print("Wins classified as loss", cm[1][0]) print("Loss classified as wins", cm[0][1]) print('\nAccuracy:\t', accuracy_score(true_result, pred_result)) print('Precision:\t', precision_score(true_result, pred_result)) print('Recall: \t', recall_score(true_result, pred_result)) print('F1 score:\t', f1_score(true_result, pred_result)) print('Mean absolute error:\t', mean_absolute_error(ytest, ypred)) # print to file print("Loss classified as loss", cm[0][0], file=open(filename, "a")) print("Wins classified as wins", cm[1][1], file=open(filename, "a")) print("Wins classified as loss", cm[1][0], file=open(filename, "a")) print("Loss classified as wins", cm[0][1], file=open(filename, "a")) print('\nAccuracy:\t', accuracy_score(true_result, pred_result), file=open(filename, "a")) print('Precision:\t', precision_score(true_result, pred_result), file=open(filename, "a")) print('Recall: \t', recall_score(true_result, pred_result), file=open(filename, "a")) print('F1 score:\t', f1_score(true_result, pred_result), file=open(filename, "a")) print('Mean absolute error:\t', mean_absolute_error(ytest, ypred), file=open(filename, "a"))
def test_regressor(train, test, feature_extractor, target_transformer, regressor): (train_raw_X, train_raw_y) = (train, train['SalaryNormalized']) (test_raw_X, test_raw_y) = (test, test['SalaryNormalized']) print 'feature extraction ...' train_y = target_transformer.transform(train_raw_y) test_y = target_transformer.transform(test_raw_y) train_X = feature_extractor.fit_transform(train_raw_X, train_y) test_X = feature_extractor.transform(test_raw_X) print 'fit regression model ...' try: regressor.fit(train_X, train_y) train_raw_yhat = target_transformer.r_transform(regressor.predict(train_X)) test_raw_yhat = target_transformer.r_transform(regressor.predict(test_X)) except TypeError: regressor.fit(train_X.toarray(), train_y) train_raw_yhat = target_transformer.r_transform(regressor.predict(train_X.toarray())) test_raw_yhat = target_transformer.r_transform(regressor.predict(test_X.toarray())) print 'evaluate error metrics ...' train_error = metrics.mean_absolute_error(train_raw_y, train_raw_yhat) test_error = metrics.mean_absolute_error(test_raw_y, test_raw_yhat) print 'Train error: ', train_error print 'Test error:', test_error
def predict_variance_inf_phase1(budget, hum_train_means, temp_train_means, hum_train_vars, temp_train_vars): """Method to make predictions based on max-variance active inference.""" start_hum = 0 window_hum = None window_temp = None i = 0 hum_preds = np.ones((50, 96)) temp_preds = np.ones((50, 96)) for t in global_times: if budget > 0: window_hum = np.argpartition(hum_train_vars[t], -budget)[-budget:] window_temp = np.argpartition(temp_train_vars[t], -budget)[-budget:] else: window_hum = np.array([]) window_temp = np.array([]) hum_pred, temp_pred = makePreds_phase1(window_hum, window_temp, hum_train_means, temp_train_means, i, t) hum_preds[:, i] = copy.deepcopy(hum_pred) temp_preds[:, i] = copy.deepcopy(temp_pred) i += 1 hum_mean_err = mean_absolute_error(hum_test, hum_preds) temp_mean_err = mean_absolute_error(temp_test, temp_preds) return hum_preds, temp_preds, hum_mean_err, temp_mean_err
def make_model(data,tc): train_data = data.sample(frac=.8) test_data = data.drop(train_data.index) train_y = train_data['T/Tc'] train_X = train_data.drop(['T/Tc','temperature'], axis=1) test_y = test_data['T/Tc'] test_X = test_data.drop(['T/Tc','temperature'], axis=1) # model = XGBClassifier(n_estimators = 1000,max_depth=8, learning_rate=0.05) # model.fit(train_X, train_y, early_stopping_rounds=10, # eval_set=[(test_X, test_y)], verbose=True) # xgb.plot_tree(model) model = svm.SVC(kernel='rbf', gamma=1, C=1, verbose = True) model.fit(train_X, train_y) predictions = model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(np.array(predictions), test_y))) train_y = train_data['temperature']/tc test_y = test_data['temperature']/tc # model2 = XGBRegressor(n_estimators = 1000,max_depth=8, learning_rate=0.05) # model2.fit(train_X, train_y, early_stopping_rounds=10,eval_metric='mae', # eval_set=[(test_X, test_y)], verbose=True) model2 = svm.SVR(kernel='rbf', gamma=.5, C=1, verbose = True) model2.fit(train_X, train_y) predictions = model2.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(np.array(predictions), test_y))) return [model,model2]
def compute_mse(model,x_train_current_tmp,YTrain,x_test_current_tmp,YTest, score ,values_TM = []): model.fit(x_train_current_tmp, YTrain) y_pred_train = model.predict(x_train_current_tmp) y_pred_test = model.predict(x_test_current_tmp) if len(values_TM)!=0: abs_error_train = 100.*mean_absolute_error(YTrain,y_pred_train)*len(YTrain)/(89.7* values_TM[0, 0] * values_TM[0,1]) print("abs train", abs_error_train) abs_error_test = 100.*mean_absolute_error(YTest,y_pred_test)*len(YTest)/(89.7* values_TM[1, 0] * values_TM[1,1]) print("abs test", abs_error_test) mse_error_train = 100.*np.sqrt(mean_squared_error(YTrain,y_pred_train)*len(YTrain)/(values_TM[0, 0] * values_TM[0, 1]))/(89.7) print("mean squared error train", mse_error_train ) mse_error_test = 100.*np.sqrt(mean_squared_error(YTest,y_pred_test)*len(YTest)/(values_TM[1, 0] * values_TM[1, 1]))/(89.7) print("mean squared error test", mse_error_test ) if score=="mean_squared_error": new_loss = mean_squared_error(YTest,y_pred_test) elif score== "mean_absolute_error": new_loss = mean_absolute_error(YTest,y_pred_test) else: new_loss = r2_score(YTest,y_pred_test) beta = model.coef_ if x_train_current_tmp.shape[1]==1: beta = np.array([beta]) beta = beta.reshape([len(beta),1]) return new_loss, beta
def cross_val(regressor_high,regressor_low,classifier,train): rows=random.sample(train.index, int(train.shape[0]*0.75)) sample = train.ix[rows] crime=pd.DataFrame(sample.Total_Crime_Risk,dtype=int) crime['highcrime']=0 crime.highcrime[crime.Total_Crime_Risk>crime.Total_Crime_Risk.median()]=1 crime['GEOGRAPHY_ID']=sample.GEOGRAPHY_ID sample=sample.drop(train.columns[[0,-2,-1]], axis=1) model=classifier.fit(sample, crime.highcrime) Highcrime=model.predict(sample) Highcrime=np.array(Highcrime) sample['predicted_highcrime']=Highcrime high_areas=sample.ix[sample.predicted_highcrime==1] high_areas=pd.merge(high_areas, crime, on='GEOGRAPHY_ID', how= 'inner') high_areas_crime=high_areas.Total_Crime_Risk high_areas=high_areas.drop(high_areas.columns[[-1,-2,-3]],axis=1) low_areas=sample.ix[sample.predicted_highcrime==0] low_areas=pd.merge(low_areas, crime, on='GEOGRAPHY_ID', how= 'inner') low_areas_crime=low_areas.Total_Crime_Risk low_areas=low_areas.drop(low_areas.columns[[-1,-2,-3]],axis=1) model_high=regressor_high.fit(high_areas, high_areas_crime) high_crime=model_high.predict(high_areas) model_low=regressor_low.fit(low_areas, low_areas_crime) low_crime=model_low.predict(low_areas) high_error=mean_absolute_error(high_areas_crime,high_crime) low_error=mean_absolute_error(low_areas_crime,low_crime) print high_error,low_error, ((high_error+low_error)/2)
def prediction_performance(model, Xtest, Ytest, numberCategories): # Calculate metric for logistic regression performance. if(numberCategories == 1): # Get metrics for binary classification. YDistribution = model.predict_proba(Xtest)[:,1] YClassification = model.predict(Xtest) auc = roc_auc_score(Ytest, YDistribution) print("AUC", auc) MAE = mean_absolute_error(Ytest, YDistribution) print("MAE", MAE) accuracy = 1 - mean_absolute_error(YClassification, Ytest) print("Accuracy", accuracy) metrics = [accuracy, auc, MAE] else: # Get metric for multiple class classification. YPredictions = model.predict(Xtest) YDistribution = model.predict_proba(Xtest) YTestLabels = label_data(Ytest) accuracy = model.score(Xtest, YTestLabels) print("Accuracy", accuracy) avAUC = evaluate_auc_score(model, Xtest, Ytest) print("Av AUC", avAUC) #auc = roc_auc_score(Ytest, YPredictions) MAE = mean_absolute_error(Ytest, YDistribution) print("MAE", MAE) metrics = [accuracy, avAUC, MAE] return metrics
def normalEquation(features, features_validation, values, values_validation): M = numpy.dot(features.T, features) print "Transposta de f por f" print M.shape M = numpy.array(M) print "Transformou em array" print M.shape M = numpy.linalg.pinv(M) print "Inversa" print M.shape M = numpy.dot(M, features.T) print "Multiplicou por transposta de f" print M.shape theta = numpy.dot(M, values) #M = numpy.linalg.pinv(M) print theta.shape print features.shape print theta predictions = numpy.dot(theta, features.T) pred_validation = numpy.dot(theta, features_validation.T) print predictions print "MEAN ABSOLUTE ERROR " print mean_absolute_error(values, predictions) print "MEAN ABSOLUTE ERROR (validation) " print mean_absolute_error(values_validation, pred_validation)
def test_continue_train(self): X, y = load_boston(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) params = { 'objective': 'regression', 'metric': 'l1', 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False) init_gbm = lgb.train(params, lgb_train, num_boost_round=20) model_name = 'model.txt' init_gbm.save_model(model_name) evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=30, valid_sets=lgb_eval, verbose_eval=False, # test custom eval metrics feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)), evals_result=evals_result, init_model='model.txt') ret = mean_absolute_error(y_test, gbm.predict(X_test)) self.assertLess(ret, 3.5) self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5) for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']): self.assertAlmostEqual(l1, mae, places=5) os.remove(model_name)
def tst(X, Y, k=3, rad=4, mode='k'): trX = X[:-1200] trY = Y[:-1200] tstX = X[-400:] tstY = Y[-400:] nnlr = NNLR(k, rad, mode) nnlr.fit(trX, trY) pred = nnlr.predict(trX) print 'Training Set' print 'Root Mean Squared Error' print mean_squared_error(trY, pred)**.5 print 'Root Mean Error' print mean_absolute_error(trY, pred) # print zip(pred, trX)[:5] print nnlr.active pred = nnlr.predict(tstX) print 'Test Set' print 'Root Mean Squared Error' print mean_squared_error(tstY, pred)**.5 print 'Root Mean Error' print mean_absolute_error(tstY, pred) # print zip(pred, tstY)[:5] print nnlr.active
def main(): DOC = """ ================================================================================ Compare the prediction accuracy of different models on the boston dataset ================================================================================ """ print(DOC) from sklearn import cross_validation, datasets boston = datasets.load_boston() X, y = boston.data, np.round(boston.target) #X -= X.mean() y -= y.min() idx = np.argsort(y) X = X[idx] y = y[idx] cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0) score_logistic = [] score_ordinal_logistic = [] score_ridge = [] for i, (train, test) in enumerate(cv): #test = train if not np.all(np.unique(y[train]) == np.unique(y)): # we need the train set to have all different classes continue assert np.all(np.unique(y[train]) == np.unique(y)) train = np.sort(train) test = np.sort(test) w, theta = ordinal_logistic_fit(X[train], y[train], verbose=True, solver='TNC') pred = ordinal_logistic_predict(w, theta, X[test]) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (ORDINAL) fold %s: %s' % (i+1, s)) score_ordinal_logistic.append(s) from sklearn import linear_model clf = linear_model.LogisticRegression(C=1.) clf.fit(X[train], y[train]) pred = clf.predict(X[test]) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s)) score_logistic.append(s) from sklearn import linear_model clf = linear_model.Ridge(alpha=1.) clf.fit(X[train], y[train]) pred = np.round(clf.predict(X[test])) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (RIDGE) fold %s: %s' % (i+1, s)) score_ridge.append(s) print() print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC): %s' % np.mean(score_ordinal_logistic)) print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic)) print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION): %s' % np.mean(score_ridge)) # print('Chance level is at %s' % (1. / np.unique(y).size)) return np.mean(score_ridge)
def testModel( model, layerSizes, Xtrain, Ytrain, Xtest, Ytest, learningRate, epochs, batchSize, optimizer, resultsFile = "lossOptLog.txt", printResults = False, elapsedTime = False): lossCategories = Ytrain.shape[1] numberHiddenLayers = len(layerSizes) - 2 inputLayerSize = layerSizes[0] units1 = layerSizes[1] dropout1 = dropouts[0] dropout2 = dropouts[1] dropout3 = dropouts[2] # Test MAE of model on training data (to check for overfitting). trainingPredY = model.predict_proba(Xtrain, verbose = 0) MAETrain = mean_absolute_error(Ytrain, trainingPredY) # Test MAE on test data. testPredY = model.predict(Xtest, verbose = 0) MAE = mean_absolute_error(Ytest, testPredY) # Calculate AUC for each category. auc = [0] * lossCategories """ for i in range(0, lossCategories): categoryValues = Ytest[:][i:(i+1)] categoryPredictions = testPredY[:][i:(i+1)] auc[i] = roc_auc_score(categoryPredictions, categoryValues) aucAverage = (sum(auc) / len(auc)) """ aucAverage = 0 # Evaluate the model and write results to a file. scores = model.evaluate(Xtest, Ytest, verbose = 0) testAccuracy = scores[1] scores = model.evaluate(Xtrain, Ytrain, verbose = 0) trainAccuracy = scores[1] if(printResults): print("Training MAE: %.2f%%" % (MAETrain * 100)) print("acc: %.2f%%" % (testAccuracy*100)) print("auc: %.2f%%" % (aucAverage*100)) print("MAE: %.2f%%" % (MAE*100)) print("%s , %s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n" % (units1, units2, units3, learningRate, epochs, batchSize, patience, optimizer, dropout1, dropout2, dropout3)) print("\n") # Write model results to a file. if(elapsedTime is not False): with open(resultsFile, "a") as text_file: text_file.write( "%s , %s , %s, %s , %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n" % (elapsedTime, MAETrain, trainAccuracy, testAccuracy, aucAverage, MAE, units1, units2, units3, learningRate, epochs, batchSize, patience, optimizer, dropout1, dropout2, dropout3)) else: with open(resultsFile, "a") as text_file: text_file.write( "%s , %s , %s , %s , %s , %s ,%s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n" % (MAETrain, trainAccuracy, testAccuracy, aucAverage, MAE, units1, units2, units3, learningRate, epochs, batchSize, optimizer, dropout1, dropout2, dropout3))
def main(): #load da serie dtst = Datasets() serie = dtst.Leitura_dados(dtst.bases_linear_graduais(3, 35)) serie = np.asarray(serie) particao = Particionar_series(serie, [0.0, 0.0, 0.0], 0) serie = particao.Normalizar(serie) ''' ELM = ELMRegressor() ELM.Tratamento_dados(serie, [0.8, 0.2, 0.2], 4) #criando uma lista para os dados lista_dados = [] lista_dados.append(ELM.train_entradas) lista_dados.append(ELM.train_saidas) lista_dados.append(ELM.val_entradas) lista_dados.append(ELM.val_saidas) lista_dados.append(ELM.teste_entradas) lista_dados.append(ELM.teste_saidas) #Otimizando a arquitetura de uma ELM ELM.Otimizar_rede(10, lista_dados) ''' #ELM treinando com a entrada e a saida #ELM = ELMRegressor(ELM.neuronios_escondidos) ELM = ELMRegressor(5) ELM.Tratamento_dados(serie, [0.8, 0.2, 0.2], 4) ELM.Treinar(ELM.train_entradas, ELM.train_saidas) #previsao do ELM para o conjunto de treinamento prediction_train = ELM.Predizer(ELM.train_entradas) MAE_train = mean_absolute_error(ELM.train_saidas, prediction_train) print('MAE Treinamento: ', MAE_train) #previsao do ELM para o conjunto de teste prediction_test = ELM.Predizer(ELM.teste_entradas) MAE_test = mean_absolute_error(ELM.teste_saidas, prediction_test) print('MAE Teste: ', MAE_test) #grafico de previsao para treinamento plt.plot(ELM.train_saidas, label = 'Real Treinamento', color = 'Blue') plt.plot(prediction_train, label = 'Real Previsão', color = 'Red') plt.title('Gráfico Treinamento, MAE: %s' %MAE_train) plt.legend() plt.tight_layout() plt.show() #grafico de previsao para teste plt.plot(ELM.teste_saidas, label = 'Real Teste', color = 'Blue') plt.plot(prediction_test, label = 'Previsao Teste', color = 'Red') plt.title('Gráfico Teste, MAE: %s' %MAE_test) plt.legend() plt.tight_layout() plt.show()
def blended_scorer(estimator, X, y): ols_preds = ols_preds_for_Xs(X) pred_y = estimator.predict(X) msg("BLENDED SCORES FOR a CV GROUP:") for blend in np.arange(0, 1.01, 0.1): blended_prediction = (blend * ols_preds) + ((1.0 - blend) * pred_y) blended_score = mean_absolute_error(blended_prediction, y) msg("%f * OLS yields score of %f" % (blend, blended_score)) return mean_absolute_error(y, pred_y)
def test_clf(X, y, clf, test_size=0.2, num=20): ylist = y.T.tolist()[0] train = numpy.zeros(num) cross = numpy.zeros(num) for i in xrange(num): X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, ylist, test_size=test_size) clf.fit(X_train, y_train) train[i] = mean_absolute_error(clf.predict(X_train), y_train) cross[i] = mean_absolute_error(clf.predict(X_test), y_test) return (train.mean(), train.std()), (cross.mean(), cross.std())
def build_SGDRegressor(train_X, train_y, test_X, test_y): ########## log_train_y = np.log(train_y) ########## sgd_regressor = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.001, l1_ratio=0.15, verbose=True, n_iter = 50) sgd_regressor.fit(train_X, log_train_y) train_yhat = np.exp(sgd_regressor.predict(train_X)) test_yhat = np.exp(sgd_regressor.predict(test_X)) print metrics.mean_absolute_error(train_y, train_yhat) print metrics.mean_absolute_error(test_y, test_yhat)
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 13) assert_almost_equal(zero_one(y_true, y_pred, normalize=True), 13 / float(n_samples), 2) assert_almost_equal(zero_one_loss(y_true, y_pred), 13 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 13. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) assert_equal(accuracy_score(y_true, y_pred, normalize=False), n_samples - zero_one_loss(y_true, y_pred, normalize=False)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def build_boostedTree(train_X, train_y, test_X, test_y): bt = ensemble.GradientBoostingRegressor(loss = 'lad', learning_rate= 0.1, n_estimators=100, subsample=0.3, max_depth=3, max_features=50, verbose = 1) bt_train_X = train_X bt_test_X = test_X bt.fit(bt_train_X.toarray(), train_y) train_yhat = sgd_regressor.predict(bt_train_X) test_yhat = sgd_regressor.predict(bt_test_X) print metrics.mean_absolute_error(train_y, train_yhat) print metrics.mean_absolute_error(test_y, test_yhat)
def zillow_keras(parameters, X_train, X_dev, Y_train, Y_dev, random_seed=None): #random seed if random_seed != None: np.random.seed(random_seed) hidden_1, hidden_2, activation, epochs, learning_rate, batch_size, method, momentum = parameters if activation == 'tanh': initializer = keras.initializers.lecun_normal() elif activation == 'relu': initializer = keras.initializers.he_normal() else: raise ValueError('activation') regularizer = None #regularizer = regularizers.l2(0.01) model = Sequential() model.add(Dense(hidden_1, input_dim=X_train.shape[1], kernel_initializer=initializer, activation=activation, kernel_regularizer=regularizer)) model.add(Dense(hidden_2, kernel_initializer=initializer, activation=activation, kernel_regularizer=regularizer)) model.add(Dense(1, kernel_initializer=initializer, kernel_regularizer=regularizer)) if method == 'GD': optimizer = optimizers.SGD(lr=learning_rate, momentum=momentum) elif method == 'RMSProp': optimizer = optimizers.RMSprop(lr=learning_rate) elif method == 'Adam': optimizer = optimizers.Adam(lr=learning_rate) else: raise ValueError('method') model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae']) initial_epoch = 0 #verbose = 1 verbose = 0 verbose2 = 100 #verbose2 = None results = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, callbacks=[KerasCallback(verbose2)], validation_data = (X_dev, Y_dev), verbose=verbose, initial_epoch=initial_epoch) Y_predict = model.predict(X_train) try: accuracy_train = mean_absolute_error(Y_train, Y_predict) except ValueError: accuracy_train = 0 Y_predict = model.predict(X_dev) try: accuracy_dev = mean_absolute_error(Y_dev, Y_predict) except ValueError: accuracy_dev = 0 print(accuracy_train, accuracy_dev, hidden_1, hidden_2, activation, epochs, learning_rate, batch_size, method, momentum) '''
def scorer_gbr_lad(clf, X, y, verbose=1): """Scorer for GradientBoostingRegressor with los='lad' """ y_pred = clf.predict(X) score = -mean_absolute_error(y, y_pred) if verbose >0: print >>sys.stderr,"Eout=",-score if 'staged_predict' in dir(clf): if verbose>0: print("Staged predicts (Eout)") for i,y_pred in enumerate(clf.staged_predict(X)): Eout = mean_absolute_error(y,y_pred) if verbose>0: print "tree %3d, test score %f" % (i+1,Eout) return score
def runRegressor( clf,featureMat,targets,no_of_training_example ): try: clf.fit(featureMat[:no_of_training_example,:], targets[:no_of_training_example]) y_pred = clf.predict(featureMat[no_of_training_example:,:]) print 'Variance Score' print explained_variance_score(targets[no_of_training_example:], y_pred) print 'Mean absolute error' print mean_absolute_error(targets[no_of_training_example:], y_pred) print 'Explained variance score' print explained_variance_score(targets[no_of_training_example:], y_pred) except Exception, e: print e
def test_clf_kfold(X, y, clf, folds=10): train = numpy.zeros(folds) cross = numpy.zeros(folds) for i, (train_idx, test_idx) in enumerate(cross_validation.KFold(y.shape[0], n_folds=folds)): X_train = X[train_idx] X_test = X[test_idx] y_train = y[train_idx].T.tolist()[0] y_test = y[test_idx].T.tolist()[0] clf.fit(X_train, y_train) train[i] = mean_absolute_error(clf.predict(X_train), y_train) cross[i] = mean_absolute_error(clf.predict(X_test), y_test) return (train.mean(), train.std()), (cross.mean(), cross.std())
def gd_method(f, f_valid, v, v_validation): m = len(v) m_validation = len(v_validation) # Normalize data features, mu, sigma = normalize_features(f) # Normalize validation data (using the mean and std calculated when normalizing the model) features_validation, mu, sigma = normalize_features(f_valid) features['ones'] = numpy.ones(m) # Add a column of 1s (y intercept) features_validation['ones'] = numpy.ones(m_validation) # Add a column of 1s (y intercept) # Convert features and values to numpy arrays features_array = numpy.array(features) values_array = numpy.array(v) # Set values for alpha, number of iterations. alpha = 0.04 # please feel free to change this value num_iterations = 1000 # please feel free to change this value # Initialize theta, perform gradient descent theta_gradient_descent = numpy.zeros(len(features.columns)) theta_gradient_descent, cost_history = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations) plot = None # ------------------------------------------------- # Uncomment the next line to see your cost history # ------------------------------------------------- plot = plot_cost_history(alpha, cost_history) print plot # # Please note, there is a possibility that plotting # this in addition to your calculation will exceed # the 30 second limit on the compute servers. # Predictions for the data used for define the model predictions = numpy.dot(features, theta_gradient_descent) print predictions # Predictions for the validation data pred_validation = numpy.dot(features_validation, theta_gradient_descent) print pred_validation # Compute error using mean absolute error print "MEAN ABSOLUTE ERROR " print mean_absolute_error(v, predictions) print "MEAN ABSOLUTE ERROR (validation) " print mean_absolute_error(v_validation, pred_validation)
def testModel( model, layerSizes, Xtrain, Ytrain, Xtest, Ytest, learningRate, epochs, batchSize, optimizer, resultsFile = "selfOptLog.txt", printResults = False, elapsedTime = False): numberHiddenLayers = len(layerSizes) - 2 inputLayerSize = layerSizes[0] units1 = layerSizes[1] dropout1 = dropouts[0] dropout2 = dropouts[1] dropout3 = dropouts[2] # Test MAE of model on training data (to check for overfitting). trainingPredY = model.predict_proba(Xtrain, verbose = 0) MAETrain = mean_absolute_error(Ytrain, trainingPredY) scores = model.evaluate(Xtrain, Ytrain, verbose=0) trainingAccuracy = scores[1] * 100 # Test MAE, AUC, and Accuracy on test data. testPredY = model.predict_proba(Xtest, verbose = 0) auc = roc_auc_score(Ytest, testPredY) MAE = mean_absolute_error(Ytest, testPredY) bestGuess = [] # The estimated 0 or 1 output. for predictionCount in range (0, len(testPredY)): if(testPredY[predictionCount] < 0.5): bestGuess.append(0) else: bestGuess.append(1) testAccuracy = 1 - mean_absolute_error(bestGuess, Ytest) # Evaluate the model and write results to a file. if(printResults): print("Training MAE: %.2f%%" % (MAETrain * 100)) print("acc: %.2f%%" % (testAccuracy*100)) print("AUC: %.2f%%" % (auc*100)) print("MAE: %.2f%%" % (MAE*100)) print("%s , %s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n" % (units1, units2, units3, learningRate, epochs, batchSize, patience, optimizer, dropout1, dropout2, dropout3)) print("\n") # Write model results to a file. if(elapsedTime is not False): with open(resultsFile, "a") as text_file: text_file.write( "%s, %s , %s , %s, %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n" % (elapsedTime, MAETrain, trainingAccuracy, testAccuracy, MAE, units1, units2, units3, learningRate, epochs, batchSize, patience, optimizer, dropout1, dropout2, dropout3)) else: with open(resultsFile, "a") as text_file: text_file.write( "%s , %s , %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n" % (MAETrain, trainingAccuracy, testAccuracy, auc, MAE, units1, units2, units3, learningRate, epochs, batchSize, optimizer, dropout1, dropout2, dropout3))
def test_regression_multioutput_array(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. y_true = [[0, 0]]*4 y_pred = [[1, 1]]*4 mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [1., 1.], decimal=2) assert_array_almost_equal(mae, [1., 1.], decimal=2) assert_array_almost_equal(r, [0., 0.], decimal=2) r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(r, [0, -3.5], decimal=2) assert_equal(np.mean(r), r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='uniform_average')) evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(evs, [0, -1.25], decimal=2) # Checking for the condition in which both numerator and denominator is # zero. y_true = [[1, 3], [-1, 2]] y_pred = [[1, 4], [-1, 1]] r2 = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(r2, [1., -3.], decimal=2) assert_equal(np.mean(r2), r2_score(y_true, y_pred, multioutput='uniform_average')) evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(evs, [1., -3.], decimal=2) assert_equal(np.mean(evs), explained_variance_score(y_true, y_pred)) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values') msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput='raw_values') assert_array_almost_equal(msle, msle2, decimal=2)
def rf_regressor(rf_model, train_x, train_y, valid_x, valid_y, generate_csv=False, f_selection=False, n_features=3): global fobj est = rf_model.fit(train_x, train_y) print 'feature importance: ', est.feature_importances_ if f_selection: print "Feature Selection Enabled- feature count: %r" % (n_features) rf_model = RFE(rf_model, n_features, step=1) train_y_pred = est.predict(train_x) error = mt.mean_absolute_error(train_y, train_y_pred) fobj.write('Train Error: %r\n' % (error)) valid_y_pred = est.predict(valid_x) return mt.mean_absolute_error(valid_y, valid_y_pred), error, est
def do_cmp(sn1, sn2="submission_model02"): def load(fn): print fn, fn = '%s/%s.csv.gz' % (Project().datapath, fn) with gzip.open(fn,'rb') as fp: reader = csv.reader(fp) header = next(reader) return np.array(list(reader),dtype=float)[:,1:] s1 = load(sn1) print s2 = load(sn2) print "mae: %.4f" % mean_absolute_error(s1,s2) s2 = load('submission_mix') print "mae2: %.4f" % mean_absolute_error(s1,s2)
def test_trigonometric(): """Check that using trig functions work and that results differ""" est1 = SymbolicRegressor(random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) est1 = mean_absolute_error(est1.predict(boston.data[400:, :]), boston.target[400:]) est2 = SymbolicRegressor(trigonometric=True, random_state=0) est2.fit(boston.data[:400, :], boston.target[:400]) est2 = mean_absolute_error(est2.predict(boston.data[400:, :]), boston.target[400:]) assert_true(abs(est1 - est2) > 0.01)
def test_subsample(): """Check that subsample work and that results differ""" est1 = SymbolicRegressor(max_samples=1.0, random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) est1 = mean_absolute_error(est1.predict(boston.data[400:, :]), boston.target[400:]) est2 = SymbolicRegressor(max_samples=0.7, random_state=0) est2.fit(boston.data[:400, :], boston.target[:400]) est2 = mean_absolute_error(est2.predict(boston.data[400:, :]), boston.target[400:]) assert_true(abs(est1 - est2) > 0.01)
random_state=0) # Fit regression model model = ensemble.GradientBoostingRegressor( n_estimators= 1000, # Tells the model, how many decision trees to build Higher numbers usually allow the model to be more accurate but it increases the amount of time required to run the model learning_rate= 0.1, # Learning rate controls how much each additional decision tree influences the overall prediction. Lower rates usually lead to higher accuracy but only works if we have n_estimators set to a high value. max_depth= 6, # Max_depth controls how many layers deep each individual decision tree can be. We'll start with 6 which means that each decision tree in the model can be up to 6 layers deep. min_samples_leaf= 9, # Min_samples_leaf controls how many times a value must appear in our training set for a decision tree to make a decision based on it. max_features= 0.1, # Max_features is the percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree. loss= 'huber' # Loss controls how scikit-learn calculates the model's error rate or cost as it learns. The huber function does a good job while not being too influenced by outliers in the data set. ) model.fit( X_train, y_train ) # We tell the model to train using our training data set by calling scikit-learn's fit function on the model # Save the trained model to a file so we can use it in other programs joblib.dump(model, 'trained_house_classifier_model.pkl') # Find the error rate on the training set mse = mean_absolute_error(y_train, model.predict(X_train)) print("Training Set Mean Absolute Error: %.4f" % mse) # Find the error rate on the test set mse = mean_absolute_error(y_test, model.predict(X_test)) print("Test Set Mean Absolute Error: %.4f" % mse)
def overall_mae(self): return mean_absolute_error(self._observed, self._predicted)
def getcoef(self, model, y, ypred): # The coeficients print('With coefficients: \n', model.coef_) # Explained variance score: 1 is perfect prediction e = mean_absolute_error(self.y_train, ypred) print("mean absolut error = " + str(e))
def evaluate_ensemble(members, weights, testX, testy): # make prediction yhat = ensemble_predictions(members, weights, testX, testy) # calculate MAE return mean_absolute_error(testy, yhat), yhat
dataset_labels = land_data['price'] from sklearn.model_selection import train_test_split train_features,test_features,train_labels,test_labels = \ train_test_split(dataset_features,dataset_labels,test_size=0.25, random_state=21) from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(train_features, train_labels) predicted_price = regressor.predict(test_features) from sklearn import metrics print('Linear regression') print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, predicted_price)) print('Mean Squared Error:', metrics.mean_squared_error(test_labels, predicted_price)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, predicted_price))) from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=200, random_state=0) regressor.fit(train_features, train_labels) predicted_price = regressor.predict(test_features) from sklearn import metrics print('Random forest regressor') print('Mean Absolute Error:',
sns.distplot(transformed_y[train:]) from sklearn.metrics import mean_absolute_error regressor = MLPRegressor(hidden_layer_sizes=[20, 20], activation='relu', max_iter=1000, random_state=1) regressor.fit(shuffled_X[:train], transformed_y[:train]) "R2 %.3f, ошибка в возрасте: %.2f, разброс значений возраста %.2f" % ( r2_score( shuffled_y[train:], target_processor.inverse_transform( regressor.predict(shuffled_X[train:]).reshape(-1, 1))), mean_absolute_error( shuffled_y[train:], target_processor.inverse_transform( regressor.predict(shuffled_X[train:]).reshape( -1, 1))), shuffled_y[train:].std()) # Посмотрим теперь классификацию на датасете Iris # classifier = MLPClassifier( # hidden_layer_sizes=[32, 12], # activation='tanh', # max_iter=1000, # random_state=1) # X_changed = MinMaxScaler( # feature_range=(-1, 1) # ).fit_transform(X) # scores = cross_val_score( # classifier, # X=X_changed,
def run( self, country: Countries, action: str = 'evaluate' ) -> Union[pd.DataFrame, Dict[Union[Literal["Singapore"], Literal["China"], Literal["India"]], Dict[str, Union[ Union[float, str], Any]]]]: """ >>> from q3_time_series.model import UnivariateMultiStepLSTM >>> # To Evaluate >>> evaluate_metrics = UnivariateMultiStepLSTM(3,2).run('Singapore', "evaluate") >>> # To Predict >>> prediction = UnivariateMultiStepLSTM(3,2).run('Singapore', 'predict') """ assert country in Countries.__args__, \ f"{country} is not supported, please choose between {Countries.__args__}" X_train, y_train = self.split_sequence(self.train[country].values, self.n_steps_in, self.n_steps_out) X_valid, y_valid = self.split_sequence(self.valid_arb[country].values, self.n_steps_in, self.n_steps_out) X_train = X_train.reshape( (X_train.shape[0], X_train.shape[1], self.n_features)) X_valid = X_valid.reshape( (X_valid.shape[0], X_valid.shape[1], self.n_features)) model = self.make_model() model.fit(X_train, y_train, epochs=200, verbose=0, callbacks=[ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20) ], validation_data=(X_valid, y_valid)) if action == 'predict': input = (self.df[country][-self.n_steps_in:].values).reshape( (1, self.n_steps_in, self.n_features)) pred = model.predict(input, verbose=0) return pd.DataFrame(pred, columns=['2008', '2009'], index=[country]).T else: pred_valid = model.predict(X_valid, verbose=0) pred_train = model.predict(X_train, verbose=0) return { country: { 'rmse_train': sqrt( mean_squared_error( [y_train[i][0] for i in range(0, len(y_train))], [ pred_train[i][0] for i in range(0, len(pred_train)) ])), 'rmse_val': sqrt( mean_squared_error( [y_valid[i][0] for i in range(0, len(y_valid))], [ pred_valid[i][0] for i in range(0, len(pred_valid)) ])), 'mae_train': mean_absolute_error( [y_train[i][0] for i in range(0, len(y_train))], [pred_train[i][0] for i in range(0, len(pred_train))]), 'mae_val': mean_absolute_error( [y_valid[i][0] for i in range(0, len(y_valid))], [pred_valid[i][0] for i in range(0, len(pred_valid))]), 'mape_train': f'{self.mean_absolute_percentage_error([y_train[i][0] for i in range(0, len(y_train))], [pred_train[i][0] for i in range(0, len(pred_train))])} %', 'mape_val': f'{self.mean_absolute_percentage_error([y_valid[i][0] for i in range(0, len(y_valid))], [pred_valid[i][0] for i in range(0, len(pred_valid))])} %' } }
def UnivariateAnalysis(p_features, p_X_train, p_Y_train, p_X_val, p_Y_val, p_top_k_features=5, p_model='continuous', # choose from continuous, binary, multinomial p_target_distribution='gamma', p_metric='L1 Error', # choose from L1 Error, AUC p_seed=0, p_subsamplesize=1500, p_n_buckets=20, p_verbose=False): feature_error = [] import sys # Import the library import statsmodels.api as sm from statsmodels.genmod.generalized_linear_model import GLMResults # Scoring parameter if p_metric == 'L1 Error': from sklearn.metrics import mean_absolute_error elif p_metric == 'AUC': from sklearn.metrics import auc from sklearn.metrics import roc_curve else: print('{} is not currently an option'.format(p_metric)) sys.exit() for name, index in p_features: if p_verbose: print(name) # Fit the model # add intercept to continuous variables and classification models if (len(index) == 1) or (p_model in ['binary', 'multinomial']): train_data = sm.add_constant(p_X_train.iloc[:, index]) val_data = sm.add_constant(p_X_val.iloc[:, index]) else: train_data = p_X_train.iloc[:, index] val_data = p_X_val.iloc[:, index] if p_model == 'continuous': model = sm.GLM(p_Y_train, train_data, family=sm.families.Gamma(sm.families.links.log)) try: result = model.fit() except np.linalg.linalg.LinAlgError as err: print('{} failed to fit due to {} error'.format(name, err)) continue elif p_model == 'binary': model = sm.Logit(p_Y_train, train_data) try: result = model.fit(disp=0) except np.linalg.linalg.LinAlgError as err: print('{} failed to fit due to {} error'.format(name, err)) continue elif p_model == 'multinomial': model = sm.MNLogit(p_Y_train, train_data) try: result = model.fit(disp=0) except np.linalg.linalg.LinAlgError as err: print('{} failed to fit due to {} error'.format(name, err)) continue else: print('{} is not an available model option'.format(p_model)) # Calculate the error with the selected metric if p_metric == 'L1 Error': error = mean_absolute_error(p_Y_val, result.predict(val_data)) elif p_metric == 'AUC': try: # TODO make this more specific as well fpr, tpr, thresholds = roc_curve(p_Y_val, result.predict(val_data)) except: print('{} AUC calculation failed'.format(name)) continue error = auc(fpr, tpr) feature_error.append([name, error, index]) if p_metric in ['L1 Error']: df = pd.DataFrame(columns=['Variable', 'Validation Error', 'Index'], data=feature_error) df_sorted = df.sort_values(by='Validation Error') elif p_metric in ['AUC']: df = pd.DataFrame(columns=['Variable', 'Validation AUC', 'Index'], data=feature_error) df_sorted = df.sort_values(by='Validation AUC', ascending=False) top_k_features = df_sorted.iloc[:p_top_k_features, :] print(top_k_features.iloc[:, :2]) for name, error, index in pd.DataFrame.as_matrix(top_k_features): if len(index) == 1: print('Feature: ' + str(name)) if p_metric in ['L1 Error']: print('Validation Error: ' + str(error)) elif p_metric == 'AUC': print('AUC: ' + str(error)) X_train_const = sm.add_constant(p_X_train.iloc[:, index]) # X_val_const = sm.add_constant(p_X_val.iloc[:,index]) if p_model == 'continuous': model = sm.GLM(p_Y_train, X_train_const, family=sm.families.Gamma(sm.families.links.log)) elif p_model == 'binary': model = sm.Logit(p_Y_train, X_train_const) elif p_model == 'multinomial': model = sm.MNLogit(p_Y_train, X_train_const) result = model.fit(disp=0) print('Training AIC: ' + str(result.aic)) # plot fitted vs observed on both training and validation data y_pred_train = result.predict(X_train_const) # y_pred_val = result.predict(X_val_const) plot_data_train = pd.DataFrame(np.column_stack([p_X_train.iloc[:, index], p_Y_train, y_pred_train]), columns=[list(p_X_train.columns[index])[0], 'y', 'y_pred']) if p_model == 'binary': x_values, y_values = AutoBucket(plot_data_train[list(p_X_train.columns[index])[0]], plot_data_train['y'], p_n_buckets) else: from random import sample, seed seed(p_seed) rand_vals = sample(range(len(plot_data_train)), k=min(p_subsamplesize, len(plot_data_train))) plot_data_train_sample = plot_data_train.iloc[rand_vals, :] plot_data_train_sample_sorted = plot_data_train_sample.sort_values(by=list(p_X_train.columns[index])[0]) fig, ax = plt.subplots(figsize=(12, 8)) if p_model == 'binary': plot_data_train_sample_sorted = plot_data_train.sort_values(by=list(p_X_train.columns[index])[0]) plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y_pred', ax=ax, linestyle='-', color='b') plt.plot(x_values, y_values, 'ro--') else: plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y_pred', ax=ax, linestyle='-', color='b') plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y', ax=ax, kind='scatter', color='r') plt.show() print(result.summary()) else: # Add observed (average) values to the graph. Use automatic bucketing of indt variable # Add argument to choose between: predicted value, observed value, 95% confidence int print('Feature: ' + str(name)) if p_metric in ['L1 Error']: print('Validation Error: ' + str(error)) elif p_metric == 'AUC': print('AUC: ' + str(error)) if p_model == 'continuous': model = sm.GLM(p_Y_train, p_X_train.iloc[:, index], family=sm.families.Gamma(sm.families.links.log)) elif p_model == 'binary': model = sm.Logit(p_Y_train, p_X_train.iloc[:, index]) elif p_model == 'multinomial': model = sm.MNLogit(p_Y_train, p_X_train.iloc[:, index]) result = model.fit(disp=0) print('Training AIC: ' + str(result.aic)) # TODO add multinomial below fig, ax1 = plt.subplots(figsize=(12, 8)) if p_model == 'continuous': upper_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 1]))}) model = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, 'model': list(np.exp(result.params))}) lower_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 0]))}) elif p_model == 'binary': # TODO verify transformation below is correct upper_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 1])/ (np.exp(GLMResults.conf_int(result)[:, 1]) + 1))}) model = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, 'model': list(np.exp(result.params)/(1 + np.exp(result.params)))}) lower_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 0])/ (np.exp(GLMResults.conf_int(result)[:, 0]) + 1))}) upper_bound.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='r') model.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='b') lower_bound.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='g') ax1.set_ylabel('Response', color='b') ax1.tick_params('y', colors='b') ax1.legend(loc='upper left') weights = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns, 'weight': list(p_X_train.iloc[:, index].sum(axis=0))}) plt.xticks(rotation=90) ax2 = ax1.twinx() weights.plot(x='Level', ax=ax2, kind='bar', color='y', alpha=0.4) ax2.set_ylabel('Weight', color='y') ax2.set_ylim([0, max(weights.iloc[:, 1]) * 3]) ax2.tick_params('y', colors='y') ax2.legend(loc='upper right') ax2.grid(False) # fig.tight_layout() plt.show() print(result.summary())
def calculate_measures_for_continues_labels( all_predictions: pd.DataFrame, final_total_payoff_prediction_column: str, total_payoff_label_column: str, label_options: list, raisha: str = 'All_raishas', round_number: str = 'All_rounds', bin_label: pd.Series = None, bin_predictions: pd.Series = None, already_calculated: bool = False, bin_label_column_name: str = 'bin_label', bin_prediction_column_name: str = 'bin_predictions', prediction_type: str = '') -> (pd.DataFrame, dict): """ Calc and print the regression measures, including bin analysis :param all_predictions: :param total_payoff_label_column: the name of the label column :param final_total_payoff_prediction_column: the name of the prediction label :param label_options: list of the label option names :param raisha: if we run a raisha analysis this is the raisha we worked with :param round_number: for per round analysis :param bin_label: the bin label series, the index is the same as the total_payoff_label_column index :param bin_predictions: the bin predictions series, the index is the same as the total_payoff_label_column index :param prediction_type: if we want to use seq and reg predictions- so we have a different column for each. :param already_calculated: if we already calculated the measures, need to calculate again only the bin measures :param bin_label_column_name: the name of the bin label column if it is in the all_prediction df :param bin_prediction_column_name: the name of the bin prediction column if it is in the all_prediction df :return: """ dict_key = f'{raisha} {round_number}' if 'is_train' in all_predictions.columns: data = all_predictions.loc[all_predictions.is_train == False] else: data = all_predictions results_dict = defaultdict(dict) predictions = data[final_total_payoff_prediction_column] gold_labels = data[total_payoff_label_column] mse = metrics.mean_squared_error(predictions, gold_labels) rmse = round(100 * math.sqrt(mse), 2) mae = round(100 * metrics.mean_absolute_error(predictions, gold_labels), 2) mse = round(100 * mse, 2) # calculate bin measures if bin_label_column_name and bin_prediction_column_name in all_predictions.columns: bin_label = all_predictions[bin_label_column_name] bin_predictions = all_predictions[bin_prediction_column_name] elif bin_label is None and bin_predictions is None: print(f'No bin labels and bin predictions') logging.info(f'No bin labels and bin predictions') raise Exception precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support( bin_label, bin_predictions) num_bins = len(label_options) precision_micro, recall_micro, fbeta_score_micro, support_micro =\ metrics.precision_recall_fscore_support(bin_label, bin_predictions, average='micro') precision_macro, recall_macro, fbeta_score_macro, support_macro =\ metrics.precision_recall_fscore_support(bin_label, bin_predictions, average='macro') # number of DM chose stay home final_labels = list(range(len(support))) for my_bin in range(len(label_options)): status_size = bin_label.where(bin_label == my_bin).dropna().shape[0] if status_size in support: index_in_support = np.where(support == status_size)[0] if final_labels[index_in_support[ 0]] in label_options and index_in_support.shape[0] > 1: # 2 bins with the same size --> already assign index_in_support = index_in_support[1] else: index_in_support = index_in_support[0] final_labels[index_in_support] = label_options[my_bin] for item in final_labels: if item not in label_options: # status_size = 0 final_labels.remove(item) accuracy = metrics.accuracy_score(bin_label, bin_predictions) results_dict[dict_key][ f'Bin_{num_bins}_bins_Accuracy{prediction_type}'] = round( accuracy * 100, 2) # create the results to return for measure, measure_name in [[precision, 'precision'], [recall, 'recall'], [fbeta_score, 'Fbeta_score']]: for i in range(len(measure)): if f'Bin_{measure_name}_{final_labels[i]}{prediction_type}' in [ 'Bin_Fbeta_score_1', 'Bin_Fbeta_score_2', 'Bin_Fbeta_score_3', 'Bin_precision_1', 'Bin_precision_2', 'Bin_precision_3', 'Bin_recall_1', 'Bin_recall_2', 'Bin_recall_3' ]: print( f'Error: final_labels: {final_labels}, label_options: {label_options},' f'already_calculated: {already_calculated}, raisha: {raisha}, rounds: {round_number}' ) results_dict[dict_key][ f'Bin_{measure_name}_{final_labels[i]}{prediction_type}'] = round( measure[i] * 100, 2) for measure, measure_name in [[precision_micro, 'precision_micro'], [recall_micro, 'recall_micro'], [fbeta_score_micro, 'Fbeta_score_micro'], [precision_macro, 'precision_macro'], [recall_macro, 'recall_macro'], [fbeta_score_macro, 'Fbeta_score_macro']]: results_dict[dict_key][ f'Bin_{num_bins}_bins_{measure_name}{prediction_type}'] = round( measure * 100, 2) if not already_calculated: results_dict[dict_key][f'MSE{prediction_type}'] = mse results_dict[dict_key][f'RMSE{prediction_type}'] = rmse results_dict[dict_key][f'MAE{prediction_type}'] = mae results_pd = pd.DataFrame.from_dict(results_dict, orient='index') return results_pd, results_dict
for targetUser in userRatingsForSongDict[targetSong]: if (j == 500): break j += 1 similarityForSongDict = {} start = time.time() targetSongUserRatingsDict = userRatingsForSongDict[targetSong] actualTargetRating = userRatingsForSongDict[targetSong][targetUser] calcSimilarSongs() finalScore = calcFinalScore() print(targetUser, finalScore, actualTargetRating) print("") if finalScore != 0: predictedScores.append(finalScore) actualScores.append(actualTargetRating) if (actualScores and predictedScores): rmse = sqrt(mean_squared_error(actualScores, predictedScores)) rmseSum += rmse mae = mean_absolute_error(actualScores, predictedScores) maeSum += mae print("Root mean squared error is: ", rmse, "and Mean absolute error is: ", mae) k += 1 print("Mean rmse is: ", rmseSum / k, "and Mean absolute error is: ", maeSum / k)
from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score boston = load_boston() x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.3, random_state=42) model = LinearRegression() model.fit(x_train, y_train) y_pred = model.predict(x_test) mae = mean_absolute_error(y_pred=y_pred, y_true=y_test) print(mae) mse = mean_squared_error(y_pred=y_pred, y_true=y_test) print(mse) rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test)) print(rmse) r2 = r2_score(y_pred=y_pred, y_true=y_test) print(r2) #===================================================================== # K-Fold #===================================================================== import numpy as np
def splitDataTraining(task, model, features, target, test_size, scoring): if test_size == 1: logger.info("The whole dataset will be used for training!") model.fit(features, target) params = np.append(model.intercept_, model.coef_) predictions = model.predict(features) newX = pd.DataFrame({ "Constant": np.ones(len(features)) }).join(pd.DataFrame(features)) MSE = (sum( (target - predictions)**2)) / (len(newX) - len(newX.columns)) var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal()) sd_b = np.sqrt(var_b) ts_b = params / sd_b p_values = [ 2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b ] sd_b = np.round(sd_b, 3) ts_b = np.round(ts_b, 3) p_values = np.round(p_values, 3) params = np.round(params, 4) results = pd.DataFrame() results["Coefficients"], results["Standard Errors"], results[ "t values"], results["Probabilites"] = [ params, sd_b, ts_b, p_values ] return results elif test_size < 1: x_train, x_test, y_train, y_test = train_test_split( features, target, test_size=test_size, random_state=1) model.fit(x_train, y_train) model_train_pred = model.predict(x_train) model_test_pred = model.predict(x_test) results = pd.DataFrame() if task == "regression": if "neg_mean_absolute_error" in scoring: results['MAE_train'], results['MAE_test'] = [[ mean_absolute_error(y_train, model_train_pred) ], [mean_absolute_error(y_test, model_test_pred)]] if "neg_mean_squared_error" in scoring: results['MSE_train'], results['MSE_test'] = [[ mean_squared_error(y_train, model_train_pred) ], [mean_squared_error(y_test, model_test_pred)]] if "neg_mean_squared_log_error" in scoring: results['MSLE_train'], results['MSLE_test'] = [[ mean_squared_log_error(y_train, model_train_pred) ], [mean_squared_log_error(y_test, model_test_pred)]] if "r2" in scoring: results['r2_train'], results['r2_test'] = [[ r2_score(y_train, model_train_pred) ], [r2_score(y_test, model_test_pred)]] return results elif task == "classification": if "precision" in scoring: results['precision_train'], results['precision_test'] = [[ precision_score(y_train, model_train_pred) ], [precision_score(y_test, model_test_pred)]] if "recall" in scoring: results['recall_train'], results['recall_test'] = [[ recall_score(y_train, model_train_pred) ], [recall_score(y_test, model_test_pred)]] if "f1" in scoring: results['f1_train'], results['f1_test'] = [[ f1_score(y_train, model_train_pred) ], [f1_score(y_test, model_test_pred)]] if "roc_auc" in scoring: results['roc_auc_train'], results['roc_auc_test'] = [[ roc_auc_score(y_train, model_train_pred) ], [roc_auc_score(y_test, model_test_pred)]] return results
from etl import prepare_data, prepare_submission from sklearn.feature_selection import SelectFromModel from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split from xgboost import XGBRegressor # load and split train/dev/test (X_train, y_train), (X_test, test_id) = prepare_data() X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=69) # run without hyperparams for fscore calculations model = XGBRegressor() model.fit(X_train, y_train) y_hat = model.predict(X_dev) mae = mean_absolute_error(np.expm1(y_dev), np.expm1(y_hat)) print("Mae: {}".format(mae)) thresholds = np.sort(model.feature_importances_) thresholds = np.unique(thresholds) threshold = 0 best_mae = mae for thresh in thresholds[:50]: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBRegressor() selection_model.fit(select_X_train, y_train) # eval model
def testForCVData(self): self.r_sqr = self.clf.score(self.X_test, self.y_test) self.output = self.clf.predict(self.X_test) self.mae = mean_absolute_error(self.y_test, self.output) self.mse = mean_squared_error(self.y_test, self.output)
def run_models(grid_y, grid_x): X, Y = create_training_and_testing_data(grid_x, grid_y) data = Table(X, Y) # print(data.Y) # np.savetxt('data/' + str(grid_x) + '_' + str(grid_y) + '.csv', np.array(data), delimiter=',', fmt='%10.5f') # print(out_data.domain) # print(out_data.Y) # feature_method = og.preprocess.score.UnivariateLinearRegression() # selector = og.preprocess.SelectBestFeatures(method=feature_method, k=10) # out_data2 = selector(data) # plot_input(out_data2.X, out_data2.Y) # print(out_data2.domain) # pca = PCA(n_components=5) # model = pca(out_data2) # out_data = model(out_data2) # print(out_data.domain) test = og.data.Table(data.domain, random.sample(data, 60)) train = og.data.Table(data.domain, [d for d in data if d not in test]) lin = og.regression.linear.LinearRegressionLearner() rf = og.regression.random_forest.RandomForestRegressionLearner() nnr = og.regression.NNRegressionLearner() svm = og.regression.SVRLearner() knn = KNeighborsRegressor(n_neighbors=3) learners = [lin, rf, nnr, svm] regressors = [learner(train) for learner in learners] knn.fit(train.X, train.Y) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_lin.pickle", "wb") as f: pickle.dump(lin, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_rf.pickle", "wb") as f: pickle.dump(rf, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_nnr.pickle", "wb") as f: pickle.dump(nnr, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_svm.pickle", "wb") as f: pickle.dump(svm, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_knn.pickle", "wb") as f: pickle.dump(knn, f) # print((r(test)[0] for r in regressors)) linPredict = regressors[0](test) rfPredict = regressors[1](test) nnrPredict = regressors[2](test) svmPredict = regressors[3](test) knnPredict = knn.predict(test.X) predictions = [] predictions.append(linPredict) predictions.append(rfPredict) predictions.append(nnrPredict) predictions.append(svmPredict) predictions.append(knnPredict) # print(knnPredict) # print("y ", " ".join("%5s" % l.name for l in regressors)) # for d in test: # print(("{:<5}" + " {:5.1f}" * len(regressors)).format(d.get_class(), *(r(d)[0] for r in regressors))) # res = og.evaluation.CrossValidation(test, learners, k=10) # rmse = og.evaluation.RMSE(res) # mae = og.evaluation.MAE(res) # r2 = og.evaluation.R2(res) rmse = [] mae = [] rmse.append(math.sqrt(mean_squared_error(test.Y, linPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, rfPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, nnrPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, svmPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, knnPredict))) mae.append(mean_absolute_error(test.Y, linPredict)) mae.append(mean_absolute_error(test.Y, rfPredict)) mae.append(mean_absolute_error(test.Y, nnrPredict)) mae.append(mean_absolute_error(test.Y, svmPredict)) mae.append(mean_absolute_error(test.Y, knnPredict)) return np.array(mae), np.array(rmse), np.array(predictions), test
def assessmentModel(self,cvNum=5): y_pred = self.__model.predict(self.__xTest) from sklearn.model_selection import cross_val_score if self.__typeLearning == 'Regression': from sklearn.metrics import mean_squared_error,median_absolute_error,mean_squared_log_error,mean_absolute_error,explained_variance_score,r2_score #mean_squared_error print('MSE: \t\t',mean_squared_error(self.__yTest,y_pred)) print('RMSE: \t\t',np.sqrt(mean_squared_error(self.__yTest,y_pred))) #print('MSE: ',np.mean((self.__yTest-y_pred)**2)) #median_absolute_error #print('median: ',np.median(np.abs(self.__yTest-y_pred))) print('median: \t\t',median_absolute_error(self.__yTest,y_pred)) #mean_absolute_error #print('MAE: ',np.mean(np.abs(self.__yTest-y_pred))) print('MAE: \t\t',mean_absolute_error(self.__yTest,y_pred)) #mean_squared_log_error print('MSLE: \t\t',mean_squared_log_error(self.__yTest,y_pred)) #print('MSLE: ',np.mean((np.log(self.__yTest+1)-np.log(y_pred+1))**2)) #explained_variance_score print('explained_variance: \t\t',explained_variance_score(self.__yTest,y_pred)) #print('explained_variance: ',1-np.var(self.__yTest-y_pred)/np.var(self.__yTest)) #r2_score print('R2: \t\t',r2_score(self.__yTest,y_pred)) #print('R2: ',1-(np.sum((self.__yTest-y_pred)**2))/np.sum((self.__yTest -np.mean(self.__yTest))**2)) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_squared_error') print('cv MSE mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='r2') print('cv r2 mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='explained_variance') print('cv explained_variance mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_squared_log_error') print('cv MSLE mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_absolute_error') print('cv MAE mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_median_absolute_error') print('cv median mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_root_mean_squared_error') print('cv RMSE mean: \t',scoresval.mean()) if self.__typeLearning == 'Classification': from sklearn.metrics import accuracy_score,balanced_accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score,average_precision_score print('准确率: \t\t',accuracy_score(self.__yTest,y_pred)) print('准确率-Balanced: \t',balanced_accuracy_score(self.__yTest,y_pred)) print('F1-micro: \t\t',f1_score(self.__yTest,y_pred,average='micro')) print('F1-macro: \t\t',f1_score(self.__yTest,y_pred,average='macro')) print('F1-weighted: \t\t',f1_score(self.__yTest,y_pred,average='weighted')) print('精确率-micro: \t',precision_score(self.__yTest,y_pred,average='micro')) print('精确率-macro: \t',precision_score(self.__yTest,y_pred,average='macro')) print('精确率-weighted:',precision_score(self.__yTest,y_pred,average='weighted')) print('召回率-micro: \t',recall_score(self.__yTest,y_pred,average='micro')) print('召回率-macro: \t',recall_score(self.__yTest,y_pred,average='macro')) print('召回率-weighted:',recall_score(self.__yTest,y_pred,average='weighted')) print('Cohen\'s Kappa: \t',cohen_kappa_score(self.__yTest,y_pred)) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='accuracy') print('cv accuracy mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='balanced_accuracy') print('cv balanced_accuracy mean:',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_micro') print('cv f1_micro mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_macro') print('cv f1_macro mean: \t',scoresval.mean()) scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_weighted') print('cv f1_weighted mean: \t',scoresval.mean()) from sklearn.metrics import classification_report print('分类报告: ','\n',classification_report(self.__yTest,y_pred))
#fitting the model now titanic_model.fit(train_X, train_y) # In[ ]: #We're just testing how well fitted the model is here. titanic_preds = titanic_model.predict(val_X) # In[ ]: from sklearn.metrics import mean_absolute_error #Let's calculate the MAE titanic_mae = mean_absolute_error(titanic_preds, val_y) print(titanic_mae) # # Applying The Model To The Given Test Data # In[ ]: #making an X function final_X = test_data[features] final_predictions = titanic_model.predict(final_X) final_predictions = np.round(final_predictions) final_predictions = final_predictions.astype(int) # # Creating the submission
#Necessario Desescalonar y_train = scaler_y_train.inverse_transform(y_train) y_test = scaler_y_test.inverse_transform(y_test) previsoes_train = scaler_y_train.inverse_transform( previsoes_train) previsoes_test = scaler_y_test.inverse_transform( previsoes_test) ''' Tratamento dos Dados ''' from sklearn.metrics import mean_absolute_error, mean_squared_error from math import sqrt #Dados estatisticos do treinamento mae_train = mean_absolute_error( y_train, previsoes_train) mse_train = mean_squared_error(y_train, previsoes_train) rmse_train = sqrt( mean_squared_error(y_train, previsoes_train)) #Dados estatisticos dos testes mae_test = mean_absolute_error(y_test, previsoes_test) mse_test = mean_squared_error(y_test, previsoes_test) rmse_test = sqrt( mean_squared_error(y_test, previsoes_test)) ''' import matplotlib.pyplot as plt #plt.title('Previsão da rede neural com 2 neurônios ocultos ') plt.xlabel('Valores reais')
#error = absolute_error/100 print("Mean Absolute Error is") print(error) f.write("Mean Absolute Error is " + str(error) + "\n") f.write("Sample Solution" + "\n") for rand in range(0, 5): f.write("%s " % predictions[rand]) f.close() # For different metrics evaluation flattened = np.concatenate(final_values[iteration_count]).ravel().tolist() comp = pd.DataFrame({ 'original': target[top_70:len(needed_tweets)], 'predicted': flattened }) comp.corr(method='pearson') from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error r2_score(comp['original'].tolist(), comp['predicted'].tolist()) comp.to_csv("GCN_comp.csv") mean_absolute_error(comp['original'].tolist(), comp['predicted'].tolist())
def LSTM_Multivariate(): dataset = read_csv("/Users/ange/Downloads/Training-Data-Sets.csv", header=0) print('----------------------------------------------------------------------------') print('Replace NaN values with Mean') dataset.fillna(dataset.mean(), inplace=True) print('----------------------------------------------------------------------------') values = dataset.values target = dataset.iloc[:, 1:2].values dat = dataset.iloc[:, [1,4,3,18,5,14]].values t_df = read_csv("/Users/ange/Downloads/Test dataset v1.csv", header=0) t_df.fillna(t_df.mean(), inplace=True) t_dat = t_df.iloc[:, [1,4,3,18,5,14]].values t_target = t_df.iloc[:, 1:2].values scaler = preprocessing.StandardScaler() # define input sequence train = dat test = t_dat target_train = target target_test = t_target[13:] # choose a number of time steps n_steps = 13 # split into samples X, y = split_sequence(train,target, n_steps) # summarize the data for i in range(len(X)): print(X[i], y[i]) # reshape from [samples, timesteps] into [samples, timesteps, features] n_features = 6 X = X.reshape((X.shape[0], X.shape[1], n_features)) print(X.shape, y.shape) # define model start_time = time.time() model = Sequential() model.add(Bidirectional(LSTM(25, activation='relu'), input_shape=(n_steps, n_features) )) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse') # fit model model.fit(X, y, epochs=100, verbose=0, shuffle = False) print('') print('Prediction') # demonstrate prediction #Testing test_inputs = t_dat test_features = [] for i in range(n_steps, len(test_inputs)): test_features.append(test_inputs[i-n_steps:i, 0:n_features]) test_features = np.array(test_features) print(test_features[0]) test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], n_features)) print('Features - Shape') print(test_features.shape[1]) print(test_features) x_input = array(test_features) print('') print(x_input.shape) predictions = model.predict(test_features, verbose=0) print('') print("Execution Time: %s seconds" % (time.time() - start_time)) print('') actual = target_test pred = scaler.fit_transform(predictions) plt.figure(figsize=(10,6)) plt.plot(actual, color='blue', label='Actual Forecast') plt.plot(predictions , color='red', label='Predicted Forecast') plt.title('Sales Forecasting') plt.xlabel('Forecast Horizons (Day)') plt.ylabel('Sales') plt.legend() plt.show() print('-------------------------------------------------------') print('MAE:') print(metrics.mean_absolute_error(actual,predictions)) print('') print('RMSE:') print(np.sqrt(metrics.mean_absolute_error(actual,predictions))) print('Epochs: 100') model.summary() print('') print('') print('-------------------------------------------------------')
## ELM TRAINING MAE_TRAIN_MINS = [] MAE_TEST_MINS = [] steps = 2 # new neurons = 10 # new predictions = [] for M in range(1, steps, 1): MAES_TRAIN = [] MAES_TEST = [] # print "Training with %s neurons..."%M for i in [10, 100, 300, 500]: print(f"Training {i} neurons in Step{M}") ELM = ELMRegressor(i) ELM.fit(X_train, y_train) prediction = ELM.predict(X_train) MAES_TRAIN.append(mean_absolute_error(y_train, prediction)) prediction = ELM.predict(X_test) predictions.append(prediction) mae = mean_absolute_error(y_test, prediction) MAES_TEST.append(mae) print(f"MAE: {mae}") print(f"RMSE: {rmse(prediction, y_test)}") print(f"MSE: {mean_squared_error(prediction, y_test)}") MAE_TEST_MINS.append(min(MAES_TEST)) MAE_TRAIN_MINS.append(MAES_TRAIN[np.argmin(MAES_TEST)]) print("Minimum MAE ELM =", min(MAE_TEST_MINS)) print("using amount of steps: ", steps) # new print("using amount of neurons: ", neurons) # new
y = dataset['Deaths'].values plt.figure(figsize=(15, 10)) plt.tight_layout() seabornInstance.distplot(dataset['Deaths']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) regressor = LinearRegression() regressor.fit(X_train, y_train) coeff_df = pd.DataFrame(X, columns=['Recovered', 'Confirmed']) y_pred = regressor.predict(X_test) df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) df1 = df.head(25) df1.plot(kind='bar', figsize=(10, 8)) plt.grid(which='major', linestyle='-', linewidth='0.5', color='green') plt.grid(which='minor', linestyle='-', linewidth='0.5', color='black') plt.show() print('Mean absolute error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean squared error:', metrics.mean_squared_error(y_test, y_pred)) print('Root mean squared error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
verbose=True, early_stopping=True, validation_fraction=0.2) clf.fit(X_train, y_train) print("训练集准确率:", clf.score(X_train, y_train)) clf.fit(X_test, y_test) y_Pred = clf.predict(X_test) y_Pred[y_Pred > 1] = 1 y_Pred[y_Pred < 0] = 0 # 模型评分 y_pre = clf.predict(X_test) c1 = clf.score(X_test, y_test) print("测试集准确率:", c1) # mae评估 d1 = mean_absolute_error(y_true=y_test, y_pred=y_Pred) print("mae:", d1) #rmae评估 e1 = mean_squared_error(y_true=y_test, y_pred=y_Pred) print('mean_squared_error: ', e1) #R²评估 r2 = 1 - (d1) / (np.std(y_test)) print("R²:", r2_score(y_test, y_Pred)) #计算时间 end = time.perf_counter() print("final is in ", end - start)
max_features='auto', max_depth=8, min_samples_leaf=4, min_samples_split=8, oob_score=True, #random_state = 42, criterion='mae', n_jobs=-1, bootstrap=True) #warm_start=False, #max_leaf_nodes = 30) rfr.fit(X_train, y_train) predictions = rfr.predict(X_test) #plt.scatter(y_test,predictions) print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) print('MAPE:', mean_absolute_percentage_error(y_test, predictions)) #columns = net_profit_percent.columns #print (sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), columns),reverse=True)) #subplots method of matplotlib fig, axes = plt.subplots(nrows=2, ncols=1) axes[0].scatter(y_test, predictions) plt.sca(axes[1]) #Use the pyplot interface to change just one subplot plt.xticks(range(X_train.shape[1]), X_train.columns, color='r') axes[1].bar(range(X_train.shape[1]), rfr.feature_importances_, color='b',
from sklearn.model_selection import train_test_split #training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100) #importing LR model from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(X_train, y_train) #prediction predictions = lm.predict(X_test) #some visualize plt.scatter(y_test, predictions) #regression Evaluation methods from sklearn import metrics metrics.mean_absolute_error(y_test, predictions) metrics.mean_squared_error(y_test, predictions) np.sqrt(metrics.mean_squared_error(y_test, predictions))
def polynomial_residual(degree, X, y): polynomial_regression = PolynomialRegression(degree=degree) polynomial_regression.fit(X, y) y_pred = polynomial_regression.predict(X) mae = mean_absolute_error(y, y_pred) return mae
# Print '{name} has been fitted' print(name, 'has been fitted.') #Show best score model.best_score_ #Show best parameters for given model model..best_estimator_ # Import r2_score and mean_absolute_error functions from sklearn.metrics import r2_score, mean_absolute_error # Predict test set using fitted random forest pred = fitted_models['rf'].predict(X_test) # Calculate and print R^2 and MAE print( 'R^2:', r2_score(y_test, pred )) print( 'MAE:', mean_absolute_error(y_test, pred)) Classification: #Display probability of prediction model.predict_proba # Classification metrics from sklearn.metrics import roc_curve, auc #Helper function def fit_and_plot_classifier(clf): # Fit model
sns.displot(y_test - prediction) #plt.show() #plt.scatter(y_test, prediction) #plt.show() """Regression Evaluation Metrics :- Here are three common evaluation metrics for regression problems: Mean Absolute Error (MAE) is the mean of the absolute value of the errors Mean Squared Error (MSE) is the mean of the squared errors. Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors. Comparing these metrics: MAE is the easiest to understand, because it's the average error. MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world. RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units. All of these are loss functions, because we want to minimize them.""" from sklearn.metrics import mean_squared_error, mean_absolute_error print("MAE: ", mean_absolute_error(y_test, prediction)) print("MSE: ", mean_squared_error(y_test, prediction)) print("RMSE: ", np.sqrt(mean_squared_error(y_test, prediction))) import pickle ##Open a file , where we want to store the data file = open("regression.pkl", 'wb') ##dump the information to that file pickle.dump(regressor, file)
def run(self): """ For experimental purpose, since the evaluation result is bad, this model will not be use for prediction >>> from q3_time_series.model import MultivariateMultiStepLSTM >>> # To Evaluate >>> evaluate_metrics = MultivariateMultiStepLSTM(3,2).run() """ dataset_train = np.hstack(self.hstacK_generator(self.train)) dataset_valid = np.hstack(self.hstacK_generator(self.valid_arb)) X_train, y_train = self.split_sequences(dataset_train, self.n_steps_in, self.n_steps_out) X_valid, y_valid = self.split_sequences(dataset_valid, self.n_steps_in, self.n_steps_out) model = self.make_model(X_train.shape[2]) model.fit(X_train, y_train, epochs=200, verbose=0, callbacks=[ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20) ], validation_data=(X_valid, y_valid)) pred_valid = model.predict(X_valid, verbose=0) pred_train = model.predict(X_train, verbose=0) tmp = [] for j, col in enumerate(self.df.columns): tmp.append({ col: { 'rmse_train': sqrt( mean_squared_error( [y_train[i][0][j] for i in range(0, len(y_train))], [ pred_train[i][0][j] for i in range(0, len(pred_train)) ])), 'rmse_val': sqrt( mean_squared_error( [y_valid[i][0][j] for i in range(0, len(y_valid))], [ pred_valid[i][0][j] for i in range(0, len(pred_valid)) ])), 'mae_train': mean_absolute_error( [y_train[i][0][j] for i in range(0, len(y_train))], [ pred_train[i][0][j] for i in range(0, len(pred_train)) ]), 'mae_val': mean_absolute_error( [y_valid[i][0][j] for i in range(0, len(y_valid))], [ pred_valid[i][0][j] for i in range(0, len(pred_valid)) ]), 'mape_train': f'{self.mean_absolute_percentage_error([y_train[i][0][j] for i in range(0, len(y_train))], [pred_train[i][0][j] for i in range(0, len(pred_train))])} %', 'mape_val': f'{self.mean_absolute_percentage_error([y_valid[i][0][j] for i in range(0, len(y_valid))], [pred_train[i][0][j] for i in range(0, len(pred_valid))])} %' } }) return tmp
def eval_metrics(actual, pred): rmse = np.sqrt(mean_squared_error(actual, pred)) mae = mean_absolute_error(actual, pred) r2 = r2_score(actual, pred) return rmse, mae, r2