def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:,0])
    
    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k:[] for k in X.columns}
        pred=np.zeros(y.shape[0])
        for train,test in CV.split(X,y):
            Xtrain = X.iloc[train,:]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test,:]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape)>1:
                p=p[:,0]
            pred[test]=p

            if get_importance:    
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k,v in importance_out.items():
                    importances[k].append(v)
                    
        cv_scores = [{'r': np.corrcoef(y,pred)[0,1],
                      'R2': np.corrcoef(y,pred)[0,1]**2,
                      'MAE': mean_absolute_error(y,pred)}]
        
        
        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{'r': np.corrcoef(y,in_pred)[0,1],
                          'R2': np.corrcoef(y,in_pred)[0,1]**2,
                          'MAE': mean_absolute_error(y,in_pred)}]
        GAM_results[name] = {'scores_cv': cv_scores,
                             'scores_insample': in_scores,
                             'pred_vars': X.columns,
                             'importances': importances,
                             'model': gam}
    return GAM_results
Пример #2
0
def evaluate(ytest, ypred, filename='metrics.txt'):
    true_result = [1 if item > 0.5 else 0 for item in ytest]
    pred_result = [1 if item > 0.5 else 0 for item in ypred]
    
    cm = confusion_matrix(true_result, pred_result)
    print('\nConfusion matrix:')
    print(cm)
    print("\nLoss classified as loss", cm[0][0])
    print("Wins classified as wins", cm[1][1])
    print("Wins classified as loss", cm[1][0])
    print("Loss classified as wins", cm[0][1])
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result))
    print('Precision:\t', precision_score(true_result, pred_result))
    print('Recall: \t', recall_score(true_result, pred_result))
    print('F1 score:\t', f1_score(true_result, pred_result))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred))
    
    # print to file
    print("Loss classified as loss", cm[0][0], file=open(filename, "a"))
    print("Wins classified as wins", cm[1][1], file=open(filename, "a"))
    print("Wins classified as loss", cm[1][0], file=open(filename, "a"))
    print("Loss classified as wins", cm[0][1], file=open(filename, "a"))
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result), file=open(filename, "a"))
    print('Precision:\t', precision_score(true_result, pred_result), file=open(filename, "a"))
    print('Recall: \t', recall_score(true_result, pred_result), file=open(filename, "a"))
    print('F1 score:\t', f1_score(true_result, pred_result), file=open(filename, "a"))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred), file=open(filename, "a"))
Пример #3
0
def test_regressor(train, test, feature_extractor, target_transformer, regressor):
	(train_raw_X, train_raw_y) = (train, train['SalaryNormalized'])
	(test_raw_X, test_raw_y) = (test, test['SalaryNormalized'])


	print 'feature extraction ...'
	train_y = target_transformer.transform(train_raw_y)
	test_y = target_transformer.transform(test_raw_y)
	train_X = feature_extractor.fit_transform(train_raw_X, train_y)
	test_X = feature_extractor.transform(test_raw_X)

	print 'fit regression model ...'
	try:
		regressor.fit(train_X, train_y)
		train_raw_yhat = target_transformer.r_transform(regressor.predict(train_X))
		test_raw_yhat = target_transformer.r_transform(regressor.predict(test_X))
	except TypeError:
		regressor.fit(train_X.toarray(), train_y)
		train_raw_yhat = target_transformer.r_transform(regressor.predict(train_X.toarray()))
		test_raw_yhat = target_transformer.r_transform(regressor.predict(test_X.toarray()))

	print 'evaluate error metrics ...'
	train_error = metrics.mean_absolute_error(train_raw_y, train_raw_yhat)
	test_error = metrics.mean_absolute_error(test_raw_y, test_raw_yhat)
	print 'Train error: ', train_error
	print 'Test error:', test_error
def predict_variance_inf_phase1(budget, hum_train_means, temp_train_means, hum_train_vars, temp_train_vars):
    """Method to make predictions based on max-variance active inference."""         
    start_hum = 0
    window_hum = None
    window_temp = None
    i = 0

    hum_preds = np.ones((50, 96))
    temp_preds = np.ones((50, 96))

    for t in global_times:
        if budget > 0:
            window_hum = np.argpartition(hum_train_vars[t], -budget)[-budget:]
            window_temp = np.argpartition(temp_train_vars[t], -budget)[-budget:]
        else:
            window_hum = np.array([])
            window_temp = np.array([])

        hum_pred, temp_pred = makePreds_phase1(window_hum, window_temp, hum_train_means, temp_train_means, i, t)

        hum_preds[:, i] = copy.deepcopy(hum_pred)
        temp_preds[:, i] = copy.deepcopy(temp_pred)
        
        i += 1

    hum_mean_err = mean_absolute_error(hum_test, hum_preds)
    temp_mean_err = mean_absolute_error(temp_test, temp_preds)

    return hum_preds, temp_preds, hum_mean_err, temp_mean_err
Пример #5
0
def make_model(data,tc):

    train_data = data.sample(frac=.8)
    test_data = data.drop(train_data.index)
    train_y = train_data['T/Tc']
    train_X = train_data.drop(['T/Tc','temperature'], axis=1)
    test_y = test_data['T/Tc']
    test_X = test_data.drop(['T/Tc','temperature'], axis=1)

#    model = XGBClassifier(n_estimators = 1000,max_depth=8, learning_rate=0.05)
#    model.fit(train_X, train_y, early_stopping_rounds=10,
#                 eval_set=[(test_X, test_y)], verbose=True)
#    xgb.plot_tree(model)

    model = svm.SVC(kernel='rbf', gamma=1, C=1, verbose = True)
    model.fit(train_X, train_y)
    predictions = model.predict(test_X)
    print("Mean Absolute Error : " + str(mean_absolute_error(np.array(predictions), test_y)))

    train_y = train_data['temperature']/tc
    test_y = test_data['temperature']/tc

#    model2 = XGBRegressor(n_estimators = 1000,max_depth=8, learning_rate=0.05)
#    model2.fit(train_X, train_y, early_stopping_rounds=10,eval_metric='mae',
#                 eval_set=[(test_X, test_y)], verbose=True)

    model2 = svm.SVR(kernel='rbf', gamma=.5, C=1, verbose = True)
    model2.fit(train_X, train_y)

    predictions = model2.predict(test_X)
    print("Mean Absolute Error : " + str(mean_absolute_error(np.array(predictions), test_y)))

    return [model,model2]
Пример #6
0
def compute_mse(model,x_train_current_tmp,YTrain,x_test_current_tmp,YTest, score ,values_TM = []):
    model.fit(x_train_current_tmp, YTrain)
    y_pred_train = model.predict(x_train_current_tmp)
    y_pred_test = model.predict(x_test_current_tmp)

    if len(values_TM)!=0:
        abs_error_train = 100.*mean_absolute_error(YTrain,y_pred_train)*len(YTrain)/(89.7* values_TM[0, 0] * values_TM[0,1])
        print("abs train", abs_error_train)

        abs_error_test = 100.*mean_absolute_error(YTest,y_pred_test)*len(YTest)/(89.7* values_TM[1, 0] * values_TM[1,1])
        print("abs test", abs_error_test)

        mse_error_train = 100.*np.sqrt(mean_squared_error(YTrain,y_pred_train)*len(YTrain)/(values_TM[0, 0] * values_TM[0, 1]))/(89.7)
        print("mean squared error train", mse_error_train )

        mse_error_test = 100.*np.sqrt(mean_squared_error(YTest,y_pred_test)*len(YTest)/(values_TM[1, 0] * values_TM[1, 1]))/(89.7)
        print("mean squared error test", mse_error_test )

    if score=="mean_squared_error":
        new_loss = mean_squared_error(YTest,y_pred_test)
    elif score== "mean_absolute_error":
        new_loss = mean_absolute_error(YTest,y_pred_test)
    else:
        new_loss = r2_score(YTest,y_pred_test)
    beta = model.coef_

    if x_train_current_tmp.shape[1]==1:
        beta = np.array([beta])
    beta = beta.reshape([len(beta),1])

    return new_loss, beta
Пример #7
0
def cross_val(regressor_high,regressor_low,classifier,train):
	rows=random.sample(train.index, int(train.shape[0]*0.75))
	sample = train.ix[rows]

	crime=pd.DataFrame(sample.Total_Crime_Risk,dtype=int)
	crime['highcrime']=0
	crime.highcrime[crime.Total_Crime_Risk>crime.Total_Crime_Risk.median()]=1
	crime['GEOGRAPHY_ID']=sample.GEOGRAPHY_ID
	sample=sample.drop(train.columns[[0,-2,-1]], axis=1)
		
	model=classifier.fit(sample, crime.highcrime)
	Highcrime=model.predict(sample)
	Highcrime=np.array(Highcrime)
	sample['predicted_highcrime']=Highcrime
	
	high_areas=sample.ix[sample.predicted_highcrime==1]
	high_areas=pd.merge(high_areas, crime, on='GEOGRAPHY_ID', how= 'inner')
	high_areas_crime=high_areas.Total_Crime_Risk
	high_areas=high_areas.drop(high_areas.columns[[-1,-2,-3]],axis=1)

	low_areas=sample.ix[sample.predicted_highcrime==0]
	low_areas=pd.merge(low_areas, crime, on='GEOGRAPHY_ID', how= 'inner')
	low_areas_crime=low_areas.Total_Crime_Risk
	low_areas=low_areas.drop(low_areas.columns[[-1,-2,-3]],axis=1)

	model_high=regressor_high.fit(high_areas, high_areas_crime)
	high_crime=model_high.predict(high_areas)
	model_low=regressor_low.fit(low_areas, low_areas_crime)
	low_crime=model_low.predict(low_areas)

	high_error=mean_absolute_error(high_areas_crime,high_crime)
	low_error=mean_absolute_error(low_areas_crime,low_crime)
	print high_error,low_error, ((high_error+low_error)/2)
Пример #8
0
def prediction_performance(model, Xtest, Ytest, numberCategories):
    # Calculate metric for logistic regression performance.
    if(numberCategories == 1):
        # Get metrics for binary classification.
        YDistribution = model.predict_proba(Xtest)[:,1]
        YClassification = model.predict(Xtest)
        auc = roc_auc_score(Ytest, YDistribution)
        print("AUC", auc)
        MAE = mean_absolute_error(Ytest, YDistribution)
        print("MAE", MAE)
        accuracy = 1 - mean_absolute_error(YClassification, Ytest)
        print("Accuracy", accuracy)
        metrics = [accuracy, auc, MAE]
    else:
        # Get metric for multiple class classification.
        YPredictions = model.predict(Xtest)
        YDistribution = model.predict_proba(Xtest)
        YTestLabels = label_data(Ytest)
        accuracy = model.score(Xtest, YTestLabels)
        print("Accuracy", accuracy)
        avAUC = evaluate_auc_score(model, Xtest, Ytest)
        print("Av AUC", avAUC)
        #auc = roc_auc_score(Ytest, YPredictions)
        MAE = mean_absolute_error(Ytest, YDistribution)
        print("MAE", MAE)
        metrics = [accuracy, avAUC, MAE]
    return metrics
Пример #9
0
def normalEquation(features, features_validation, values, values_validation):

    M = numpy.dot(features.T, features)
    print "Transposta de f por f"
    print M.shape
    M = numpy.array(M)
    print "Transformou em array"
    print M.shape
    M = numpy.linalg.pinv(M)
    print "Inversa"
    print M.shape
    M = numpy.dot(M, features.T)
    print "Multiplicou por transposta de f"
    print M.shape
    theta = numpy.dot(M, values)
    #M = numpy.linalg.pinv(M)


    print theta.shape
    print features.shape
    print theta

    predictions = numpy.dot(theta, features.T)
    pred_validation = numpy.dot(theta, features_validation.T)

    print predictions

    print "MEAN ABSOLUTE ERROR "
    print mean_absolute_error(values, predictions)

    print "MEAN ABSOLUTE ERROR (validation) "
    print mean_absolute_error(values_validation, pred_validation)
Пример #10
0
 def test_continue_train(self):
     X, y = load_boston(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'regression',
         'metric': 'l1',
         'verbose': -1
     }
     lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
     model_name = 'model.txt'
     init_gbm.save_model(model_name)
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=30,
                     valid_sets=lgb_eval,
                     verbose_eval=False,
                     # test custom eval metrics
                     feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)),
                     evals_result=evals_result,
                     init_model='model.txt')
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
     self.assertLess(ret, 3.5)
     self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
     for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
         self.assertAlmostEqual(l1, mae, places=5)
     os.remove(model_name)
Пример #11
0
def tst(X, Y, k=3, rad=4, mode='k'):
    trX = X[:-1200]
    trY = Y[:-1200]
    tstX = X[-400:]
    tstY = Y[-400:]

    nnlr = NNLR(k, rad, mode)

    nnlr.fit(trX, trY)

    pred = nnlr.predict(trX)
    print 'Training Set'
    print 'Root Mean Squared Error'
    print mean_squared_error(trY, pred)**.5
    print 'Root Mean Error'
    print mean_absolute_error(trY, pred)
    # print zip(pred, trX)[:5]
    print nnlr.active

    pred = nnlr.predict(tstX)
    print 'Test Set'
    print 'Root Mean Squared Error'
    print mean_squared_error(tstY, pred)**.5
    print 'Root Mean Error'
    print mean_absolute_error(tstY, pred)
    # print zip(pred, tstY)[:5]
    print nnlr.active
Пример #12
0
def main():
    DOC = """
================================================================================
    Compare the prediction accuracy of different models on the boston dataset
================================================================================
    """
    print(DOC)
    from sklearn import cross_validation, datasets
    boston = datasets.load_boston()
    X, y = boston.data, np.round(boston.target)
    #X -= X.mean()
    y -= y.min()

    idx = np.argsort(y)
    X = X[idx]
    y = y[idx]
    cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0)
    score_logistic = []
    score_ordinal_logistic = []
    score_ridge = []
    for i, (train, test) in enumerate(cv):
        #test = train
        if not np.all(np.unique(y[train]) == np.unique(y)):
            # we need the train set to have all different classes
            continue
        assert np.all(np.unique(y[train]) == np.unique(y))
        train = np.sort(train)
        test = np.sort(test)
        w, theta = ordinal_logistic_fit(X[train], y[train], verbose=True,
                                        solver='TNC')
        pred = ordinal_logistic_predict(w, theta, X[test])
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (ORDINAL)  fold %s: %s' % (i+1, s))
        score_ordinal_logistic.append(s)

        from sklearn import linear_model
        clf = linear_model.LogisticRegression(C=1.)
        clf.fit(X[train], y[train])
        pred = clf.predict(X[test])
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s))
        score_logistic.append(s)

        from sklearn import linear_model
        clf = linear_model.Ridge(alpha=1.)
        clf.fit(X[train], y[train])
        pred = np.round(clf.predict(X[test]))
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (RIDGE)    fold %s: %s' % (i+1, s))
        score_ridge.append(s)


    print()
    print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC):    %s' % np.mean(score_ordinal_logistic))
    print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic))
    print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION):    %s' % np.mean(score_ridge))
    # print('Chance level is at %s' % (1. / np.unique(y).size))
    
    return np.mean(score_ridge)
Пример #13
0
def testModel(
    model, layerSizes, Xtrain, Ytrain, Xtest, Ytest, learningRate, epochs,
    batchSize, optimizer, resultsFile = "lossOptLog.txt", printResults = False,
    elapsedTime = False):
    lossCategories = Ytrain.shape[1]
    numberHiddenLayers = len(layerSizes) - 2
    inputLayerSize = layerSizes[0]
    units1 = layerSizes[1]
    dropout1 = dropouts[0]
    dropout2 = dropouts[1]
    dropout3 = dropouts[2]
    # Test MAE of model on training data (to check for overfitting).
    trainingPredY = model.predict_proba(Xtrain, verbose = 0)
    MAETrain = mean_absolute_error(Ytrain, trainingPredY)

    # Test MAE on test data.
    testPredY = model.predict(Xtest, verbose = 0)
    MAE = mean_absolute_error(Ytest, testPredY)

    # Calculate AUC for each category.
    auc = [0] * lossCategories
    """
    for i in range(0, lossCategories):
        categoryValues = Ytest[:][i:(i+1)]
        categoryPredictions = testPredY[:][i:(i+1)]
        auc[i] = roc_auc_score(categoryPredictions, categoryValues)
    aucAverage = (sum(auc) / len(auc))
    """
    aucAverage = 0
    # Evaluate the model and write results to a file.
    scores = model.evaluate(Xtest, Ytest, verbose = 0)
    testAccuracy = scores[1]
    scores = model.evaluate(Xtrain, Ytrain, verbose = 0)
    trainAccuracy = scores[1]
    if(printResults):
        print("Training MAE: %.2f%%" % (MAETrain * 100))
        print("acc: %.2f%%" % (testAccuracy*100))
        print("auc: %.2f%%" % (aucAverage*100))
        print("MAE: %.2f%%" % (MAE*100))
        print("%s , %s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n"
            % (units1, units2, units3, learningRate, epochs, batchSize,
            patience, optimizer, dropout1, dropout2, dropout3))
        print("\n")        
    # Write model results to a file.
    if(elapsedTime is not False):
        with open(resultsFile, "a") as text_file:
            text_file.write(
                "%s , %s , %s, %s , %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n"
                % (elapsedTime, MAETrain, trainAccuracy, testAccuracy, aucAverage, MAE, units1,
                units2, units3, learningRate, epochs, batchSize, patience,
                optimizer, dropout1, dropout2, dropout3))
    else:
        with open(resultsFile, "a") as text_file:
            text_file.write(
                "%s , %s , %s , %s , %s , %s ,%s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n"
                % (MAETrain, trainAccuracy, testAccuracy, aucAverage, MAE, units1, units2, units3,
                learningRate, epochs, batchSize, optimizer, dropout1, dropout2,
                dropout3))
Пример #14
0
def main():
    #load da serie
    dtst = Datasets()
    serie = dtst.Leitura_dados(dtst.bases_linear_graduais(3, 35))
    serie = np.asarray(serie)
    particao = Particionar_series(serie, [0.0, 0.0, 0.0], 0)
    serie = particao.Normalizar(serie)
    
    '''
    ELM = ELMRegressor()
    ELM.Tratamento_dados(serie, [0.8, 0.2, 0.2], 4)
    
    #criando uma lista para os dados
    lista_dados = []
    lista_dados.append(ELM.train_entradas)
    lista_dados.append(ELM.train_saidas)
    lista_dados.append(ELM.val_entradas)
    lista_dados.append(ELM.val_saidas)
    lista_dados.append(ELM.teste_entradas)
    lista_dados.append(ELM.teste_saidas)
    
    #Otimizando a arquitetura de uma ELM
    ELM.Otimizar_rede(10, lista_dados)
    '''
        
    #ELM treinando com a entrada e a saida
    #ELM = ELMRegressor(ELM.neuronios_escondidos)
    ELM = ELMRegressor(5)
    ELM.Tratamento_dados(serie, [0.8, 0.2, 0.2], 4)
    ELM.Treinar(ELM.train_entradas, ELM.train_saidas)
    
    #previsao do ELM para o conjunto de treinamento
    prediction_train = ELM.Predizer(ELM.train_entradas)
    MAE_train = mean_absolute_error(ELM.train_saidas, prediction_train)
    print('MAE Treinamento: ', MAE_train)
    
    #previsao do ELM para o conjunto de teste
    prediction_test = ELM.Predizer(ELM.teste_entradas)
    MAE_test = mean_absolute_error(ELM.teste_saidas, prediction_test)
    print('MAE Teste: ', MAE_test)
    
    
    #grafico de previsao para treinamento
    plt.plot(ELM.train_saidas, label = 'Real Treinamento', color = 'Blue')
    plt.plot(prediction_train, label = 'Real Previsão', color = 'Red')
    plt.title('Gráfico Treinamento, MAE: %s' %MAE_train)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    #grafico de previsao para teste
    plt.plot(ELM.teste_saidas, label = 'Real Teste', color = 'Blue')
    plt.plot(prediction_test, label = 'Previsao Teste', color = 'Red')
    plt.title('Gráfico Teste, MAE: %s' %MAE_test)
    plt.legend()
    plt.tight_layout()
    plt.show()
Пример #15
0
def blended_scorer(estimator, X, y):
    ols_preds = ols_preds_for_Xs(X)
    pred_y = estimator.predict(X)
    msg("BLENDED SCORES FOR a CV GROUP:")
    for blend in np.arange(0, 1.01, 0.1):
        blended_prediction = (blend * ols_preds) + ((1.0 - blend) * pred_y)
        blended_score = mean_absolute_error(blended_prediction, y)
        msg("%f * OLS yields score of %f" % (blend, blended_score))
    return mean_absolute_error(y, pred_y)
Пример #16
0
def test_clf(X, y, clf, test_size=0.2, num=20):
    ylist = y.T.tolist()[0]
    train = numpy.zeros(num)
    cross = numpy.zeros(num)
    for i in xrange(num):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, ylist, test_size=test_size)
        clf.fit(X_train, y_train)
        train[i] = mean_absolute_error(clf.predict(X_train), y_train)
        cross[i] = mean_absolute_error(clf.predict(X_test), y_test)
    return (train.mean(), train.std()), (cross.mean(), cross.std())
def build_SGDRegressor(train_X, train_y, test_X, test_y):
    ##########
    log_train_y = np.log(train_y)
    ##########
    sgd_regressor = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.001, l1_ratio=0.15, verbose=True, n_iter = 50)
    sgd_regressor.fit(train_X, log_train_y)
    train_yhat = np.exp(sgd_regressor.predict(train_X))
    test_yhat = np.exp(sgd_regressor.predict(test_X))
    print metrics.mean_absolute_error(train_y, train_yhat)
    print metrics.mean_absolute_error(test_y, test_yhat)
Пример #18
0
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 13)
        assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
                            13 / float(n_samples), 2)

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        13 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13)
    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)

    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 13. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    assert_equal(accuracy_score(y_true, y_pred, normalize=False),
                 n_samples - zero_one_loss(y_true, y_pred, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def build_boostedTree(train_X, train_y, test_X, test_y):
    bt = ensemble.GradientBoostingRegressor(loss = 'lad', 
        learning_rate= 0.1, n_estimators=100, subsample=0.3, max_depth=3, max_features=50, 
        verbose = 1)
    bt_train_X = train_X
    bt_test_X = test_X
    bt.fit(bt_train_X.toarray(), train_y)
    train_yhat = sgd_regressor.predict(bt_train_X)
    test_yhat = sgd_regressor.predict(bt_test_X)
    print metrics.mean_absolute_error(train_y, train_yhat)
    print metrics.mean_absolute_error(test_y, test_yhat)
Пример #20
0
Файл: test.py Проект: mircean/ML
def zillow_keras(parameters, X_train, X_dev, Y_train, Y_dev, random_seed=None):
    #random seed
    if random_seed != None:
        np.random.seed(random_seed)
    
    hidden_1, hidden_2, activation, epochs, learning_rate, batch_size, method, momentum = parameters

    if activation == 'tanh':
        initializer = keras.initializers.lecun_normal()
    elif activation == 'relu':
        initializer = keras.initializers.he_normal()
    else:
        raise ValueError('activation')

    regularizer = None
    #regularizer = regularizers.l2(0.01)

    model = Sequential()
    model.add(Dense(hidden_1, input_dim=X_train.shape[1], kernel_initializer=initializer, activation=activation, kernel_regularizer=regularizer))
    model.add(Dense(hidden_2, kernel_initializer=initializer, activation=activation, kernel_regularizer=regularizer))
    model.add(Dense(1, kernel_initializer=initializer, kernel_regularizer=regularizer))

    if method == 'GD':
        optimizer = optimizers.SGD(lr=learning_rate, momentum=momentum)
    elif method == 'RMSProp':
        optimizer = optimizers.RMSprop(lr=learning_rate)
    elif method == 'Adam':
        optimizer = optimizers.Adam(lr=learning_rate)
    else:
        raise ValueError('method')

    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])

    initial_epoch = 0
    #verbose = 1
    verbose = 0
    verbose2 = 100
    #verbose2 = None
    results = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, callbacks=[KerasCallback(verbose2)], validation_data = (X_dev, Y_dev), verbose=verbose, initial_epoch=initial_epoch)

    Y_predict = model.predict(X_train)
    try:
        accuracy_train = mean_absolute_error(Y_train, Y_predict)
    except ValueError:
        accuracy_train = 0
    Y_predict = model.predict(X_dev)
    try:
        accuracy_dev = mean_absolute_error(Y_dev, Y_predict)
    except ValueError:
        accuracy_dev = 0

    print(accuracy_train, accuracy_dev, hidden_1, hidden_2, activation, epochs, learning_rate, batch_size, method, momentum)
    
    '''
Пример #21
0
def scorer_gbr_lad(clf, X, y, verbose=1):
    """Scorer for GradientBoostingRegressor with los='lad' """
    y_pred = clf.predict(X)
    score = -mean_absolute_error(y, y_pred)
    if verbose >0:
        print >>sys.stderr,"Eout=",-score
        if 'staged_predict' in dir(clf):
            if verbose>0: print("Staged predicts (Eout)")
            for i,y_pred in enumerate(clf.staged_predict(X)):
                Eout = mean_absolute_error(y,y_pred)
                if verbose>0: print "tree %3d, test score %f" % (i+1,Eout)
    return score
Пример #22
0
def runRegressor( clf,featureMat,targets,no_of_training_example ):
	try:
		clf.fit(featureMat[:no_of_training_example,:], targets[:no_of_training_example])
		y_pred = clf.predict(featureMat[no_of_training_example:,:])
		print 'Variance Score'
		print explained_variance_score(targets[no_of_training_example:], y_pred)
		print 'Mean absolute error'
		print mean_absolute_error(targets[no_of_training_example:], y_pred)
		print 'Explained variance score'
		print explained_variance_score(targets[no_of_training_example:], y_pred)
	except Exception, e:	
		print e
Пример #23
0
def test_clf_kfold(X, y, clf, folds=10):
    train = numpy.zeros(folds)
    cross = numpy.zeros(folds)
    for i, (train_idx, test_idx) in enumerate(cross_validation.KFold(y.shape[0], n_folds=folds)):
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx].T.tolist()[0]
        y_test = y[test_idx].T.tolist()[0]
        clf.fit(X_train, y_train)
        train[i] = mean_absolute_error(clf.predict(X_train), y_train)
        cross[i] = mean_absolute_error(clf.predict(X_test), y_test)
    return (train.mean(), train.std()), (cross.mean(), cross.std())
Пример #24
0
def gd_method(f, f_valid, v, v_validation):
    m = len(v)
    m_validation = len(v_validation)

    # Normalize data
    features, mu, sigma = normalize_features(f)

    # Normalize validation data (using the mean and std calculated when normalizing the model)
    features_validation, mu, sigma = normalize_features(f_valid)

    features['ones'] = numpy.ones(m)  # Add a column of 1s (y intercept)
    features_validation['ones'] = numpy.ones(m_validation)  # Add a column of 1s (y intercept)

    # Convert features and values to numpy arrays
    features_array = numpy.array(features)
    values_array = numpy.array(v)

    # Set values for alpha, number of iterations.
    alpha = 0.04  # please feel free to change this value
    num_iterations = 1000  # please feel free to change this value

    # Initialize theta, perform gradient descent
    theta_gradient_descent = numpy.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array,
                                                            values_array,
                                                            theta_gradient_descent,
                                                            alpha,
                                                            num_iterations)
    plot = None
    # -------------------------------------------------
    # Uncomment the next line to see your cost history
    # -------------------------------------------------
    plot = plot_cost_history(alpha, cost_history)
    print plot
    #
    # Please note, there is a possibility that plotting
    # this in addition to your calculation will exceed
    # the 30 second limit on the compute servers.

    # Predictions for the data used for define the model
    predictions = numpy.dot(features, theta_gradient_descent)
    print predictions

    # Predictions for the validation data
    pred_validation = numpy.dot(features_validation, theta_gradient_descent)
    print pred_validation

    # Compute error using mean absolute error
    print "MEAN ABSOLUTE ERROR "
    print mean_absolute_error(v, predictions)

    print "MEAN ABSOLUTE ERROR (validation) "
    print mean_absolute_error(v_validation, pred_validation)
def testModel(
    model, layerSizes, Xtrain, Ytrain, Xtest, Ytest, learningRate, epochs,
    batchSize, optimizer, resultsFile = "selfOptLog.txt", printResults = False,
    elapsedTime = False):
    numberHiddenLayers = len(layerSizes) - 2
    inputLayerSize = layerSizes[0]
    units1 = layerSizes[1]
    dropout1 = dropouts[0]
    dropout2 = dropouts[1]
    dropout3 = dropouts[2]
    # Test MAE of model on training data (to check for overfitting).
    trainingPredY = model.predict_proba(Xtrain, verbose = 0)
    MAETrain = mean_absolute_error(Ytrain, trainingPredY)
    scores = model.evaluate(Xtrain, Ytrain, verbose=0)
    trainingAccuracy = scores[1] * 100

    # Test MAE, AUC, and Accuracy on test data.
    testPredY = model.predict_proba(Xtest, verbose = 0)
    auc = roc_auc_score(Ytest, testPredY)
    MAE = mean_absolute_error(Ytest, testPredY)
    bestGuess = [] # The estimated 0 or 1 output.
    for predictionCount in range (0, len(testPredY)):
        if(testPredY[predictionCount] < 0.5):
            bestGuess.append(0)
        else:
            bestGuess.append(1)
    testAccuracy = 1 - mean_absolute_error(bestGuess, Ytest)
                                    
    # Evaluate the model and write results to a file.
    if(printResults):
        print("Training MAE: %.2f%%" % (MAETrain * 100))
        print("acc: %.2f%%" % (testAccuracy*100))
        print("AUC: %.2f%%" % (auc*100))
        print("MAE: %.2f%%" % (MAE*100))
        print("%s , %s , %s, %s, %s , %s , %s , %s , %s , %s, %s \n"
            % (units1, units2, units3, learningRate, epochs, batchSize,
            patience, optimizer, dropout1, dropout2, dropout3))
        print("\n")        
    # Write model results to a file.
    if(elapsedTime is not False):
        with open(resultsFile, "a") as text_file:
            text_file.write(
                "%s, %s , %s , %s, %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n"
                % (elapsedTime, MAETrain, trainingAccuracy, testAccuracy, MAE, units1,
                units2, units3, learningRate, epochs, batchSize, patience,
                optimizer, dropout1, dropout2, dropout3))
    else:
        with open(resultsFile, "a") as text_file:
            text_file.write(
                "%s , %s , %s , %s , %s , %s , %s , %s, %s , %s , %s , %s , %s , %s, %s \n"
                % (MAETrain, trainingAccuracy, testAccuracy, auc, MAE, units1, units2, units3,
                learningRate, epochs, batchSize, optimizer, dropout1, dropout2,
                dropout3))
Пример #26
0
def test_regression_multioutput_array():
    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]

    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r = r2_score(y_true, y_pred, multioutput='raw_values')
    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')

    assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
    assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
    assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    y_true = [[0, 0]]*4
    y_pred = [[1, 1]]*4
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r = r2_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(mse, [1., 1.], decimal=2)
    assert_array_almost_equal(mae, [1., 1.], decimal=2)
    assert_array_almost_equal(r, [0., 0.], decimal=2)

    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values')
    assert_array_almost_equal(r, [0, -3.5], decimal=2)
    assert_equal(np.mean(r), r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
                 multioutput='uniform_average'))
    evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
                                   multioutput='raw_values')
    assert_array_almost_equal(evs, [0, -1.25], decimal=2)

    # Checking for the condition in which both numerator and denominator is
    # zero.
    y_true = [[1, 3], [-1, 2]]
    y_pred = [[1, 4], [-1, 1]]
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(r2, [1., -3.], decimal=2)
    assert_equal(np.mean(r2), r2_score(y_true, y_pred,
                 multioutput='uniform_average'))
    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(evs, [1., -3.], decimal=2)
    assert_equal(np.mean(evs), explained_variance_score(y_true, y_pred))

    # Handling msle separately as it does not accept negative inputs.
    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
    msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
    msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred),
                               multioutput='raw_values')
    assert_array_almost_equal(msle, msle2, decimal=2)
Пример #27
0
def rf_regressor(rf_model, train_x, train_y, valid_x, valid_y, generate_csv=False, f_selection=False, n_features=3):
    global fobj
    est = rf_model.fit(train_x, train_y)
    print 'feature importance: ', est.feature_importances_
    if f_selection:
        print "Feature Selection Enabled- feature count: %r" % (n_features)
        rf_model = RFE(rf_model, n_features, step=1)

    train_y_pred = est.predict(train_x)
    error = mt.mean_absolute_error(train_y, train_y_pred)
    fobj.write('Train Error: %r\n' % (error))
    valid_y_pred = est.predict(valid_x)
    return mt.mean_absolute_error(valid_y, valid_y_pred), error, est
Пример #28
0
def do_cmp(sn1, sn2="submission_model02"):
    def load(fn):
        print fn,
        fn = '%s/%s.csv.gz' % (Project().datapath, fn)
        with gzip.open(fn,'rb') as fp:
            reader = csv.reader(fp) 
            header = next(reader)
            return np.array(list(reader),dtype=float)[:,1:]
    s1 = load(sn1)
    print
    s2 = load(sn2)
    print "mae: %.4f" % mean_absolute_error(s1,s2)
    s2 = load('submission_mix')
    print "mae2: %.4f" % mean_absolute_error(s1,s2)
Пример #29
0
def test_trigonometric():
    """Check that using trig functions work and that results differ"""

    est1 = SymbolicRegressor(random_state=0)
    est1.fit(boston.data[:400, :], boston.target[:400])
    est1 = mean_absolute_error(est1.predict(boston.data[400:, :]),
                               boston.target[400:])

    est2 = SymbolicRegressor(trigonometric=True, random_state=0)
    est2.fit(boston.data[:400, :], boston.target[:400])
    est2 = mean_absolute_error(est2.predict(boston.data[400:, :]),
                               boston.target[400:])

    assert_true(abs(est1 - est2) > 0.01)
Пример #30
0
def test_subsample():
    """Check that subsample work and that results differ"""

    est1 = SymbolicRegressor(max_samples=1.0, random_state=0)
    est1.fit(boston.data[:400, :], boston.target[:400])
    est1 = mean_absolute_error(est1.predict(boston.data[400:, :]),
                               boston.target[400:])

    est2 = SymbolicRegressor(max_samples=0.7, random_state=0)
    est2.fit(boston.data[:400, :], boston.target[:400])
    est2 = mean_absolute_error(est2.predict(boston.data[400:, :]),
                               boston.target[400:])

    assert_true(abs(est1 - est2) > 0.01)
                                                    random_state=0)

# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=
    1000,  # Tells the model, how many decision trees to build Higher numbers usually allow the model to be more accurate but it increases the amount of time required to run the model
    learning_rate=
    0.1,  # Learning rate controls how much each additional decision tree influences the overall prediction. Lower rates usually lead to higher accuracy but only works if we have n_estimators set to a high value.
    max_depth=
    6,  # Max_depth controls how many layers deep each individual decision tree can be. We'll start with 6 which means that each decision tree in the model can be up to 6 layers deep.
    min_samples_leaf=
    9,  # Min_samples_leaf controls how many times a value must appear in our training set for a decision tree to make a decision based on it.
    max_features=
    0.1,  # Max_features is the percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree.
    loss=
    'huber'  # Loss controls how scikit-learn calculates the model's error rate or cost as it learns. The huber function does a good job while not being too influenced by outliers in the data set.
)
model.fit(
    X_train, y_train
)  # We tell the model to train using our training data set by calling scikit-learn's fit function on the model

# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)
Пример #32
0
 def overall_mae(self):
     return mean_absolute_error(self._observed, self._predicted)
Пример #33
0
 def getcoef(self, model, y, ypred):
     # The coeficients
     print('With coefficients: \n', model.coef_)
     # Explained variance score: 1 is perfect prediction
     e = mean_absolute_error(self.y_train, ypred)
     print("mean absolut error = " + str(e))
def evaluate_ensemble(members, weights, testX, testy):
    # make prediction
    yhat = ensemble_predictions(members, weights, testX, testy)
    # calculate MAE
    return mean_absolute_error(testy, yhat), yhat
Пример #35
0
dataset_labels = land_data['price']

from sklearn.model_selection import train_test_split

train_features,test_features,train_labels,test_labels = \
    train_test_split(dataset_features,dataset_labels,test_size=0.25, random_state=21)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_features, train_labels)
predicted_price = regressor.predict(test_features)

from sklearn import metrics
print('Linear regression')
print('Mean Absolute Error:',
      metrics.mean_absolute_error(test_labels, predicted_price))

print('Mean Squared Error:',
      metrics.mean_squared_error(test_labels, predicted_price))

print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(test_labels, predicted_price)))

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(train_features, train_labels)
predicted_price = regressor.predict(test_features)

from sklearn import metrics
print('Random forest regressor')
print('Mean Absolute Error:',
Пример #36
0
sns.distplot(transformed_y[train:])

from sklearn.metrics import mean_absolute_error
regressor = MLPRegressor(hidden_layer_sizes=[20, 20],
                         activation='relu',
                         max_iter=1000,
                         random_state=1)
regressor.fit(shuffled_X[:train], transformed_y[:train])
"R2 %.3f, ошибка в возрасте: %.2f, разброс значений возраста %.2f" % (
    r2_score(
        shuffled_y[train:],
        target_processor.inverse_transform(
            regressor.predict(shuffled_X[train:]).reshape(-1, 1))),
    mean_absolute_error(
        shuffled_y[train:],
        target_processor.inverse_transform(
            regressor.predict(shuffled_X[train:]).reshape(
                -1, 1))), shuffled_y[train:].std())

# Посмотрим теперь классификацию на датасете Iris
# classifier = MLPClassifier(
#     hidden_layer_sizes=[32, 12],
#     activation='tanh',
#     max_iter=1000,
#     random_state=1)
# X_changed = MinMaxScaler(
#     feature_range=(-1, 1)
# ).fit_transform(X)
# scores = cross_val_score(
#     classifier,
#     X=X_changed,
Пример #37
0
    def run(
        self,
        country: Countries,
        action: str = 'evaluate'
    ) -> Union[pd.DataFrame, Dict[Union[Literal["Singapore"], Literal["China"],
                                        Literal["India"]], Dict[str, Union[
                                            Union[float, str], Any]]]]:
        """
        >>> from q3_time_series.model import UnivariateMultiStepLSTM
        >>> # To Evaluate
        >>> evaluate_metrics = UnivariateMultiStepLSTM(3,2).run('Singapore', "evaluate")
        >>> # To Predict
        >>> prediction = UnivariateMultiStepLSTM(3,2).run('Singapore', 'predict')
        """
        assert country in Countries.__args__, \
            f"{country} is not supported, please choose between {Countries.__args__}"
        X_train, y_train = self.split_sequence(self.train[country].values,
                                               self.n_steps_in,
                                               self.n_steps_out)
        X_valid, y_valid = self.split_sequence(self.valid_arb[country].values,
                                               self.n_steps_in,
                                               self.n_steps_out)

        X_train = X_train.reshape(
            (X_train.shape[0], X_train.shape[1], self.n_features))
        X_valid = X_valid.reshape(
            (X_valid.shape[0], X_valid.shape[1], self.n_features))

        model = self.make_model()
        model.fit(X_train,
                  y_train,
                  epochs=200,
                  verbose=0,
                  callbacks=[
                      tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                       patience=20)
                  ],
                  validation_data=(X_valid, y_valid))

        if action == 'predict':
            input = (self.df[country][-self.n_steps_in:].values).reshape(
                (1, self.n_steps_in, self.n_features))
            pred = model.predict(input, verbose=0)

            return pd.DataFrame(pred,
                                columns=['2008', '2009'],
                                index=[country]).T

        else:
            pred_valid = model.predict(X_valid, verbose=0)
            pred_train = model.predict(X_train, verbose=0)

            return {
                country: {
                    'rmse_train':
                    sqrt(
                        mean_squared_error(
                            [y_train[i][0] for i in range(0, len(y_train))], [
                                pred_train[i][0]
                                for i in range(0, len(pred_train))
                            ])),
                    'rmse_val':
                    sqrt(
                        mean_squared_error(
                            [y_valid[i][0] for i in range(0, len(y_valid))], [
                                pred_valid[i][0]
                                for i in range(0, len(pred_valid))
                            ])),
                    'mae_train':
                    mean_absolute_error(
                        [y_train[i][0] for i in range(0, len(y_train))],
                        [pred_train[i][0] for i in range(0, len(pred_train))]),
                    'mae_val':
                    mean_absolute_error(
                        [y_valid[i][0] for i in range(0, len(y_valid))],
                        [pred_valid[i][0] for i in range(0, len(pred_valid))]),
                    'mape_train':
                    f'{self.mean_absolute_percentage_error([y_train[i][0] for i in range(0, len(y_train))], [pred_train[i][0] for i in range(0, len(pred_train))])} %',
                    'mape_val':
                    f'{self.mean_absolute_percentage_error([y_valid[i][0] for i in range(0, len(y_valid))], [pred_valid[i][0] for i in range(0, len(pred_valid))])} %'
                }
            }
Пример #38
0
def UnivariateAnalysis(p_features,
                       p_X_train,
                       p_Y_train,
                       p_X_val,
                       p_Y_val,
                       p_top_k_features=5,
                       p_model='continuous', # choose from continuous, binary, multinomial
                       p_target_distribution='gamma',
                       p_metric='L1 Error', # choose from L1 Error, AUC
                       p_seed=0,
                       p_subsamplesize=1500,
                       p_n_buckets=20,
                       p_verbose=False):
    feature_error = []

    import sys

    # Import the library
    import statsmodels.api as sm
    from statsmodels.genmod.generalized_linear_model import GLMResults

    # Scoring parameter
    if p_metric == 'L1 Error':
        from sklearn.metrics import mean_absolute_error
    elif p_metric == 'AUC':
        from sklearn.metrics import auc
        from sklearn.metrics import roc_curve
    else:
        print('{} is not currently an option'.format(p_metric))
        sys.exit()

    for name, index in p_features:
        if p_verbose:
            print(name)
        # Fit the model
        # add intercept to continuous variables and classification models
        if (len(index) == 1) or (p_model in ['binary', 'multinomial']):
            train_data = sm.add_constant(p_X_train.iloc[:, index])
            val_data = sm.add_constant(p_X_val.iloc[:, index])
        else:
            train_data = p_X_train.iloc[:, index]
            val_data = p_X_val.iloc[:, index]

        if p_model == 'continuous':
            model = sm.GLM(p_Y_train, train_data, family=sm.families.Gamma(sm.families.links.log))
            try:
                result = model.fit()
            except np.linalg.linalg.LinAlgError as err:
                print('{} failed to fit due to {} error'.format(name, err))
                continue
        elif p_model == 'binary':
            model = sm.Logit(p_Y_train, train_data)
            try:
                result = model.fit(disp=0)
            except np.linalg.linalg.LinAlgError as err:
                print('{} failed to fit due to {} error'.format(name, err))
                continue
        elif p_model == 'multinomial':
            model = sm.MNLogit(p_Y_train, train_data)
            try:
                result = model.fit(disp=0)
            except np.linalg.linalg.LinAlgError as err:
                print('{} failed to fit due to {} error'.format(name, err))
                continue
        else:
            print('{} is not an available model option'.format(p_model))

        # Calculate the error with the selected metric
        if p_metric == 'L1 Error':
            error = mean_absolute_error(p_Y_val, result.predict(val_data))
        elif p_metric == 'AUC':
            try: # TODO make this more specific as well
                fpr, tpr, thresholds = roc_curve(p_Y_val, result.predict(val_data))
            except:
                print('{} AUC calculation failed'.format(name))
                continue
            error = auc(fpr, tpr)

        feature_error.append([name, error, index])

    if p_metric in ['L1 Error']:
        df = pd.DataFrame(columns=['Variable', 'Validation Error', 'Index'],
                          data=feature_error)
        df_sorted = df.sort_values(by='Validation Error')
    elif p_metric in ['AUC']:
        df = pd.DataFrame(columns=['Variable', 'Validation AUC', 'Index'],
                          data=feature_error)
        df_sorted = df.sort_values(by='Validation AUC', ascending=False)

    top_k_features = df_sorted.iloc[:p_top_k_features, :]

    print(top_k_features.iloc[:, :2])

    for name, error, index in pd.DataFrame.as_matrix(top_k_features):
        if len(index) == 1:
            print('Feature: ' + str(name))
            if p_metric in ['L1 Error']:
                print('Validation Error: ' + str(error))
            elif p_metric == 'AUC':
                print('AUC: ' + str(error))

            X_train_const = sm.add_constant(p_X_train.iloc[:, index])
            # X_val_const = sm.add_constant(p_X_val.iloc[:,index])

            if p_model == 'continuous':
                model = sm.GLM(p_Y_train, X_train_const, family=sm.families.Gamma(sm.families.links.log))
            elif p_model == 'binary':
                model = sm.Logit(p_Y_train, X_train_const)
            elif p_model == 'multinomial':
                model = sm.MNLogit(p_Y_train, X_train_const)
            result = model.fit(disp=0)

            print('Training AIC: ' + str(result.aic))

            # plot fitted vs observed on both training and validation data
            y_pred_train = result.predict(X_train_const)
            # y_pred_val = result.predict(X_val_const)

            plot_data_train = pd.DataFrame(np.column_stack([p_X_train.iloc[:, index], p_Y_train, y_pred_train]),
                                           columns=[list(p_X_train.columns[index])[0], 'y', 'y_pred'])

            if p_model == 'binary':
                x_values, y_values = AutoBucket(plot_data_train[list(p_X_train.columns[index])[0]], plot_data_train['y'], p_n_buckets)
            else:
                from random import sample, seed
                seed(p_seed)
                rand_vals = sample(range(len(plot_data_train)), k=min(p_subsamplesize, len(plot_data_train)))
                plot_data_train_sample = plot_data_train.iloc[rand_vals, :]
                plot_data_train_sample_sorted = plot_data_train_sample.sort_values(by=list(p_X_train.columns[index])[0])

            fig, ax = plt.subplots(figsize=(12, 8))

            if p_model == 'binary':
                plot_data_train_sample_sorted = plot_data_train.sort_values(by=list(p_X_train.columns[index])[0])
                plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y_pred', ax=ax, linestyle='-', color='b')
                plt.plot(x_values, y_values, 'ro--')
            else:
                plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y_pred', ax=ax, linestyle='-', color='b')
                plot_data_train_sample_sorted.plot(x=list(p_X_train.columns[index])[0], y='y', ax=ax, kind='scatter', color='r')
            plt.show()

            print(result.summary())
        else:
            # Add observed (average) values to the graph. Use automatic bucketing of indt variable
            # Add argument to choose between: predicted value, observed value, 95% confidence int
            print('Feature: ' + str(name))
            if p_metric in ['L1 Error']:
                print('Validation Error: ' + str(error))
            elif p_metric == 'AUC':
                print('AUC: ' + str(error))

            if p_model == 'continuous':
                model = sm.GLM(p_Y_train, p_X_train.iloc[:, index], family=sm.families.Gamma(sm.families.links.log))
            elif p_model == 'binary':
                model = sm.Logit(p_Y_train, p_X_train.iloc[:, index])
            elif p_model == 'multinomial':
                model = sm.MNLogit(p_Y_train, p_X_train.iloc[:, index])
            result = model.fit(disp=0)

            print('Training AIC: ' + str(result.aic))

            # TODO add multinomial below
            fig, ax1 = plt.subplots(figsize=(12, 8))
            if p_model == 'continuous':
                upper_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                            '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 1]))})
                model = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                      'model': list(np.exp(result.params))})
                lower_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                            '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 0]))})
            elif p_model == 'binary':
                # TODO verify transformation below is correct
                upper_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                            '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 1])/
                                                             (np.exp(GLMResults.conf_int(result)[:, 1]) + 1))})
                model = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                      'model': list(np.exp(result.params)/(1 + np.exp(result.params)))})
                lower_bound = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                            '95% C.I.': list(np.exp(GLMResults.conf_int(result)[:, 0])/
                                                             (np.exp(GLMResults.conf_int(result)[:, 0]) + 1))})
            upper_bound.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='r')
            model.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='b')
            lower_bound.plot(x='Level', ax=ax1, linestyle='-', marker='o', color='g')
            ax1.set_ylabel('Response', color='b')
            ax1.tick_params('y', colors='b')
            ax1.legend(loc='upper left')

            weights = pd.DataFrame({'Level': p_X_train.iloc[:, index].columns,
                                    'weight': list(p_X_train.iloc[:, index].sum(axis=0))})
            plt.xticks(rotation=90)

            ax2 = ax1.twinx()
            weights.plot(x='Level', ax=ax2, kind='bar', color='y', alpha=0.4)
            ax2.set_ylabel('Weight', color='y')
            ax2.set_ylim([0, max(weights.iloc[:, 1]) * 3])
            ax2.tick_params('y', colors='y')
            ax2.legend(loc='upper right')
            ax2.grid(False)

            # fig.tight_layout()
            plt.show()

            print(result.summary())
def calculate_measures_for_continues_labels(
        all_predictions: pd.DataFrame,
        final_total_payoff_prediction_column: str,
        total_payoff_label_column: str,
        label_options: list,
        raisha: str = 'All_raishas',
        round_number: str = 'All_rounds',
        bin_label: pd.Series = None,
        bin_predictions: pd.Series = None,
        already_calculated: bool = False,
        bin_label_column_name: str = 'bin_label',
        bin_prediction_column_name: str = 'bin_predictions',
        prediction_type: str = '') -> (pd.DataFrame, dict):
    """
    Calc and print the regression measures, including bin analysis
    :param all_predictions:
    :param total_payoff_label_column: the name of the label column
    :param final_total_payoff_prediction_column: the name of the prediction label
    :param label_options: list of the label option names
    :param raisha: if we run a raisha analysis this is the raisha we worked with
    :param round_number: for per round analysis
    :param bin_label: the bin label series, the index is the same as the total_payoff_label_column index
    :param bin_predictions: the bin predictions series, the index is the same as the total_payoff_label_column index
    :param prediction_type: if we want to use seq and reg predictions- so we have a different column for each.
    :param already_calculated: if we already calculated the measures, need to calculate again only the bin measures
    :param bin_label_column_name: the name of the bin label column if it is in the all_prediction df
    :param bin_prediction_column_name: the name of the bin prediction column if it is in the all_prediction df
    :return:
    """
    dict_key = f'{raisha} {round_number}'
    if 'is_train' in all_predictions.columns:
        data = all_predictions.loc[all_predictions.is_train == False]
    else:
        data = all_predictions

    results_dict = defaultdict(dict)
    predictions = data[final_total_payoff_prediction_column]
    gold_labels = data[total_payoff_label_column]
    mse = metrics.mean_squared_error(predictions, gold_labels)
    rmse = round(100 * math.sqrt(mse), 2)
    mae = round(100 * metrics.mean_absolute_error(predictions, gold_labels), 2)
    mse = round(100 * mse, 2)

    # calculate bin measures
    if bin_label_column_name and bin_prediction_column_name in all_predictions.columns:
        bin_label = all_predictions[bin_label_column_name]
        bin_predictions = all_predictions[bin_prediction_column_name]
    elif bin_label is None and bin_predictions is None:
        print(f'No bin labels and bin predictions')
        logging.info(f'No bin labels and bin predictions')
        raise Exception

    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(
        bin_label, bin_predictions)
    num_bins = len(label_options)
    precision_micro, recall_micro, fbeta_score_micro, support_micro =\
        metrics.precision_recall_fscore_support(bin_label, bin_predictions, average='micro')
    precision_macro, recall_macro, fbeta_score_macro, support_macro =\
        metrics.precision_recall_fscore_support(bin_label, bin_predictions, average='macro')

    # number of DM chose stay home
    final_labels = list(range(len(support)))
    for my_bin in range(len(label_options)):
        status_size = bin_label.where(bin_label == my_bin).dropna().shape[0]
        if status_size in support:
            index_in_support = np.where(support == status_size)[0]
            if final_labels[index_in_support[
                    0]] in label_options and index_in_support.shape[0] > 1:
                # 2 bins with the same size --> already assign
                index_in_support = index_in_support[1]
            else:
                index_in_support = index_in_support[0]
            final_labels[index_in_support] = label_options[my_bin]

    for item in final_labels:
        if item not in label_options:  # status_size = 0
            final_labels.remove(item)

    accuracy = metrics.accuracy_score(bin_label, bin_predictions)
    results_dict[dict_key][
        f'Bin_{num_bins}_bins_Accuracy{prediction_type}'] = round(
            accuracy * 100, 2)

    # create the results to return
    for measure, measure_name in [[precision, 'precision'], [recall, 'recall'],
                                  [fbeta_score, 'Fbeta_score']]:
        for i in range(len(measure)):
            if f'Bin_{measure_name}_{final_labels[i]}{prediction_type}' in [
                    'Bin_Fbeta_score_1', 'Bin_Fbeta_score_2',
                    'Bin_Fbeta_score_3', 'Bin_precision_1', 'Bin_precision_2',
                    'Bin_precision_3', 'Bin_recall_1', 'Bin_recall_2',
                    'Bin_recall_3'
            ]:
                print(
                    f'Error: final_labels: {final_labels}, label_options: {label_options},'
                    f'already_calculated: {already_calculated}, raisha: {raisha}, rounds: {round_number}'
                )
            results_dict[dict_key][
                f'Bin_{measure_name}_{final_labels[i]}{prediction_type}'] = round(
                    measure[i] * 100, 2)
    for measure, measure_name in [[precision_micro, 'precision_micro'],
                                  [recall_micro, 'recall_micro'],
                                  [fbeta_score_micro, 'Fbeta_score_micro'],
                                  [precision_macro, 'precision_macro'],
                                  [recall_macro, 'recall_macro'],
                                  [fbeta_score_macro, 'Fbeta_score_macro']]:
        results_dict[dict_key][
            f'Bin_{num_bins}_bins_{measure_name}{prediction_type}'] = round(
                measure * 100, 2)

    if not already_calculated:
        results_dict[dict_key][f'MSE{prediction_type}'] = mse
        results_dict[dict_key][f'RMSE{prediction_type}'] = rmse
        results_dict[dict_key][f'MAE{prediction_type}'] = mae

    results_pd = pd.DataFrame.from_dict(results_dict, orient='index')

    return results_pd, results_dict
Пример #40
0
    for targetUser in userRatingsForSongDict[targetSong]:
        if (j == 500):
            break
        j += 1
        similarityForSongDict = {}

        start = time.time()
        targetSongUserRatingsDict = userRatingsForSongDict[targetSong]
        actualTargetRating = userRatingsForSongDict[targetSong][targetUser]

        calcSimilarSongs()

        finalScore = calcFinalScore()

        print(targetUser, finalScore, actualTargetRating)
        print("")
        if finalScore != 0:
            predictedScores.append(finalScore)
            actualScores.append(actualTargetRating)

    if (actualScores and predictedScores):
        rmse = sqrt(mean_squared_error(actualScores, predictedScores))
        rmseSum += rmse
        mae = mean_absolute_error(actualScores, predictedScores)
        maeSum += mae
        print("Root mean squared error is: ", rmse,
              "and Mean absolute error is: ", mae)
        k += 1

print("Mean rmse is: ", rmseSum / k, "and Mean absolute error is: ",
      maeSum / k)
Пример #41
0
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

boston = load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                    test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
print(mae)

mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
print(mse)

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))
print(rmse)

r2 = r2_score(y_pred=y_pred, y_true=y_test)
print(r2)

#=====================================================================
# K-Fold
#=====================================================================
import numpy as np
Пример #42
0
def splitDataTraining(task, model, features, target, test_size, scoring):
    if test_size == 1:
        logger.info("The whole dataset will be used for training!")
        model.fit(features, target)
        params = np.append(model.intercept_, model.coef_)
        predictions = model.predict(features)

        newX = pd.DataFrame({
            "Constant": np.ones(len(features))
        }).join(pd.DataFrame(features))
        MSE = (sum(
            (target - predictions)**2)) / (len(newX) - len(newX.columns))

        var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal())
        sd_b = np.sqrt(var_b)
        ts_b = params / sd_b

        p_values = [
            2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b
        ]

        sd_b = np.round(sd_b, 3)
        ts_b = np.round(ts_b, 3)
        p_values = np.round(p_values, 3)
        params = np.round(params, 4)

        results = pd.DataFrame()
        results["Coefficients"], results["Standard Errors"], results[
            "t values"], results["Probabilites"] = [
                params, sd_b, ts_b, p_values
            ]

        return results

    elif test_size < 1:
        x_train, x_test, y_train, y_test = train_test_split(
            features, target, test_size=test_size, random_state=1)
        model.fit(x_train, y_train)
        model_train_pred = model.predict(x_train)
        model_test_pred = model.predict(x_test)
        results = pd.DataFrame()

        if task == "regression":
            if "neg_mean_absolute_error" in scoring:
                results['MAE_train'], results['MAE_test'] = [[
                    mean_absolute_error(y_train, model_train_pred)
                ], [mean_absolute_error(y_test, model_test_pred)]]
            if "neg_mean_squared_error" in scoring:
                results['MSE_train'], results['MSE_test'] = [[
                    mean_squared_error(y_train, model_train_pred)
                ], [mean_squared_error(y_test, model_test_pred)]]
            if "neg_mean_squared_log_error" in scoring:
                results['MSLE_train'], results['MSLE_test'] = [[
                    mean_squared_log_error(y_train, model_train_pred)
                ], [mean_squared_log_error(y_test, model_test_pred)]]
            if "r2" in scoring:
                results['r2_train'], results['r2_test'] = [[
                    r2_score(y_train, model_train_pred)
                ], [r2_score(y_test, model_test_pred)]]
            return results

        elif task == "classification":
            if "precision" in scoring:
                results['precision_train'], results['precision_test'] = [[
                    precision_score(y_train, model_train_pred)
                ], [precision_score(y_test, model_test_pred)]]
            if "recall" in scoring:
                results['recall_train'], results['recall_test'] = [[
                    recall_score(y_train, model_train_pred)
                ], [recall_score(y_test, model_test_pred)]]
            if "f1" in scoring:
                results['f1_train'], results['f1_test'] = [[
                    f1_score(y_train, model_train_pred)
                ], [f1_score(y_test, model_test_pred)]]
            if "roc_auc" in scoring:
                results['roc_auc_train'], results['roc_auc_test'] = [[
                    roc_auc_score(y_train, model_train_pred)
                ], [roc_auc_score(y_test, model_test_pred)]]

            return results
Пример #43
0
from etl import prepare_data, prepare_submission
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# load and split train/dev/test
(X_train, y_train), (X_test, test_id) = prepare_data()
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

# run without hyperparams for fscore calculations
model = XGBRegressor()
model.fit(X_train, y_train)

y_hat = model.predict(X_dev)
mae = mean_absolute_error(np.expm1(y_dev), np.expm1(y_hat))
print("Mae: {}".format(mae))

thresholds = np.sort(model.feature_importances_)
thresholds = np.unique(thresholds)
threshold = 0
best_mae = mae

for thresh in thresholds[:50]:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = XGBRegressor()
    selection_model.fit(select_X_train, y_train)

    # eval model
Пример #44
0
 def testForCVData(self):
     self.r_sqr = self.clf.score(self.X_test, self.y_test)
     self.output = self.clf.predict(self.X_test)
     self.mae = mean_absolute_error(self.y_test, self.output)
     self.mse = mean_squared_error(self.y_test, self.output)
Пример #45
0
def run_models(grid_y, grid_x):
    X, Y = create_training_and_testing_data(grid_x, grid_y)
    data = Table(X, Y)
    # print(data.Y)
    # np.savetxt('data/' + str(grid_x) + '_' + str(grid_y) + '.csv', np.array(data), delimiter=',', fmt='%10.5f')
    # print(out_data.domain)
    # print(out_data.Y)

    # feature_method = og.preprocess.score.UnivariateLinearRegression()
    # selector = og.preprocess.SelectBestFeatures(method=feature_method, k=10)
    # out_data2 = selector(data)
    # plot_input(out_data2.X, out_data2.Y)
    # print(out_data2.domain)

    # pca = PCA(n_components=5)
    # model = pca(out_data2)
    # out_data = model(out_data2)
    # print(out_data.domain)

    test = og.data.Table(data.domain, random.sample(data, 60))
    train = og.data.Table(data.domain, [d for d in data if d not in test])

    lin = og.regression.linear.LinearRegressionLearner()
    rf = og.regression.random_forest.RandomForestRegressionLearner()
    nnr = og.regression.NNRegressionLearner()
    svm = og.regression.SVRLearner()
    knn = KNeighborsRegressor(n_neighbors=3)

    learners = [lin, rf, nnr, svm]
    regressors = [learner(train) for learner in learners]
    knn.fit(train.X, train.Y)

    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_lin.pickle", "wb") as f:
        pickle.dump(lin, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_rf.pickle", "wb") as f:
        pickle.dump(rf, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_nnr.pickle", "wb") as f:
        pickle.dump(nnr, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_svm.pickle", "wb") as f:
        pickle.dump(svm, f)
    with open(
            "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) +
            "_knn.pickle", "wb") as f:
        pickle.dump(knn, f)

    # print((r(test)[0] for r in regressors))
    linPredict = regressors[0](test)
    rfPredict = regressors[1](test)
    nnrPredict = regressors[2](test)
    svmPredict = regressors[3](test)
    knnPredict = knn.predict(test.X)

    predictions = []
    predictions.append(linPredict)
    predictions.append(rfPredict)
    predictions.append(nnrPredict)
    predictions.append(svmPredict)
    predictions.append(knnPredict)

    # print(knnPredict)

    # print("y   ", " ".join("%5s" % l.name for l in regressors))
    # for d in test:
    #     print(("{:<5}" + " {:5.1f}" * len(regressors)).format(d.get_class(), *(r(d)[0] for r in regressors)))

    # res = og.evaluation.CrossValidation(test, learners, k=10)
    # rmse = og.evaluation.RMSE(res)
    # mae = og.evaluation.MAE(res)
    # r2 = og.evaluation.R2(res)

    rmse = []
    mae = []
    rmse.append(math.sqrt(mean_squared_error(test.Y, linPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, rfPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, nnrPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, svmPredict)))
    rmse.append(math.sqrt(mean_squared_error(test.Y, knnPredict)))

    mae.append(mean_absolute_error(test.Y, linPredict))
    mae.append(mean_absolute_error(test.Y, rfPredict))
    mae.append(mean_absolute_error(test.Y, nnrPredict))
    mae.append(mean_absolute_error(test.Y, svmPredict))
    mae.append(mean_absolute_error(test.Y, knnPredict))

    return np.array(mae), np.array(rmse), np.array(predictions), test
Пример #46
0
    def assessmentModel(self,cvNum=5):
        y_pred = self.__model.predict(self.__xTest)
        from sklearn.model_selection import cross_val_score

        if self.__typeLearning == 'Regression':
            from sklearn.metrics import mean_squared_error,median_absolute_error,mean_squared_log_error,mean_absolute_error,explained_variance_score,r2_score

            #mean_squared_error
            print('MSE: \t\t',mean_squared_error(self.__yTest,y_pred))
            print('RMSE: \t\t',np.sqrt(mean_squared_error(self.__yTest,y_pred)))
            #print('MSE: ',np.mean((self.__yTest-y_pred)**2))

            #median_absolute_error
            #print('median: ',np.median(np.abs(self.__yTest-y_pred)))
            print('median: \t\t',median_absolute_error(self.__yTest,y_pred))
            
            #mean_absolute_error
            #print('MAE: ',np.mean(np.abs(self.__yTest-y_pred)))
            print('MAE: \t\t',mean_absolute_error(self.__yTest,y_pred))
            
            #mean_squared_log_error
            print('MSLE: \t\t',mean_squared_log_error(self.__yTest,y_pred))
            #print('MSLE: ',np.mean((np.log(self.__yTest+1)-np.log(y_pred+1))**2))
            
            #explained_variance_score
            print('explained_variance: \t\t',explained_variance_score(self.__yTest,y_pred))
            #print('explained_variance: ',1-np.var(self.__yTest-y_pred)/np.var(self.__yTest))
            
            #r2_score
            print('R2: \t\t',r2_score(self.__yTest,y_pred))
            #print('R2: ',1-(np.sum((self.__yTest-y_pred)**2))/np.sum((self.__yTest -np.mean(self.__yTest))**2))


            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_squared_error') 
            print('cv MSE mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='r2') 
            print('cv r2 mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='explained_variance') 
            print('cv explained_variance mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_squared_log_error') 
            print('cv MSLE mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_mean_absolute_error') 
            print('cv MAE mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_median_absolute_error') 
            print('cv median mean: \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='neg_root_mean_squared_error') 
            print('cv RMSE mean: \t',scoresval.mean())

        if self.__typeLearning == 'Classification':
            from sklearn.metrics import accuracy_score,balanced_accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score,average_precision_score
            print('准确率: \t\t',accuracy_score(self.__yTest,y_pred))
            print('准确率-Balanced: \t',balanced_accuracy_score(self.__yTest,y_pred))
            
            print('F1-micro: \t\t',f1_score(self.__yTest,y_pred,average='micro'))
            print('F1-macro: \t\t',f1_score(self.__yTest,y_pred,average='macro'))
            print('F1-weighted: \t\t',f1_score(self.__yTest,y_pred,average='weighted'))
            
            print('精确率-micro: \t',precision_score(self.__yTest,y_pred,average='micro'))
            print('精确率-macro: \t',precision_score(self.__yTest,y_pred,average='macro'))
            print('精确率-weighted:',precision_score(self.__yTest,y_pred,average='weighted'))

            print('召回率-micro: \t',recall_score(self.__yTest,y_pred,average='micro'))
            print('召回率-macro: \t',recall_score(self.__yTest,y_pred,average='macro'))
            print('召回率-weighted:',recall_score(self.__yTest,y_pred,average='weighted'))

            print('Cohen\'s Kappa: \t',cohen_kappa_score(self.__yTest,y_pred))
            
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='accuracy') 
            print('cv accuracy mean:  \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='balanced_accuracy') 
            print('cv balanced_accuracy mean:',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_micro') 
            print('cv f1_micro mean:  \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_macro') 
            print('cv f1_macro mean:  \t',scoresval.mean())
            scoresval = cross_val_score(self.__model,self.__xData,self.__yData,cv=cvNum,scoring='f1_weighted') 
            print('cv f1_weighted mean:  \t',scoresval.mean())

            from sklearn.metrics import classification_report
            print('分类报告: ','\n',classification_report(self.__yTest,y_pred))
#fitting the model now
titanic_model.fit(train_X, train_y)

# In[ ]:

#We're just testing how well fitted the model is here.
titanic_preds = titanic_model.predict(val_X)

# In[ ]:

from sklearn.metrics import mean_absolute_error

#Let's calculate the MAE

titanic_mae = mean_absolute_error(titanic_preds, val_y)

print(titanic_mae)

# # Applying The Model To The Given Test Data

# In[ ]:

#making an X function

final_X = test_data[features]
final_predictions = titanic_model.predict(final_X)
final_predictions = np.round(final_predictions)
final_predictions = final_predictions.astype(int)

# # Creating the submission
Пример #48
0
                        #Necessario Desescalonar
                        y_train = scaler_y_train.inverse_transform(y_train)
                        y_test = scaler_y_test.inverse_transform(y_test)

                        previsoes_train = scaler_y_train.inverse_transform(
                            previsoes_train)
                        previsoes_test = scaler_y_test.inverse_transform(
                            previsoes_test)
                        '''
                        Tratamento dos Dados
                        '''
                        from sklearn.metrics import mean_absolute_error, mean_squared_error
                        from math import sqrt

                        #Dados estatisticos do treinamento
                        mae_train = mean_absolute_error(
                            y_train, previsoes_train)
                        mse_train = mean_squared_error(y_train,
                                                       previsoes_train)
                        rmse_train = sqrt(
                            mean_squared_error(y_train, previsoes_train))

                        #Dados estatisticos dos testes
                        mae_test = mean_absolute_error(y_test, previsoes_test)
                        mse_test = mean_squared_error(y_test, previsoes_test)
                        rmse_test = sqrt(
                            mean_squared_error(y_test, previsoes_test))
                        '''
                        import matplotlib.pyplot as plt
                        
                        #plt.title('Previsão da rede neural com 2 neurônios ocultos ')
                        plt.xlabel('Valores reais')
Пример #49
0
        #error = absolute_error/100
        print("Mean Absolute Error is")
        print(error)
        f.write("Mean Absolute Error is " + str(error) + "\n")

        f.write("Sample Solution" + "\n")
        for rand in range(0, 5):
            f.write("%s " % predictions[rand])

f.close()

# For different metrics evaluation

flattened = np.concatenate(final_values[iteration_count]).ravel().tolist()

comp = pd.DataFrame({
    'original': target[top_70:len(needed_tweets)],
    'predicted': flattened
})

comp.corr(method='pearson')

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

r2_score(comp['original'].tolist(), comp['predicted'].tolist())

comp.to_csv("GCN_comp.csv")

mean_absolute_error(comp['original'].tolist(), comp['predicted'].tolist())
def LSTM_Multivariate():
    dataset = read_csv("/Users/ange/Downloads/Training-Data-Sets.csv", header=0)

    print('----------------------------------------------------------------------------')
    print('Replace NaN values with Mean')
    dataset.fillna(dataset.mean(), inplace=True)
    print('----------------------------------------------------------------------------')

    values = dataset.values
    target = dataset.iloc[:, 1:2].values
    dat = dataset.iloc[:, [1,4,3,18,5,14]].values

    t_df = read_csv("/Users/ange/Downloads/Test dataset v1.csv", header=0)
    t_df.fillna(t_df.mean(), inplace=True)
    t_dat = t_df.iloc[:, [1,4,3,18,5,14]].values
    t_target = t_df.iloc[:, 1:2].values
    scaler = preprocessing.StandardScaler()

    # define input sequence
    train = dat
    test = t_dat
    target_train = target
    target_test = t_target[13:]

    # choose a number of time steps
    n_steps = 13
    # split into samples
    X, y = split_sequence(train,target, n_steps)
    # summarize the data
    for i in range(len(X)):
        print(X[i], y[i])

    # reshape from [samples, timesteps] into [samples, timesteps, features]
    n_features = 6
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    print(X.shape, y.shape)

    # define model
    start_time = time.time()
    model = Sequential()
    model.add(Bidirectional(LSTM(25, activation='relu'), input_shape=(n_steps, n_features) ))

    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    # fit model
    model.fit(X, y, epochs=100, verbose=0, shuffle = False)

    print('')
    print('Prediction')
    # demonstrate prediction
    #Testing
    test_inputs = t_dat


    test_features = []
    for i in range(n_steps, len(test_inputs)):
        test_features.append(test_inputs[i-n_steps:i, 0:n_features])   
    test_features = np.array(test_features)

    print(test_features[0])
    test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], n_features))
    print('Features - Shape')
    print(test_features.shape[1])
    print(test_features)

    x_input = array(test_features)
    print('')
    print(x_input.shape)
    predictions = model.predict(test_features, verbose=0)

    print('')
    print("Execution Time: %s seconds" % (time.time() - start_time))
    print('')

    actual = target_test
    pred = scaler.fit_transform(predictions)

    plt.figure(figsize=(10,6))
    plt.plot(actual, color='blue', label='Actual Forecast')
    plt.plot(predictions , color='red', label='Predicted Forecast')
    plt.title('Sales Forecasting')
    plt.xlabel('Forecast Horizons (Day)')
    plt.ylabel('Sales')
    plt.legend()
    plt.show()

    print('-------------------------------------------------------')
    print('MAE:')
    print(metrics.mean_absolute_error(actual,predictions))
    print('')

    print('RMSE:')
    print(np.sqrt(metrics.mean_absolute_error(actual,predictions)))
    print('Epochs: 100')
    model.summary()
    print('')
    print('')
    print('-------------------------------------------------------')
## ELM TRAINING
MAE_TRAIN_MINS = []
MAE_TEST_MINS = []
steps = 2  # new
neurons = 10  # new
predictions = []
for M in range(1, steps, 1):
    MAES_TRAIN = []
    MAES_TEST = []
    # print "Training with %s neurons..."%M
    for i in [10, 100, 300, 500]:
        print(f"Training {i} neurons in Step{M}")
        ELM = ELMRegressor(i)
        ELM.fit(X_train, y_train)
        prediction = ELM.predict(X_train)
        MAES_TRAIN.append(mean_absolute_error(y_train,
                                              prediction))

        prediction = ELM.predict(X_test)
        predictions.append(prediction)
        mae = mean_absolute_error(y_test,
                                             prediction)
        MAES_TEST.append(mae)
        print(f"MAE: {mae}")
        print(f"RMSE: {rmse(prediction, y_test)}")
        print(f"MSE: {mean_squared_error(prediction, y_test)}")
    MAE_TEST_MINS.append(min(MAES_TEST))
    MAE_TRAIN_MINS.append(MAES_TRAIN[np.argmin(MAES_TEST)])

print("Minimum MAE ELM =", min(MAE_TEST_MINS))
print("using amount of steps: ", steps)  # new
print("using amount of neurons: ", neurons)  # new
y = dataset['Deaths'].values

plt.figure(figsize=(15, 10))
plt.tight_layout()
seabornInstance.distplot(dataset['Deaths'])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

coeff_df = pd.DataFrame(X, columns=['Recovered', 'Confirmed'])

y_pred = regressor.predict(X_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(25)

df1.plot(kind='bar', figsize=(10, 8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle='-', linewidth='0.5', color='black')
plt.show()

print('Mean absolute error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean squared error:', metrics.mean_squared_error(y_test, y_pred))
print('Root mean squared error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Пример #53
0
                   verbose=True,
                   early_stopping=True,
                   validation_fraction=0.2)
clf.fit(X_train, y_train)
print("训练集准确率:", clf.score(X_train, y_train))
clf.fit(X_test, y_test)
y_Pred = clf.predict(X_test)
y_Pred[y_Pred > 1] = 1
y_Pred[y_Pred < 0] = 0

# 模型评分
y_pre = clf.predict(X_test)
c1 = clf.score(X_test, y_test)
print("测试集准确率:", c1)

# mae评估
d1 = mean_absolute_error(y_true=y_test, y_pred=y_Pred)
print("mae:", d1)

#rmae评估
e1 = mean_squared_error(y_true=y_test, y_pred=y_Pred)
print('mean_squared_error: ', e1)

#R²评估
r2 = 1 - (d1) / (np.std(y_test))
print("R²:", r2_score(y_test, y_Pred))

#计算时间
end = time.perf_counter()
print("final is in ", end - start)
Пример #54
0
        max_features='auto',
        max_depth=8,
        min_samples_leaf=4,
        min_samples_split=8,
        oob_score=True,
        #random_state = 42,
        criterion='mae',
        n_jobs=-1,
        bootstrap=True)
    #warm_start=False,
    #max_leaf_nodes = 30)
    rfr.fit(X_train, y_train)
    predictions = rfr.predict(X_test)

    #plt.scatter(y_test,predictions)
    print('MAE:', metrics.mean_absolute_error(y_test, predictions))
    print('MSE:', metrics.mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
    print('MAPE:', mean_absolute_percentage_error(y_test, predictions))

    #columns = net_profit_percent.columns
    #print (sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), columns),reverse=True))

    #subplots method of matplotlib
    fig, axes = plt.subplots(nrows=2, ncols=1)
    axes[0].scatter(y_test, predictions)
    plt.sca(axes[1])  #Use the pyplot interface to change just one subplot
    plt.xticks(range(X_train.shape[1]), X_train.columns, color='r')
    axes[1].bar(range(X_train.shape[1]),
                rfr.feature_importances_,
                color='b',
Пример #55
0
from sklearn.model_selection import train_test_split

#training data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=100)

#importing LR model
from sklearn.linear_model import LinearRegression

lm = LinearRegression()

lm.fit(X_train, y_train)

#prediction
predictions = lm.predict(X_test)

#some visualize
plt.scatter(y_test, predictions)

#regression Evaluation methods
from sklearn import metrics

metrics.mean_absolute_error(y_test, predictions)

metrics.mean_squared_error(y_test, predictions)

np.sqrt(metrics.mean_squared_error(y_test, predictions))
def polynomial_residual(degree, X, y):
    polynomial_regression = PolynomialRegression(degree=degree)
    polynomial_regression.fit(X, y)
    y_pred = polynomial_regression.predict(X)
    mae = mean_absolute_error(y, y_pred)
    return mae
Пример #57
0
	    # Print '{name} has been fitted'
	    print(name, 'has been fitted.')

#Show best score
model.best_score_
#Show best parameters for given model
model..best_estimator_
# Import r2_score and mean_absolute_error functions
from sklearn.metrics import r2_score, mean_absolute_error

# Predict test set using fitted random forest
pred = fitted_models['rf'].predict(X_test)

# Calculate and print R^2 and MAE
print( 'R^2:', r2_score(y_test, pred ))
print( 'MAE:', mean_absolute_error(y_test, pred))




Classification:

#Display probability of prediction
model.predict_proba

# Classification metrics
from sklearn.metrics import roc_curve, auc

#Helper function
def fit_and_plot_classifier(clf):
    # Fit model
sns.displot(y_test - prediction)
#plt.show()

#plt.scatter(y_test, prediction)
#plt.show()
"""Regression Evaluation Metrics :-
Here are three common evaluation metrics for regression problems:

Mean Absolute Error (MAE) is the mean of the absolute value of the errors

Mean Squared Error (MSE) is the mean of the squared errors.
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors.

Comparing these metrics:

MAE is the easiest to understand, because it's the average error.
MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are loss functions, because we want to minimize them."""
from sklearn.metrics import mean_squared_error, mean_absolute_error
print("MAE: ", mean_absolute_error(y_test, prediction))
print("MSE: ", mean_squared_error(y_test, prediction))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, prediction)))

import pickle
##Open a file , where we want to store the data
file = open("regression.pkl", 'wb')
##dump the information to that file
pickle.dump(regressor, file)
Пример #59
0
    def run(self):
        """
        For experimental purpose, since the evaluation result is bad, this model will not be use for prediction
        >>> from q3_time_series.model import MultivariateMultiStepLSTM
        >>> # To Evaluate
        >>> evaluate_metrics = MultivariateMultiStepLSTM(3,2).run()
        """
        dataset_train = np.hstack(self.hstacK_generator(self.train))
        dataset_valid = np.hstack(self.hstacK_generator(self.valid_arb))

        X_train, y_train = self.split_sequences(dataset_train, self.n_steps_in,
                                                self.n_steps_out)
        X_valid, y_valid = self.split_sequences(dataset_valid, self.n_steps_in,
                                                self.n_steps_out)

        model = self.make_model(X_train.shape[2])
        model.fit(X_train,
                  y_train,
                  epochs=200,
                  verbose=0,
                  callbacks=[
                      tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                       patience=20)
                  ],
                  validation_data=(X_valid, y_valid))

        pred_valid = model.predict(X_valid, verbose=0)
        pred_train = model.predict(X_train, verbose=0)

        tmp = []
        for j, col in enumerate(self.df.columns):
            tmp.append({
                col: {
                    'rmse_train':
                    sqrt(
                        mean_squared_error(
                            [y_train[i][0][j] for i in range(0, len(y_train))],
                            [
                                pred_train[i][0][j]
                                for i in range(0, len(pred_train))
                            ])),
                    'rmse_val':
                    sqrt(
                        mean_squared_error(
                            [y_valid[i][0][j] for i in range(0, len(y_valid))],
                            [
                                pred_valid[i][0][j]
                                for i in range(0, len(pred_valid))
                            ])),
                    'mae_train':
                    mean_absolute_error(
                        [y_train[i][0][j] for i in range(0, len(y_train))], [
                            pred_train[i][0][j]
                            for i in range(0, len(pred_train))
                        ]),
                    'mae_val':
                    mean_absolute_error(
                        [y_valid[i][0][j] for i in range(0, len(y_valid))], [
                            pred_valid[i][0][j]
                            for i in range(0, len(pred_valid))
                        ]),
                    'mape_train':
                    f'{self.mean_absolute_percentage_error([y_train[i][0][j] for i in range(0, len(y_train))], [pred_train[i][0][j] for i in range(0, len(pred_train))])} %',
                    'mape_val':
                    f'{self.mean_absolute_percentage_error([y_valid[i][0][j] for i in range(0, len(y_valid))], [pred_train[i][0][j] for i in range(0, len(pred_valid))])} %'
                }
            })

        return tmp
Пример #60
0
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2