示例#1
0
def regression_score(true_data, predict_data):
    assert (true_data.shape == predict_data.shape)
    if len(true_data.shape) == 1 or true_data.shape[1] == 1:
        return explained_variance_score(true_data, predict_data)
    else:
        return np.mean([explained_variance_score(true_data[:, index], predict_data[:, index]) for index in
                        range(true_data.shape[1])])
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 13)
        assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
                            13 / float(n_samples), 2)

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        13 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13)
    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)

    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 13. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    assert_equal(accuracy_score(y_true, y_pred, normalize=False),
                 n_samples - zero_one_loss(y_true, y_pred, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def runRegressor( clf,featureMat,targets,no_of_training_example ):
	try:
		clf.fit(featureMat[:no_of_training_example,:], targets[:no_of_training_example])
		y_pred = clf.predict(featureMat[no_of_training_example:,:])
		print 'Variance Score'
		print explained_variance_score(targets[no_of_training_example:], y_pred)
		print 'Mean absolute error'
		print mean_absolute_error(targets[no_of_training_example:], y_pred)
		print 'Explained variance score'
		print explained_variance_score(targets[no_of_training_example:], y_pred)
	except Exception, e:	
		print e
示例#4
0
def test_regression_multioutput_array():
    y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]]
    y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]]

    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r = r2_score(y_true, y_pred, multioutput='raw_values')
    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')

    assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
    assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
    assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
    assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    y_true = [[0, 0]]*4
    y_pred = [[1, 1]]*4
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r = r2_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(mse, [1., 1.], decimal=2)
    assert_array_almost_equal(mae, [1., 1.], decimal=2)
    assert_array_almost_equal(r, [0., 0.], decimal=2)

    r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values')
    assert_array_almost_equal(r, [0, -3.5], decimal=2)
    assert_equal(np.mean(r), r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
                 multioutput='uniform_average'))
    evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]],
                                   multioutput='raw_values')
    assert_array_almost_equal(evs, [0, -1.25], decimal=2)

    # Checking for the condition in which both numerator and denominator is
    # zero.
    y_true = [[1, 3], [-1, 2]]
    y_pred = [[1, 4], [-1, 1]]
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(r2, [1., -3.], decimal=2)
    assert_equal(np.mean(r2), r2_score(y_true, y_pred,
                 multioutput='uniform_average'))
    evs = explained_variance_score(y_true, y_pred, multioutput='raw_values')
    assert_array_almost_equal(evs, [1., -3.], decimal=2)
    assert_equal(np.mean(evs), explained_variance_score(y_true, y_pred))

    # Handling msle separately as it does not accept negative inputs.
    y_true = np.array([[0.5, 1], [1, 2], [7, 6]])
    y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]])
    msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
    msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred),
                               multioutput='raw_values')
    assert_array_almost_equal(msle, msle2, decimal=2)
def EXP_VAR(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None,
        ss=None, source_model_args=None, method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args)
        try:
            return explained_variance_score(flux_arr[inds], back_trans_flux, multioutput=multioutput)
        except:
            return float(np.mean(np.var(flux_arr[inds] - back_trans_flux, axis=1) / np.var(flux_arr[inds], axis=1)))
    else:
        try:
            return explained_variance_score(Y, y, multioutput=multioutput)
        except:
            return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
示例#6
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # symmetric
    assert_equal(zero_one(y_true, y_pred),
                 zero_one(y_pred, y_true))
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        mean_squared_error(y_pred, y_true))
    # not symmetric
    assert_true(explained_variance_score(y_true, y_pred) !=
            explained_variance_score(y_pred, y_true))
    assert_true(r2_score(y_true, y_pred) !=
            r2_score(y_pred, y_true))
def svm(X,Y,k):

	if k > 0:
		y = Y
		totalRSS = 0
		totalR_sq = 0
		totalev = 0

		kf = KFold(len(X), n_folds=k)
		for train_index, test_index in kf:
			x_train, y_train = X[train_index], y[train_index]
			x_test,y_test = X[test_index], y[test_index]
			clf = SVR(kernel='rbf',C=1e5,degree=5)

			clf.fit(x_train,y_train)
			pred = clf.predict(x_test)
			#print("Residual sum of squares: %.5f"
			  #% np.mean((clf.predict(x_test) - y_test) ** 2))
			
			pred = clf.predict(x_test)
			totalRSS += np.mean((clf.predict(x_test) - y_test) ** 2)
			
			totalR_sq += r2_score(y_test, pred)
			
			totalev += explained_variance_score(y_test,pred)
		print("Residual sum of squares: {}".format(float(totalRSS)/float(k)))
		print('R^2 score: {}'.format(float(totalR_sq)/float(k)))
		print('explained variance score: {}'.format(float(totalev)/float(k)))
			
	else:
		X = preprocessing.scale(X)
		cutoff = int(len(X)*.7)
		x_train,y_train = X[:cutoff], Y[:cutoff]
		x_test, y_test = X[(cutoff+1):],Y[(cutoff+1):]

		clf = SVR(kernel='rbf',C=1e5,degree=5)

		clf.fit(x_train,y_train)
		pred = clf.predict(x_test)
		print("Residual sum of squares: %.5f"
		  % np.mean((clf.predict(x_test) - y_test) ** 2))
		pred = clf.predict(x_test)

		# Explained variance score: 1 is perfect prediction
		print('R^2 score: %.8f' % r2_score(y_test, pred))
		print('explained variance score: %.8f' %explained_variance_score(y_test,pred))

	return
def main(args):
    if len(sys.argv) < 2:
        print("USAGE: python linear_regression.py [feature matrix] [values]")
        exit(0)
    X = np.genfromtxt(args[0], delimiter=',')
    Y = np.genfromtxt(args[1], delimiter=',')  
    X = util.process_X(X)
    # X = util.item_item_collab_filtering(X, 100, -1)
    if('dap' in args[0]):
        X = util.fill_mean2(X)
    else:
        X = util.fill_mean(X,-1)
    print X
    X = util.variance_threshold(X, 1)
    kfolds = False
    if len(args) >= 3: 
        kfolds = True
    if kfolds:
        kf = KFold(len(X), n_folds=int(args[2]))
        for train_index, test_index in kf:
            x_train, y_train = X[train_index], Y[train_index]
            x_test,y_test = X[test_index], Y[test_index]
            regr = linear_model.LinearRegression()
            regr.fit(x_train,y_train)
            print("Residual sum of squares: %.5f"
              % np.mean((regr.predict(x_test) - y_test) ** 2))
            pred = regr.predict(x_test)

            # Explained variance score: 1 is perfect prediction
            print('Variance score: %.8f' % regr.score(x_test, y_test))
            print('R^2 score: %.8f' % r2_score(y_test, pred))
            print('explained variance score: %.8f' % explained_variance_score(y_test,pred))
            print '\n'
    else:
        cutoff = int(len(X)*.7)
        x_train, y_train = X[:cutoff], Y[:cutoff]
        x_test,y_test = X[(cutoff+1):], Y[(cutoff+1):]
        regr = linear_model.LinearRegression()
        regr.fit(x_train,y_train)
        print("Residual sum of squares: %.5f"
          % np.mean((regr.predict(x_test) - y_test) ** 2))
        pred = regr.predict(x_test)
        

        # Explained variance score: 1 is perfect prediction
        print('Variance score: %.8f' % regr.score(x_test, y_test))
        print('R^2 score: %.8f' % r2_score(y_test, pred))
        print('explained variance score: %.8f' %explained_variance_score(y_test,pred))
def plotResults(predicted, expected, output):
    """
    Generate a simple plot demonstrating the results.
    """
    var = metrics.explained_variance_score(expected, predicted)
    mae = metrics.mean_absolute_error(expected, predicted)
    mse = metrics.mean_squared_error(expected, predicted)
    r2 = metrics.r2_score(expected, predicted)
    rms = np.sqrt(np.mean((expected - predicted) ** 2))

    print output
    print 'Explained variance (best possible score is 1.0, lower values are worse):', var
    print 'Mean Absolute Error (best is 0.0):', mae
    print 'Mean Squred Error (best is 0.0):', mse
    print 'R2 score (best is 1.0):', r2
    print 'RMS:', rms
    print '\n\n\n'

    title = 'RMS=%.4f, MSE=%.4f, R2=%.3f' % (rms, mse, r2)

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    plt.title(title)
    ax1.scatter(expected, predicted, alpha=0.2, s=5)
    ax1.set_xlabel("Spectroscopic Redshift")
    ax1.set_ylabel("Photo-z")
    ax1.plot([0, 8], [0, 8], '-r')
    ax1.set_xlim(0, 1.1*expected.max())
    ax1.set_ylim(0, 1.1*expected.max())
    plt.savefig(output+'Results.pdf')
    plt.close()
def performance_metric(label, prediction):
    """Calculate and return the appropriate error performance metric."""

    ###################################
    ### Step 3. YOUR CODE GOES HERE ###
    ###################################

    # The following page has a table of scoring functions in sklearn:
    # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics

    # In order to study all of the different performance metrics, I will simply
    # calculate them all and return a dictionary with all of the results
    l, p = label, prediction

    output = collections.OrderedDict()

    output["explained variance score"] = skmetrics.explained_variance_score(
        l, p)
    output["mean absolute error"] = skmetrics.mean_absolute_error(l, p)

    output["mean squared error"] = skmetrics.mean_squared_error(l, p)

    output["root mean squared error"] = np.sqrt(
        skmetrics.mean_squared_error(l, p))

    output["median absolute error"] = skmetrics.median_absolute_error(l, p)

    output["r2 score"] = skmetrics.r2_score(l, p)

    return output
示例#11
0
def across_all_appliances(scores, mains, aggregate_predictions):
    total_sum_abs_diff = 0.0
    for appliance_scores in scores.values():
        total_sum_abs_diff += appliance_scores['sum_abs_diff']

    # Total energy correctly assigned
    # See Eq(1) on p5 of Kolter & Johnson 2011
    denominator = 2 * np.sum(mains)
    total_energy_correctly_assigned = 1 - (total_sum_abs_diff / denominator)
    total_energy_correctly_assigned = float(total_energy_correctly_assigned)

    # explained variance
    n = min(len(mains), len(aggregate_predictions))
    mains = mains[:n]
    aggregate_predictions = aggregate_predictions[:n]

    scores['across all appliances'] = {
        'total_energy_correctly_assigned': total_energy_correctly_assigned,
        'explained_variance_score': float(
            metrics.explained_variance_score(mains, aggregate_predictions)),
        'mean_absolute_error': float(
            np.mean(
                [scores[app]['mean_absolute_error']
                 for app in scores])),
        'relative_error_in_total_energy': float(
            np.mean(
                [scores[app]['relative_error_in_total_energy']
                 for app in scores])),
    }
    scores['across all appliances'].update({
        metric: float(np.mean([scores[app][metric] for app in scores]))
        for metric in METRICS['classification']
    })

    return scores
示例#12
0
文件: models_skl.py 项目: satra/sad
def actvspred(modelname, predmodel):
    """
    plot the predicted vs. the actual score
    """
    predscores, actualscores, meanerr, rmsqerr = predmodel
    axmax = int(round(np.max([predscores,actualscores])))
    axmin = int(round(np.min([predscores,actualscores])))
    # fit line through the scores
    actualscores2 = actualscores.reshape(subject_num,1)
    model = lm.LinearRegression()
    model.fit(actualscores2, predscores)
    # get explained variance  
    rsqrd = skm.explained_variance_score(actualscores, predscores)
    x = np.array(range(axmin-5, axmax+6))
    y = model.coef_[0]*x+model.intercept_
    # plot scatterplot and lines
    plt.figure()
    plt.scatter(actualscores,predscores,s=70)
    plt.plot(x,x,'g',label='optimal model')  
    plt.plot(x,y,'k',label='our model',linewidth=2)
    plt.xlabel("actual lsas delta")
    plt.ylabel("predicted lsas delta")
    plt.title(modelname)
    plt.axis([axmin-5,axmax+5,axmin-5,axmax+5])
    axes = plt.axes()
    axes.grid(b=True)
    axes.text(0.05,0.8,"meanerr: %.2f\nrmse: %.2f\nexpl. var: %.2f"%(meanerr,rmsqerr,rsqrd),transform=axes.transAxes)
    #plt.legend()
    plt.savefig(os.path.join(outdir,"%s_crossval.png"%modelname),dpi=100,format="png")
示例#13
0
def plot_expl_var(y_true, y_pred, vari, lev, label=None):
    expl_var = metrics.explained_variance_score(y_true, y_pred,
                                                multioutput='raw_values')
    plt.plot(unpack(expl_var, vari, axis=0), lev, label=label)
    plt.ylim([np.amax(lev), np.amin(lev)])
    plt.ylabel('$\sigma$')
    plt.title('Explained Variance Regression Score')
示例#14
0
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    #print("Accuracy on training set:")
    #print(clf.score(X_train, y_train))
    #print("Accuracy on testing set:")
    #print(clf.score(X_test, y_test))
    
    y_predicted = clf.predict(X_test)
    
    vysledek = (mean_absolute_error(y_test, y_predicted),
                mean_squared_error(y_test, y_predicted),
                r2_score(y_test, y_predicted),
                explained_variance_score(y_test, y_predicted))
    
    #print("mean_absolute_error:")
    #print(vysledek[0])
    #print("mean_squared_error:")
    #print(vysledek[1])
    #print("r2_score:")
    #print(vysledek[2])
    #print("explained_variance_score:")
    #print(vysledek[3])
    
    return vysledek
示例#15
0
文件: ml.py 项目: nadpriplod/rosa
def linreg(y,X):

    # Split the data into training/testing sets
    X_train = X[:-2000]
    X_test = X[-2000:]

    # Split the targets into training/testing sets
    y_train = y[:-2000]
    y_test = y[-2000:]

    # Create linear regression object
    regr = linear_model.LinearRegression(normalize=True)

    # Train the model using the training sets
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    # The intersept
    #print('Intercept: \n', regr.intercept_ )
    # The coefficients
    #print('Coefficients: \n', regr.coef_)
    # The mean square error
    print("Residual sum of squares:")
    print(((y_pred - y_test) ** 2).sum())
    #print((((y_test - y_test.mean()) ** 2).sum())/(len(y_test)-1))
    print("Variance:")
    print(y_test.var())
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.6f' % explained_variance_score(y_test, y_pred, multioutput='variance_weighted'))

    return regr
示例#16
0
文件: summary.py 项目: TaRyu/fx
def score():
    methods = ['cro_cnn',
               'cro_knn',
               'cro_svm',
               'mon_ann',
               'mon_knn',
               'mon_svm',
               'day_ann',
               'day_knn',
               'day_svm']
    result_tmp1 = np.empty(0)
    result_tmp2 = np.empty(0)
    for fx in FX_LIST:
        data = pd.read_pickle('%s/summary_%s.pkl' % (PREX, fx))
        for method in methods:
            score1 = metrics.mean_squared_error(data['real'], data[method])
            result_tmp1 = np.append(result_tmp1, score1)
            score2 = metrics.explained_variance_score(
                data['real'], data[method])
            result_tmp2 = np.append(result_tmp2, score2)
    result1 = pd.DataFrame(result_tmp1.reshape(-1, len(methods)),
                           index=FX_LIST, columns=methods)
    result2 = pd.DataFrame(result_tmp2.reshape(-1, len(methods)),
                           index=FX_LIST, columns=methods)
    result1.to_pickle('%s/summary_mse.pkl' % PREX)
    result2.to_pickle('%s/summary_evs.pkl' % PREX)
    return result1, result2
def print_reg_metrics(y_test, y_pred):
    print '%s  %s' % ('metric'.center(20), 'value'.center(12))
    print '--------------------  ------------'
    print 'explained variance:  %12.3f' % metrics.explained_variance_score(y_test, y_pred)
    print 'mean absolute error: %12.3f' % metrics.mean_absolute_error(y_test, y_pred)
    print 'mean squared error:  %12.3f' % metrics.mean_squared_error(y_test, y_pred)
    print 'R-squared score:     %12.3f' % metrics.r2_score(y_test, y_pred)
示例#18
0
文件: ml.py 项目: jarulraj/dbtune
def estimate_performance(file, features_to_discard):
    methods = [#("Lasso Regression", linear_model.Lasso(alpha = 0.05)),
               #("Gaussian Processes", gaussian_process.GaussianProcess(theta0=1e-2, corr='absolute_exponential')),
               ("SVR", svm.SVR(kernel="linear", C=1e3, degree=4))]

    [_, y_benchmark, num_benchmarks] =  preprocess(file, normalize_data, BENCHMARK_LABEL_FIELD, features_to_discard)
    [X_throughput_combined, y_throughput_combined, num_throughputs] = preprocess(file, normalize_data, THROUGHPUT_LABEL_FIELD, features_to_discard)

    for name, instance in methods:
        print("===========================================================================")
        print("Using method %s" % name)
        print("===========================================================================")

        for benchmark_number in range(num_benchmarks):
            print("-----------------------")
            print("Estimating for benchmark %s" % benchmark_list[benchmark_number][0])
            print("-----------------------")

            sample_filter = y_benchmark == benchmark_number
            X_throughput = X_throughput_combined[sample_filter, :]
            y_throughput = y_throughput_combined[sample_filter]

            print("Found %d samples, doing two-way CV" % X_throughput.shape[0])

            [X_train, y_train, X_test, y_test] = split_data(X_throughput, y_throughput, 2)
            instance.fit(X_train, y_train)
            y_pred = instance.predict(X_test)

            np.set_printoptions(suppress=True)
            print(y_test[:20])
            print(y_pred[:20])

            print("Estimator got R2 Score %f" % r2_score(y_test, y_pred))
            print("Estimator got Explained Variance %f" % explained_variance_score(y_test, y_pred))
示例#19
0
def cross_validate_predictor(data, features, clf_options, output_filename=None):
    print(clf_options)
    data_x = data[features].values
    data_y = data['ddg_exp'].values
    cv = cross_validation.LeaveOneLabelOut(data['label'].values)
    clf = ensemble.GradientBoostingRegressor(**clf_options)
    y_pred_all = []
    y_true_all = []
    for train, test in cv:
        x_train = data_x[train]
        y_train = data_y[train]
        x_test = data_x[test]
        y_test = data_y[test]
        clf.fit(x_train, y_train)
        probas_ = clf.predict(x_test)
        y_pred_all.extend(probas_)
        y_true_all.extend(y_test)
    results = clf_options.copy()
    results['n_features'] = len(features)
    results['features'] = ','.join(features)
    results['explained_variance_score'] = metrics.explained_variance_score(y_true_all, y_pred_all)
    results['mean_absolute_error'] = metrics.mean_absolute_error(y_true_all, y_pred_all)
    results['mean_squared_error'] = metrics.mean_squared_error(y_true_all, y_pred_all)
    results['r2_score'] = metrics.r2_score(y_true_all, y_pred_all)

    if output_filename is not None:
        write_row_to_file(results, output_filename)
    return results, y_true_all, y_pred_all
示例#20
0
文件: train.py 项目: gbourdin/charlas
def cli(dataset_path, out_file):
    """Train a new model.

    This will train a new model using the provided dataset, trained model
    will be dumped to OUT file.
    """
    data = pd.read_csv(dataset_path)
    data = data.dropna(axis=0)  # Just drop empty values
    X = data[FEATURES]
    y = data['Price']

    train_X, test_X, train_y, test_y = train_test_split(
        X, y, test_size=0.2, random_state=1
    )

    model = HousePricePredictor()
    model.fit(train_X, train_y)
    model.dump(out_file)

    predictions = model.predict(test_X)
    print("Mean Absolute Error : " + str(
        mean_absolute_error(predictions, test_y)))
    print("Explained Variance Score :" + str(
        explained_variance_score(predictions, test_y)))
    print("R2 Score :" + str(r2_score(predictions, test_y)))
def print_evaluations(Y_true, Y_pred, classification=True):
    
    if classification:
        report = classification_report(Y_true, Y_pred)
        logging.info('Classification report:\n%s' % str(report))

        cm = confusion_matrix(Y_true, Y_pred)
        logging.info('Confusion Matrix:\n%s' % str(cm))
    
        # fig = plt.figure()
        # ax = fig.add_subplot(111)
        # cax = ax.matshow(cm)
        # fig.colorbar(cax)
        #
        # ax.set_xticklabels(['']+['-1', '0', '1'])
        # ax.set_yticklabels(['']+['-1', '0', '1'])
        #
        # plt.title('Confusion Matrix')
        # plt.ylabel('True label')
        # plt.xlabel('Predicted label')
        # plt.show(block=False)

    else:
        var = explained_variance_score(Y_true, Y_pred)
        logging.info('Explained variance (best=1.0): %f' % var)
        
        mae = mean_absolute_error(Y_true, Y_pred)
        logging.info('Mean absolute error (best=0.0): %f' % mae)
        
        mse = mean_squared_error(Y_true, Y_pred)
        logging.info('Mean squared error (best=0.0): %f' % mse)
        
        r2 = r2_score(Y_true, Y_pred)
        logging.info('R squared score (best=1.0): %f' % r2)
def regression_metrics( csv_test, csv_result, last_or_first ):

    real_results = []
    predicted_results = []

    with open(csv_test, 'rb') as csv_test_file:
        csv_test_reader = csv.reader(csv_test_file, delimiter=',', quotechar='"')
        for row in csv_test_reader:
            if last_or_first == 'first_field':
                real_results.append(float(row.pop(0)))
            else:
                real_results.append(float(row.pop()))
                
    with open(csv_result, 'rb') as csv_result_file:
        csv_result_reader = csv.reader(csv_result_file, delimiter=',', quotechar='"')
        for row in csv_result_reader:
            if last_or_first == 'first_field':
                predicted_results.append(float(row.pop(0)))
            else:
                predicted_results.append(float(row.pop()))            

    labels = list(set(real_results))           

    print('Explained variance score: %f' % explained_variance_score(real_results, predicted_results))  
    print('Mean squared error: %f' % mean_squared_error(real_results, predicted_results))  
    print('Mean absolute error: %f' % mean_absolute_error(real_results, predicted_results))  
示例#23
0
def display(reg, reg_name):
    reg=reg.fit(x_train,y_train)
    y_pred=reg.predict(x_test)
    r2 = reg.score(x_test,y_test)
    
    lst_reg.append(reg_name)    
        
    rms = sqrt(mean_squared_error(y_test, y_pred))
    #print("The Root mean square error for the Regressor is: "+str(rms))
    rms = round(rms,2)
    lst_rms.append(str(rms))
    
    r2 = r2_score(y_test,y_pred)
    #print("r squared value: "+str(r2))
    r2 = round(r2,2)
    lst_r2.append(r2)
    
    var_score = explained_variance_score(y_test,y_pred)
    #print("Variance Score: "+str(var_score))
    var_score = round(var_score,2)
    lst_vs.append(var_score)
    
    mean_abs_error=mean_absolute_error(y_test,y_pred)
    #print("Mean Absolute Error: "+str(mean_abs_error))
    mean_abs_error = round(mean_abs_error,2)
    lst_mae.append(mean_abs_error)
    
    #print(reg.coef_,reg.intercept_)
    
    dic['Regressor'] = lst_reg
    dic['RMSE'] = lst_rms
    dic['R Square'] = lst_r2
    dic['Var Score'] = lst_vs
    dic['Mean Abs Err'] = lst_mae
def pearso(name, X1, X2):
    print X1.shape, X2.shape
    print type(X1)    
    print "Pearson correlation %s %f" %(name, pearsonr(X1,X2)[0])
    print "Correct samples %s %f" %(name, 1-((X1-X2)!=0).sum()/float(len(X1)))
    print "RMSE %f" % sqrt(sum((X1-X2)**2)/len(X1))
    print "Explained Variance Score %f" % metrics.explained_variance_score(X1,X2)
def exp_var(
    rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
):
    """Calculate explained variance.

    Args:
        rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs
        rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction

    Returns:
        float: Explained variance (min=0, max=1).
    """

    y_true, y_pred = merge_rating_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
    )
    return explained_variance_score(y_true, y_pred)
示例#26
0
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n = y_true.shape[0]

    assert_equal(zero_one(y_true, y_pred), 13)
    assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def displayResults(clf, title):
    print "\n\n=== Reuslt of ", title, " ==="
    y_pred_raw = clf.predict(X_test)
    y_pred = scaler.inverse_transform(y_pred_raw[:20])
    y_true_raw = y_test
    y_true = scaler.inverse_transform(y_true_raw[:20])

    print "\npredicted result, true result"
    for i in range(len(y_true)):
        print y_pred[i], "\t", y_true[i]

    print "\nr2_score:"
    print r2_score(y_true_raw, y_pred_raw)
    print "\nexplained_variance_score:"
    print explained_variance_score(y_true_raw, y_pred_raw)
    print "\nmean_squared_error:"
    print mean_squared_error(y_true_raw, y_pred_raw)
示例#28
0
def test_regression_metrics(n_samples=50):
    y_true = np.arange(n_samples)
    y_pred = y_true + 1

    assert_almost_equal(mean_squared_error(y_true, y_pred), 1.)
    assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.)
    assert_almost_equal(r2_score(y_true, y_pred),  0.995, 2)
    assert_almost_equal(explained_variance_score(y_true, y_pred), 1.)
示例#29
0
def printMetrics(estimator, X_train, y_train, y_test, y_pred):
    
    scores = cross_validation.cross_val_score(estimator, X_train,y_train, cv=5)
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2))
    print "EVS: %.4f" % explained_variance_score(y_test, y_pred)
    print "MAE: %.4f" % mean_absolute_error(y_test, y_pred)
    print "MSE: %.4f" % mean_squared_error(y_test, y_pred)
    print "R2: %.4f" % r2_score(y_test, y_pred)
示例#30
0
	def score(self, X, y):
		if self.model_type == 'classification':
			yhat = self.predict(X)
			return np.mean(yhat == y)
		elif self.model_type == 'regression':
			yhat = self.predict(X)
			return metrics.explained_variance_score(y, yhat)
		else:
			raise RuntimeError('unknown model type')
示例#31
0
def evaluate(y_actual, y_predicted):
    explained_variance = explained_variance_score(y_actual, y_predicted)
    pearson = pearsonr(y_actual, y_predicted)
    rms = sqrt(mean_squared_error(y_actual, y_predicted))
    return (explained_variance, pearson[0], rms)
for train_index, test_index in split(new_data, n_splits=3):
    # print("TRAIN:", train_index, "TEST:", test_index)
    new_data_train, new_data_test = new_data.ix[train_index], new_data.ix[
        test_index]
    target_train, target_test = target.ix[train_index], target.ix[test_index]
    # print(list(map(tuple, np.where(np.isnan(new_data_train)))))
    # print(new_data_train.ix[[605]])
    ###any nan or infinite
    # print(np.any(np.isnan(new_data_train)),np.all(np.isfinite(new_data_test)),
    ###
    estimator.fit(new_data_train, target_train)
    # print(new_data_test)
    target_pred = estimator.predict(new_data_test.values)
    print("r2 score:", r2_score(target_test,
                                target_pred), 'explained variance score:',
          explained_variance_score(target_test,
                                   target_pred), 'mean_squared_error',
          mean_squared_error(target_test, target_pred), 'mean_absolute_error',
          mean_absolute_error(target_test, target_pred),
          'median_absolute_error',
          median_absolute_error(target_test, target_pred))

# print(estimator.best_params_, estimator.best_estimator_)
# print(estimator.alpha_)
# print(estimator.best_estimator_.coef_, estimator.best_estimator_.residues_, estimator.best_estimator_.intercept_)

###samples to see the result of prediction
# print(good_data.ix[1624,'y'],estimator.predict(good_data.ix[1624,:].drop('y').values))
# print(good_data.ix[14,'y'],estimator.predict(good_data.ix[14,:].drop('y').values))
# print(good_data.ix[164,'y'],estimator.predict(good_data.ix[164,:].drop('y').values))
# print(good_data.ix[333,'y'],estimator.predict(good_data.ix[333,:].drop('y').values))
# print(good_data.ix[1000,'y'],estimator.predict(good_data.ix[1000,:].drop('y').values))
print('ELASTICNET REGRESSION')
print(df1)
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,learning_rate = 0.1, loss = 'ls')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(10)
print("****************************************************************************") 
print('GRADIENTBOOST REGRESSION')
print(df1)
x=[regressor.score(X_train,y_train)*100,rr.score(X_train,y_train)*100,model_lasso.score(X_train,y_train)*100,model_enet.score(X_train,y_train)*100,clf.score(X_train,y_train)*100]
y=[regressor.score(X_test, y_test)*100,rr.score(X_test, y_test)*100,model_lasso.score(X_test, y_test)*100,model_enet.score(X_test, y_test)*100,clf.score(X_test, y_test)*100]
z=[metrics.mean_absolute_error(y_test, y_predd),metrics.mean_absolute_error(y_test,pred_test_rr),metrics.mean_absolute_error(y_test,pred_test_lasso),metrics.mean_absolute_error(y_test,pred_test_enet),metrics.mean_absolute_error(y_test, y_pred)]
h=[metrics.mean_squared_error(y_test, y_predd),metrics.mean_squared_error(y_test,pred_test_rr),metrics.mean_squared_error(y_test,pred_test_lasso),metrics.mean_squared_error(y_test,pred_test_enet),metrics.mean_squared_error(y_test, y_pred)]
g=[np.sqrt(metrics.mean_squared_error(y_test, y_predd)),np.sqrt(metrics.mean_squared_error(y_test,pred_test_rr)), np.sqrt(metrics.mean_squared_error(y_test,pred_test_lasso)),np.sqrt(metrics.mean_squared_error(y_test,pred_test_enet)),np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
v=[metrics.explained_variance_score(y_test,y_predd),metrics.explained_variance_score(y_test,pred_test_rr),metrics.explained_variance_score(y_test,pred_test_lasso),metrics.explained_variance_score(y_test,pred_test_enet),metrics.explained_variance_score(y_test, y_pred)]
print("****************************************************************************") 
data = pd.DataFrame(np.column_stack([x,y,z,h,g,v]),columns=['Train Score','Test Score','Mean Absolute Error','Mean Squared Error','Root Mean Squared Error','Variance'],index= ['Linear Regression Model:','Ridge Regression Model:','Lasso Regression Model:','ElasticNet Regression Model:','GradientBoosting Regression Model:'])
print(data.to_string())
print("****************************************************************************")
fig = plt.figure(figsize=(10,5))
fig.add_subplot(3,2,1)
plt.scatter(y_test,y_predd)
plt.title(" MULTIPLE LINEAR REGRESSION ")
plt.ylabel('predicted value')
plt.xlabel('Actual price')
fig.add_subplot(3,2,2)
plt.scatter(y_test,pred_test_rr,color='purple')
plt.title("RIDGE REGRESSION ")
plt.ylabel('predicted value')
plt.xlabel('Actual price');
示例#34
0
x, y = shuffle(housing_data.data, housing_data.target, random_state=7)
num_training = int(len(x) * 0.8)
x_train, y_train = x[:num_training], y[:num_training]
x_test, y_test = x[num_training:], y[num_training:]

dt_regressor = DecisionTreeRegressor(max_depth=4)
dt_regressor.fit(x_train, y_train)

ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                                 n_estimators=400,
                                 random_state=7)
ab_regressor.fit(x_train, y_train)

y_test_pred_dt = dt_regressor.predict(x_test)
y_test_pred_ab = ab_regressor.predict(x_test)
ab_regressor_mse = round(mean_squared_error(y_test, y_test_pred_ab), 2)
ab_regressor_evs = round(explained_variance_score(y_test, y_test_pred_ab), 2)
dt_regressor_mse = round(mean_squared_error(y_test, y_test_pred_dt), 2)
dt_regressor_evs = round(explained_variance_score(y_test, y_test_pred_dt), 2)
print(
    "\nab_regressor mean_squared_error ={0:.2f}; explained_variance_score={1:.2f}"
    .format(ab_regressor_mse, ab_regressor_evs))
print(
    "\ndt_regressor mean_squared_error={0:.2f}; explained_variance_score={1:.2f}"
    .format(dt_regressor_mse, dt_regressor_evs))

plot_feature_importance(dt_regressor.feature_importances_, 'dt_regressor',
                        housing_data.feature_names)
plot_feature_importance(ab_regressor.feature_importances_, 'ab_regressor',
                        housing_data.feature_names)
示例#35
0
def test_losses_at_limits():
    # test limit cases
    assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2)
    assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2)
    assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2)
    assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2)
示例#36
0
def evaluate_regression(y_true, predictions, validation_loss, epoch_counter_train, roc_df):
	
	# y_true = np.ravel(np.reshape(y_true, (-1,1)))
	# predictions = np.ravel(np.reshape(predictions, (-1,1)))

	y_true = np.nan_to_num(y_true)
	predictions = np.nan_to_num(predictions)

	mse = mean_squared_error(y_true, predictions)
	r2 = r2_score(y_true, predictions)
	mae = mean_absolute_error(y_true, predictions)
	error_var = np.var(np.abs(y_true - predictions))
	explained_var = explained_variance_score(y_true, predictions)

	if r2 < 0.: r2 = 0.
	if explained_var < 0.: explained_var = 0.


	roc_df.append({
		'epoch': epoch_counter_train,
		# 'train_loss': np.round(last_train_epoch_loss, 5),
		'val_loss': np.round(validation_loss, 5),
		'mse': np.round(mse, 2),
		'r2': np.round(r2, 2),
		'mae': np.round(mae, 2),
		'error_var': np.round(error_var, 2),
		'explained_var': np.round(explained_var, 2),
		})

	print(pd.DataFrame(roc_df))
	pd.DataFrame(roc_df).to_csv(args['OUTPATH'] + 'result_df.csv')

	summary_writer.add_scalar('performance/mse', mse, epoch)
	summary_writer.add_scalar('performance/r2', r2, epoch)
	summary_writer.add_scalar('performance/mae', mae, epoch)
	summary_writer.add_scalar('performance/error_var', error_var, epoch)
	summary_writer.add_scalar('performance/explained_var', explained_var, epoch)


	plt.figure(figsize=(12,12))
	plt.title('epoch ' + str(epoch_counter_train) + ' | mae ' + str(np.round(mae, 2)) + ' | r2 ' + str(np.round(r2, 2)))
	plt.scatter(y_true, predictions, c='darkgreen', s=16, alpha=.4)
	plt.xscale('log')
	plt.yscale('log')
	if args['target_label'] == 'length_of_icu':
		plt.xlim(1., 1000.)
		plt.ylim(1., 1000.)
	if args['target_label'] == 'length_of_stay':
		plt.xlim(1., 2000.)
		plt.ylim(1., 2000.)
	plt.grid(which='both')
	plt.xlabel('Labels [hours spent in ICU]')
	plt.ylabel('Predictions [hours spent in ICU]')
	plt.savefig(args['OUTPATH'] + args['target_label'] + '/predictions/' + 'epoch_' + str(epoch_counter_train) + '.pdf')
	plt.close()

	performance_x_vec = np.linspace(0, epoch_counter_train, len(pd.DataFrame(roc_df)))

	plt.figure()
	plt.plot(performance_x_vec, pd.DataFrame(roc_df)['mse'], c='darkgreen', label='mse', linewidth=4, alpha=.6)
	plt.yscale('log')
	plt.xlabel('epochs')
	plt.ylabel('MSE Loss')
	plt.title('Mean Squared Error')
	plt.ylim(1e2,1e5)
	plt.grid(which='both')
	plt.legend()
	plt.savefig(args['OUTPATH'] + args['target_label'] + 'mse.pdf')
	plt.close()

	plt.figure()
	plt.plot(performance_x_vec, pd.DataFrame(roc_df)['r2'], c='darkgreen', label='r2', linewidth=4, alpha=.6)
	plt.xlabel('epochs')
	plt.ylabel('R Squared')
	plt.title('R Squared')
	plt.grid()
	plt.legend()
	plt.savefig(args['OUTPATH'] + args['target_label'] + 'r2.pdf')
	plt.close()

	plt.figure()
	plt.plot(performance_x_vec, pd.DataFrame(roc_df)['mae'], c='darkgreen', label='mae', linewidth=4, alpha=.6)
	plt.yscale('log')
	plt.xlabel('epochs')
	plt.ylabel('Mean Absolute Error [hours spent in ICU]')
	plt.title('Mean Absolute Error')
	plt.ylim(10.,100.)
	plt.yscale('log')
	plt.grid(which='both')
	plt.legend()
	plt.savefig(args['OUTPATH'] + args['target_label'] + 'mae_epoch' + str(epoch_counter_train) + '.pdf')
	plt.close()

	plt.figure()
	plt.plot(performance_x_vec, pd.DataFrame(roc_df)['explained_var'], c='darkgreen', label='explained_var', linewidth=4, alpha=.6)
	plt.xlabel('epochs')
	plt.ylabel('Explained Variance')
	plt.title('Explained Variance')
	plt.grid()
	plt.legend()
	plt.savefig(args['OUTPATH'] + args['target_label'] + 'explained_var.pdf')
	plt.close()

	return roc_df
示例#37
0
evaluations = []
STEPS = 400
for i in range(100):
    regressor.train(input_fn=wx_input_fn(X_train, y=y_train), steps=STEPS)
    evaluations.append(regressor.evaluate(input_fn=wx_input_fn(X_val,
                                                               y_val,
                                                               num_epochs=1,
                                                               shuffle=False)))
(100 x 400 / 2) = 20,000 epochs
evaluations[0]
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [14, 10]
loss_values = [ev['loss'] for ev in evaluations]
training_steps = [ev['global_step'] for ev in evaluations]
plt.scatter(x=training_steps, y=loss_values)
plt.xlabel('Training steps (Epochs = steps / 2)')
plt.ylabel('Loss (SSE)')
plt.show()
pred = regressor.predict(input_fn=wx_input_fn(X_test,
                                              num_epochs=1,
                                              shuffle=False))
predictions = np.array([p['predictions'][0] for p in pred])

print("The Explained Variance: %.2f" % explained_variance_score(
                                            y_test, predictions))  
print("The Mean Absolute Error: %.2f degrees Celcius" % mean_absolute_error(
                                            y_test, predictions))  
print("The Median Absolute Error: %.2f degrees Celcius" % median_absolute_error(
                                            y_test, predictions))
示例#38
0
pre = xgb_train.predict(x_test)
print('Score : ', explained_variance_score(y_test, pre))
print('MAE : ', mean_absolute_error(y_test, pre))

plt.plot(pre, 'r', y_test, 'b')
plt.show()
'''

#--------------------------------------------------
#RandomForest
rf = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=5)
rf.fit(x_train, y_train)

pre = rf.predict(x_test)
error = abs(pre - y_test)
print('Score : ', explained_variance_score(y_test, pre))
print("MAE : ", round(np.mean(error), 2))

plt.plot(pre, 'r', y_test, 'b')
plt.show()

#------------------------------------------
#特徵重要性
feature_list = list('風' '大' '濕' '環' '模' '照')
importance = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2))
                       for feature, importance in zip(feature_list, importance)
                       ]
feature_importances = sorted(feature_importances,
                             key=lambda x: x[1],
                             reverse=True)
print("r2_CV:", r2.mean())
print("MSE_CV:", mean_squared_error.mean())
"""
 Test/Evaluation
"""
time3 = time.clock()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=3)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
time4 = time.clock()
print("testing time:", time4 - time3)

print("EVS_test:", metrics.explained_variance_score(y_test, y_pred))
print("R2_test", metrics.r2_score(y_test, y_pred))
print("MSE_test:", metrics.mean_squared_error(y_test, y_pred))
print("The weights are:", ridge.coef_)
"""
Visualization
"""
fig, ax = plt.subplots()
ax.scatter(y, predicted, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [predicted.min(), predicted.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.savefig("cv_ridge.png")

fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
plt.ylabel('Predicted values')

# evaluate the model
from sklearn import metrics


print("MAE = ",metrics.mean_absolute_error(y_test,predictions)) 

print("MSE = ",metrics.mean_squared_error(y_test,predictions)) 

print("RMSE = ",np.sqrt(metrics.mean_squared_error(y_test,predictions)))


# variance score

print(metrics.explained_variance_score(y_test, predictions))

# Residuals

sns.distplot((y_test-predictions), bins=75)

# mobile or website or length of membership

cdf = pd.DataFrame (lm.coef_, X.columns, columns['Coeff'])

# Coeff is like a weight 1:26 , 1:38, 1:0.19, 1:61.28

# Website is deficient or App is more effective


示例#41
0
    count = []
    plt.ion()
    for i in range(len(test_data) - 9):
        x = test_data[i:i + 9]
        y = test_data[i + 9:i + 10]
        x = x.reshape(-1, 1, 9)
        y = y.reshape(-1, 1)
        out = net(x)
        loss = loss_func(out, y)
        print(loss.item())
        label.append(y.numpy().reshape(-1))
        output.append(out.data.numpy().reshape(-1))
        count.append(i)
        plt.clf()
        label_icon, = plt.plot(count, label, linewidth=1, color="blue")
        output_icon, = plt.plot(count, output, linewidth=1, color="red")
        plt.legend([label_icon, output_icon], ["label", "output"],
                   loc="upper right",
                   fontsize=10)

        plt.pause(0.01)
    plt.savefig("./img.pdf")
    plt.ioff()
    plt.show()
    # print(np.shape(label))
    # print(np.shape(output))
    r2 = r2_score(label, output)
    variance = explained_variance_score(label, output)
    print(r2)
    print(variance)
示例#42
0
def compute_metrics(y_true_cts,
                    y_pred_cts,
                    y_true_bin,
                    y_pred_bin,
                    y_pred_score=None):
    #Linear Regression metrics

    regression_dict = {}
    if y_pred_cts is not None:

        y_true = y_true_cts
        y_pred = y_pred_cts

        regression_dict[
            'explained_variance_score'] = metrics.explained_variance_score(
                y_true, y_pred)
        #regression_dict['max_error'] = metrics.max_error(y_true, y_pred)
        regression_dict['mean_absolute_error'] = metrics.mean_absolute_error(
            y_true, y_pred)
        regression_dict['mean_squared_error'] = metrics.mean_squared_error(
            y_true, y_pred)
        #regression_dict['mean_squared_log_error'] = metrics.mean_squared_log_error(y_true, y_pred)
        regression_dict[
            'median_absolute_error'] = metrics.median_absolute_error(
                y_true, y_pred)
        regression_dict['r2'] = metrics.r2_score(y_true, y_pred)

    #create DataFrame
    regression_metrics = pd.DataFrame.from_dict(regression_dict,
                                                orient='index')

    # =============================================================================
    #Classification metrics
    classification_dict = {}

    if y_pred_bin is not None:

        y_true = y_true_bin
        y_pred = y_pred_bin

        classification_dict['accuracy_score'] = metrics.accuracy_score(
            y_true, y_pred)
        #classification_dict['avg_ps'] = metrics.average_precision_score(y_true, y_score)
        classification_dict['confusion_matrix'] = metrics.confusion_matrix(
            y_true, y_pred)
        classification_dict['f1_score'] = metrics.f1_score(y_true, y_pred)
        classification_dict['precision_score'] = metrics.precision_score(
            y_true, y_pred)
        classification_dict['recall_score'] = metrics.recall_score(
            y_true, y_pred)
        if y_pred_score is None:
            y_pred_score = y_pred
        classification_dict['roc_auc_score'] = metrics.roc_auc_score(
            y_true, y_pred_score)
        #classification_dict['roc_curve'] = metrics.roc_curve(y_true, y_score)
        classification_dict[
            'gini'] = 2 * classification_dict['roc_auc_score'] - 1
        classification_dict['sensibility'] = classification_dict[
            'confusion_matrix'][1, 1] / sum(
                classification_dict['confusion_matrix'][1, :])
        classification_dict['specificity'] = classification_dict[
            'confusion_matrix'][0, 0] / sum(
                classification_dict['confusion_matrix'][0, :])

    #create DataFrame
    classification_metrics = pd.DataFrame.from_dict(classification_dict,
                                                    orient='index')
    # =============================================================================
    print(classification_metrics)
    print(regression_metrics)
    return regression_metrics, classification_metrics
    y1_test = test_inputs['target_load']
    y2_test = test_inputs['target_imf9']
    y3_test = test_inputs['target_imf10']
    y4_test = test_inputs['target_imf8']
    y5_test = test_inputs['target_imf7']

    y1_preds, y2_preds, y3_preds, y4_preds, y5_preds = model.predict(
        [X_test, aux_test])
    # y1_preds, y2_preds, y3_preds, y4_preds = model.predict([X_test, aux_test])

    y1_test = y_scaler.inverse_transform(y1_test)
    y1_preds = y_scaler.inverse_transform(y1_preds)

    y1_test, y1_preds = flatten_test_predict(y1_test, y1_preds)

    rmse_predict = RMSE(y1_test, y1_preds)
    evs = explained_variance_score(y1_test, y1_preds)
    mae = mean_absolute_error(y1_test, y1_preds)
    mse = mean_squared_error(y1_test, y1_preds)
    msle = mean_squared_log_error(y1_test, y1_preds)
    meae = median_absolute_error(y1_test, y1_preds)
    r_square = r2_score(y1_test, y1_preds)

    mape_v = mape(y1_preds.reshape(-1, 1), y1_test.reshape(-1, 1))

    print('rmse_predict:', rmse_predict, "evs:", evs, "mae:", mae, "mse:", mse,
          "msle:", msle, "meae:", meae, "r2:", r_square, "mape", mape_v)

    store_predict_points(
        y1_test, y1_preds, output_dir + '/test_mtl_prediction_epochs_' +
        str(EPOCHS) + '_lag_' + str(time_step_lag) + '.csv')
    def foward_chain_cv(self, scoring_metric, greater_is_better=False):
        i = 1

        MAE = []
        Exp_var = []
        MSE = []
        r_squared = []
        params_used = {}


        y_pred_cont = []
        y_test_cont = []
        y_pred_cont_index = []
        split_dates = []

        fig = plt.figure(num='{}'.format(self.regressor))


        tscv = TimeSeriesSplit(n_splits=self.no_splits)
        for train_index, test_index in tqdm(tscv.split(X)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            X_test_index = X_test.index.values.tolist()


            if self.scalar is not None:
                # Scale Data
                scaler_X = self.scalar()
                scaler_y = self.scalar()
                scaler_X.fit(X_train)
                scaler_y.fit(y_train)
                X_train, X_test = scaler_X.transform(X_train), scaler_X.transform(X_test)
                y_train, y_test = scaler_y.transform(y_train), scaler_y.transform(y_test)
            else:
                X_train, X_test = np.asarray(X_train), np.asarray(X_test)
                y_train, y_test = np.asarray(y_train), np.asarray(y_test)

            # Find Best Params
            best_score, best_params = self.find_optimal_paramters(
                X_train, y_train, self.regressor, self.parameters, scoring_metric, greater_is_better)

            self.regressor.set_params(**best_params)
            self.regressor.fit(X_train, y_train.ravel())


            # predict y values
            y_pred = self.regressor.predict(X_test)

            if self.scalar is not None:
                # transform y values back to real scale for assessment
                y_pred = scaler_y.inverse_transform(y_pred)
                y_test = scaler_y.inverse_transform(y_test)

            # compute error metrics
            params_used[i] = best_params
            MAE.append(metrics.mean_absolute_error(y_test, y_pred))
            Exp_var.append(metrics.explained_variance_score(y_test, y_pred))
            MSE.append(metrics.mean_squared_error(y_test, y_pred))
            r_squared.append(metrics.r2_score(y_test, y_pred))

            # plot y_pred vs y_test
            y_df = pd.DataFrame(index=pd.to_datetime(X_test_index))
            y_pred = y_pred.reshape(len(y_pred), )
            y_test = y_test.reshape(len(y_test), )
            y_df['y_pred'] = y_pred
            y_df['y_test'] = y_test

            # plot the subplots
            ax = fig.add_subplot(int(sqrt(self.no_splits)), int(sqrt(self.no_splits)+1), i)
            ax.xaxis.set_major_formatter(DateFormatter('%m-%y'))
            y_df.plot(title = 'Split{}'.format(i), ax=ax, legend=False)
            ax.tick_params(axis='x', rotation=45, labelsize=8)
            if i == 1:
                fig.legend(loc=4)

            # convert arrays to list and append continuous y_pred vs y_test
            y_pred_cont_index = y_pred_cont_index + X_test_index
            split_dates.append(y_pred_cont_index[-1])
            y_pred_list = y_pred.tolist()
            y_test_list = y_test.tolist()
            y_pred_cont = y_pred_cont + y_pred_list
            y_test_cont = y_test_cont + y_test_list

            i += 1

        # Plot the continuous chart
        y_continuous_df = pd.DataFrame(index=pd.to_datetime(y_pred_cont_index))
        y_pred_cont = np.asarray(y_pred_cont)
        y_test_cont = np.asarray(y_test_cont)
        y_continuous_df['Model'] = y_pred_cont
        y_continuous_df['Actual'] = y_test_cont
        y_continuous_df.plot(title='Running Performance')
        plt.suptitle(str(self.regressor).split('(')[0])

        # add verticle lines to the running total output
        del split_dates[-1]
        for date in split_dates:
            date = datetime.strptime(date, '%m/%d/%Y %H:%M')
            plt.axvline(x=date, linestyle=':', color='red', linewidth=1, alpha=.8)

        # Calculate average metrics
        no_splits = tscv.get_n_splits()
        avg_mae = sum(MAE) / no_splits
        avg_exp_var = sum(Exp_var) / no_splits
        avg_mse = sum(MSE) / no_splits
        avg_rsquared = sum(r_squared) / no_splits

        print('\nMAE:{} \nMSE:{} \nExp Var Explained: {}\nr^2: {}\nParams:{}'.format(MAE, MSE, Exp_var, r_squared,
                                                                                     params_used))
        print('\nAvg MAE:', avg_mae,
              '\nAverage Explained Variance:', avg_exp_var,
              '\nAvg MSE:', avg_mse,
              '\nAvg r^2:', avg_rsquared)
        print('end')
        fig.tight_layout()
        plt.show()
示例#45
0
# Plot outputs
import matplotlib.pyplot as plt

plt.scatter(X_test, y_test, color='green')
plt.plot(X_test, y_test_pred, color='black', linewidth=4)
plt.xticks(())
plt.yticks(())
plt.show()

# Measure performance
import sklearn.metrics as sm

print "Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2) 
print "Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2) 
print "Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2) 
print "Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2) 
print "R2 score =", round(sm.r2_score(y_test, y_test_pred), 2)

# Model persistence
import cPickle as pickle

output_model_file = '3_model_linear_regr.pkl'

with open(output_model_file, 'w') as f:
    pickle.dump(linear_regressor, f)

with open(output_model_file, 'r') as f:
    model_linregr = pickle.load(f)

y_test_pred_new = model_linregr.predict(X_test)
print "\nNew mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_new), 2) 
示例#46
0
def evaluateWithMetrics(true, lstm_predict, reg_predict, simple_avg_predict):
    sum_lstm_evs = 0
    sum_reg_evs = 0
    sum_simple_avg_evs = 0
    sum_lstm_mse = 0
    sum_reg_mse = 0
    sum_simple_avg_mse = 0
    sum_lstm_mae = 0
    sum_reg_mae = 0
    sum_simple_avg_mae = 0
    sum_lstm_r2_score = 0
    sum_reg_r2_score = 0
    sum_simple_avg_r2_score = 0
    
    lstm_evs = []
    reg_evs = []
    avg_evs = []
    
    lstm_mse = []
    reg_mse = []
    avg_mse = []
    
    lstm_mae = []
    reg_mae = []
    avg_mae = []
    
    lstm_r2_score = []
    reg_r2_score = []
    avg_r2_score = []
       
    for i in range(true.shape[0]):
        r = explained_variance_score(true[i], lstm_predict[i])
        lstm_evs.append(r)
        sum_lstm_evs = sum_lstm_evs + r
        
        r = explained_variance_score(true[i], reg_predict[i])
        reg_evs.append(r)
        sum_reg_evs = sum_reg_evs + r
        
        r = explained_variance_score(true[i], simple_avg_predict[i])
        avg_evs.append(r)
        sum_simple_avg_evs = sum_simple_avg_evs + r
        
        r = mean_squared_error(true[i], lstm_predict[i])
        lstm_mse.append(r)
        sum_lstm_mse = sum_lstm_mse + r
        
        r = mean_squared_error(true[i], reg_predict[i])
        reg_mse.append(r)
        sum_reg_mse = sum_reg_mse + r
        
        r = mean_squared_error(true[i], simple_avg_predict[i])
        avg_mse.append(r)       
        sum_simple_avg_mse = sum_simple_avg_mse + r
        
        r = mean_absolute_error(true[i], lstm_predict[i])
        lstm_mae.append(r)
        sum_lstm_mae = sum_lstm_mae + r
        
        r = mean_absolute_error(true[i], reg_predict[i])
        reg_mae.append(r)
        sum_reg_mae = sum_reg_mae + r
        
        r = mean_absolute_error(true[i], simple_avg_predict[i])
        avg_mae.append(r)
        sum_simple_avg_mae = sum_simple_avg_mae + r
        
        r = r2_score(true[i], lstm_predict[i])
        lstm_r2_score.append(r)
        sum_lstm_r2_score = sum_lstm_r2_score + r
        
        r = r2_score(true[i], reg_predict[i])
        reg_r2_score.append(r)
        sum_reg_r2_score = sum_reg_r2_score + r
        
        r = r2_score(true[i], simple_avg_predict[i])
        avg_r2_score.append(r)
        sum_simple_avg_r2_score = sum_simple_avg_r2_score + r
        print(f'mae:lstm:{mean_absolute_error(true[i], lstm_predict[i])}, avg:{mean_absolute_error(true[i], simple_avg_predict[i])}')
     
    plotMetrics2(lstm_evs, reg_evs, avg_evs, lstm_mse, reg_mse, avg_mse, lstm_mae, reg_mae, avg_mae, lstm_r2_score, reg_r2_score, avg_r2_score)
    
    avg_lstm_evs = sum_lstm_evs / true.shape[0]
    avg_reg_evs = sum_reg_evs / true.shape[0]
    avg_simple_avg_evs = sum_simple_avg_evs / true.shape[0]
    avg_lstm_mse = sum_lstm_mse / true.shape[0]
    avg_reg_mse = sum_reg_mse / true.shape[0]
    avg_simple_avg_mse = sum_simple_avg_mse / true.shape[0]
    avg_lstm_mae = sum_lstm_mae / true.shape[0]
    avg_reg_mae = sum_reg_mae / true.shape[0]
    avg_simple_avg_mae = sum_simple_avg_mae / true.shape[0]
    avg_lstm_r2_score = sum_lstm_r2_score / true.shape[0]
    avg_reg_r2_score = sum_reg_r2_score / true.shape[0]
    avg_simple_avg_r2_score = sum_simple_avg_r2_score / true.shape[0]
    
    print(f'explained variance score: lstm:{avg_lstm_evs}, regression:{avg_reg_evs}, simple avg:{avg_simple_avg_evs}')
    print(f'mean absolute error: lstm:{avg_lstm_mae}, regression:{avg_reg_mae}, simple avg: {avg_simple_avg_mae}')
    print(f'mean squared error: lstm:{avg_lstm_mse}, regression:{avg_reg_mse}, simple avg: {avg_simple_avg_mse}')
    print(f'r2 score: lstm:{avg_lstm_r2_score}, regression:{avg_reg_r2_score}, simple avg: {avg_simple_avg_r2_score}')
# 数据和标签分为训练集和测试集
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
    data, target)

#构建线性回归模型
from sklearn.linear_model import LinearRegression
clf = LinearRegression().fit(data_train, target_train)
quality_pre = clf.predict(data_test)

#评价
from sklearn.metrics import mean_squared_error, median_absolute_error, explained_variance_score
print("线性回归模型的均方误差为:", mean_squared_error(target_test, quality_pre))
print("线性回归模型的中值误差为:", median_absolute_error(target_test, quality_pre))
print("线性回归模型的可解释方差值为:", explained_variance_score(target_test, quality_pre))

#展示对比
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(15, 6))
plt.plot(range(target_test.shape[0]),
         target_test,
         linewidth=1.5,
         linestyle='-')
plt.plot(range(target_test.shape[0]),
         quality_pre,
         linewidth=1.5,
         linestyle='-.')
plt.legend(["真实值", "预测值"])
示例#48
0
    ilr = parallel_ilr_inference(nb_jobs=args.nb_seeds,
                                 train_input=train_input,
                                 train_target=train_target,
                                 arguments=args)[0]

    # predict on training
    mu, var, std, nlpd = \
        ilr.meanfield_prediction(input, target, prediction=args.prediction)

    # metrics
    from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

    mse = mean_squared_error(target, mu)
    evar = explained_variance_score(target,
                                    mu,
                                    multioutput='variance_weighted')
    smse = 1. - r2_score(target, mu, multioutput='variance_weighted')

    print('TRAIN - EVAR:', evar, 'MSE:', mse, 'SMSE:', smse, 'NLPD:',
          nlpd.mean(), 'Compnents:', len(ilr.used_labels))

    fig, axes = plt.subplots(2, 1)

    # plot prediction
    sorter = np.argsort(input, axis=0).flatten()
    sorted_input, sorted_target = input[sorter, 0], target[sorter, 0]
    sorted_mu, sorted_std = mu[sorter, 0], std[sorter, 0]

    axes[0].plot(true_input, true_target, '--k')
    axes[0].scatter(train_input, train_target, marker='+', s=1.25, color='k')
示例#49
0
文件: model.py 项目: raaraa/AlphaPy
def generate_metrics(model, partition):
    r"""Generate model evaluation metrics for all estimators.

    Parameters
    ----------
    model : alphapy.Model
        The model object with stored predictions.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    model : alphapy.Model
        The model object with the completed metrics.

    Notes
    -----
    AlphaPy takes a brute-force approach to calculating each metric.
    It calls every scikit-learn function without exception. If the
    calculation fails for any reason, then the evaluation will still
    continue without error.

    References
    ----------
    For more information about model evaluation and the associated metrics,
    refer to [EVAL]_.

    .. [EVAL] http://scikit-learn.org/stable/modules/model_evaluation.html

    """

    logger.info('='*80)
    logger.info("Metrics for: %s", partition)

    # Extract model paramters.

    model_type = model.specs['model_type']

    # Extract model data.

    if partition == Partition.train:
        expected = model.y_train
    else:
        expected = model.y_test

    # Generate Metrics

    if expected.any():
        # Add blended model to the list of algorithms.
        if len(model.algolist) > 1:
            algolist = copy(model.algolist)
            algolist.append('BLEND')
        else:
            algolist = model.algolist

        # get the metrics for each algorithm
        for algo in algolist:
            # get predictions for the given algorithm
            predicted = model.preds[(algo, partition)]
            # classification metrics
            if model_type == ModelType.classification:
                probas = model.probas[(algo, partition)]
                try:
                    model.metrics[(algo, partition, 'accuracy')] = accuracy_score(expected, predicted)
                except:
                    logger.info("Accuracy Score not calculated")
                try:
                    model.metrics[(algo, partition, 'average_precision')] = average_precision_score(expected, probas)
                except:
                    logger.info("Average Precision Score not calculated")
                try:
                    model.metrics[(algo, partition, 'balanced_accuracy')] = balanced_accuracy_score(expected, predicted)
                except:
                    logger.info("Accuracy Score not calculated")
                try:
                    model.metrics[(algo, partition, 'brier_score_loss')] = brier_score_loss(expected, probas)
                except:
                    logger.info("Brier Score not calculated")
                try:
                    model.metrics[(algo, partition, 'cohen_kappa')] = cohen_kappa_score(expected, predicted)
                except:
                    logger.info("Cohen's Kappa Score not calculated")
                try:
                    model.metrics[(algo, partition, 'confusion_matrix')] = confusion_matrix(expected, predicted)
                except:
                    logger.info("Confusion Matrix not calculated")
                try:
                    model.metrics[(algo, partition, 'f1')] = f1_score(expected, predicted)
                except:
                    logger.info("F1 Score not calculated")
                try:
                    model.metrics[(algo, partition, 'neg_log_loss')] = log_loss(expected, probas)
                except:
                    logger.info("Log Loss not calculated")
                try:
                    model.metrics[(algo, partition, 'precision')] = precision_score(expected, predicted)
                except:
                    logger.info("Precision Score not calculated")
                try:
                    model.metrics[(algo, partition, 'recall')] = recall_score(expected, predicted)
                except:
                    logger.info("Recall Score not calculated")
                try:
                    fpr, tpr, _ = roc_curve(expected, probas)
                    model.metrics[(algo, partition, 'roc_auc')] = auc(fpr, tpr)
                except:
                    logger.info("ROC AUC Score not calculated")
            # regression metrics
            elif model_type == ModelType.regression:
                try:
                    model.metrics[(algo, partition, 'explained_variance')] = explained_variance_score(expected, predicted)
                except:
                    logger.info("Explained Variance Score not calculated")
                try:
                    model.metrics[(algo, partition, 'neg_mean_absolute_error')] = mean_absolute_error(expected, predicted)
                except:
                    logger.info("Mean Absolute Error not calculated")
                try:
                    model.metrics[(algo, partition, 'neg_median_absolute_error')] = median_absolute_error(expected, predicted)
                except:
                    logger.info("Median Absolute Error not calculated")
                try:
                    model.metrics[(algo, partition, 'neg_mean_squared_error')] = mean_squared_error(expected, predicted)
                except:
                    logger.info("Mean Squared Error not calculated")
                try:
                    model.metrics[(algo, partition, 'neg_mean_squared_log_error')] = mean_squared_log_error(expected, predicted)
                except:
                    logger.info("Mean Squared Log Error not calculated")
                try:
                    model.metrics[(algo, partition, 'r2')] = r2_score(expected, predicted)
                except:
                    logger.info("R-Squared Score not calculated")
        # log the metrics for each algorithm
        for algo in model.algolist:
            logger.info('-'*80)
            logger.info("Algorithm: %s", algo)
            metrics = [(k[2], v) for k, v in list(model.metrics.items()) if k[0] == algo and k[1] == partition]
            for key, value in sorted(metrics):
                svalue = str(value)
                svalue.replace('\n', ' ')
                logger.info("%s: %s", key, svalue)
    else:
        logger.info("No labels for generating %s metrics", partition)

    return model
model.fit(x=X_train,
          y=y_train,
          validation_data=(X_test, y_test),
          batch_size=128,
          epochs=400)

#140
losses = pd.DataFrame(model.history.history)
# this data frame has two columns, one is loss and the other is called
# val_loss ---> this is loss on that test set
# that validation data and now I can directly compare the loss on training
# and loss on test data in order to see if i am overfitting to the training
# data on my model. Simply we can plot it
losses.plot()

#%%% we can do some evaluationon our test data 140
"""Evaluation on Test Data"""
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

predictions = model.predict(X_test)

mean_absolute_error(y_test, predictions)
house['price'].mean()

explained_variance_score(y_test, predictions)

# Our predictions
plt.scatter(y_test, predictions)
# Perfect predictions
plt.plot(y_test, y_test, 'r')
示例#51
0
文件: coal.py 项目: d-chambers/msha
def plot_predicted_injury_rates(prod_df, accident_df, mines_df):
    """Use simple features to predict accident rates and plot. """
    def get_dates_with_no_nan(df_list):
        """return a sorted list a index which contain no nans in any columns"""
        common_index = reduce(iand, [set(x.index) for x in df_list])

        for df in df_list:
            has_nulls = df[df.isnull().any(axis=1)].index
            for element in has_nulls:
                if element in common_index:
                    common_index.remove(element)
            # out.add(list(df[df.isnull().any(axis=1)].index))
        return sorted(common_index)

    def create_features_df(injuries, prod, norm_df):
        """Create a dataframe of features to predict accident rates."""
        grouper = pd.Grouper(key="date", freq="q")
        # get features from production df
        prod_cols = ["hours_worked", "employee_count", "coal_production"]
        prod_features = prod.groupby(grouper)[prod_cols].sum()
        exp_df = aggregate_descriptive_stats(injuries, "total_experience")
        size_df = aggregate_descriptive_stats(prod, "employee_count")
        # prod_per_hour = prod_features['coal_production'] / prod_features['hours_worked']
        # prod_features['coal_per_hour'] = prod_per_hour
        df_list = [exp_df, size_df, prod_features, norm_df]
        index = get_dates_with_no_nan(df_list)
        hours = prod_features.loc[index]
        exp = exp_df.loc[index]
        size = size_df.loc[index]
        # drop number of accidents info from exp df
        exp = exp.drop(columns="count")
        out = pd.concat([exp, size, hours],
                        keys=["exp", "size", "prod"],
                        axis=1)
        return out

    plt.clf()
    # get features and such
    prod, mines = get_ug_coal_prod_and_mines(prod_df, mines_df)
    injuries = accident_df[is_ug_gc_accidents(accident_df, only_injuries=True)]
    normed = normalize_injuries(injuries, prod, mines)
    # get experience, mine sizes (by employee count) and hours worked
    # combine into a feature dataframe
    feature_df = create_features_df(injuries, prod, normed)
    norm = normed.loc[feature_df.index]
    # get GC injury rate (injuries per 10^6 hours)
    target = norm["hours_worked"] * 1_000_000
    # select the most important features
    select_feats = select_k_best_regression(
        feature_df,
        target,
        k=5,
        normalize=True,
    )
    X = select_feats.values
    reg = LinearRegression(normalize=True).fit(X, target.values)
    x_pred = reg.predict(X)
    rmse = mean_squared_error(target.values, x_pred, squared=False)
    explained_var = explained_variance_score(
        target.values,
        x_pred,
    )
    # now plot
    plt.figure(figsize=(5.5, 3.5))
    plt.plot(target.index, target.values, color="b", label="GC injury rate")
    plt.plot(select_feats.index,
             x_pred,
             color="r",
             label="predicted injury rate")
    plt.legend()
    plt.xlabel("Year")
    plt.ylabel("GC Injures per $10^6$ Hours")
    return plt
示例#52
0
y_test.to_csv('y_test.csv')

# In[ ]:

ypred_df = pd.DataFrame(ypred)

# In[352]:

ypred_df.to_csv('ypred.csv')

# In[92]:

from sklearn.metrics import explained_variance_score

explained_variance_score(y_test, ypred)

# In[93]:

from sklearn.metrics import max_error

max_error(y_test, ypred)

# In[94]:

from sklearn.metrics import r2_score

r2 = r2_score(y_test, ypred)
r2

# In[95]:
示例#53
0
data =  pd.read_csv('./data/employee-perf.csv')
data_x = data[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']]
data_y = data['Annual Performance Rating']
model = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 4)
model.fit(x_train,y_train)
preds = model.predict(x_test)
pprint.pprint(pd.DataFrame({'Actual':y_test, 'Predicted':preds}))
   Actual  Predicted
3      90  88.640209
4      85  81.412110
6      94  93.320892
 print('MSE, MAE, R^2, EVS: ' + str([mean_squared_error(y_test, preds), 
    ...: median_absolute_error(y_test, preds), 
    ...: r2_score(y_test, preds), 
    ...: explained_variance_score(y_test, preds)]))
MSE, MAE, R^2, EVS: [5.0610589164729705, 1.3597910272418403, 0.62664319468642016, 0.8861576085020817]


#reading in the new modified employee-perf
 data2=  pd.read_csv('./data/employee-perf2.csv')
 #predicting the performance score for data2 based on the performance score of the first data
 data_x = data[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']]
 data_y= data['Annual Performance Rating']

 predict_vars = data2[['Aptitude Test Score', 'Interview Score', 'Missed Training Classes']]
 model = linear_model.LinearRegression()
 model.fit(x_train,y_train)
 preds = model.predict(predict_vars)

# the numbers are slightly off here from what I had in my answer document but
示例#54
0
def main():
    horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses
    horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses

    races98 = RaceParserNoHandicaps('./../Data/born98.csv').races
    races05 = RaceParserNoHandicaps('./../Data/born05.csv').races

    print ''' HorsesBorn98 Dataset '''
    horses_train_98, horses_test_98 = split_dataset(horses98)

    horses_98_X_train = []
    horses_98_y_train = []
    for h in horses_train_98:
        v,s = compute_vector(h)
        horses_98_X_train.append(v)
        horses_98_y_train .append(s)

    print 'No. of instances in training set:'
    print len(horses_98_X_train)
    print len(horses_98_y_train)
    print ''

    horses_98_X_test = []
    horses_98_y_test = []
    for h in horses_test_98:
        v,s = compute_vector(h)
        horses_98_X_test.append(v)
        horses_98_y_test.append(s)

    print 'No. of instances in testing set:'
    print len(horses_98_X_test)
    print len(horses_98_y_test)
    print ''
    
    # Create linear regression object
    regr98 = linear_model.LinearRegression()

    # Train the model using the training sets
    regr98.fit(horses_98_X_train, horses_98_y_train)

    # Coefficients
    print 'Coefficients:'
    print regr98.coef_
    print ''

    # Explained variance score: 1 is perfect prediction
    print 'Variance score:'
    print regr98.score(horses_98_X_test, horses_98_y_test)
    print ''

    print 'Mean absolute error:'
    print mean_absolute_error(horses_98_y_test, (regr98.predict(horses_98_X_test)))
    print ''

    print 'Explained variance:'
    print explained_variance_score(horses_98_y_test, (regr98.predict(horses_98_X_test)))
    print ''

    print 'Mean squared error:'
    print mean_squared_error(horses_98_y_test, (regr98.predict(horses_98_X_test)))
    print ''


    print ''' HorsesBorn05 Dataset '''
    horses_train_05, horses_test_05 = split_dataset(horses05)

    horses_05_X_train = []
    horses_05_y_train = []
    for h in horses_train_05:
        v,s = compute_vector(h)
        horses_05_X_train.append(v)
        horses_05_y_train .append(s)

    print 'No. of instances in training set:'
    print len(horses_05_X_train)
    print len(horses_05_y_train)
    print ''

    horses_05_X_test = []
    horses_05_y_test = []
    for h in horses_test_05:
        v,s = compute_vector(h)
        horses_05_X_test.append(v)
        horses_05_y_test.append(s)

    print 'No. of instances in testing set:'
    print len(horses_05_X_test)
    print len(horses_05_y_test)
    print ''
    
    # Create linear regression object
    regr05 = linear_model.LinearRegression(fit_intercept=True)

    # Train the model using the training sets
    regr05.fit(horses_05_X_train, horses_05_y_train)

    # Coefficients
    print 'Coefficients:'
    print regr05.coef_
    print ''

    # Explained variance score: 1 is perfect prediction
    print 'Variance score:'
    print regr05.score(horses_05_X_test, horses_05_y_test)
    print ''

    print 'Mean absolute error:'
    print mean_absolute_error(horses_05_y_test, (regr05.predict(horses_05_X_test)))
    print ''

    print 'Explained variance:'
    print explained_variance_score(horses_05_y_test, (regr05.predict(horses_05_X_test)))
    print ''

    print 'Mean squared error:'
    print mean_squared_error(horses_05_y_test, (regr05.predict(horses_05_X_test)))
    print ''

    print 'R2 score:'
    print r2_score(horses_05_y_test, (regr05.predict(horses_05_X_test)))
    print ''

    print 'Mean absolute error based on training set:'
    print mean_absolute_error(horses_05_y_train, (regr05.predict(horses_05_X_train)))
    print ''


    # Plots
    horses_98_y_pred = regr98.predict(horses_98_X_test)
    horses_05_y_pred = regr05.predict(horses_05_X_test)

    plot_speeds(horses_98_y_pred, 'r', 'Predicted Speeds for Horses1998 Test Set')
    plot_speeds(horses_98_y_test, 'r', 'Actual Speeds for Horses1998 Test Set')

    plot_speeds(horses_05_y_pred, 'b', 'Predicted Speeds for Horses2005 Test Set')
    plot_speeds(horses_05_y_test, 'b', 'Actual Speeds for Horses2005 Test Set')
示例#55
0
def regression_test(test_data):
    data = pd.read_csv(
        "./Data-assignment-1/Traffic_flow/traffic_flow_data.csv")
    cols = []

    # for i in range((data.shape[1]) // 45):
    #     for j in range(5):
    #         cols.append(45 * i + j + 20)
    # y = data['Segment23_(t+1)']
    # X = data[data.columns[cols]]

    # # get every segment 23
    # y = data['Segment23_(t+1)']
    # data = data.iloc[:, 22::45]
    # X = data

    # print(data.shape[1])
    # new = data.copy()
    # for i in range(45, data.shape[1] - 1):
    #     new[new.columns[i]] = data[data.columns[i]] - data[data.columns[i - 45]]

    # data = new[data.columns[44:]]

    X, y = data.drop('Segment23_(t+1)', axis=1), data['Segment23_(t+1)']
    X_test, y_test = test_data.drop('Segment23_(t+1)',
                                    axis=1), test_data['Segment23_(t+1)']
    X_train, a, y_train, a = train_test_split(X,
                                              y,
                                              test_size=0.2,
                                              random_state=42)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

    tuned_parameters = [{'alpha': [i**2 / 100 for i in range(1, 100, 2)]}]

    model = Ridge()
    scoring = {'r2': 'r2'}  #''mean_squared_error': 'neg_mean_squared_error'
    grid = GridSearchCV(model, tuned_parameters, scoring=scoring, refit='r2')

    grid.fit(X_train, y_train)
    print(grid.param_grid)

    results = grid.cv_results_

    graph('Traffic Flow', ['alpha', 'Score'], results, scoring, 'alpha')
    best = grid.best_estimator_

    # best = model.fit(X_train, y_train)
    predictions = best.predict(X_test)

    print(
        '*******************************************************************')
    print("Ridge Regression Traffic Flow")
    print("Mean squared error: {}".format(
        mean_squared_error(y_test, predictions)))
    print("Explained variance: {}".format(
        explained_variance_score(y_test, predictions)))

    tuned_parameters = {}

    model = LinearRegression()
    grid = GridSearchCV(model, tuned_parameters)

    grid.fit(X_train, y_train)
    best = grid.best_estimator_

    # best = model.fit(X_train, y_train)
    predictions = best.predict(X_test)

    print(
        '*******************************************************************')
    print("Linear Regression Traffic Flow")
    print("Mean squared error: {}".format(
        mean_squared_error(y_test, predictions)))
    print("Explained variance: {}".format(
        explained_variance_score(y_test, predictions)))
示例#56
0
    "SVMReg.",
    "ForestReg.",
]
classifiers = [
    KNeighborsRegressor(n_neighbors=1, algorithm="auto"),
    DecisionTreeRegressor(max_depth=5, splitter='best'),
    MLPRegressor(alpha=1, max_iter=1000),
    SVR(C=1.0, epsilon=0.2),
    RandomForestRegressor(n_estimators=100, random_state=0),
]

#Comparem les seguents característiques:
results = pd.DataFrame(
    index=['Absolute Error', 'Variance Score', 'Train Cost', 'Test Cost'],
    columns=names)

for name, clf in zip(names, classifiers):
    t1 = time.time()
    clf.fit(X_train, y_train)
    t2 = time.time()
    y_pred = clf.predict(X_test)
    t3 = time.time()
    results.at['Train Cost', name] = round(t2 - t1, 3)
    results.at['Test Cost', name] = round(t3 - t2, 3)
    results.at['Absolute Error', name] = mean_absolute_error(y_test, y_pred)
    results.at['Variance Score',
               name] = explained_variance_score(y_test, y_pred)

print('Results of Regression Classifiers')
print(results)
def log_rf(experimentID, run_name, params, X_train, X_test, y_train, y_test):
    import os
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import explained_variance_score, max_error
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    from sklearn.metrics import mean_squared_log_error, median_absolute_error
    from sklearn.metrics import r2_score, mean_poisson_deviance
    from sklearn.metrics import mean_gamma_deviance
    import tempfile

    with mlflow.start_run(experiment_id=experimentID,
                          run_name=run_name) as run:
        # Create model, train it, and create predictions
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        predictions = rf.predict(X_test)

        # Log model
        mlflow.sklearn.log_model(rf, "random-forest-model")

        # Log params
        [mlflow.log_param(param, value) for param, value in params.items()]

        # Create metrics
        exp_var = explained_variance_score(y_test, predictions)
        max_err = max_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        rmse = mean_squared_error(y_test, predictions, squared=False)
        mslogerror = mean_squared_log_error(y_test, predictions)
        medianae = median_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        mean_poisson = mean_poisson_deviance(y_test, predictions)
        mean_gamma = mean_gamma_deviance(y_test, predictions)

        # Print metrics
        print("  explained variance: {}".format(exp_var))
        print("  max error: {}".format(max_err))
        print("  mae: {}".format(mae))
        print("  mse: {}".format(mse))
        print("  rmse: {}".format(rmse))
        print("  mean square log error: {}".format(mslogerror))
        print("  median abosulte error: {}".format(medianae))
        print("  R2: {}".format(r2))
        print("  mean poisson deviance: {}".format(mean_poisson))
        print("  mean gamma deviance: {}".format(mean_gamma))

        # Log metrics
        mlflow.log_metric("explained variance", exp_var)
        mlflow.log_metric("max error", max_err)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mean square log error", mslogerror)
        mlflow.log_metric("median abosulte error", medianae)
        mlflow.log_metric("R2", r2)
        mlflow.log_metric("mean poisson deviance", mean_poisson)
        mlflow.log_metric("mean gamma deviance", mean_gamma)

        # Create feature importance
        importance = pd.DataFrame(list(
            zip(df_pits_races_4_model_encoded.columns,
                rf.feature_importances_)),
                                  columns=["Feature", "Importance"
                                           ]).sort_values("Importance",
                                                          ascending=False)

        # Log importances using a temporary file
        temp = tempfile.NamedTemporaryFile(prefix="feature-importance-",
                                           suffix=".csv")
        temp_name = temp.name
        try:
            importance.to_csv(temp_name, index=False)
            mlflow.log_artifact(temp_name, "feature-importance.csv")
        finally:
            temp.close()  # Delete the temp file

        # Create plot
        fig, ax = plt.subplots()

        sns.residplot(predictions, y_test.values.ravel(), lowess=False)
        plt.xlabel("Predicted values pit duration")
        plt.ylabel("Residual")
        plt.title("Residual Plot for pitting")

        # Log residuals using a temporary file
        temp = tempfile.NamedTemporaryFile(prefix="residuals_pit_model",
                                           suffix=".png")
        temp_name = temp.name
        try:
            fig.savefig(temp_name)
            mlflow.log_artifact(temp_name, "residuals_pit_model.png")
        finally:
            temp.close()  # Delete the temp file

        display(fig)
        return run.info.run_uuid
示例#58
0
    best_thetas.append(theta)

(lm().fit(X_train, y_train)).coef_

y_predict_50 = X_test.dot(best_thetas[0])
y_predict_2000 = X_test.dot(best_thetas[1])
y_predict_10000 = X_test.dot(best_thetas[2])

for i in range(len(best_thetas)):
    print(f'minibatch size: {minibatch_size[i]}')
    print(f'Coefficients: {best_thetas[i]}')
    print("\n")
    print("Holdout mean squared error: %.2f" %
          metrics.mean_squared_error(y_test, X_test.dot(best_thetas[i])))
    print("Holdout explained variance: %.2f" %
          metrics.explained_variance_score(y_test, X_test.dot(best_thetas[i])))
    print("Holdout r-squared: %.2f" %
          metrics.r2_score(y_test, X_test.dot(best_thetas[i])))
    print("\n")

for epoch in range(n_iterations):
    shuffled_indices = np.random.permutation(m)
    X_b_shuffled = X_train[shuffled_indices]
    y_shuffled = y_train[shuffled_indices]
    for i in range(0, m, minibatch_size):
        xi = X_b_shuffled[i:i + minibatch_size]
        yi = y_shuffled[i:i + minibatch_size]
        gradients = 2 / minibatch_size * np.asarray(xi).T.dot(
            xi.dot(theta) - yi)
        theta = theta - eta * gradients
        theta_path_mgd.append(theta)
示例#59
0
    dt_regressor = DecisionTreeRegressor(max_depth=4)
    dt_regressor.fit(x_train, y_train)

    # Lets boost decision tree's performance with AdaBoost with
    # estimators as 400 and random_state as 7

    ab_regressor = AdaBoostRegressor(dt_regressor,
                                     n_estimators=400,
                                     random_state=7)
    ab_regressor.fit(x_train, y_train)

    # Performance of decision tree regressor

    y_pred_dt = dt_regressor.predict(x_test)
    mse = sm.mean_squared_error(y_test, y_pred_dt)
    evs = sm.explained_variance_score(y_test, y_pred_dt)
    print("\n#### Decision Tree performance ####")
    print("Mean squared error =", round(mse, 2))
    print("Explained variance score =", round(evs, 2))

    # Performance of decision tree regressor with Adaboost

    y_pred_dt = ab_regressor.predict(x_test)
    mse = sm.mean_squared_error(y_test, y_pred_dt)
    evs = sm.explained_variance_score(y_test, y_pred_dt)
    print("\n#### Decision Tree performance with Adaboost ####")
    print("Mean squared error =", round(mse, 2))
    print("Explained variance score =", round(evs, 2))

    # Feature importance
plt.plot(X_test, y_test_pred, color='black', linewidth=4)
plt.xticks(())
plt.yticks(())
plt.show()

# Measure performance
import sklearn.metrics as sm

print("Mean absolute error =",
      round(sm.mean_absolute_error(y_test, y_test_pred), 2))
print("Mean squared error =",
      round(sm.mean_squared_error(y_test, y_test_pred), 2))
print("Median absolute error =",
      round(sm.median_absolute_error(y_test, y_test_pred), 2))
print("Explain variance score =",
      round(sm.explained_variance_score(y_test, y_test_pred), 2))
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))

# Model persistence
import pickle as pickle

output_model_file = '3_model_linear_regr.pkl'

with open(output_model_file, 'wb') as f:
    pickle.dump(linear_regressor, f)

with open(output_model_file, 'rb') as f:
    model_linregr = pickle.load(f)

y_test_pred_new = model_linregr.predict(X_test)
print("\nNew mean absolute error =",