示例#1
0
def test_gradient_boosting_estimator_with_smooth_quantile_loss():
    np.random.seed(0)
    m = 15000
    n = 10
    p = .8
    X = np.random.normal(size=(m,n))
    beta = np.random.normal(size=n)
    mu = np.dot(X, beta)
    y = np.random.lognormal(mu)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333)
    loss_function = SmoothQuantileLossFunction(1, p, .0001)
    q_loss = QuantileLossFunction(1, p)
    model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), 
                                      loss_function, n_estimators=150, 
                                      stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True)
    assert_raises(NotFittedError, lambda : model.predict(X_train))
    
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)
    model2 = GradientBoostingRegressor(loss='quantile', alpha=p)
    model2.fit(X_train, y_train)
    prediction2 = model2.predict(X_test)
    assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2))
    assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2))
    q = np.mean(y_test <= prediction)
    assert_less(np.abs(q-p), .05)
    assert_greater(model.score_, 0.)
    assert_approx_equal(model.score(X_train, y_train), model.score_)
def question_four():
    """Training models on high degree polynomial features can result in
    overly complex models that overfit, so we often use regularized versions
    of the model to constrain model complexity, as we saw with Ridge and
    Lasso linear regression.
    
    For this question, train two models: a non-regularized LinearRegression
    model (default parameters) and a regularized Lasso Regression model (with
    parameters alpha=0.01, max_iter=10000) both on polynomial features of
    degree 12. Return the R2 score for both the LinearRegression and Lasso
    model's test sets.
    
    This function should return one tuple (LinearRegression_R2_test_score,
    Lasso_R2_test_score)"""
    
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score
    
    X_tr = X_train.reshape(-1, 1)
    X_tst =X_test.reshape(-1, 1)
    poly = PolynomialFeatures(degree=12)
    X_poly_tr = poly.fit_transform(X_tr)
    X_poly_tst = poly.fit_transform(X_tst)
        
    linreg = LinearRegression()
    linreg.fit(X_poly_tr, y_train)
    linear = r2_score(y_test, linreg.predict(X_poly_tst))

    lasso = Lasso(alpha=0.01, max_iter=10000)    
    lasso.fit(X_poly_tr, y_train)
    lasso = r2_score(y_test, lasso.predict(X_poly_tst))
    
    
    return linear, lasso
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    # Your code here
    train_result = np.zeros([10, 1])
    test_result = np.zeros([10, 1])

    for i in range(0,10):
        
        poly = PolynomialFeatures(degree=i)
        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))

        linreg = LinearRegression().fit(X_train_poly, y_train)

        score_train = r2_score( y_train,linreg.predict(X_train_poly))
        score_test = r2_score( y_test,linreg.predict(X_test_poly))
        
        train_result[i] = score_train
        test_result[i] = score_test
        
        train_result = train_result.flatten()
        test_result = test_result.flatten()
        
    return (train_result, test_result)# Your answer here
示例#4
0
def answer_four():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score

    # X_train to poly
    poly = PolynomialFeatures(degree=12)
    X_poly = poly.fit_transform(x.reshape(-1, 1))

    X_train, X_test, y_train, y_test = train_test_split(X_poly,
                                                        y,
                                                        random_state=0)

    # Fit non-regularized LinearRegression model (default parameters)
    linreg = LinearRegression().fit(X_train, y_train)

    # Fit regularized Lasso Regression model (with parameters `alpha=0.01`, `max_iter=10000`)
    lasso = Lasso(alpha=0.01, max_iter=10000).fit(X_train, y_train)

    # Add prediction to array
    y_linreg_test = linreg.predict(X_test)
    y_lasso_test = lasso.predict(X_test)

    # Compute R2_
    Linreg_R2_test_score = r2_score(y_test, y_linreg_test)
    Lasso_R2_test_score = r2_score(y_test, y_lasso_test)

    return Linreg_R2_test_score, Lasso_R2_test_score
示例#5
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    
    
    degrees = np.arange(10)
    nbLines_train, = X_train.shape
    nbLines_test, = X_test.shape
    score_train = []
    score_test = []
    for n in degrees:  
        poly = PolynomialFeatures(degree=n)

        X_train_poly = poly.fit_transform(X_train.reshape(nbLines_train,1))
        X_test_poly = poly.fit_transform(X_test.reshape(nbLines_test,1))
        
        linreg = LinearRegression().fit(X_train_poly, y_train)
        
        score_train = score_train + [r2_score(y_train, linreg.predict(X_train_poly))]
        score_test = score_test + [r2_score(y_test, linreg.predict(X_test_poly))]

    

    return (np.array(score_train), np.array(score_test))
def main():
    path1 = "analysis/wine/log_data.csv"
    path2 = "analysis/wine/pc_data.csv"

    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path2)

    np.random.seed(777)
    randint = np.random.choice(df1.index, size=250)
    test_x = torch.from_numpy(df1.loc[randint].drop('quality',
                                                    axis=1).values).float()
    test_y = torch.from_numpy(df1.loc[randint, 'quality'].values).float()
    train_x_pre = torch.from_numpy(
        df1.drop(randint).drop('quality', axis=1).values).float()
    train_y_pre = torch.from_numpy(
        df1.drop(randint).loc[:, 'quality'].values).float()

    model, likelihood = train(train_x_pre, train_y_pre, training_iter=1000)

    observed_pred, mean, lower, upper = test(test_x, model, likelihood)

    sns.scatterplot(x=test_y.numpy(), y=mean.cpu().numpy())
    r2_score(test_y.numpy(), mean.cpu().numpy())

    # test_x2 = torch.from_numpy(df2.loc[randint].drop('quality', axis=1).values).float()
    # test_y2 = torch.from_numpy(df2.loc[randint, 'quality'].values).float()
    # train_x_pre2 = torch.from_numpy(df2.drop(randint).drop('quality', axis=1).values).float()
    # train_y_pre2 = torch.from_numpy(df2.drop(randint).loc[:,'quality'].values).float()
    model2, likelihood2 = train2(train_x_pre, train_y_pre, training_iter=1000)

    observed_pred2, mean2, lower2, upper2 = test(test_x, model2, likelihood2)
    sns.scatterplot(x=test_y.numpy(), y=mean2.cpu().numpy())
    r2_score(test_y.numpy(), mean2.cpu().numpy())
示例#7
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    R2_train = np.zeros([10, 1])
    R2_test = np.zeros([10, 1])

    for deg in range(10):

        poly = PolynomialFeatures(degree=deg)

        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))

        model = LinearRegression().fit(X_train_poly, y_train)

        r2_train = r2_score(y_train, model.predict(X_train_poly))

        r2_test = r2_score(y_test, model.predict(X_test_poly))

        R2_train[deg] = r2_train
        R2_test[deg] = r2_test

        R2_train = R2_train.flatten()
        R2_test = R2_test.flatten()

    return (R2_train, R2_test)
示例#8
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    r2_train, r2_test = [], []

    # Your code here
    for i in range(10):
        poly = PolynomialFeatures(i)

        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))

        clf = LinearRegression()

        clf.fit(X_train_poly, y_train)

        #clf.score(X_train_poly, y_train)
        #clf.score(X_test_poly, y_test)

        score_train = r2_score(y_train, clf.predict(X_train_poly))
        score_test = r2_score(y_test, clf.predict(X_test_poly))
        r2_train.append(score_train)
        r2_test.append(score_test)

    return (r2_train, r2_test)
示例#9
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    x_train = X_train.reshape(-1, 1)
    x_test = X_test.reshape(-1, 1)

    # Your code here
    r2_train = np.array([])
    r2_test = np.array([])
    for i in range(10):
        poly_transform = PolynomialFeatures(degree=i).fit(x_train)
        X_train_poly = poly_transform.transform(x_train)
        X_test_poly = poly_transform.transform(x_test)

        model = LinearRegression().fit(X_train_poly, y_train)

        fitted_model_prediction = model.predict(X_train_poly)
        fitted_model_score = r2_score(y_train, fitted_model_prediction)
        r2_train = np.append(r2_train, fitted_model_score)

        predicted_model_prediction = model.predict(X_test_poly)
        predicted_model_score = r2_score(y_test, predicted_model_prediction)
        r2_test = np.append(r2_test, predicted_model_score)

    return (r2_train, r2_test)  # Your answer here
示例#10
0
def answer_two():
    from sklearn.metrics.regression import r2_score
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(0)
    n = 15
    x = np.linspace(0, 10, n) + np.random.randn(n) / 5
    y = np.sin(x) + x / 6 + np.random.randn(n) / 10
    x = x.reshape(-1, 1)
    y = y.reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

    test_data = np.linspace(0, 10, 100).reshape(-1, 1)
    r2_train = []
    r2_test = []
    for i in range(10):
        poly = PolynomialFeatures(degree=i)
        x_poly = poly.fit_transform(X_train)
        model = LinearRegression()
        model.fit(x_poly, y_train)
        y_pred_train = poly.fit_transform(X_train)
        y_pred_train = model.predict(y_pred_train)
        r2_train.append(r2_score(y_train, y_pred_train))
        y_pred_test = poly.transform(X_test)
        y_pred_test = model.predict(y_pred_test)
        r2_test.append(r2_score(y_test, y_pred_test))
    r2_train = np.array(r2_train)
    r2_test = np.array(r2_test)

    # Your code here

    return r2_train, r2_test
示例#11
0
def answer_two():
    R2_train = []
    R2_test = []

    for i in np.arange(0, 10):
        poly = PolynomialFeatures(degree=i)
        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))

        linreg = LinearRegression().fit(X_train_poly, y_train)
        w = linreg.coef_
        w = w.transpose()
        b = linreg.intercept_

        y_predict_train = np.dot(X_train_poly, w) + b
        y_predict_test = np.dot(X_test_poly, w) + b

        r2_train = r2_score(y_train, y_predict_train)
        r2_test = r2_score(y_test, y_predict_test)
        R2_train.append(r2_train)
        R2_test.append(r2_test)

    R2_train = np.array(R2_train)
    R2_test = np.array(R2_test)
    return (R2_train, R2_test)
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    X_tr = X_train.reshape(-1, 1)
    X_tst = X_test.reshape(-1, 1)
    
    train_scores = np.zeros([10])
    test_scores = np.zeros([10])


    for degree in range(10):
        poly = PolynomialFeatures(degree=degree)
        
        X_poly_tr = poly.fit_transform(X_tr)
        X_poly_tst = poly.fit_transform(X_tst)
        
        linreg = LinearRegression()
        
        linreg.fit(X_poly_tr, y_train)
    
        train = r2_score(y_train, linreg.predict(X_poly_tr))
        test = r2_score(y_test, linreg.predict(X_poly_tst))
        
        train_scores[degree] = train
        test_scores[degree] = test
        
    return train_scores, test_scores
示例#13
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    # Your code here
    degrees = range(0,10)
    r2_train = np.zeros([10,1])
    r2_test = np.zeros([10,1])
    
    for d in degrees:
        
        poly = PolynomialFeatures(degree=d)

        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))

        linreg = LinearRegression().fit(X_train_poly, y_train)
    
        score_train = r2_score(y_train, linreg.predict(X_train_poly))
        score_test = r2_score(y_test, linreg.predict(X_test_poly))
        
        r2_train[d] = score_train        
        r2_test[d] = score_test
        
        r2_train = r2_train.flatten()        
        r2_test = r2_test.flatten()
    
    return (r2_train, r2_test)
示例#14
0
 def evaluate(self, y_predict, y_true, target_names=None):
     if self.config.task_type == 'classification':
         classification_report(y_true=y_true,
                               y_pred=y_predict,
                               target_names=target_names)
     elif self.config.task_type == 'ranking':
         roc_auc_score(y_true=y_true, y_score=y_predict)
     elif self.config.task_type == 'regression':
         r2_score(y_true=y_true, y_pred=y_predict)
示例#15
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    # Your code here

    #
    np.random.seed(0)
    n = 15
    x = np.linspace(0, 10, n) + np.random.randn(n) / 5
    y_true = np.sin(x) + x / 6 + np.random.randn(n) / 10

    # Init
    X_pred = np.linspace(0, 10, 100).reshape(-1, 1)
    r2_train_arr = np.empty([
        0,
    ])
    r2_test_arr = np.empty([
        0,
    ])

    # Fit various polynomials, predict, add predictions to array
    for i, p in enumerate(range(0, 10)):
        # X_train to poly
        poly = PolynomialFeatures(degree=p)
        X_poly = poly.fit_transform(x.reshape(-1, 1))
        y_true = y_true.reshape(-1, 1)

        X_train, X_test, y_train, y_test = train_test_split(X_poly,
                                                            y,
                                                            random_state=0)

        # Fit model
        linreg = LinearRegression().fit(X_train, y_train)

        # Add prediction to array
        y_preds_train = linreg.predict(X_train)
        y_preds_test = linreg.predict(X_test)
        #         print(y_train.shape, y_test.shape, y_preds_train.shape, y_preds_test.shape)

        # Compute R2_
        r2_train = r2_score(y_train, y_preds_train)
        r2_test = r2_score(y_test, y_preds_test)
        #         print(r2_train, r2_test)

        # Add to array
        r2_train_arr = np.hstack([r2_train_arr, r2_train])
        r2_test_arr = np.hstack([r2_test_arr, r2_test])

    # Sanity check
    assert r2_train_arr.shape == (10, ), str(r2_train_arr.shape)
    assert r2_test_arr.shape == (10, ), str(r2_test_arr.shape)

    return r2_train_arr, r2_test_arr
示例#16
0
def answer_four():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score
    poly = PolynomialFeatures(degree = 12)
    X_train_poly = poly.fit_transform(X_train.reshape(11,1))
    X_test_poly = poly.fit_transform(X_test.reshape(4,1))
    linreg = LinearRegression().fit(X_train_poly, y_train)
    lasso = Lasso(alpha = 0.01, max_iter = 10000).fit(X_train_poly, y_train)
    
    
    return (r2_score(y_test, linreg.predict(X_test_poly)), r2_score(y_test, lasso.predict(X_test_poly)))
示例#17
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    r2_train = np.zeros(10)
    r2_test = np.zeros(10)
    for degree in range(10):
        poly = PolynomialFeatures(degree=degree)
        X_poly = poly.fit_transform(X_train.reshape(11, 1))
        linreg = LinearRegression().fit(X_poly, y_train)
        r2_train[degree] = r2_score(y_train, linreg.predict(X_poly))
        X_test_poly = poly.fit_transform(X_test.reshape(4, 1))
        r2_test[degree] = r2_score(y_test, linreg.predict(X_test_poly))
    return (r2_train, r2_test)
    def test_r2_within_business_value(self):
        m = MyModel()
        m.init()

        # Add code to load your super secret cross validation stuff
        # use secrets encoded in a variable group available in the ADO Pipeline
        # cuz scientists can't access that and game the system.
        super_secret_data = None

        y_predicted = m.predict(super_secret_data[x])

        r2_score(super_secret_data[y], y_predicted)

        assert (r2_score > 2 < r2_score)
示例#19
0
def test_with_response_transformation():
    X, y = load_boston(return_X_y=True)

    log_y = np.log(y)

    X = pandas.DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])])
    y = pandas.DataFrame(y, columns=['y'])

    transformer = VariableTransformer(dict(y=Log(Identity('y'))))
    model = ResponseTransformingEstimator(Earth(), transformer)
    model.fit(X, y)
    log_y_pred = model.predict(X)
    assert r2_score(log_y, log_y_pred) > .8
    assert r2_score(y, log_y_pred) < .1
示例#20
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    r2_train, r2_test = [], []
    X_train_data = X_train.reshape(-1, 1)
    X_test_data = X_test.reshape(-1, 1)
    for degree in range(0, 10):
        poly = PolynomialFeatures(degree=degree)
        X_poly = poly.fit_transform(X_train_data)
        X_test_poly = poly.fit_transform(X_test_data)
        linreg = LinearRegression().fit(X_poly, y_train)
        r2_train.append(r2_score(y_train, linreg.predict(X_poly)))
        r2_test.append(r2_score(y_test, linreg.predict(X_test_poly)))
    return r2_train, r2_test
def answer_four():
    degree = 12
    poly = PolynomialFeatures(degree=degree)
    x_fit = poly.fit_transform(X_train.reshape(-1, 1))
    x_test_fit = poly.fit_transform(X_test.reshape(-1, 1))
    model = LinearRegression()
    model.fit(x_fit, y_train)
    y_linear_pred = model.predict(x_test_fit)
    r_linear = r2_score(y_test, y_linear_pred)
    lasso = Lasso(alpha=0.01, max_iter=10000)
    lasso.fit(x_fit, y_train)
    y_lasso_pred = lasso.predict(x_test_fit)
    r_lasso = r2_score(y_test, y_lasso_pred)
    ans = (r_linear, r_lasso)
    return ans
示例#22
0
def main():
    data = get_data_without_cols(2)
    X_train, y_train, X_test, y_test = data
    model = RandomForestRegressor(
        n_estimators=100,
        n_jobs=-1,
        random_state=1,
        criterion="mse",
        max_features="sqrt",
        # min_samples_split=3, max_depth=9,
        # min_samples_leaf=1,
        # min_weight_fraction_leaf=0,
        # min_impurity_decrease=0,
    )
    model.fit(X_train, y_train.iloc[:, 0])
    y_pred = model.predict(X_test).reshape(-1, 1)

    r2 = r2_score(y_test, y_pred)
    m_e = mean_absolute_error(y_test, y_pred)
    mean = y_test.mean(axis=0).mean()
    mean_pred = y_pred.mean(axis=0).mean()

    print("{} | mean error : {} | mean {} / {}".format(
        round(r2, 3),
        round(m_e, 3),
        round(mean_pred, 3),
        round(mean, 3),
    ))
def answer_four():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score

    # Your code here
    poly = PolynomialFeatures(12)
    X_train_new = poly.fit_transform(X_train.reshape(X_train.shape[0], 1))
    X_test_new = poly.fit_transform(X_test.reshape(X_test.shape[0], 1))
    lr = LinearRegression()
    lr.fit(X_train_new, y_train)
    lr_r2 = r2_score(y_test, lr.predict(X_test_new))
    lasso = Lasso(alpha=0.01, max_iter=10000)
    lasso.fit(X_train_new, y_train)
    ls_r2 = r2_score(y_test, lasso.predict(X_test_new))
    return (lr_r2, ls_r2)  # Your answer here
示例#24
0
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    results_train = np.zeros(10)
    results_test = np.zeros(10)
    for i in range(0, 10):
        poly = PolynomialFeatures(degree=i)
        X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
        X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))
        linreg = LinearRegression().fit(X_train_poly, y_train)
        score_train = r2_score(y_train, linreg.predict(X_train_poly))
        score_test = r2_score(y_test, linreg.predict(X_test_poly))
        results_train[i] = score_train
        results_test[i] = score_test
    return (results_train, results_test)
示例#25
0
def plotScatterPlot(actual, predicted, outFileName):
    'Make a scatter plot showing the predicted vs actual activation energy for each reaction'
    plt.scatter(actual, predicted, s=7, color='#4b9da6')
    axes = plt.gca()

    # make plot square with equal x and y axes
    bounds = [
        min(list(actual) + list(predicted) + [0]) - 1,
        max(list(actual) + list(predicted)) + 1
    ]
    plt.axis(bounds * 2)
    axes.set_aspect('equal', adjustable='box')

    # plot the identity for visual reference (10% darker than data)
    plt.plot([bounds[0], bounds[1]], [bounds[0], bounds[1]], color='#d95d41')

    rSquared = r2_score(actual, predicted)
    print(rSquared)
    plt.figtext(0.6, 0.15, '$R^2 = $' + format(rSquared, '.4f'), fontsize=11)
    plt.xlabel('QM Calculated $\mathregular{H_{2}}$ Binding Energy',
               fontsize=10)
    plt.ylabel('Model Predicted $\mathregular{H_{2}}$ Binding Energy',
               fontsize=10)
    plt.title(
        'Model Predicted vs. QM Calculated $\mathregular{H_{2}}$ Binding',
        fontsize=12)
    plt.tight_layout()
    plt.savefig(str(outFileName) + '.png')
    plt.clf()
示例#26
0
    def test_adjusted_r2_score(self):
        y_true = [1, 2, 3, 4, 5]
        y_pred = [1, 3, 2, 5, 4]

        r2 = r2_score(y_true, y_pred)
        assert r2 == 0.6

        for df in range(4):
            n = len(y_true)
            f = (n - 1) / (n - df - 1)
            expected = 1 - (1 - r2) * f
            self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred,
                                                     MockModel(coef_=[1] *
                                                               df)),
                                   expected,
                                   places=3,
                                   msg=f'failed for df={df}')

        # if degree of freedom gets too large returns 0
        coef = [1] * 4
        self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred,
                                                 MockModel(coef_=coef)),
                               0,
                               places=3,
                               msg=f'failed for df={df}')

        coef = [1] * 5
        self.assertAlmostEqual(adjusted_r2_score(y_true, y_pred,
                                                 MockModel(coef_=coef)),
                               0,
                               places=3,
                               msg=f'failed for df={df}')
示例#27
0
    def score(self, X, y, method="r2", verbose=False):
        """
        Produce multi-step prediction of y, and compute the metrics against y.
        Nan is ignored when computing the metrics.

        :param array-like X: exogenous input time series, shape = (n_samples,
                             n_exog_inputs)
        :param array-like y: target time series to predict, shape = (n_samples)
        :param string method: could be "r2" (R Square) or "mse" (Mean Square
                              Error).

        :return: prediction metric. Nan is ignored when computing the metrics.
        """
        ypred = self.predict(X, y)
        mask = np.isnan(y) | np.isnan(ypred)
        if verbose:
            print('Evaluating {} score, {} of {} data points are evaluated.'.
                  format(method, np.sum(~mask), y.shape[0]))
        if method == "r2":
            return r2_score(y[~mask], ypred[~mask])
        elif method == "mse":
            return mean_squared_error(y[~mask], ypred[~mask])
        else:
            raise ValueError(
                '{} method is not supported. Please choose from \"r2\" or \"mse\".'
            )
示例#28
0
def log_r2_score(y_true, y_pred, sample_weight=None, multioutput="uniform_average"):
    """ r squared on log of prediction

    Parameters
    ----------
    y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
        Estimated target values.

    sample_weight : array-like of shape = (n_samples), optional
        Sample weights.

    multioutput : string in ['raw_values', 'uniform_average'] \
            or array-like of shape = (n_outputs)

        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.

        'raw_values' :
            Returns a full set of errors when the input is of multioutput
            format.

        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.

    """
    y_type, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)

    if not (y_true >= 0).all() and not (y_pred >= 0).all():
        raise ValueError("Mean Squared Logarithmic Error cannot be used when " "targets contain negative values.")

    return r2_score(np.log(y_true + 1), np.log(y_pred + 1), sample_weight, multioutput)
示例#29
0
def get_scores(y_true,y_pred):
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
示例#30
0
def Hyd(actual, predicted, nlmos, slope, yint, outFileName):
    'Make a scatter plot showing the predicted vs actual activation energy for each reaction'
    xmin = -0.17
    xmax = -0.38
    ymin = [xmin * slope + yint]
    ymax = [xmax * slope + yint]
    print(ymin, ymax)

    plt.scatter(nlmos, actual, s=7, color='#4b9da6')
    axes = plt.gca()
    # make plot square with equal x and y axes
    #    bounds = [min(list(actual) + list(predicted) + [0])-1, max(list(actual) + list(predicted))+1]
    boundsx = [min(list(nlmos)), max(list(nlmos))]
    boundsy = [max(list(actual)), min(list(actual))]
    print(boundsx[0], boundsx[1])
    #    plt.axis(boundsy * 2)
    axes.set_aspect(aspect=1 / (280), adjustable='box')
    #    plt.axis('tight')
    # plot the identity for visual reference (10% darker than data)
    plt.plot([xmin, xmax], [ymin, ymax], color='#d95d41')
    rSquared = r2_score(actual, predicted)
    #    print(rSquared)
    plt.figtext(0.4, 0.15, '$R^2 = $' + format(rSquared, '.3f'), fontsize=12)
    plt.ylabel('QM Calculated $\mathregular{pK_{a}}$', fontsize=14)
    plt.xlabel('Co-H NLMO Energy (eV)', fontsize=14)
    plt.title('QM Calculated $\mathregular{pK_{a}}$ vs. Co-H NLMO Energy',
              fontsize=14)
    plt.tight_layout()
    plt.savefig(str(outFileName) + '.png')
    plt.clf()
示例#31
0
def answer_four():
    X_tr = X_train.reshape(-1, 1)
    X_tt = X_test.reshape(-1, 1)

    poly = PolynomialFeatures(degree=12)
    X_tr_poly = poly.fit_transform(X_tr)
    X_tt_poly = poly.fit_transform(X_tt)

    linreg = LinearRegression().fit(X_tr_poly, y_train)
    lasso = Lasso(alpha=0.01, max_iter=10000).fit(X_tr_poly, y_train)

    LinearRegression_R2_test_score = r2_score(y_test,
                                              linreg.predict(X_tt_poly))
    Lasso_R2_test_score = r2_score(y_test, lasso.predict(X_tt_poly))

    return LinearRegression_R2_test_score, Lasso_R2_test_score
示例#32
0
def get_scores(shots):
    y_true = [shot.result for shot in shots]
    y_pred = [shot.pred for shot in shots]
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
示例#33
0
 def score(self, X, y, sample_weight=None):
     from commonml.skchainer.classifier import Classifier
     from commonml.skchainer.regressor import Regressor
     if isinstance(self.model, Classifier):
         from sklearn.metrics.classification import accuracy_score
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
     elif isinstance(self.model, Regressor):
         from sklearn.metrics.regression import r2_score
         return r2_score(y, self.predict(X), sample_weight=sample_weight,
                         multioutput='variance_weighted')
     else:
         raise ValueError('Unsupported model.')
示例#34
0
 def calc_error_metrics(self):
     # Log loss, aka logistic loss or cross-entropy loss.
     self.scores['LogLoss'] = log_loss(self.y_true, self.y_pred) 
     # Mean Squared Error
     self.scores['Mean Squared Error'] = mean_squared_error(self.y_true, self.y_pred)
     # Mean Absolute Error
     self.scores['Mean Absolute Error'] = mean_absolute_error(self.y_true, self.y_pred)
     # R^2 (coefficient of determination) regression score function - indicated how well data fits the statistical model
     self.scores['R2 Score'] = r2_score(self.y_true, self.y_pred)
     
     """TBD compute the log-loss to consider boolean"""
     return
from sklearn.preprocessing import MinMaxScaler

from dbn.tensorflow import SupervisedDBNRegression


# Loading dataset
boston = load_boston()
X, Y = boston.data, boston.target

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Data scaling
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)

# Training
regressor = SupervisedDBNRegression(hidden_layers_structure=[100],
                                    learning_rate_rbm=0.01,
                                    learning_rate=0.01,
                                    n_epochs_rbm=20,
                                    n_iter_backprop=200,
                                    batch_size=16,
                                    activation_function='relu')
regressor.fit(X_train, Y_train)

# Test
X_test = min_max_scaler.transform(X_test)
Y_pred = regressor.predict(X_test)
print 'Done.\nR-squared: %f\nMSE: %f' % (r2_score(Y_test, Y_pred), mean_squared_error(Y_test, Y_pred))
示例#36
0
文件: ML.py 项目: dlrsb/dream
def evaluateAndPredict(estimator, opt, output, evaluate=True, predict=False, model_output=None, features_output=None):
    # Load data
    print "loading data"
    d = load_datasets(
        opt["subchallenge"],
        opt["final"],
        opt["filterQA"],
        opt["use_mut"],
        opt["use_CNV"],
        opt["use_exp"],
        opt["use_methyl"],
        opt["use_drug_info"],
        opt["use_cell_info"],
    )
    saveObserved(d)

    if opt["use_drug_info"]:
        # Get split datasets
        data = [
            (d["comb_train_input_None"], d["comb_train_output_None"], d["leaderboard_None"]),
            (d["comb_train_input_One"], d["comb_train_output_One"], d["leaderboard_One"]),
            (d["comb_train_input_Both"], d["comb_train_output_Both"], d["leaderboard_Both"]),
        ]
        # in this case, data is a list that contains 3 sets of X, y, LB
    else:
        data = [(d["comb_train_input"], d["comb_train_output"], d["leaderboard"])]

    if path.exists(output):
        remove(output)
    f = open(output, "a")  # append mode
    f.write("Estimator:\t" + estimator[0] + "\n\n")
    f.write("Options:\n")
    for key in opt:
        f.write(key + "\t" + str(opt[key]) + "\n")
    f.write("\n")

    iters_cv_pred = []
    iters_predictions = []
    iters_confidence = []
    feature_lists = []
    for i in xrange(len(data)):
        if len(data) > 1:
            f.write("Model " + str(i + 1) + "\n")

        # Preprocess data
        print "preprocessing data"
        X_train, LB, feature_names = preprocess(
            data[i][0],
            data[i][2],
            d,
            opt["use_mut"],
            opt["use_CNV"],
            opt["use_exp"],
            opt["exp_threshold"],
            opt["use_methyl"],
            opt["use_cell_info"],
            opt["scale"],
        )
        y_train = data[i][1].values

        # Feature selection (if using SelectKBest)
        if opt["selection_method"] == "kbest":
            print "feature selection"
            if opt["num_features"] > len(feature_names):
                selected = featureSelection(X_train, y_train, opt["selection_method"], estimator[1], "all")
            else:
                selected = featureSelection(
                    X_train, y_train, opt["selection_method"], estimator[1], opt["num_features"]
                )
            X_train = X_train[:, selected]
            LB = LB[:, selected]

        # Optimize parameters (only does the actual optimization if parameters dict is not empty) and write best parameters to file
        opt_est = optParameters(estimator[1], X_train, y_train, estimator[2], data[i][0])
        f.write("Best parameters:\n" + str(opt_est[1]) + "\n\n")

        # Feature selection (if not using SelectKBest)
        if opt["selection_method"] != "kbest":
            selected = featureSelection(X_train, y_train, opt["selection_method"], opt_est[0], opt["num_features"])
            X_train = X_train[:, selected]
            LB = LB[:, selected]

        feature_lists.append([feature for (feature, mask) in zip(feature_names, selected) if mask])

        if evaluate:
            print "evaluating model"
            # Model evaluation
            predictions, confidence = evalModel(opt_est[0], X_train, y_train, 5, opt["cv_iterations"])
            iters_cv_pred.append(predictions)
            iters_confidence.append(confidence)

        if predict:
            print "predicting"
            # Predict leaderboard/test output:
            opt_est[0].fit(X_train, y_train)
            iters_predictions.append(opt_est[0].predict(LB))

    if features_output != None:
        saveFeatures(feature_lists, features_output)

    # Join dataset subsets and calculate scores/save to file
    if evaluate:
        print "calculating scores"
        if opt["use_drug_info"]:
            cv_predictions = pd.Series(
                data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input_None"].index
            ).append(pd.Series(data=iters_cv_pred[1], name="PREDICTION", index=d["comb_train_input_One"].index))
            cv_predictions = cv_predictions.append(
                pd.Series(data=iters_cv_pred[2], name="PREDICTION", index=d["comb_train_input_Both"].index)
            )
            confidence = pd.concat(
                [
                    pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input_None"].index),
                    pd.Series(data=iters_confidence[1], name="CONFIDENCE", index=d["comb_train_input_One"].index),
                    pd.Series(data=iters_confidence[2], name="CONFIDENCE", index=d["comb_train_input_Both"].index),
                ],
                axis=0,
            )
        else:
            if estimator[0] == "PLS":
                iters_cv_pred[0] = [float(iters_cv_pred[0][i]) for i in xrange(len(iters_cv_pred[0]))]
                iters_confidence[0] = [float(iters_confidence[0][i]) for i in xrange(len(iters_confidence[0]))]
            cv_predictions = pd.Series(data=iters_cv_pred[0], name="PREDICTION", index=d["comb_train_input"].index)
            confidence = pd.Series(data=iters_confidence[0], name="CONFIDENCE", index=d["comb_train_input"].index)
        savePredictions(cv_predictions, d, CV=True)
        saveConfidence(confidence, d)

        # Calculate scores and write to file
        r2 = r2_score(d["comb_train_output"].values, cv_predictions.values)
        challenge_performance = drugCombiScore(d["comb_train_output"].values, cv_predictions.values, d)
        challenge_global = globalScore(d["comb_train_output"].values, cv_predictions.values, d)
        print challenge_performance
        print challenge_global
        f.write("Model Evaluation\n")
        f.write("R2:\t" + str(r2) + "\n")
        f.write("Performance score:\t" + str(challenge_performance[0]) + "\n")
        f.write("Standard error:\t" + str(challenge_performance[1]) + "\n")
        f.write("Global score:\t" + str(challenge_global[0]) + "\n")
        f.write("Primary metric:\t" + str(challenge_global[1]) + "\n")
        f.write("Tie-breaking metric:\t" + str(challenge_global[2]) + "\n")

    if predict:
        if opt["use_drug_info"]:
            predictions = pd.concat(
                [
                    pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard_None"].index),
                    pd.Series(data=iters_predictions[1], index=d["leaderboard_One"].index),
                    pd.Series(data=iters_predictions[2], index=d["leaderboard_Both"].index),
                ],
                axis=0,
            )
        else:
            predictions = pd.Series(data=iters_predictions[0], name="PREDICTION", index=d["leaderboard"].index)
        savePredictions(predictions, d, CV=False)

    # Save model
    if model_output != None:
        joblib.dump(estimator, model_output)

    f.close()
示例#37
0
def validate(y_true, y_pred):
    
    print 'Kolmogorov-Smirnov test = ', ks_2samp(y_true, y_pred)
    print 'mean_squared_error = ', mean_squared_error(y_true, y_pred)
    print 'mean_absolute_error = ', mean_absolute_error(y_true, y_pred)
    print 'r2_score = ', r2_score(y_true, y_pred)
    
    """TBD compute the log-loss to consider boolean"""
    
    print "log_loss = " + str(log_loss(y_true, y_pred)) #Log loss, aka logistic loss or cross-entropy loss.
    
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)    #Compute precision-recall pairs for different probability thresholds
    #print "precision = " + str(precision)
    #print "recall = " + str(recall)
    #print "thresholds = " +  str(thresholds)
    average_precision = average_precision_score(y_true, y_pred)    #Compute average precision (AP) from prediction scores
    print "average_precision_score = ", average_precision
    
    ##############################################################################
    # Plot of a ROC curve for a specific class
    plt.figure()
    plt.plot(precision, recall, label='AUC = %0.2f' % average_precision)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower right")
    plt.show()
    ##############################################################################

    fpr, tpr, thresholds = roc_curve(y_true, y_pred)    #Compute Receiver operating characteristic (ROC)
    print "fpr = " + str(fpr)
    print "tpr = " + str(tpr)
    print "thresholds = " +  str(thresholds)
    print "roc_auc_score = " + str(roc_auc_score(y_true, y_pred))    #Compute Area Under the Curve (AUC) from prediction scores
    roc_auc = auc(fpr, tpr)
    
    ##############################################################################
    # Plot of a ROC curve for a specific class
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic ')
    plt.legend(loc="lower right")
    plt.show()
    ##############################################################################
    
    
    ##print "pearsonr = " + str(pearsonr(np.array(y_true), np.array(y_pred)))
    ##print "spearmanr = " + str(spearmanr(np.array(y_true), np.array(y_pred)))
    ##print matthews_corrcoef(y_true, y_pred)    #Compute the Matthews correlation coefficient (MCC) for binary classes
    ##print confusion_matrix(y_true, y_pred)    #Compute confusion matrix to evaluate the accuracy of a classification
    ##print accuracy_score(y_true, y_pred)    #Accuracy classification score.
    ##print classification_report(y_true, y_pred)    #Build a text report showing the main classification metrics
    ##print f1_score(y_true, y_pred)    #Compute the F1 score, also known as balanced F-score or F-measure
    ##print fbeta_score(y_true, y_pred)    #Compute the F-beta score
    ##print hamming_loss(y_true, y_pred)    #Compute the average Hamming loss.
    ##print jaccard_similarity_score(y_true, y_pred)    #Jaccard similarity coefficient score
    ##print precision_recall_fscore_support(y_true, y_pred)    #Compute precision, recall, F-measure and support for each class
    ##print precision_score(y_true, y_pred)    #Compute the precision
    ##print recall_score(y_true, y_pred)    #Compute the recall
    ##print zero_one_loss(y_true, y_pred) #Zero-one classification loss.
    ##print "hinge_loss = " + str(hinge_loss(y_true, y_pred))    #Average hinge loss (non-regularized)
    
    return