示例#1
0
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")

    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info
    )

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    if nrows < 500000:
        # sklearn linear regression model initialization, fit and predict
        skols = skLinearRegression(fit_intercept=True, normalize=False)
        skols.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)

        assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
示例#2
0
def test_ols(datatype, X_type, y_type, algorithm):

    X = np.array([[2.0, 5.0], [6.0, 9.0], [2.0, 2.0], [2.0, 3.0]],
                 dtype=datatype)
    y = np.dot(X, np.array([5.0, 10.0]).astype(datatype))

    pred_data = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype)

    skols = skLinearRegression(fit_intercept=True, normalize=False)
    skols.fit(X, y)

    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    if X_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([2, 6, 2, 2], dtype=datatype)
        gdf['1'] = np.asarray([5, 9, 2, 3], dtype=datatype)
        cuols.fit(gdf, y)

    elif X_type == 'ndarray':
        cuols.fit(X, y)

    sk_predict = skols.predict(pred_data)
    cu_predict = cuols.predict(pred_data).to_array()

    print(sk_predict)
    print(cu_predict)

    # print(skols.coef_)
    print(cuols.gdf_datatype)
    print(y.dtype)

    assert array_equal(sk_predict, cu_predict, 1e-3, with_sign=True)
    def test_score_matches_sklearn_performance(self):
        skLR = skLinearRegression()
        skLR.fit(self.X_train, self.Y_train)
        skLR_score = skLR.score(self.X_test, self.Y_test)

        score = self.test_LR.score(self.X_test, self.Y_test)
        np.testing.assert_almost_equal(skLR_score, score, decimal=1)
示例#4
0
def test_weighted_linear_regression(datatype, algorithm, fit_intercept,
                                    normalize, distribution):
    nrows, ncols, n_info = 1000, 20, 10
    max_weight = 10
    noise = 20
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info, noise=noise
    )

    # set weight per sample to be from 1 to max_weight
    if distribution == "uniform":
        wt = np.random.randint(1, high=max_weight, size=len(X_train))
    elif distribution == "exponential":
        wt = np.random.exponential(size=len(X_train))
    else:
        wt = np.random.lognormal(size=len(X_train))

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=fit_intercept,
                               normalize=normalize,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train, sample_weight=wt)
    cuols_predict = cuols.predict(X_test)

    # sklearn linear regression model initialization, fit and predict
    skols = skLinearRegression(fit_intercept=fit_intercept,
                               normalize=normalize)
    skols.fit(X_train, y_train, sample_weight=wt)

    skols_predict = skols.predict(X_test)

    assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
示例#5
0
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_regression_dataset(datatype,
                                                               nrows,
                                                               ncols,
                                                               n_info)

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    if nrows < 500000:
        # sklearn linear regression model initialization, fit and predict
        skols = skLinearRegression(fit_intercept=True,
                                   normalize=False)
        skols.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)

        assert array_equal(skols_predict, cuols_predict,
                           1e-1, with_sign=True)
示例#6
0
def test_LinearRegression_fit_intercept():
    np.random.seed(0)
    X = np.random.random((10, 1))
    y = np.random.random(10)

    clf1 = LinearRegression(fit_intercept=False).fit(X, y)
    clf2 = skLinearRegression(fit_intercept=False).fit(X, y)

    assert_allclose(clf1.coef_, clf2.coef_)
示例#7
0
def test_LinearRegression_fit_intercept():
    np.random.seed(0)
    X = np.random.random((10, 1))
    y = np.random.random(10)

    clf1 = LinearRegression(fit_intercept=False).fit(X, y)
    clf2 = skLinearRegression(fit_intercept=False).fit(X, y)

    assert_allclose(clf1.coef_, clf2.coef_)
示例#8
0
    def test_score_matches_sklearn_performance(self):
        print('')
        sklr = skLinearRegression()
        sklr.fit(self.X_train, self.y_train)
        sklr_score = sklr.score(self.X_test, self.y_test)

        self.advi_lr.fit(self.X_train, self.y_train)
        score = self.advi_lr.score(self.X_test, self.y_test)
        npt.assert_almost_equal(sklr_score, score, decimal=1)
示例#9
0
def test_linear_regression_model_default(datatype):

    X_train, X_test, y_train, y_test = small_regression_dataset(datatype)

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression()

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    # sklearn linear regression model initialization and fit
    skols = skLinearRegression()
    skols.fit(X_train, y_train)

    skols_predict = skols.predict(X_test)

    assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
示例#10
0
def test_LinearRegression_err():
    """
    Test that errors are correctly accounted for
    By comparing to scikit-learn LinearRegression
    """
    np.random.seed(0)
    X = np.random.random((10, 1))
    y = np.random.random(10) + 1
    dy = 0.1

    y = np.random.normal(y, dy)

    X_fit = np.linspace(0, 1, 10)[:, None]
    clf1 = LinearRegression().fit(X, y, dy)
    clf2 = skLinearRegression().fit(X / dy, y / dy)

    assert_allclose(clf1.coef_[1:], clf2.coef_)
    assert_allclose(clf1.coef_[0], clf2.intercept_ * dy)
示例#11
0
def test_LinearRegression_err():
    """
    Test that errors are correctly accounted for
    By comparing to scikit-learn LinearRegression
    """
    np.random.seed(0)
    X = np.random.random((10, 1))
    y = np.random.random(10) + 1
    dy = 0.1

    y = np.random.normal(y, dy)

    X_fit = np.linspace(0, 1, 10)[:, None]
    clf1 = LinearRegression().fit(X, y, dy)
    clf2 = skLinearRegression().fit(X / dy, y / dy)

    assert_allclose(clf1.coef_[1:], clf2.coef_)
    assert_allclose(clf1.coef_[0], clf2.intercept_ * dy)
示例#12
0
def demo_helper(X, Y, learning_rate, iterations, title, x_label, y_label, x_lim=None, y_lim=None):
    # Splitting the Dataset
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.33, random_state= 101)

    # Custom ML linear regression module
    ml_regressor = LinearRegression(learning_rate=learning_rate, iterations=iterations)

    # scikit-learn linear regression module
    scikit_regressor = skLinearRegression()

    # Perform linear regression using both scikit-learn and ML.py library
    ml_regressor.fit(X_train, y_train)      # custom version
    scikit_regressor.fit(X_train, y_train)  # scikit-learn version

    # Make predictions
    mlY_pred = ml_regressor.predict(X_test)
    skY_pred = scikit_regressor.predict(X_test)

    # Plot the X, Y data
    plt.scatter(X, Y)
    plt.title(title + ": Raw Data")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

    # Plot the X, Y_pred data
    custom_label = title + ": ML.py Linear Regression Module\nLearning Rate: " + \
                   str(learning_rate) + ", Iterations: " + str(iterations)
    plot_regression_line(mlY_pred, X_test, y_test, x_label, y_label,
                         custom_label, x_range=x_lim, y_range=y_lim)
    plot_regression_line(skY_pred, X_test, y_test, x_label, y_label,
                         title + ": scikit-learn Linear Regression Module",
                         x_range=x_lim, y_range=y_lim)

    # Evaluate Each Model's Performance
    d_ml = {
        'Metrics': ['Mean Absolute Error:',
                    'Mean Squared Error:',
                    'Mean Root Squared Error:'],
        'Values': [mean_absolute_error(y_test, mlY_pred),
                   mean_squared_error(y_test, mlY_pred),
                   np.sqrt(mean_squared_error(y_test, mlY_pred))]
    }

    d_sk = {
        'Metrics': ['Mean Absolute Error:',
                    'Mean Squared Error:',
                    'Mean Root Squared Error:'],
        'Values': [mean_absolute_error(y_test, skY_pred),
                   mean_squared_error(y_test, skY_pred),
                   np.sqrt(mean_squared_error(y_test, skY_pred))]
    }

    df_ml = pd.DataFrame(data=d_ml)
    df_sk = pd.DataFrame(data=d_sk)

    fig, ax = plt.subplots(2)
    cell_text = []
    for row in range(len(df_ml)):
        cell_text.append(df_ml.iloc[row])

    ax[0].set_title(title + ": ML.py Linear Regression Module\nLearning Rate: " +
                    str(learning_rate) + ", Iterations: " + str(iterations))
    ax[0].table(cellText=cell_text, colLabels=df_ml.columns, loc='center')
    ax[0].axis(False)

    cell_text = []
    for row in range(len(df_sk)):
        cell_text.append(df_sk.iloc[row])

    ax[1].set_title(title + ": scikit-learn Linear Regression Module")
    ax[1].table(cellText=cell_text, colLabels=df_sk.columns, loc='center')
    ax[1].axis(False)
    plt.show()

    print("ML.py Linear Regression Weights for y = mx + b (" + title + "):")
    print("Slope:", ml_regressor.slope)
    print("Intercept:", ml_regressor.intercept, "\n")
                                [1, 4, 5, 16, 25, 64, 125]])
        result = myLinearRegression._perform_change_of_basis(
            test_array1, 3, True)
        if not np.array_equal(result, test_array2):
            errs += "\nLinear Regression: Failed to add polynomial features"

        dataset = load_dataset('basisData.pkl')
        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']

        print(" Testing least squares no regularization")
        model_mine = myLinearRegression('MSE', 1, False, None)
        model_mine.fit(X, y)
        model_sklearn = skLinearRegression(False)
        model_sklearn.fit(X, y)
        y_pred_mine = model_mine.predict(X)
        y_pred_sk = model_sklearn.predict(X)
        if not np.array_equal(y_pred_mine, y_pred_sk):
            errs += "\nLinear Regression: did not match sklearn for Least Squares no regularization"
        y_pred_mine = model_mine.predict(X, round=True)
        y_pred_sk = np.round(model_sklearn.predict(X), 0)
        if not np.array_equal(y_pred_mine, y_pred_sk):
            errs += "\nLinear Regression: did not match sklearn for Least Squares no regularization (rounded)"

        # print(" Testing least squares with L2 regularization")
        # model_mine = myLinearRegression('MSE', 1, False, 'L2', 1, 1000)
        # model_mine.fit(X, y)
        # model_sklearn = Ridge(fit_intercept=False)
        # model_sklearn.fit(X, y)
示例#14
0
def test_linear_models(datatype, X_type, y_type, algorithm, nrows, ncols,
                       n_info):
    train_rows = np.int32(nrows * 0.8)
    X, y = make_regression(n_samples=(nrows),
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=0)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Initialization of cuML's linear and ridge regression models
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm)

    if X_type == 'dataframe':
        y_train = pd.DataFrame({'labels': y_train[0:, ]})
        X_train = pd.DataFrame(
            {'fea%d' % i: X_train[0:, i]
             for i in range(X_train.shape[1])})
        X_test = pd.DataFrame(
            {'fea%d' % i: X_test[0:, i]
             for i in range(X_test.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_train)
        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
        y_cudf = y_train.values
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)

        # fit and predict cuml linear regression model
        cuols.fit(X_cudf, y_cudf)
        cuols_predict = cuols.predict(X_cudf_test).to_array()

        # fit and predict cuml ridge regression model
        curidge.fit(X_cudf, y_cudf)
        curidge_predict = curidge.predict(X_cudf_test).to_array()

    elif X_type == 'ndarray':

        # fit and predict cuml linear regression model
        cuols.fit(X_train, y_train)
        cuols_predict = cuols.predict(X_test).to_array()

        # fit and predict cuml ridge regression model
        curidge.fit(X_train, y_train)
        curidge_predict = curidge.predict(X_test).to_array()

    if nrows < 500000:
        # sklearn linear and ridge regression model initialization and fit
        skols = skLinearRegression(fit_intercept=True, normalize=False)
        skols.fit(X_train, y_train)
        skridge = skRidge(fit_intercept=False, normalize=False)
        skridge.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)
        skridge_predict = skridge.predict(X_test)

        assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
        assert array_equal(skridge_predict,
                           curidge_predict,
                           1e-1,
                           with_sign=True)
示例#15
0
def PolynomialRegression(degree=3, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         skLinearRegression(**kwargs))