예제 #1
0
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    if algorithm == "svd" and nrows > 46340:
        pytest.skip("svd solver is not supported for the data that has more"
                    "than 46340 rows or columns if you are using CUDA version"
                    "10.x")

    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info
    )

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    if nrows < 500000:
        # sklearn linear regression model initialization, fit and predict
        skols = skLinearRegression(fit_intercept=True, normalize=False)
        skols.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)

        assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
예제 #2
0
def test_ols(datatype, X_type, y_type, algorithm):

    X = np.array([[2.0, 5.0], [6.0, 9.0], [2.0, 2.0], [2.0, 3.0]],
                 dtype=datatype)
    y = np.dot(X, np.array([5.0, 10.0]).astype(datatype))

    pred_data = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype)

    skols = skLinearRegression(fit_intercept=True, normalize=False)
    skols.fit(X, y)

    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    if X_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([2, 6, 2, 2], dtype=datatype)
        gdf['1'] = np.asarray([5, 9, 2, 3], dtype=datatype)
        cuols.fit(gdf, y)

    elif X_type == 'ndarray':
        cuols.fit(X, y)

    sk_predict = skols.predict(pred_data)
    cu_predict = cuols.predict(pred_data).to_array()

    print(sk_predict)
    print(cu_predict)

    # print(skols.coef_)
    print(cuols.gdf_datatype)
    print(y.dtype)

    assert array_equal(sk_predict, cu_predict, 1e-3, with_sign=True)
예제 #3
0
def test_weighted_linear_regression(datatype, algorithm, fit_intercept,
                                    normalize, distribution):
    nrows, ncols, n_info = 1000, 20, 10
    max_weight = 10
    noise = 20
    X_train, X_test, y_train, y_test = make_regression_dataset(
        datatype, nrows, ncols, n_info, noise=noise
    )

    # set weight per sample to be from 1 to max_weight
    if distribution == "uniform":
        wt = np.random.randint(1, high=max_weight, size=len(X_train))
    elif distribution == "exponential":
        wt = np.random.exponential(size=len(X_train))
    else:
        wt = np.random.lognormal(size=len(X_train))

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=fit_intercept,
                               normalize=normalize,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train, sample_weight=wt)
    cuols_predict = cuols.predict(X_test)

    # sklearn linear regression model initialization, fit and predict
    skols = skLinearRegression(fit_intercept=fit_intercept,
                               normalize=normalize)
    skols.fit(X_train, y_train, sample_weight=wt)

    skols_predict = skols.predict(X_test)

    assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
예제 #4
0
def test_linear_regression_model(datatype, algorithm, nrows, column_info):

    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_regression_dataset(datatype,
                                                               nrows,
                                                               ncols,
                                                               n_info)

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    if nrows < 500000:
        # sklearn linear regression model initialization, fit and predict
        skols = skLinearRegression(fit_intercept=True,
                                   normalize=False)
        skols.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)

        assert array_equal(skols_predict, cuols_predict,
                           1e-1, with_sign=True)
예제 #5
0
def test_linreg_predict_convert_dtype(train_dtype, test_dtype):
    X, y = make_regression(n_samples=50, n_features=10,
                           n_informative=5, random_state=0)
    X = X.astype(train_dtype)
    y = y.astype(train_dtype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    clf = cuLinearRegression()
    clf.fit(X_train, y_train)
    clf.predict(X_test.astype(test_dtype))
def getTheModel(ensemble, ntrees, frate, seed, coll):
    seed = seed + 10

    # if ntrees < 100:
    #     ntrees = 100
    if frate < 0.3:
        frate = 0.3
    if ensemble == 1:
        clf = RandomForestRegressor(n_estimators=ntrees,
                                    max_features=frate,
                                    max_leaf_nodes=100,
                                    min_samples_leaf=1,
                                    random_state=seed,
                                    n_jobs=-1)

    if ensemble == 2:
        clf = ExtraTreesRegressor(n_estimators=ntrees,
                                  max_leaf_nodes=100,
                                  min_samples_leaf=1,
                                  random_state=seed,
                                  n_jobs=-1)
    if ensemble == 3:
        clf = GradientBoostingRegressor(n_estimators=ntrees,
                                        learning_rate=0.1,
                                        max_depth=2,
                                        random_state=seed,
                                        loss='ls')
    if ensemble == 4:
        clf = cuLinearRegression(fit_intercept=True,
                                 normalize=True,
                                 algorithm='eig')
        # clf = linear_model.LinearRegression(n_jobs=-1)

    if ensemble == 5:
        clf = DecisionTreeRegressor(random_state=seed)

    if ensemble == 10:
        if "200" in coll:
            clf = DecisionTreeRegressor(random_state=seed,
                                        min_samples_leaf=30,
                                        min_samples_split=50,
                                        max_depth=3)
        else:
            clf = DecisionTreeRegressor(random_state=seed,
                                        min_samples_leaf=50,
                                        min_samples_split=100,
                                        max_depth=6)

    return clf
예제 #7
0
def test_linear_regression_model_default(datatype):

    X_train, X_test, y_train, y_test = small_regression_dataset(datatype)

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression()

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    # sklearn linear regression model initialization and fit
    skols = skLinearRegression()
    skols.fit(X_train, y_train)

    skols_predict = skols.predict(X_test)

    assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
예제 #8
0
def test_linear_regression_single_column():
    '''Test that linear regression can be run on single column with more than
    46340 rows (a limitation on CUDA <11)'''
    model = cuLinearRegression()
    model.fit(cp.random.rand(46341), cp.random.rand(46341))
예제 #9
0
def test_linear_models(datatype, X_type, y_type, algorithm, nrows, ncols,
                       n_info):
    train_rows = np.int32(nrows * 0.8)
    X, y = make_regression(n_samples=(nrows),
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=0)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Initialization of cuML's linear and ridge regression models
    cuols = cuLinearRegression(fit_intercept=True,
                               normalize=False,
                               algorithm=algorithm)

    curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm)

    if X_type == 'dataframe':
        y_train = pd.DataFrame({'labels': y_train[0:, ]})
        X_train = pd.DataFrame(
            {'fea%d' % i: X_train[0:, i]
             for i in range(X_train.shape[1])})
        X_test = pd.DataFrame(
            {'fea%d' % i: X_test[0:, i]
             for i in range(X_test.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_train)
        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
        y_cudf = y_train.values
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)

        # fit and predict cuml linear regression model
        cuols.fit(X_cudf, y_cudf)
        cuols_predict = cuols.predict(X_cudf_test).to_array()

        # fit and predict cuml ridge regression model
        curidge.fit(X_cudf, y_cudf)
        curidge_predict = curidge.predict(X_cudf_test).to_array()

    elif X_type == 'ndarray':

        # fit and predict cuml linear regression model
        cuols.fit(X_train, y_train)
        cuols_predict = cuols.predict(X_test).to_array()

        # fit and predict cuml ridge regression model
        curidge.fit(X_train, y_train)
        curidge_predict = curidge.predict(X_test).to_array()

    if nrows < 500000:
        # sklearn linear and ridge regression model initialization and fit
        skols = skLinearRegression(fit_intercept=True, normalize=False)
        skols.fit(X_train, y_train)
        skridge = skRidge(fit_intercept=False, normalize=False)
        skridge.fit(X_train, y_train)

        skols_predict = skols.predict(X_test)
        skridge_predict = skridge.predict(X_test)

        assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
        assert array_equal(skridge_predict,
                           curidge_predict,
                           1e-1,
                           with_sign=True)