def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) try: assert abs(cu_r2 - skl_r2) <= 0.02 except AssertionError: pytest.xfail("failed due to AssertionError error, " "fix will be merged soon")
def test_mbsgd_regressor_vs_skl(lrate, penalty, make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset if nrows < 500000: cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test), convert_dtype=datatype) skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(cp.asnumpy(X_train), cp.asnumpy(y_train).ravel()) skl_pred = skl_sgd_regressor.predict(cp.asnumpy(X_test)) skl_r2 = r2_score(skl_pred, cp.asnumpy(y_test), convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_mbsgd_regressor(datatype, lrate, input_type, penalty, nrows, ncols): train_rows = int(nrows * 0.8) X, y = make_regression(n_samples=nrows, n_features=ncols, random_state=0) X_test = np.array(X[train_rows:, :], dtype=datatype) X_train = np.array(X[:train_rows, :], dtype=datatype) y_train = np.array(y[:train_rows, ], dtype=datatype) y_test = np.array(y[train_rows:, ], dtype=datatype) cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() skl_sgd_regressor = SGDRegressor(learning_rate=lrate, eta0=0.005, max_iter=100, fit_intercept=True, tol=0.0, penalty=penalty, random_state=0) skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test) skl_r2 = r2_score(skl_pred, y_test) assert (cu_r2 - skl_r2 <= 0.02)
def test_mbsgd_regressor_set_params(): x = np.linspace(0, 1, 50) y = x * 2 model = cumlMBSGRegressor() model.fit(x, y) coef_before = model.coef_ model = cumlMBSGRegressor(eta0=0.1, fit_intercept=False) model.fit(x, y) coef_after = model.coef_ model = cumlMBSGRegressor() model.set_params(**{'eta0': 0.1, 'fit_intercept': False}) model.fit(x, y) coef_test = model.coef_ assert coef_before != coef_after assert coef_after == coef_test
def test_mbsgd_regressor_attributes(): X, y = make_blobs() clf = cumlMBSGRegressor() clf.fit(X, y) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "loss", "eta0", "batch_size", "epochs"] for attr in attrs: assert hasattr(clf, attr)
def test_mbsgd_regressor_default(make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor(batch_size=nrows / 100) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cp.asnumpy(cu_pred), cp.asnumpy(y_test), convert_dtype=datatype) assert cu_r2 > 0.9
def test_mbsgd_regressor_default(make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02
def test_mbsgd_regressor(lrate, penalty, make_dataset): nrows, datatype, X_train, X_test, y_train, y_test = make_dataset cu_mbsgd_regressor = cumlMBSGRegressor(learning_rate=lrate, eta0=0.005, epochs=100, fit_intercept=True, batch_size=nrows / 100, tol=0.0, penalty=penalty) cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test) cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) assert cu_r2 >= 0.9
def test_mbsgd_regressor_default(datatype, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_mbsgd_regressor = cumlMBSGRegressor() cu_mbsgd_regressor.fit(X_train, y_train) cu_pred = cu_mbsgd_regressor.predict(X_test).to_array() cu_r2 = r2_score(cu_pred, y_test, convert_dtype=datatype) if nrows < 500000: skl_sgd_regressor = SGDRegressor() skl_sgd_regressor.fit(X_train, y_train) skl_pred = skl_sgd_regressor.predict(X_test) skl_r2 = r2_score(skl_pred, y_test, convert_dtype=datatype) assert abs(cu_r2 - skl_r2) <= 0.02