def test_logistic_regression_predict_proba(dtype, nrows, column_info, num_classes, fit_intercept): ncols, n_info = column_info X_train, X_test, y_train, y_test = \ make_classification_dataset(datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept) culog.fit(X_train, y_train) if num_classes > 2: sklog = skLog(fit_intercept=fit_intercept, solver="lbfgs", multi_class="multinomial") else: sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.copy_to_host().T if fit_intercept: sklog.intercept_ = culog.intercept_.copy_to_host() else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_proba = culog.predict_proba(X_test).get() sk_proba = sklog.predict_proba(X_test) cu_log_proba = culog.predict_log_proba(X_test).get() sk_log_proba = sklog.predict_log_proba(X_test) assert array_equal(cu_proba, sk_proba) assert array_equal(cu_log_proba, sk_log_proba)
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio, fit_intercept, nrows, column_info, C, tol): if penalty in ['l1', 'elasticnet']: pytest.xfail("OWL numerical stability is being improved") ncols, n_info = column_info # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == 'elasticnet': pytest.skip("Need sklearn > 0.21 for testing logistic with" "elastic net.") X_train, X_test, y_train, y_test = \ make_classification_dataset(datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(penalty=penalty, l1_ratio=l1_ratio, C=C, fit_intercept=fit_intercept, tol=tol, verbose=0) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ['elasticnet', 'l1']: if sk_check: sklog = skLog(penalty=penalty, l1_ratio=l1_ratio, solver='saga', C=C, fit_intercept=fit_intercept, multi_class='auto') else: sklog = skLog(penalty=penalty, solver='saga', C=C, fit_intercept=fit_intercept, multi_class='auto') else: sklog = skLog(penalty=penalty, solver='lbfgs', C=C, fit_intercept=fit_intercept, multi_class='auto') sklog.fit(X_train, y_train) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.06
def test_logistic_regression_decision_function( dtype, nrows, column_info, num_classes, fit_intercept ): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_dec_func = culog.decision_function(X_test) if num_classes > 2: cu_dec_func = cu_dec_func.T sk_dec_func = sklog.decision_function(X_test) assert array_equal(cu_dec_func, sk_dec_func)
def test_logistic_regression_model_default(dtype): X_train, X_test, y_train, y_test = small_classification_dataset(dtype) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog() culog.fit(X_train, y_train) sklog = skLog(multi_class="auto") sklog.fit(X_train, y_train) assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.022
def test_logistic_regression_predict_proba( dtype, nrows, column_info, num_classes, fit_intercept, sparse_input ): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) X_train = csr_matrix(X_train) if sparse_input else X_train X_test = csr_matrix(X_test) if sparse_input else X_test y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) if num_classes > 2: sklog = skLog( fit_intercept=fit_intercept, solver="lbfgs", multi_class="multinomial" ) else: sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_proba = culog.predict_proba(X_test) sk_proba = sklog.predict_proba(X_test) cu_log_proba = culog.predict_log_proba(X_test) sk_log_proba = sklog.predict_log_proba(X_test) assert array_equal(cu_proba, sk_proba) assert array_equal(cu_log_proba, sk_log_proba)
def test_logistic_regression_weighting(regression_dataset, option, test_status): regression_type, data, coef, output = regression_dataset[test_status] class_weight = None sample_weight = None if option == 'sample_weight': n_samples = data.shape[0] sample_weight = np.abs(np.random.rand(n_samples)) elif option == 'class_weight': class_weight = np.random.rand(2) class_weight = {0: class_weight[0], 1: class_weight[1]} elif option == 'balanced': class_weight = 'balanced' culog = cuLog(fit_intercept=False, class_weight=class_weight) culog.fit(data, output, sample_weight=sample_weight) sklog = skLog(fit_intercept=False, class_weight=class_weight) sklog.fit(data, output, sample_weight=sample_weight) skcoef = np.squeeze(sklog.coef_) cucoef = np.squeeze(culog.coef_) if regression_type == 'binary': skcoef /= np.linalg.norm(skcoef) cucoef /= np.linalg.norm(cucoef) unit_tol = 0.04 total_tol = 0.08 elif regression_type.startswith('multiclass'): skcoef = skcoef.T skcoef /= np.linalg.norm(skcoef, axis=1)[:, None] cucoef /= np.linalg.norm(cucoef, axis=1)[:, None] unit_tol = 0.2 total_tol = 0.3 equality = array_equal(skcoef, cucoef, unit_tol=unit_tol, total_tol=total_tol) if not equality: print('\ncoef.shape: ', coef.shape) print('coef:\n', coef) print('cucoef.shape: ', cucoef.shape) print('cucoef:\n', cucoef) assert equality cuOut = culog.predict(data) skOut = sklog.predict(data) assert array_equal(skOut, cuOut, unit_tol=unit_tol, total_tol=total_tol)
def test_logistic_regression( num_classes, dtype, penalty, l1_ratio, fit_intercept, nrows, column_info, C, tol ): ncols, n_info = column_info # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == "elasticnet": pytest.skip( "Need sklearn > 0.21 for testing logistic with" "elastic net." ) X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog( penalty=penalty, l1_ratio=l1_ratio, C=C, fit_intercept=fit_intercept, tol=tol ) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ["elasticnet", "l1"]: if sk_check: sklog = skLog( penalty=penalty, l1_ratio=l1_ratio, solver="saga", C=C, fit_intercept=fit_intercept, multi_class="auto", ) else: sklog = skLog( penalty=penalty, solver="saga", C=C, fit_intercept=fit_intercept, multi_class="auto", ) else: sklog = skLog( penalty=penalty, solver="lbfgs", C=C, fit_intercept=fit_intercept, multi_class="auto", ) sklog.fit(X_train, y_train) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible cu_preds = culog.predict(X_test) tol_test = 0.012 tol_train = 0.006 if num_classes == 10 and penalty in ["elasticnet", "l1"]: tol_test *= 10 tol_train *= 10 assert culog.score(X_train, y_train) >= sklog.score(X_train, y_train) - \ tol_train assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - \ tol_test assert len(np.unique(cu_preds)) == len(np.unique(y_test))
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio, fit_intercept): # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == 'elasticnet': pytest.skip("Need sklearn > 0.21 for testing logistic with" "elastic net.") nrows = 100000 train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=num_classes, n_redundant=0, n_informative=2) X_test = np.asarray(X[train_rows:, 0:]).astype(dtype) X_train = np.asarray(X[0:train_rows, :]).astype(dtype) y_train = np.asarray(y[0:train_rows, ]).astype(dtype) culog = cuLog(penalty=penalty, l1_ratio=l1_ratio, C=5.0, fit_intercept=fit_intercept, tol=1e-8) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ['elasticnet', 'l1']: if sk_check: sklog = skLog(penalty=penalty, l1_ratio=l1_ratio, solver='saga', C=5.0, fit_intercept=fit_intercept) else: sklog = skLog(penalty=penalty, solver='saga', C=5.0, fit_intercept=fit_intercept) elif penalty == 'l2': sklog = skLog(penalty=penalty, solver='lbfgs', C=5.0, fit_intercept=fit_intercept) else: if sk_check: sklog = skLog(penalty=penalty, solver='lbfgs', C=5.0, fit_intercept=fit_intercept) else: sklog = skLog(penalty='l2', solver='lbfgs', C=1e9, fit_intercept=fit_intercept) sklog.fit(X_train, y_train) preds = culog.predict(X_test) skpreds = sklog.predict(X_test) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible if penalty in ['elasticnet', 'l1', 'l2']: assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-1 else: # This is the only case where cuml and sklearn actually do a similar # lbfgs, other cases cuml does owl or sklearn does saga assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-3