def test_linear_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x") ncols, n_info = column_info X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info ) # Initialization of cuML's linear regression model cuols = cuLinearRegression(fit_intercept=True, normalize=False, algorithm=algorithm) # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test) if nrows < 500000: # sklearn linear regression model initialization, fit and predict skols = skLinearRegression(fit_intercept=True, normalize=False) skols.fit(X_train, y_train) skols_predict = skols.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
def test_ols(datatype, X_type, y_type, algorithm): X = np.array([[2.0, 5.0], [6.0, 9.0], [2.0, 2.0], [2.0, 3.0]], dtype=datatype) y = np.dot(X, np.array([5.0, 10.0]).astype(datatype)) pred_data = np.array([[3.0, 5.0], [2.0, 5.0]]).astype(datatype) skols = skLinearRegression(fit_intercept=True, normalize=False) skols.fit(X, y) cuols = cuLinearRegression(fit_intercept=True, normalize=False, algorithm=algorithm) if X_type == 'dataframe': gdf = cudf.DataFrame() gdf['0'] = np.asarray([2, 6, 2, 2], dtype=datatype) gdf['1'] = np.asarray([5, 9, 2, 3], dtype=datatype) cuols.fit(gdf, y) elif X_type == 'ndarray': cuols.fit(X, y) sk_predict = skols.predict(pred_data) cu_predict = cuols.predict(pred_data).to_array() print(sk_predict) print(cu_predict) # print(skols.coef_) print(cuols.gdf_datatype) print(y.dtype) assert array_equal(sk_predict, cu_predict, 1e-3, with_sign=True)
def test_score_matches_sklearn_performance(self): skLR = skLinearRegression() skLR.fit(self.X_train, self.Y_train) skLR_score = skLR.score(self.X_test, self.Y_test) score = self.test_LR.score(self.X_test, self.Y_test) np.testing.assert_almost_equal(skLR_score, score, decimal=1)
def test_weighted_linear_regression(datatype, algorithm, fit_intercept, normalize, distribution): nrows, ncols, n_info = 1000, 20, 10 max_weight = 10 noise = 20 X_train, X_test, y_train, y_test = make_regression_dataset( datatype, nrows, ncols, n_info, noise=noise ) # set weight per sample to be from 1 to max_weight if distribution == "uniform": wt = np.random.randint(1, high=max_weight, size=len(X_train)) elif distribution == "exponential": wt = np.random.exponential(size=len(X_train)) else: wt = np.random.lognormal(size=len(X_train)) # Initialization of cuML's linear regression model cuols = cuLinearRegression(fit_intercept=fit_intercept, normalize=normalize, algorithm=algorithm) # fit and predict cuml linear regression model cuols.fit(X_train, y_train, sample_weight=wt) cuols_predict = cuols.predict(X_test) # sklearn linear regression model initialization, fit and predict skols = skLinearRegression(fit_intercept=fit_intercept, normalize=normalize) skols.fit(X_train, y_train, sample_weight=wt) skols_predict = skols.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
def test_linear_regression_model(datatype, algorithm, nrows, column_info): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_regression_dataset(datatype, nrows, ncols, n_info) # Initialization of cuML's linear regression model cuols = cuLinearRegression(fit_intercept=True, normalize=False, algorithm=algorithm) # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test) if nrows < 500000: # sklearn linear regression model initialization, fit and predict skols = skLinearRegression(fit_intercept=True, normalize=False) skols.fit(X_train, y_train) skols_predict = skols.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
def test_LinearRegression_fit_intercept(): np.random.seed(0) X = np.random.random((10, 1)) y = np.random.random(10) clf1 = LinearRegression(fit_intercept=False).fit(X, y) clf2 = skLinearRegression(fit_intercept=False).fit(X, y) assert_allclose(clf1.coef_, clf2.coef_)
def test_score_matches_sklearn_performance(self): print('') sklr = skLinearRegression() sklr.fit(self.X_train, self.y_train) sklr_score = sklr.score(self.X_test, self.y_test) self.advi_lr.fit(self.X_train, self.y_train) score = self.advi_lr.score(self.X_test, self.y_test) npt.assert_almost_equal(sklr_score, score, decimal=1)
def test_linear_regression_model_default(datatype): X_train, X_test, y_train, y_test = small_regression_dataset(datatype) # Initialization of cuML's linear regression model cuols = cuLinearRegression() # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test) # sklearn linear regression model initialization and fit skols = skLinearRegression() skols.fit(X_train, y_train) skols_predict = skols.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
def test_LinearRegression_err(): """ Test that errors are correctly accounted for By comparing to scikit-learn LinearRegression """ np.random.seed(0) X = np.random.random((10, 1)) y = np.random.random(10) + 1 dy = 0.1 y = np.random.normal(y, dy) X_fit = np.linspace(0, 1, 10)[:, None] clf1 = LinearRegression().fit(X, y, dy) clf2 = skLinearRegression().fit(X / dy, y / dy) assert_allclose(clf1.coef_[1:], clf2.coef_) assert_allclose(clf1.coef_[0], clf2.intercept_ * dy)
def demo_helper(X, Y, learning_rate, iterations, title, x_label, y_label, x_lim=None, y_lim=None): # Splitting the Dataset X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.33, random_state= 101) # Custom ML linear regression module ml_regressor = LinearRegression(learning_rate=learning_rate, iterations=iterations) # scikit-learn linear regression module scikit_regressor = skLinearRegression() # Perform linear regression using both scikit-learn and ML.py library ml_regressor.fit(X_train, y_train) # custom version scikit_regressor.fit(X_train, y_train) # scikit-learn version # Make predictions mlY_pred = ml_regressor.predict(X_test) skY_pred = scikit_regressor.predict(X_test) # Plot the X, Y data plt.scatter(X, Y) plt.title(title + ": Raw Data") plt.xlabel(x_label) plt.ylabel(y_label) plt.show() # Plot the X, Y_pred data custom_label = title + ": ML.py Linear Regression Module\nLearning Rate: " + \ str(learning_rate) + ", Iterations: " + str(iterations) plot_regression_line(mlY_pred, X_test, y_test, x_label, y_label, custom_label, x_range=x_lim, y_range=y_lim) plot_regression_line(skY_pred, X_test, y_test, x_label, y_label, title + ": scikit-learn Linear Regression Module", x_range=x_lim, y_range=y_lim) # Evaluate Each Model's Performance d_ml = { 'Metrics': ['Mean Absolute Error:', 'Mean Squared Error:', 'Mean Root Squared Error:'], 'Values': [mean_absolute_error(y_test, mlY_pred), mean_squared_error(y_test, mlY_pred), np.sqrt(mean_squared_error(y_test, mlY_pred))] } d_sk = { 'Metrics': ['Mean Absolute Error:', 'Mean Squared Error:', 'Mean Root Squared Error:'], 'Values': [mean_absolute_error(y_test, skY_pred), mean_squared_error(y_test, skY_pred), np.sqrt(mean_squared_error(y_test, skY_pred))] } df_ml = pd.DataFrame(data=d_ml) df_sk = pd.DataFrame(data=d_sk) fig, ax = plt.subplots(2) cell_text = [] for row in range(len(df_ml)): cell_text.append(df_ml.iloc[row]) ax[0].set_title(title + ": ML.py Linear Regression Module\nLearning Rate: " + str(learning_rate) + ", Iterations: " + str(iterations)) ax[0].table(cellText=cell_text, colLabels=df_ml.columns, loc='center') ax[0].axis(False) cell_text = [] for row in range(len(df_sk)): cell_text.append(df_sk.iloc[row]) ax[1].set_title(title + ": scikit-learn Linear Regression Module") ax[1].table(cellText=cell_text, colLabels=df_sk.columns, loc='center') ax[1].axis(False) plt.show() print("ML.py Linear Regression Weights for y = mx + b (" + title + "):") print("Slope:", ml_regressor.slope) print("Intercept:", ml_regressor.intercept, "\n")
[1, 4, 5, 16, 25, 64, 125]]) result = myLinearRegression._perform_change_of_basis( test_array1, 3, True) if not np.array_equal(result, test_array2): errs += "\nLinear Regression: Failed to add polynomial features" dataset = load_dataset('basisData.pkl') X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] print(" Testing least squares no regularization") model_mine = myLinearRegression('MSE', 1, False, None) model_mine.fit(X, y) model_sklearn = skLinearRegression(False) model_sklearn.fit(X, y) y_pred_mine = model_mine.predict(X) y_pred_sk = model_sklearn.predict(X) if not np.array_equal(y_pred_mine, y_pred_sk): errs += "\nLinear Regression: did not match sklearn for Least Squares no regularization" y_pred_mine = model_mine.predict(X, round=True) y_pred_sk = np.round(model_sklearn.predict(X), 0) if not np.array_equal(y_pred_mine, y_pred_sk): errs += "\nLinear Regression: did not match sklearn for Least Squares no regularization (rounded)" # print(" Testing least squares with L2 regularization") # model_mine = myLinearRegression('MSE', 1, False, 'L2', 1, 1000) # model_mine.fit(X, y) # model_sklearn = Ridge(fit_intercept=False) # model_sklearn.fit(X, y)
def test_linear_models(datatype, X_type, y_type, algorithm, nrows, ncols, n_info): train_rows = np.int32(nrows * 0.8) X, y = make_regression(n_samples=(nrows), n_features=ncols, n_informative=n_info, random_state=0) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Initialization of cuML's linear and ridge regression models cuols = cuLinearRegression(fit_intercept=True, normalize=False, algorithm=algorithm) curidge = cuRidge(fit_intercept=False, normalize=False, solver=algorithm) if X_type == 'dataframe': y_train = pd.DataFrame({'labels': y_train[0:, ]}) X_train = pd.DataFrame( {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])}) X_test = pd.DataFrame( {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_train) X_cudf_test = cudf.DataFrame.from_pandas(X_test) y_cudf = y_train.values y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) # fit and predict cuml linear regression model cuols.fit(X_cudf, y_cudf) cuols_predict = cuols.predict(X_cudf_test).to_array() # fit and predict cuml ridge regression model curidge.fit(X_cudf, y_cudf) curidge_predict = curidge.predict(X_cudf_test).to_array() elif X_type == 'ndarray': # fit and predict cuml linear regression model cuols.fit(X_train, y_train) cuols_predict = cuols.predict(X_test).to_array() # fit and predict cuml ridge regression model curidge.fit(X_train, y_train) curidge_predict = curidge.predict(X_test).to_array() if nrows < 500000: # sklearn linear and ridge regression model initialization and fit skols = skLinearRegression(fit_intercept=True, normalize=False) skols.fit(X_train, y_train) skridge = skRidge(fit_intercept=False, normalize=False) skridge.fit(X_train, y_train) skols_predict = skols.predict(X_test) skridge_predict = skridge.predict(X_test) assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True) assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
def PolynomialRegression(degree=3, **kwargs): return make_pipeline(PolynomialFeatures(degree), skLinearRegression(**kwargs))