def __init__(self, treatment_cols, nusiance_cols, effect_estimator=LinearRegression(fit_intercept=False), treatment_estimator=LinearRegression(fit_intercept=False), y_estimator=LinearRegression(fit_intercept=False)): self.nusiance_cols = nusiance_cols self.treatment_cols = treatment_cols self.effect_estimator = effect_estimator self.treatment_estimator = treatment_estimator self.y_estimator = y_estimator
def test_k_best_feature_selector(): np.random.seed(0) m = 100000 n = 6 factor = .9 X = np.random.normal(size=(m, n)) beta = 100 * np.ones(shape=n) for i in range(1, n): beta[i] = factor * beta[i - 1] beta = np.random.permutation(beta)[:, None] # beta = np.random.normal(size=(n,1)) y = np.dot(X, beta) + 0.01 * np.random.normal(size=(m, 1)) target_vars = np.ravel(np.argsort(beta**2, axis=0))[::-1][:3] target_support = np.zeros(shape=n, dtype=bool) target_support[target_vars] = True model1 = BestKFeatureSelector(UnivariateFeatureImportanceEstimatorCV( LinearRegression()), k=3) model1.fit(X, y) np.testing.assert_array_equal(model1.support_, target_support)
def __init__(self, base_estimator: RegressorMixin = None, n_trees: int = 50, sigma_a: int = 0.001, sigma_b: float = 0.001, n_samples: int = 200, n_burn: int = 200, p_grow: float = 0.5, p_prune: float = 0.5, alpha: float = 0.95, beta: float = 2.): if base_estimator is not None: self.base_estimator = clone(base_estimator) else: base_estimator = LinearRegression() self.base_estimator = base_estimator super().__init__(n_trees=n_trees, sigma_a=sigma_a, sigma_b=sigma_b, n_samples=n_samples, n_burn=n_burn, p_grow=p_grow, p_prune=p_prune, alpha=alpha, beta=beta)
def linearRegression_sales(self): #线性回归 path = u'4.Advertising.csv' data = self.readFile(path) # x=data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) # print x_train, y_train linreg = LinearRegression() model = linreg.fit(x_train, y_train) print model print linreg.coef_ print linreg.intercept_ y_hat = linreg.predict(np.array(x_test)) mse = np.average((y_hat - y_test)**2) rmse = np.sqrt(mse) print mse, rmse t = np.arange(len(x_test)) plt.plot(t, y_test, 'r-', linewidth=2, label='Test') plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') plt.grid() plt.legend(loc='upper right') plt.show()
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): # Test that linear regression agrees between sparse and dense rng = check_random_state(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. Xcsr = sparse.csr_matrix(X) y = rng.rand(n_samples) params = dict(normalize=normalize, fit_intercept=fit_intercept) clf_dense = LinearRegression(**params) clf_sparse = LinearRegression(**params) clf_dense.fit(X, y) clf_sparse.fit(Xcsr, y) assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_) assert_allclose(clf_dense.coef_, clf_sparse.coef_)
def test_backward_elimination_estimation(): np.random.seed(0) m = 100000 n = 6 factor = .9 X = np.random.normal(size=(m, n)) beta = 100 * np.ones(shape=n) for i in range(1, n): beta[i] = factor * beta[i - 1] beta = np.random.permutation(beta)[:, None] # beta = np.random.normal(size=(n,1)) y = np.dot(X, beta) + 0.01 * np.random.normal(size=(m, 1)) target_sequence = np.ravel(np.argsort(beta**2, axis=0)) model1 = BackwardEliminationEstimator( SingleEliminationFeatureImportanceEstimatorCV(LinearRegression())) model1.fit(X, y) # model2 = BRFE(FeatureImportanceEstimatorCV(LinearRegression())) # model2.fit(X, y) np.testing.assert_array_equal(model1.elimination_sequence_, target_sequence)
def __add_trend_feature(self, arr, abs_values=False): idx = np.array(range(len(arr))) if abs_values: arr = np.abs(arr) lr = LinearRegression() lr.fit(idx.reshape(-1, 1), arr) return lr.coef_[0]
def compare_panorama_cubic(greenery_measure="vegetation", **kwargs): """ Compare/plot the segmentation results of panoramic and cubic images to each other. Also use linear regression to determine how they relate to each other. """ green_kwargs = select_green_model(greenery_measure) panorama_tiler = TileManager(cubic_pictures=False, **kwargs, **green_kwargs) cubic_tiler = TileManager(cubic_pictures=True, **kwargs, **green_kwargs) panorama_green = panorama_tiler.green_direct() cubic_green = cubic_tiler.green_direct() _remove_missing(panorama_green, cubic_green) x = np.arange(0, 0.8, 0.01) x_pano = np.array(panorama_green["green"]).reshape(-1, 1) y_cubic = np.array(cubic_green["green"]) reg = LinearRegression().fit(x_pano, y_cubic) print(reg.score(x_pano, y_cubic)) print(reg.coef_[0], reg.intercept_) plt.figure() plt.scatter(panorama_green["green"], cubic_green["green"]) plt.plot(x, reg.predict(x.reshape(-1, 1))) plt.xlabel("panoramas") plt.ylabel("cubic") plt.xlim(0, max(0.001, max(panorama_green["green"])*1.1)) plt.ylim(0, max(0.001, max(cubic_green["green"])*1.1)) plot_greenery(panorama_green, show=False, title="panorama") plot_greenery(cubic_green, show=False, title="cubic") plt.show()
def test_predict_hdf_dataframe(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df['x'] Y = df['y'] # put into Omega -- assume a client with pandas, scikit learn os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.pure_python = True om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax', as_hdf=True) om.datasets.put(Y, 'datay', as_hdf=True) # have Omega fit the model then predict lr = LinearRegression() lr.fit(reshaped(X), reshaped(Y)) pred = lr.predict(reshaped(X)) om.models.put(lr, 'mymodel2') # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').predict('datax') pred2 = result.get() self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def test_fit_pipeline(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a pipeline locally, store (unfitted) in Omega p = Pipeline([ ('lr', LinearRegression()), ]) om.models.put(p, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison p.fit(reshaped(X), reshaped(Y)) pred = p.predict(reshaped(X)) # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)")
def test_multiple_response_regressor(): np.random.seed(1) m = 100000 n = 10 X = np.random.normal(size=(m, n)) beta1 = np.random.normal(size=(n, 1)) beta2 = np.random.normal(size=(n, 1)) y1 = np.dot(X, beta1) p2 = 1. / (1. + np.exp(-np.dot(X, beta2))) y2 = np.random.binomial(n=1, p=p2) y = np.concatenate([y1, y2], axis=1) model = MaskedEstimator( LinearRegression(), [True, False]) & MaskedEstimator( ProbaPredictingEstimator(LogisticRegression()), [False, True]) # MultipleResponseEstimator([('linear', np.array([True, False], dtype=bool), LinearRegression()), # ('logistic', np.array([False, True], dtype=bool), ProbaPredictingEstimator(LogisticRegression()))]) model.fit(X, y) assert np.mean(beta1 - model.estimators_[0].estimator_.coef_) < .01 assert np.mean(beta2 - model.estimators_[1].estimator_.estimator_.coef_) < .01 model.get_params() model.predict(X)
def polynomial_linear_regression(self): best_accuracy = 0 best_degree = 0 # for degree in range(2, 10): degree = 2 model = make_pipeline( PolynomialFeatures(degree), LinearRegression()) # polynomial transformation of this degree model.fit(self.X_train, self.Y_train) # fit the model ''' check accuracy using test dataset ''' predicted_y = model.predict(self.X_test) predicted_y = [ 1 if (abs(1 - val) < abs(val)) else 0 for val in predicted_y ] accuracy = accuracy_score(self.Y_test, predicted_y) if accuracy > best_accuracy: best_accuracy = accuracy best_degree = degree self.best_model = model print(best_degree) return model
def __init__(self, base_estimator: RegressorMixin = None, **kwargs): if base_estimator is not None: self.base_estimator = clone(base_estimator) else: base_estimator = LinearRegression() self.base_estimator = base_estimator super().__init__(**kwargs)
def test_pipeline(): np.random.seed(1) m = 10000 n = 10 X = np.random.normal(size=(m, n)) beta = np.random.normal(size=(n, 1)) beta[np.random.binomial(p=2.0 / float(n), n=1, size=n).astype(bool)] = 0 y = np.dot(X, beta) + 0.5 * np.random.normal(size=(m, 1)) beta_reduced = beta[beta != 0] model = BackwardEliminationEstimator( SingleEliminationFeatureImportanceEstimatorCV(LinearRegression())) model >>= LinearRegression() model.fit(X, y) assert np.max(np.abs(model.final_stage_.coef_ - beta_reduced)) < .1
def train(): X = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]) y = np.array([10, 20, 30]) X_test = np.array([[10, 20, 30, 40], [40, 50, 60, 70], [70, 80, 90, 100]]) reg = LinearRegression() reg.fit(X, y) print('coef_:', reg.coef_) print('intercept_:', reg.intercept_) print('predict:', reg.predict(X_test))
def test_delta_transformer(): fit_model = DoublePipeline( [('xdelta', DeltaTransformer()), ('linreg', LinearRegression(fit_intercept=False))], [('ydelta', DeltaTransformer())]).fit(X, Y) assert (np.isclose(fit_model.predict(X), np.squeeze(Y)).all()) assert (np.isclose(fit_model.x_pipe_.steps[-1][1].coef_, [1.0, 0.0, 0.0]).all())
def test_fit_intercept(): # Test assertions on betas shape. X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]]) X3 = np.array([[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]) y = np.array([1, 1]) lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y) lr2_with_intercept = LinearRegression(fit_intercept=True).fit(X2, y) lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y) lr3_with_intercept = LinearRegression(fit_intercept=True).fit(X3, y) assert_equal(lr2_with_intercept.coef_.shape, lr2_without_intercept.coef_.shape) assert_equal(lr3_with_intercept.coef_.shape, lr3_without_intercept.coef_.shape) assert_equal(lr2_without_intercept.coef_.ndim, lr3_without_intercept.coef_.ndim)
def test_linear_regression_n_jobs(): """ Test for the n_jobs parameter on the fit method and the constructor """ X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf_fit = clf.fit(X, Y, 4) assert_equal(clf_fit.n_jobs, clf.n_jobs) assert_equal(clf.n_jobs, 1)
def test_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega lr = LinearRegression() om.models.put(lr, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison lr.fit(X, Y) pred = lr.predict(X) # try predicting without fitting with self.assertRaises(NotFittedError): result = om.runtime.model('mymodel2').predict('datax') result.get() # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) # -- using data already in Omega result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').fit(X, Y) result = om.runtime.model('mymodel2').predict(X) pred2 = result.get() # -- check the local data provided to fit was stored as intended meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) self.assertIn('_fitX', meta.attributes.get('metaX').get('collection')) self.assertIn('_fitY', meta.attributes.get('metaY').get('collection')) self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def __init__(self, cols_1, cols_2, estimator_1=LinearRegression(fit_intercept=False), estimator_2=RandomForestRegressor(), iters=2): self.cols_1 = cols_1 self.cols_2 = cols_2 self.estimator_1 = estimator_1 self.estimator_2 = estimator_2 self.iters = iters
def test_super_learner(): np.random.seed(0) X, y = load_boston(return_X_y=True) X = pandas.DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) model = CrossValidatingEstimator(SuperLearner( [('linear', LinearRegression()), ('earth', Earth(max_degree=2))], LinearRegression(), cv=5, n_jobs=1), cv=5) cv_pred = model.fit_predict(X, y) pred = model.predict(X) cv_r2 = r2_score(y, cv_pred) best_component_cv_r2 = max([ r2_score( y, first(model.estimator_.cross_validating_estimators_.values()). cv_predictions_) for i in range(2) ]) assert cv_r2 >= .9 * best_component_cv_r2 code = sklearn2code(model, ['predict'], numpy_flat) module = exec_module('module', code) test_pred = module.predict(**X) try: assert_array_almost_equal(np.ravel(pred), np.ravel(test_pred)) except: idx = np.abs(np.ravel(pred) - np.ravel(test_pred)) > .000001 print(np.ravel(pred)[idx]) print(np.ravel(test_pred)[idx]) raise print(r2_score(y, pred)) print(r2_score(y, cv_pred)) print( max([ r2_score( y, first(model.estimator_.cross_validating_estimators_.values()). cv_predictions_) for i in range(2) ]))
def test_linear_regression_sparse(random_state=0): "Test that linear regression also works with sparse data" random_state = check_random_state(random_state) n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.residues_, 0)
def test_linear_regression(): # Test LinearRegression on a simple dataset. # a simple dataset X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [1, 2]) # test it also for degenerate input X = [[1]] Y = [0] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [0]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [0])
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def eval_linear(data_set, test_size=0.4): # load training data from feature matrix x, y = data_set.load_training_data() # cross validation evaluation model = LinearRegression(normalize=True) #model = RFE(model, 10) score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # to visualize: # split data into train and test set x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True, random_state=0) # train model on train set model = LinearRegression(normalize=True) model = model.fit(x_train, y_train) print(model.coef_) pprint(model) # plot train performance predict_train = model.predict(x_train) plt.figure() plt.title('train') plt.scatter(y_train, predict_train) # plot test performance predict = model.predict(x_test) plt.figure() plt.title('test') plt.scatter(y_test, predict) plt.show()
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert_equal(ols.coef_.shape, (2, n_features)) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def MethodSelect(Xt, XT, Yt, YT): reg = LinearRegression() reg.fit(Xt, Yt) predict = reg.predict(XT) err = dp.rmsErr(predict, YT) if err > 100: a = XT[0] b = Xt[0] c = [a] pre = reg.predict(c) print(pre[0]) for i in range(0, len(a)): print(a[i], b[i]) print('\n\n\n') return err
def test_column_subset_transformer(): m = 1000 n = 10 X = np.random.normal(size=(m, n)) x_cols = [0, 3, 4, 5] y_cols = 9 sample_weight_cols = 8 exposure_cols = 7 subsetter1 = ColumnSubsetTransformer(x_cols=x_cols, y_cols=y_cols, sample_weight_cols=sample_weight_cols, exposure_cols=exposure_cols) np.testing.assert_array_equal(subsetter1.transform(X), X[:, x_cols]) args = {'X': X} subsetter1.update(args) np.testing.assert_array_equal(args['X'], X[:, x_cols]) np.testing.assert_array_equal(args['y'], X[:, y_cols]) np.testing.assert_array_equal(args['sample_weight'], X[:, sample_weight_cols]) np.testing.assert_array_equal(args['exposure'], X[:, exposure_cols]) X_ = pandas.DataFrame(X, columns=['x%d' % n for n in range(10)]) x_cols_ = ['x%d' % n for n in x_cols] y_cols_ = 'x%d' % y_cols sample_weight_cols_ = 'x%d' % sample_weight_cols exposure_cols_ = 'x%d' % exposure_cols subsetter2 = ColumnSubsetTransformer( x_cols=x_cols_, y_cols=y_cols_, sample_weight_cols=sample_weight_cols_, exposure_cols=exposure_cols_) np.testing.assert_array_equal(subsetter2.transform(X_), X[:, x_cols]) args_ = {'X': X_} subsetter2.update(args_) np.testing.assert_array_equal(args_['X'], X[:, x_cols]) np.testing.assert_array_equal(args_['y'], X[:, y_cols]) np.testing.assert_array_equal(args_['sample_weight'], X[:, sample_weight_cols]) np.testing.assert_array_equal(args_['exposure'], X[:, exposure_cols]) lin = ColumnSubsetTransformer(x_cols=x_cols_, y_cols=y_cols_) >> LinearRegression() lin.fit(X_) lin.predict(X_.loc[:, x_cols_]) lin.score(X_)
def test_non_null_row_subset_fitter(): np.random.seed(1) m = 10000 n = 10 # Simulate an event under constant hazard, with hazard = X * beta and # iid exponentially distributed exposure times. X = np.random.normal(size=(m, n)) beta = np.random.normal(size=(n, 1)) y = np.ravel(np.dot(X, beta)) missing = np.random.binomial(p=.001, n=1, size=(m, n)) == 1 X[missing] = None model = NonNullSubsetFitter(LinearRegression()) model.fit(X, y) assert np.max(np.abs(np.ravel(beta) - model.estimator_.coef_)) < .001