class PredictLoss(BaseLR): def __init__(self, hist=30, posmax=15, lr=0.2): from sklearn.linear_model.base import LinearRegression from collections import deque self.hist = hist self.track = deque(maxlen=self.hist) self.regr = LinearRegression() self.poscases = 0 self.posmax = posmax self.lr = lr def __call__(self, env): if len(self.track) > 5: y = np.array(self.track) x = np.array(range(len(y.shape))).reshape(-1, 1) self.regr.fit(x, y) coef_ = self.regr.coef_[0] preds = self.regr.predict(x) fst = preds[0] lst = preds[-1] e = np.sqrt(((y - preds)**2).mean()) if coef_ > 0: self.poscases += 1 if self.poscases >= self.posmax: raise EarlyStopException else: self.poscases -= 1 if self.poscases < 0: self.poscases = 0 diff = np.abs(fst - lst) coef = np.clip(diff/e, 1e-6, 1) lr = self.lr*coef print(lr, e, diff, coef_, coef, file=open('log.txt', 'a')) env.model.set_param("learning_rate", lr)
def __add_trend_feature(self, arr, abs_values=False): idx = np.array(range(len(arr))) if abs_values: arr = np.abs(arr) lr = LinearRegression() lr.fit(idx.reshape(-1, 1), arr) return lr.coef_[0]
def test_predict_hdf_dataframe(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df['x'] Y = df['y'] # put into Omega -- assume a client with pandas, scikit learn os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.pure_python = True om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax', as_hdf=True) om.datasets.put(Y, 'datay', as_hdf=True) # have Omega fit the model then predict lr = LinearRegression() lr.fit(reshaped(X), reshaped(Y)) pred = lr.predict(reshaped(X)) om.models.put(lr, 'mymodel2') # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').predict('datax') pred2 = result.get() self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def get_scikit_prediction(x=np.array([1, 2, 3]), y=np.array([1, 2, 3])): from sklearn.linear_model.base import LinearRegression as ScikitLinearRegression regression = ScikitLinearRegression() regression.fit(x, y) return regression.predict(x)
def train(): X = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]) y = np.array([10, 20, 30]) X_test = np.array([[10, 20, 30, 40], [40, 50, 60, 70], [70, 80, 90, 100]]) reg = LinearRegression() reg.fit(X, y) print('coef_:', reg.coef_) print('intercept_:', reg.intercept_) print('predict:', reg.predict(X_test))
def test_fit(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, store (unfitted) in Omega lr = LinearRegression() om.models.put(lr, 'mymodel2') self.assertIn('mymodel2', om.models.list('*')) # predict locally for comparison lr.fit(X, Y) pred = lr.predict(X) # try predicting without fitting with self.assertRaises(NotFittedError): result = om.runtime.model('mymodel2').predict('datax') result.get() # have Omega fit the model then predict result = om.runtime.model('mymodel2').fit('datax', 'datay') result.get() # check the new model version metadata includes the datax/y references meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) # -- using data already in Omega result = om.runtime.model('mymodel2').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel2').predict('foo') result = om.runtime.model('mymodel2').fit(X, Y) result = om.runtime.model('mymodel2').predict(X) pred2 = result.get() # -- check the local data provided to fit was stored as intended meta = om.models.metadata('mymodel2') self.assertIn('metaX', meta.attributes) self.assertIn('metaY', meta.attributes) self.assertIn('_fitX', meta.attributes.get('metaX').get('collection')) self.assertIn('_fitY', meta.attributes.get('metaY').get('collection')) self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
def test_linear_regression_sparse(random_state=0): "Test that linear regression also works with sparse data" random_state = check_random_state(random_state) n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.residues_, 0)
def test_linear_regression_multiple_outcome(random_state=0): "Test multiple-outcome linear regressions" X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] clf = LinearRegression(fit_intercept=True) clf.fit((X), Y) assert_equal(clf.coef_.shape, (2, n_features)) Y_pred = clf.predict(X) clf.fit(X, y) y_pred = clf.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions X, y = make_regression(random_state=random_state) Y = np.vstack((y, y)).T n_features = X.shape[1] reg = LinearRegression() reg.fit((X), Y) assert reg.coef_.shape == (2, n_features) Y_pred = reg.predict(X) reg.fit(X, y) y_pred = reg.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0): "Test multiple-outcome linear regressions with sparse data" random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert_equal(ols.coef_.shape, (2, n_features)) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert_equal(ols.coef_.shape, (2, n_features)) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept): # Test that linear regression agrees between sparse and dense rng = check_random_state(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0. Xcsr = sparse.csr_matrix(X) y = rng.rand(n_samples) params = dict(normalize=normalize, fit_intercept=fit_intercept) clf_dense = LinearRegression(**params) clf_sparse = LinearRegression(**params) clf_dense.fit(X, y) clf_sparse.fit(Xcsr, y) assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_) assert_allclose(clf_dense.coef_, clf_sparse.coef_)
def MethodSelect(Xt, XT, Yt, YT): reg = LinearRegression() reg.fit(Xt, Yt) predict = reg.predict(XT) err = dp.rmsErr(predict, YT) if err > 100: a = XT[0] b = Xt[0] c = [a] pre = reg.predict(c) print(pre[0]) for i in range(0, len(a)): print(a[i], b[i]) print('\n\n\n') return err
def linearRegression_sales(self): #线性回归 path = u'4.Advertising.csv' data = self.readFile(path) # x=data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) # print x_train, y_train linreg = LinearRegression() model = linreg.fit(x_train, y_train) print model print linreg.coef_ print linreg.intercept_ y_hat = linreg.predict(np.array(x_test)) mse = np.average((y_hat - y_test)**2) rmse = np.sqrt(mse) print mse, rmse t = np.arange(len(x_test)) plt.plot(t, y_test, 'r-', linewidth=2, label='Test') plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict') plt.grid() plt.legend(loc='upper right') plt.show()
def test_ridge_vs_lstsq(): """On alpha=0., Ridge and OLS yield the same solution.""" # we need more samples than features n_samples, n_features = 5, 4 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=0., fit_intercept=False) ols = LinearRegression(fit_intercept=False) ridge.fit(X, y) ols.fit(X, y) assert_almost_equal(ridge.coef_, ols.coef_) ridge.fit(X, y) ols.fit(X, y) assert_almost_equal(ridge.coef_, ols.coef_)
class LinearRegressionImpl(): def __init__(self, fit_intercept=True, normalize=False, copy_X=True, n_jobs=None): self._hyperparams = { 'fit_intercept': fit_intercept, 'normalize': normalize, 'copy_X': copy_X, 'n_jobs': n_jobs} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def test_linear_regression_n_jobs(): """ Test for the n_jobs parameter on the fit method and the constructor """ X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf_fit = clf.fit(X, Y, 4) assert_equal(clf_fit.n_jobs, clf.n_jobs) assert_equal(clf.n_jobs, 1)
def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights_OK = rng.randn(n_samples)**2 + 1 sample_weights_OK_1 = 1. sample_weights_OK_2 = 2. reg = LinearRegression() # make sure the "OK" sample weights actually work reg.fit(X, y, sample_weights_OK) reg.fit(X, y, sample_weights_OK_1) reg.fit(X, y, sample_weights_OK_2)
def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights_OK = rng.randn(n_samples) ** 2 + 1 sample_weights_OK_1 = 1. sample_weights_OK_2 = 2. reg = LinearRegression() # make sure the "OK" sample weights actually work reg.fit(X, y, sample_weights_OK) reg.fit(X, y, sample_weights_OK_1) reg.fit(X, y, sample_weights_OK_2)
def test_linear_regression_sample_weights(): # TODO: loop over sparse data as well rng = np.random.RandomState(0) # It would not work with under-determined systems for n_samples, n_features in ((6, 5), ): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for intercept in (True, False): # LinearRegression with explicit sample_weight reg = LinearRegression(fit_intercept=intercept) reg.fit(X, y, sample_weight=sample_weight) coefs1 = reg.coef_ inter1 = reg.intercept_ assert_equal(reg.coef_.shape, (X.shape[1], )) # sanity checks assert_greater(reg.score(X, y), 0.5) # Closed form of the weighted least square # theta = (X^T W X)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) coefs2 = linalg.solve( X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs1, coefs2) else: assert_array_almost_equal(coefs1, coefs2[1:]) assert_almost_equal(inter1, coefs2[0])
def test_score(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, fit it, store in Omega lr = LinearRegression() lr.fit(X, Y) scores = lr.score(X, Y) om.models.put(lr, 'mymodel')
def test_linear_regression_sample_weights(): # TODO: loop over sparse data as well rng = np.random.RandomState(0) # It would not work with under-determined systems for n_samples, n_features in ((6, 5), ): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for intercept in (True, False): # LinearRegression with explicit sample_weight reg = LinearRegression(fit_intercept=intercept) reg.fit(X, y, sample_weight=sample_weight) coefs1 = reg.coef_ inter1 = reg.intercept_ assert_equal(reg.coef_.shape, (X.shape[1], )) # sanity checks assert_greater(reg.score(X, y), 0.5) # Closed form of the weighted least square # theta = (X^T W X)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) coefs2 = linalg.solve(X_aug.T.dot(W).dot(X_aug), X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs1, coefs2) else: assert_array_almost_equal(coefs1, coefs2[1:]) assert_almost_equal(inter1, coefs2[0])
def test_linear_regression(): # Test LinearRegression on a simple dataset. # a simple dataset X = [[1], [2]] Y = [1, 2] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [1, 2]) # test it also for degenerate input X = [[1]] Y = [0] clf = LinearRegression() clf.fit(X, Y) assert_array_almost_equal(clf.coef_, [0]) assert_array_almost_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.predict(X), [0])
def test_intercept_flag(rows=10, columns=9): inout = get_random_array(rows, columns) test_overfitting(rows, columns) x = inout[0] y = inout[1] ntX = HomogenNumericTable(x) ntY = HomogenNumericTable(y) lr_train = linear_training.Batch() lr_train.input.set(linear_training.data, ntX) lr_train.input.set(linear_training.dependentVariables, ntY) result = lr_train.compute() model = result.get(linear_training.model) beta_coeff = model.getBeta() np_beta = getNumpyArray(beta_coeff) daal_intercept = np_beta[0, 0] from sklearn.linear_model.base import LinearRegression as ScikitLinearRegression regression = ScikitLinearRegression() regression.fit(x, y) scikit_intercept = regression.intercept_ assert_array_almost_equal(scikit_intercept, [daal_intercept])
def test_linear_regression_sample_weights(): rng = np.random.RandomState(0) for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) clf = LinearRegression() clf.fit(X, y, sample_weight) coefs1 = clf.coef_ assert_equal(clf.coef_.shape, (X.shape[1], )) assert_greater(clf.score(X, y), 0.9) assert_array_almost_equal(clf.predict(X), y) # Sample weight can be implemented via a simple rescaling # for the square loss. scaled_y = y * np.sqrt(sample_weight) scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis] clf.fit(X, y) coefs2 = clf.coef_ assert_array_almost_equal(coefs1, coefs2)
def test_predict(self): # create some data x = np.array(list(range(0, 10))) y = x * 2 df = pd.DataFrame({'x': x, 'y': y}) X = df[['x']] Y = df[['y']] # put into Omega os.environ['DJANGO_SETTINGS_MODULE'] = '' om = Omega() om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True om.datasets.put(X, 'datax') om.datasets.put(Y, 'datay') om.datasets.get('datax') om.datasets.get('datay') # create a model locally, fit it, store in Omega lr = LinearRegression() lr.fit(X, Y) pred = lr.predict(X) om.models.put(lr, 'mymodel') self.assertIn('mymodel', om.models.list('*')) # have Omega predict it # -- using data already in Omega result = om.runtime.model('mymodel').predict('datax') pred1 = result.get() # -- using data provided locally # note this is the same as # om.datasets.put(X, 'foo') # om.runtimes.model('mymodel').predict('foo') result = om.runtime.model('mymodel').predict(X) pred2 = result.get() self.assertTrue( (pred == pred1).all(), "runtimes prediction is different(1)") self.assertTrue( (pred == pred2).all(), "runtimes prediction is different(2)")
class StackedRegression(LinearModel, RegressorMixin): def __init__(self, weights=None, cv_train_size=None): estimators = [] estimators.append(KNeighborsRegressor(n_neighbors=3)) estimators.append(DecisionTreeRegressor()) estimators.append(BayesianRidge()) # estimators.append(BayesianRidge()) self.estimators = estimators self.stacker = LinearRegression() self.weights = weights if weights is not None else {} self.cv_train_size = cv_train_size if cv_train_size is not None else 0.7 self._is_fitted = False def fit_stack(self, X, y): print('fitting') print(X.shape) n_train = int(X.shape[0] * self.cv_train_size) for estimator in self.estimators: estimator.fit(X[:n_train, :], y[:n_train]) predictions = np.concatenate([np.matrix(estimator.predict(X[n_train:, :])).transpose() for estimator in self.estimators], axis=1) self.stacker.fit(predictions, y[n_train:]) self._is_fitted = True print('fitted') print(self.stacker.residues_) def fit(self, X, y): if not self._is_fitted: raise NotFittedError('StackedRegression must call fit_stack before fit.') for estimator in self.estimators: estimator.fit(X, y) def predict(self, X): predictions = np.concatenate([np.matrix(estimator.predict(X)).transpose() for estimator in self.estimators], axis=1) return self.stacker.predict(predictions)
def eval_linear(data_set, test_size=0.4): # load training data from feature matrix x, y = data_set.load_training_data() # cross validation evaluation model = LinearRegression(normalize=True) #model = RFE(model, 10) score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # to visualize: # split data into train and test set x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True, random_state=0) # train model on train set model = LinearRegression(normalize=True) model = model.fit(x_train, y_train) print(model.coef_) pprint(model) # plot train performance predict_train = model.predict(x_train) plt.figure() plt.title('train') plt.scatter(y_train, predict_train) # plot test performance predict = model.predict(x_test) plt.figure() plt.title('test') plt.scatter(y_test, predict) plt.show()
import pandas as pd from matplotlib.pyplot import scatter, plot, show from sklearn.linear_model.base import LinearRegression bmi_life_data = pd.read_csv('bmi_to_life_expect.csv') x_data = bmi_life_data[['BMI']] y_data = bmi_life_data[['Life expectancy']] # print x_data, y_data bmi_life_model = LinearRegression() bmi_life_model.fit(x_data, y_data) laos_life_exp = bmi_life_model.predict([50.00]) print(laos_life_exp) scatter(x_data, y_data) plot(x_data, bmi_life_model.predict(x_data)) show()
### ages and net_worths need to be reshaped into 2D numpy arrays ### second argument of reshape command is a tuple of integers: (n_rows, n_columns) ### by convention, n_rows is the number of data points ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) from sklearn.cross_validation import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(ages_test, net_worths_test)) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths)
''' Created on Jan 28, 2017 @author: Angel tip_orderamount_whodel TODO: how to create a complete regression analysis with dummy variable, not just coefficients. ''' import pandas as pd import numpy as np from sklearn.linear_model.base import LinearRegression path = r'C:\Users\Angel\OneDrive\Documents\data_training\data\RawDelData.csv' data = pd.read_csv(path) data["isAngel"] = "" data["isAngel"] = np.nan dat = data.loc[data['OrderAmount']>-100.00][['Tip','OrderAmount','PersonWhoDelivered', 'isAngel']] dat.loc[dat.PersonWhoDelivered == 'Angel', ['isAngel']] = 1 dat.loc[dat.PersonWhoDelivered == 'Sammie', ['isAngel']] = 0 feature_cols = ['OrderAmount', 'isAngel'] X = dat[feature_cols] y = dat.Tip lm = LinearRegression() lm.fit(X, y) print "Tips can be modeled by Y = 1.61 + 0.0829X_1 - 0.0533X2" print "In other words, tips go up by 8 cents for every dollar" print "increase in order amount and decrease by 5 cents if Angel is making the delivery."
from sklearn.model_selection._validation import cross_val_predict #boston房价数据集 boston = load_boston() #print(boston) #通过DESCR属性可以查看数据集的详细情况,这里数据有14列,前13列为特征数据,最后一列为标签数据。 #print(boston.DESCR) #boston的data和target分别存储了特征和标签 #print(boston.data) #print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #简单线性回归 model1 = LinearRegression(normalize=True) model1.fit(X_train, y_train) #模型的拟合优度 simpleScore=model1.score(X_test, y_test) print(simpleScore) ##回归系数 #print(model1.coef_) #截距项 #print(model1.intercept_) #print(simpleScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred=model1.predict(X_test) print("MSE:",metrics.mean_squared_error(y_test, y_pred)) #交叉验证
### draw the scatterplot, with color-coded training and testing points import matplotlib.pyplot as plt for feature, target in zip(feature_test, target_test): plt.scatter( feature, target, color=test_color ) for feature, target in zip(feature_train, target_train): plt.scatter( feature, target, color=train_color ) ### labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") from sklearn.linear_model.base import LinearRegression reg = LinearRegression() reg.fit(feature_train, target_train) print("Slope %s" % reg.coef_) print("Intercept %s" % reg.intercept_) print("Score = ", reg.score(feature_test, target_test)) ### draw the regression line, once it's coded try: plt.plot( feature_test, reg.predict(feature_test) ) except NameError: pass reg.fit(feature_test, target_test) plt.plot(feature_train, reg.predict(feature_train), color="b") plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show()
def draw_data_size_vs_performance_chart(): ''' Create figure for paper ''' paths = glob('output/data-sizes/*.results/*/results.json') + \ glob('output/model-h2048p512-mfs-true.results/*/results.json') df = read_json_files(paths) def parse_path(val): if 'model-h2048p512' in val: return 100 else: return float(re.search(r'(\d+)\.results', val).group(1)) df['data-pct'] = df['path'].apply(parse_path) df['words'] = 1.8e9 * df['data-pct'] / 100 df = df.append([{ 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "SemEval13", 'F1': 0.670 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "SemEval13", 'F1': 0.673 }, { 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "Senseval2", 'F1': 0.736 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "SemEval13", 'F1': 0.673 }, { 'words': 1e11, 'model': 'Yuan et al. (T: SemCor)', "competition": "Senseval2", 'F1': 0.736 }, { 'words': 1e11, 'model': 'Yuan et al. (T: OMSTI)', "competition": "Senseval2", 'F1': 0.724 }]) print(df) def get_xy(competition, model): sub_df = df[df['model'].str.contains(model, regex=False)] sub_df = sub_df.query('competition == "%s"' % competition).sort_values('words') return sub_df['words'], sub_df['F1'] with PdfPages('output/data_size_vs_performance.pdf') as pdf: se13_semcor_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor)'), '-o', label='SemEval13 (T: SemCor)') se13_mun_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor+OMSTI)'), '--o', label='SemEval13 (T: OMSTI)') se2_semcor_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor)'), ':o', label='Senseval2 (T: SemCor)') se2_mun_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor+OMSTI)'), '-.o', label='Senseval2 (T: OMSTI)') plt.legend(handles=[ se13_semcor_handle, se13_mun_handle, se2_semcor_handle, se2_mun_handle ], loc='lower right') plt.axis([1.5e7, 1.1e11, 0, 1]) plt.ylabel('F1') plt.xlabel('Tokens') plt.xscale('log') pdf.savefig() plt.show() plt.close() # extrapolate from data lr = LinearRegression() words, f1s = get_xy('SemEval13', 'Our LSTM (T: SemCor)') lr.fit(f1s.values.reshape([-1, 1]), np.log10(words.values.reshape([-1, 1]))) print('Extrapolated data size (words):') print(lr.predict([[0.75], [0.8]]))
#print(boston.data) #print(boston.target) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) #增加特征多项式让线性回归模型更好地拟合数据 #多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合 poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #多项式线性回归 model2 = LinearRegression(normalize=True) model2.fit(X_train_poly, y_train) mutilScore = model2.score(X_test_poly, y_test) print(mutilScore) #模型测试,并利用均方根误差(MSE)对测试结果进行评价 #模型的拟合值 y_pred = model2.predict(X_test_poly) print("MSE:", metrics.mean_squared_error(y_test, y_pred)) #交叉验证 predicted = cross_val_predict(model2, boston.data, boston.target, cv=10) print("MSE:", metrics.mean_squared_error(boston.target, predicted)) #画图 import matplotlib.pyplot as plt plt.scatter(boston.target, predicted, color="y", marker="o")
def get_error(x, y): model = LinearRegression(normalize=True) model.fit(x, y) predict = model.predict(x) return np.average(np.abs(y - predict))
def get_cv_error(x_train, x_test, y_train, y_test): model = LinearRegression(normalize=True) model.fit(x_train, y_train) predict = model.predict(x_test) return np.average(np.abs(y_test - predict))
inp_prices = list() features = list() def get_inp_features(self): return self.inp_features def get_inp_prices(self): return self.inp_prices def get_features(self): return self.features def read(self): F, N = map(int, raw_input().split(' ')) for _ in range(N): inp_f = map(float, raw_input().strip().split()) self.inp_features.append(inp_f[:F:]) self.inp_prices.append(inp_f[F::]) questions = int(raw_input()) for _ in range(questions): self.features.append(map(float, raw_input().split())) reader = inp_reader() reader.read() inp_features = reader.get_inp_features() inp_prices = reader.get_inp_prices() features = reader.get_features() model = LinearRegression() model.fit(inp_features, inp_prices) prices=model.predict(features) for el in prices: print (el[0])