def runMain(): boston_data = datasets.load_boston() X = boston_data.data y = boston_data.target # print '样本' # print X[:5,:] # print '标签' # print y[:5] lr_model = LinearRegression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=3) lr_model.fit(X_train, y_train) print 'xtrain====', X_train print 'X_test====', X_test print 'y_train====', y_train print 'y_test====', y_test #返回参数 params = lr_model.get_params() print '参数:' print params train_scor = lr_model.score(X_train, y_train) test_score = lr_model.score(X_test, y_test) print '训练集打分:' print train_scor print '测试集打分:' print test_score return ''
class LinearRegressionModel: # initialize a LinearRegressionModel object with "model" attribute containing an actual LinearRegression object from the skLearn module def __init__(self,*args,**kwargs): self.model=LinearRegression(*args,**kwargs) # a function that returns the actual LinearRegression object which the called LinearRegressionModel object wraps around def get_model(self): return self.model def fit(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) self.model.fit(X,y,sample_weight) return self def get_params(self,deep=True): return self.model.get_params(deep) def predict(self,X): # if statement added to avoid converting TabularData twice when predict() is called inside score() if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict(X) def score(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.score(X,y,sample_weight) def set_params(self,**params): return self.model.set_params(**params) ''' # for testing purposes def __getattribute__(self,item): logging.info("The function being called: "+str(item)) if (item in ('fit','predict','model','get_model','score')): return super().__getattribute__(item) ''' def __getattribute__(self,item): # if the called function/attribute does not require X,y tabularData conversion, get the attribute value by calling the function on the actual LinearRegression model in skLearn module # check if this object has the requested attribute try: return super().__getattribute__(item) except: pass; # otherwise fetch it from the actual linear regression object return getattr(self.model,item) '''
def search_bestparam_LinearRegression(X_train, y_train, df_search_best_param): print(f"Search best params for LinearRegression ...") model = LinearRegression() print("Supported params", model.get_params()) param_grid = { 'normalize' : [True,False], 'fit_intercept': [True,False] } search_bestparam(model, param_grid, X_train, y_train, df_search_best_param)
def sklearn_mode(): loaded_data = datasets.load_boston() data_X = loaded_data.data data_y = loaded_data.target model = LinearRegression() model.fit(data_X, data_y) print(model.coef_) # y=0.1x+0.3 print(model.intercept_) print(model.get_params()) print(model.score(data_X, data_y)) # R^2 coefficient of determination
def main(): x, y = datasets.make_regression(100, 1, noise=5) x = preprocessing.scale(x) y = preprocessing.scale(y) print(x.shape, y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) lr = LinearRegression(n_jobs=8) lr.fit(x_train, y_train) print(lr.coef_, lr.intercept_) print(lr.get_params()) print(lr.score(x_test, y_test))
class LinearRegression(Model): # X represents the features, Y represents the labels X = None Y = None prediction = None model = None def __init__(self, X=None, Y=None, cfg=False): if X is not None: self.X = X if Y is not None: self.Y = Y self.model = LinearRegressionModel() self.cfg = cfg def fit(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y print('Linear Regression Train started............') self.model.fit(self.X, self.Y) print('Linear Regression completed..........') return self.model def predict(self, test_features): print('Prediction started............') self.predictions = self.model.predict(test_features) print('Prediction completed..........') return self.predictions def save(self): if self.cfg: f = open('linearregression_configs.txt', 'w') f.write(json.dumps(self.model.get_params())) f.close() print('No models will be saved for lasso') def featureImportance(self): #if X_headers is None: # X_headers = list(self.X) #print(self.model.coef_) #feature_importance_ = zip(self.model.coef_[0], X_headers) #feature_importance = set(feature_importance_) return self.model.coef_
def run_regression(n_classes): X, y = get_scaled_data() y = clean_y_data(y, n_classes) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = LinearRegression().fit(X_train, y_train) print(model.get_params()) y_pred = model.predict(X_test) for i in range(len(y_pred)): y_pred[i] = int(y_pred[i]) print_model_info(y_test, y_pred, ("Run LinearRegression for", n_classes, "classes:"))
def train(stock, x_df, y_df): # check nan print(x_df.isnull().values.any()) print(y_df.isnull().values.any()) # LinearRegression x = x_df[:].values y = y_df[:].values reg = LinearRegression().fit(x, y) res = {} res['stock'] = stock res['score'] = reg.score(x, y) res['params'] = reg.get_params() # res['coef_'] = reg.coef_ linear_regression_results.append(res)
def train(stock, x_df, y_df): # check nan # print(x_df.isnull().values.any()) # print(y_df.isnull().values.any()) # LinearRegression x = x_df.values y = y_df.values print(stock) # print(x.shape) # print(y.shape) # x = x_df[:].values # y = y_df[:].values tc = 180 x_train = x[:tc] y_train = y[:tc] x_test = x[tc:] y_test = y[tc:] try: reg = LinearRegression().fit(x_train, y_train) except Exception as e: print(e) res = {} res['stock'] = stock res['score'] = reg.score(x_test, y_test) res['params'] = reg.get_params() # test prediction pY = reg.predict(x) print(pY.shape) py_df = pd.DataFrame(pY) pred_df = pd.concat([y_df, py_df], axis=1) pred_df.to_csv(ROOT + '/data/test_pred/' + stock[0:4] + '.csv') # custom score sum = 0 for i, predY in enumerate(pY): diff = predY - y[i] # print(diff) sum += (diff * diff) print(sum) res['custom_training_error_sum'] = sum res['custom_training_error'] = math.sqrt(sum / len(pY)) # res['coef_'] = reg.coef_ linear_regression_results.append(res)
class LinRegClassifierSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, *args, **kwargs): self.clf = LinearRegression(*args, **kwargs) self.threshold = 0.0 def fit(self, X, y): y = (2 * y) - 1 self.clf.fit(X, y) return self def predict(self, X): y = self.clf.predict(X) y = (2 * (y > self.threshold)) - 1 y[y == -1] = 0 return y def get_params(self, deep=True): return self.clf.get_params(deep=deep)
class LinearRegression(): def __init__(self, fit_intercept=True, normalize=False, copy_X=True, n_jobs=1): self.LR = LR(fit_intercept, normalize, copy_X, n_jobs) def decision_function(self, x): return self.LR.decision_function(x) def fit(self, x, y): return self.LR.fit(x, y) def get_params(self): return self.LR.get_params() def predict(self, x): return self.LR.predict(x) def set_params(self, **params): self.LR.set_params(params)
class LinRegClassifierSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, *args, **kwargs): self.clf = LinearRegression(*args, **kwargs) self.threshold = 0.0 def fit(self, X, y): y = (2 * y) - 1 self.clf.fit(X, y) return self def predict(self, X): y = self.clf.predict(X) y = (2 * (y > self.threshold)) - 1 y[y == -1] = 0 return y def get_params(self, deep=True): return self.clf.get_params(deep=deep)
def RegressionModel(model, params, features, target, scoring, kFold): if model == "linear regression": model = LinearRegression(fit_intercept=params['fit_intercept'], \ normalize=params['normalize'], copy_X=params['copy_X']) print('************************************************') print(model.get_params()) print('************************************************') else: print('Sorry, we are still developing other regression methods.') if kFold == 0: x_train,x_test,y_train,y_test = train_test_split(features,target, random_state = 1) model.fit(x_train,y_train) model_train_pred = model.predict(x_train) model_test_pred = model.predict(x_test) results = str() if "neg_mean_absolute_error" in scoring: results = 'MAE train data: %.3f, MAE test data: %.3f' % ( mean_absolute_error(y_train,model_train_pred), mean_absolute_error(y_test,model_test_pred)) if "neg_mean_squared_error" in scoring: results = results + '\n' + 'MSE train data: %.3f, MSE test data: %.3f' % ( mean_squared_error(y_train,model_train_pred), mean_squared_error(y_test,model_test_pred)) if "neg_mean_squared_log_error" in scoring: results = results + '\n' + 'MSLE train data: %.3f, MSLE test data: %.3f' % ( mean_squared_log_error(y_train,model_train_pred), mean_squared_log_error(y_test,model_test_pred)) if "r2" in scoring: results = results + '\n' +'R2 train data: %.3f, R2 test data: %.3f' % ( r2_score(y_train,model_train_pred), r2_score(y_test,model_test_pred)) return results elif kFold > 2: results = cross_validate(model, features, target, scoring=scoring, cv=kFold,error_score=np.nan) return results else: print("K-Fold has to be an integer (>=3) or 0 (No cross validation)")
def simple_linear(X_train, y_train, X_test, y_test): linear = LinearRegression() linear.fit(X_train, y_train) y_pred = linear.predict(X_test) print('\nLinear Regression Summary') print('R2:', linear.score(X_test, y_test)) print('Intercept:', linear.intercept_, '\nCoefficients:', linear.coef_) print('Parameters:', linear.get_params()) '''Predict how well model will perfom on test data''' # http://scikit-learn.org/stable/modules/model_evaluation.html # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics # scoring='wrong' to see valid scoring options # Common regression metrics: R2, RMSE, quantiles/MAPE, precision accuracy score = cross_val_score(estimator=linear, X=X_train, y=y_train, fit_params=None, scoring='r2', cv=5, n_jobs=-1) print('Mean Cross Validation Score:', score.mean())
class LinRegClassifierSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin): """ This class implements Linear Regression classifer. Specifically, this class uses Linear Regression matcher from scikit-learn, wraps it up to form a classifier. """ def __init__(self, *args, **kwargs): # Set the classifier to the scikit-learn Linear Regression matcher. self.clf = LinearRegression(*args, **kwargs) # Set the threshold to 0 self.threshold = 0.0 def fit(self, X, y): # Convert 0 and 1s to -1, and 1s y = (2 * y) - 1 # Call the fit method of Linear Regression matcher self.clf.fit(X, y) # Return the wrapper object return self def predict(self, X): # Call the predict method from the underlying matcher y = self.clf.predict(X) # Convert back the predictions a number between -1 and 1 to -1 and -1 y = (2 * (y > self.threshold)) - 1 # Convert all the -1 to 0s y[y == -1] = 0 # Return back the predictions return y def get_params(self, deep=True): """ Function to get params. This will be used by other scikit-learn matchers. """ return self.clf.get_params(deep=deep)
def linear_regression(df, significant_cols, target, cat_cols, num_cols): ss = StandardScaler() ohe = OneHotEncoder(drop='first', sparse=False) X = df[significant_cols] y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) X_train_cat = ohe.fit_transform(X_train[cat_cols]) X_train_num = ss.fit_transform(X_train[num_cols]) X_test_cat = ohe.transform(X_test[cat_cols]) X_test_num = ss.transform(X_test[num_cols]) train_data = np.c_[X_train_cat, X_train_num] test_data = np.c_[X_test_cat, X_test_num] estimator = LinearRegression(n_jobs=-1) r2_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='r2', cv=3, n_jobs=-1) rmse_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1) r2 = np.mean(r2_cv_scores) rmse = np.abs(np.mean(rmse_cv_scores)) r2_variance = np.var(r2_cv_scores, ddof=1) rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1)) estimator.fit(train_data, y_train) y_pred = estimator.predict(test_data) r2_validation = r2_score(y_test, y_pred) rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred)) params = estimator.get_params() return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
def sklearn_dateset_test(): # 测试sklearn的数据库 # 导入数据 loaded_data = datasets.load_boston() data_X = loaded_data.data data_y = loaded_data.target # 定义模型(默认参数),后续可通过预测的准确度,进行调整,尝试不同的model和参数 model = LinearRegression() # 训练模型 model.fit(data_X, data_y) # 打印X的前4个预测值 print("predict target:") print(model.predict(data_X[:4, :])) print("actual data:") print(data_y[:4]) # LinearRegressor model的参数:斜率和截距 print("model.coef_:\n", model.coef_) print("model.intercept_:\n", model.intercept_) # 取出之前定义的参数 print("model.get_params:\n", model.get_params()) # model.score(data_X, data_y)可以对Model以R^2的方式进行打分,输出精确度 # R2越大(接近于1),所拟合的回归方程越优 print("model.score:\n", model.score(data_X, data_y)) # 创建虚拟数据,100个样本,1个特征,1个目标,noise越大,点越离散 X1, y1 = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10) X2, y2 = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=50) plt.scatter(X1, y1) plt.scatter(X2, y2) plt.show() return 0
def linear_regression(): """ """ loaded_data = datasets.load_boston() data_x = loaded_data.data data_y = loaded_data.target model = LinearRegression() model.fit(data_x, data_y) # 斜率 print(model.coef_) # 截距 print(model.intercept_) print(model.get_params()) print(model.predict(data_x[:4, :])) print(data_x[:4, :]) x, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=1) plt.scatter(x, y) plt.show()
class LinearRegressionModel(object): def __init__(self): self.name = 'Linear Regression' self.clf = LinearRegression() def get_params(self): return self.clf.get_params() def train(self, dataframe): X = get_features(dataframe) y = get_response(dataframe) self.clf.fit(X, y) def predict(self, X): y_pred = self.clf.predict(X) return y_pred def save(self, filename): with open(filename, 'wb') as output_file: pickle.dump(self.clf, output_file, pickle.HIGHEST_PROTOCOL) def load(self, filename): with open(filename, 'rb') as input_file: self.clf = pickle.load(input_file)
def train_model(xy): # linear x = xy[:, 0].reshape(-1, 1) y = xy[:, 1] model = LinearRegression() model.fit(x, y) pred_y = model.predict(x) coef = model.coef_ intercept = model.intercept_ params = model.get_params() print(params) linear_r2 = sm.r2_score(y, pred_y) # r2得分 linear_absolute = sm.mean_absolute_error(y, pred_y) # 平均绝对值误差 linear_squared = sm.mean_squared_error(y, pred_y) # 均方误差 linear_median = sm.median_absolute_error(y, pred_y) # 中值绝对误差 drawing(xy, x, pred_y) return { 'linear_score': { 'linear_r2': round(linear_r2, 5), 'linear_absolute': round(linear_absolute, 5), 'linear_squared': round(linear_squared, 5), 'linear_median': round(linear_median, 5) } }
train_x, train_y = data[:, 2:], data[:, 0] return train_x, train_y, data # ''' # X, y = make_regression(random_state=0) # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) train_x, train_y, data = generator_data(data) reg = LinearRegression() print(reg) # for i in range(10): reg.fit(train_x, train_y) print('the whole parameter of the model : ', reg.get_params()) # GradientBoostingRegressor(random_state=0) pre = reg.predict(train_x) # print('Predict regression target for x :', pre) # print(pre.shape) r = reg.score(train_x, train_y) print('Return the coefficient of determination R2 of the prediction : ', r) re_index(observed_v=train_y, predicted_v=pre) print(keys[7:]) print(reg.coef_) feature_importance = reg.coef_ # feature_importance=(feature_importance/feature_importance.max())
print features.shape values = np.empty(features.shape[0], float) for i, res in enumerate(result.Result.all()): if i % 100 == 0: print i, if i >= num_results: break for j, param in enumerate(param_features): features[i, j] = getattr(res.spec, param) #values[i] = analysis.horizontal_surface_area.compute_by_result(res, flush=False) values[i] = analysis.horizontal_surface_area.func(res) print regression = LinearRegression() regression.fit(features, values, n_jobs=-1) print 'Features', param_features print 'Coeff', regression.coef_ print 'Intercept', regression.intercept_ print 'Params', regression.get_params() print 'R2', regression.score(features, values) elif arg == 'dump': from biofilm import util path = util.results_h5_path(sys.argv[2]) util.set_h5(path) import numpy as np from biofilm.model import spec, result, analysis param_features = [] for param, r in param_ranges.iteritems(): if isinstance(r, tuple): param_features.append(param) with open(path + '.dump', 'w') as dump:
regressor = LinearRegression() regressor.fit(X_poly, yTrain) print('dvj') # Calculate errors XTest_poly = poly_reg.fit_transform(XTest) yTestPredict = regressor.predict(XTest_poly) mse = mean_squared_error(yTest, yTestPredict, squared=True) rmse = mean_squared_error(yTest, yTestPredict, squared=False) mae = mean_absolute_error(yTest, yTestPredict) mape = mean_absolute_percentage_error(yTest, yTestPredict) print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) print("The root Mean Square Error (RMSE) on test set: {:.4f}".format(rmse)) print("The mean absolute error on test set: {:.4f}".format(mae)) print("The mean absolute percentage error on test set: {:.4f}".format(mape)) print(regressor.get_params(deep=True)) # prediction part Order_API_Concurrency = 5 Carts_API_Concurrency = 5 Order_Cores = 0.2 Order_DB_Cores = 0.2 Carts_Cores = 0.2 Carts_DB_Cores = 0.2 new_X = [ Order_API_Concurrency, Carts_API_Concurrency, Order_Cores, Order_DB_Cores, Carts_Cores, Carts_DB_Cores ] print() print('X value ', new_X)
from sklearn.linear_model import LinearRegression # 通用的学习模式 loaded_data = datasets.load_boston() # 加载房价的数据库 data_X = loaded_data.data data_y = loaded_data.target model = LinearRegression() # 调用线性回归模式 model.fit(data_X, data_y) # 训练 print(model.predict(data_X[:4, :])) # 测试 print(data_y[:4]) print(model.coef_) # 斜率,即输入特征的各比重 print(model.intercept_) # 截距 print(model.get_params()) # 返回model定义时的参数 # {'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False} print(model.score(data_X, data_y)) # 将数据及结果传入,给线性模型打分,准确度 import matplotlib.pyplot as plt # 生成数据集X,对应的线性结果集y X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10) print(X[:5, :]) plt.scatter(X, y) plt.show() from sklearn import preprocessing
#!/usr/bin/python3 # coding: utf-8 from sklearn import datasets from sklearn.linear_model import LinearRegression ################################################################## ## 加载数据 loaded_data = datasets.load_boston(); print(loaded_data) data_X = loaded_data.data data_y = loaded_data.target ################################################################## ## 加载 Model model = LinearRegression() model.fit(data_X, data_y) ################################################################## ## Model attribute && method # 下面的要在 Model fit() 结束以后执行 print(model.predict(data_X[:4, :])) print(model.coef_) # 系数, 很多; y = 0.1x + 0.3 中的 0.1 print(model.intercept_) # 常数; 和 y 轴的交点 print(model.get_params()) # LinearRegression() 定义的参数 print(model.score(data_X, data_y)) # R^2 coefficient of determination; 使用参数进行打分
# %% [markdown] # # Load Data: # %% X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0) # %% [markdown] # # Linear Regression: # %% model_linreg = LinearRegression() print_dict(model_linreg.get_params(), 'LinearRegressor params:') model_linreg.fit(X_train, y_train) y_predict_linreg = model_linreg.predict(X_test) y_predict_linreg = np.round(y_predict_linreg).astype( int) # Regressor -> classifier! error_rate_linreg = test(y_predict_linreg, y_test) print(f'Linear Regressor score: {model_linreg.score(X_test, y_test):.3g}') # %% [markdown] # # Naive Bayesian: # %% model_bayes = CategoricalNB() print_dict(model_bayes.get_params(), 'CategoricalNB params:') model_bayes.fit(X_train, y_train) y_predict_bayes = model_bayes.predict(X_test) error_rate_bayes = test(y_predict_bayes, y_test)
Y_Target = df.iloc[:, -1] padronizacao = StandardScaler().fit(X_Data) X_p = padronizacao.transform(X_Data) d_Test = pd.read_csv('dados/test.csv') colunas = ('NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_REDACAO') #colunas = ('NU_NOTA_CN','NU_NOTA_LC') df_test = d_Test.loc[:, colunas] df_test.update(df_test.fillna(-236)) X_Data_Test = df_test X_r = padronizacao.transform(X_Data_Test) ols = LinearRegression() print(ols.get_params().keys()) ols_params = {'fit_intercept': [True, False], 'normalize': [True, False]} X_train, X_test, Y_train, Y_test = train_test_split(X_p, Y_Target, test_size=0.25, random_state=5) ols.fit(X_p, Y_Target) pred_train = ols.predict(X_train) pred_test = ols.predict(X_test) final = ols.predict(X_r) print(ols.score(X_test, Y_test)) lista = final x = 0
######################### # Create some simple data # import pandas as pd # df = pd.read_csv('09-regression-test.csv') import numpy as np X = [[50, 80], [80, 65], [60, 60], [95, 80], [95, 50], [40, 90]] # Features y = [65, 83, 69, 92, 84, 55] # Fit a linear regression to it from sklearn.linear_model import LinearRegression model = LinearRegression(fit_intercept=True) model.fit(X, y) # Report Results print 'intercept_ :', model.intercept_ print 'coef_ :', model.coef_ print 'get_params :', model.get_params() # Model the prediction predict_data = [[52, 81], [81, 66], [60, 62], [0, 8]] y_hat = model.predict(predict_data) # # Plot the data import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(X, y, 'o') ax.plot(predict_data, y_hat) plt.show()
lm_multi2.coef_ lm_multi1.score(X1,y) lm_multi2.score(X2,y) sns.regplot(x = 'highway-mpg', y = 'price', data = df) sns.regplot(x = 'peak-rpm', y = 'price', data = df) df[['highway-mpg','peak-rpm','price']].corr() sns.residplot(x=df['highway-mpg'],y=df['price'],lowess=True) lm.fit(X1,y) dir(lm) lm._decision_function(X1) lm.get_params(True) lm._get_tags() y_hat = lm.predict(X1) def PlotPolly(model, independent_variable, dependent_variabble, Name): x_new = np.linspace(15, 55, 100) y_new = model(x_new) plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-') plt.title('Polynomial Fit with Matplotlib for Price ~ Length') ax = plt.gca() ax.set_facecolor((0.898, 0.898, 0.898)) fig = plt.gcf() plt.xlabel(Name) plt.ylabel('Price of Cars')
#-------------------------------------------- # 交差検証:テスト実施 #-------------------------------------------- z = 0 # 訓練で一番良かったものをセット y_pred = base_model[z].predict(X_test) mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) print( "**** Test set score( {} ): MSE={:.3f} RMSE={:.3f} MAE={:.3f} Score={:.3f} ****" .format(z, round(mse, 3), round(np.sqrt(mse), 3), round(mae, 3), regr.score(X_test, y_test))) print('Parameters currently in use:') from pprint import pprint pprint(regr.get_params()) # 過学習気味?なのでチューニングしてみる # # 5. RandomForest(ランダムフォレスト) # # * n_estimators =フォレスト内の樹木の数 # * max_features =ノードの分割に考慮されるフィーチャの最大数 # * max_depth =各決定木のレベルの最大数 # * min_samples_split =ノードが分割される前にノードに配置されたデータポイントの最小数 # * min_samples_leaf =リーフノードで許容されるデータポイントの最小数 # * bootstrap =データポイントをサンプリングする方法(置換の有無にかかわらず) # In[26]: # データをリセット
def scikit_tutorial(): """ scikit-learn入门 :return: """ # 1.准备数据集 X = np.random.randint(0, 100, (10, 4)) y = np.random.randint(0, 3, 10) y.sort() print('样本:') print(X) print('标签:', y) # 分割训练集、测试集 # random_state确保每次随机分割得到相同的结果 random_state:是随机数的种子 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7) print('训练集:') print(X_train) print(y_train) print('测试集:') print(X_test) print(y_test) # 特征归一化 x1 = np.random.randint(0, 1000, 5).reshape(5, 1) x2 = np.random.randint(0, 10, 5).reshape(5, 1) x3 = np.random.randint(0, 100000, 5).reshape(5, 1) print(x1) print(np.random.randint(0, 1000, (5, 1))) X = np.concatenate([x1, x2, x3], axis=1) print(X) print(preprocessing.scale(X)) # 生成分类数据进行验证scale的必要性 X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, random_state=25, n_clusters_per_class=1, scale=100) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() # 注释掉以下这句表示不进行特征归一化 # X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7) svm_classifier = svm.SVC() svm_classifier.fit(X_train, y_train) svm_classifier.score(X_test, y_test) # 2.训练模型 # 回归模型 boston_data = datasets.load_boston() X = boston_data.data y = boston_data.target print('样本:') print(X[:5, :]) print('标签:') print(y[:5]) # 选择线性回顾模型 lr_model = LinearRegression() # 分割训练集、测试集 X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=1 / 3, random_state=7) # 训练模型 lr_model.fit(X_train, y_train) # 返回参数 lr_model.get_params() lr_model.score(X_train, y_train) lr_model.score(X_test, y_test) # 3.交叉验证 # K最近邻分类 iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=10) k_range = range(1, 31) cv_scores = [] for n in k_range: knn = KNeighborsClassifier(n) scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') # 分类问题使用 # scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回归问题使用 cv_scores.append(scores.mean()) plt.plot(k_range, cv_scores) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() # 选择最优的K best_knn = KNeighborsClassifier(n_neighbors=5) best_knn.fit(X_train, y_train) print(best_knn.score(X_test, y_test)) print(best_knn.predict(X_test))
def linear_model(x, attr, xvars, fit_intercept=None, name=None, cut=None, residuals=True, quiet=True, model='LinearRegression'): """Make a linear model for attr based on xvars as free parameters. Currently only model='LinearRegression' implmented. Uses scikit-learn. residuals: Name of attribute for residuals (default: attr+"_residuals") """ if model is not 'LinearRegression': raise Exception("Currently only model='LinearRegression' implmented.") from sklearn.linear_model import LinearRegression import pandas as pd import numpy as np import xarray as xr lm = LinearRegression() if not name: name = '{:}_model'.format(attr) if not quiet: print '\nUsing scikit-learn LinearRegression to build model for {:} from variables:\n {:}'.format(attr, str(xvars)) allattrs = xvars + [attr] if cut: allattrs += [cut] xx = x.reset_coords()[allattrs].where(np.isfinite(x.reset_coords()[attr]), drop=True) df_xvars0 = xx[xvars].to_dataframe() if cut: df_xvars = xx[xvars].where(xx[cut] == 1, drop=True).to_dataframe() xdata = xx[attr].where(xx[cut] == 1, drop=True).data if not quiet: print '\nUsing following cut in buildiing model' print xx[cut] else: df_xvars = df_xvars0 xdata = xx[attr].data if fit_intercept is not None: lm.fit_intercept = fit_intercept lm.fit(df_xvars, xdata) #ft = pd.DataFrame(zip(df_xvars.columns,lm.coef_), columns=['params','estimatedCoefficients']) x[name] = xr.DataArray(lm.predict(df_xvars0), coords=[('time', df_xvars0.index)]) #x[name] = (['time'], lm.predict(df_xvars0)) x[name].attrs.update(**lm.get_params()) x[name].attrs['unit'] = x[attr].attrs.get('unit','') x[name].attrs['doc'] = 'LinearRegression scikit-learn model for {:} training data'.format(attr) x[name].attrs['model'] = model x[name].attrs['variables'] = xvars x[name].attrs['coef_'] = lm.coef_ x[name].attrs['intercept_'] = lm.intercept_ x[name].attrs['score'] = lm.score(df_xvars, xdata) if not quiet: print '\n****Model Results****' print x.reset_coords()[name] if residuals: if not isinstance(residuals, str): residuals = '{:}_residuals'.format(attr) x[residuals] = (['time'], x[attr]-x[name]) x[residuals].attrs['doc'] = 'Residuals for {:} based on LinearRegression model {:}'.format(attr, name) if not quiet: print '\n****Model Residuals****' print x.reset_coords()[residuals] return x
from sklearn import datasets from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt #使用以后的数据集进行线性回归 loaded_data = datasets.load_boston() data_X = loaded_data.data data_y = loaded_data.target model = LinearRegression() model.fit(data_X, data_y) print(model.predict(data_X[:4, :])) print(data_y[:4]) #参数 print(model.coef_) #如果y=0.1x+0.3 则此行输出的结果为0.1 print(model.intercept_) #此行输出的结果为0.3 print(model.get_params()) #模型定义时定义的参数,如果没有定义则返回默认值 print(model.score( data_X, data_y)) #给训练模型打分,注意用在LinearR中使用R^2 conefficient of determination打分
time_series['trend'] = range(time_series.shape[0]) time_series['month'] = time_series['month'].astype('category') ####dropping columns X = time_series.drop(['week', 'year', 'date', 'total_sales'], axis=1) names = pd.get_dummies(X).columns X = pd.get_dummies(X).values y = time_series.total_sales.values model = LinearRegression() model.fit(X, y) model.get_params() model.coef_ dict1 = list(zip(names, model.coef_)) prediction = model.predict(X) time_series['prediction'] = prediction import matplotlib.pyplot as plt plt.plot(time_series.date, time_series.total_sales, label='Actual') plt.plot(time_series.date, time_series.prediction, label='prediction') plt.legend(loc='upperleft') plt.show() #####forecasting
from sklearn import datasets from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt loaded_data = datasets.load_boston() data_X = loaded_data.data data_Y = loaded_data.target model = LinearRegression() model.fit(data_X, data_Y) print(model.predict(data_X[:4, :])) print(data_Y[:4]) print(model.coef_) # y = 0.1x + 0.3 print(model.intercept_) print(model.get_params()) # R^2 Coefficient of Determnination, How similar between data and target print(model.score(data_X, data_Y)) # X, Y = datasets.make_regression( # n_samples=100, n_features=1, n_targets=1, noise=0.001) # plt.scatter(X, Y) # plt.show()
from sklearn import datasets from sklearn.linear_model import LinearRegression loaded_data = datasets.load_boston() data_X = loaded_data.data data_y = loaded_data.target model = LinearRegression() model.fit(data_X, data_y) print(model.coef_) # y=ax + b print out a print(model.intercept_) #print out b print(model.get_params()) print(model.score(data_X, data_y)) #R^2 coefficient of determination
class LinRegClassifierSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin): """ This class implements Linear Regression classifer. Specifically, this class uses Linear Regression matcher from scikit-learn, wraps it up to form a classifier. """ def __init__(self, *args, **kwargs): # Set the classifier to the scikit-learn Linear Regression matcher. self.clf = LinearRegression(*args, **kwargs) # Set the threshold to 0 self.threshold = 0.0 # Set the classes_ self.classes_ = np.array([0, 1], np.int64) def fit(self, X, y): # Convert 0 and 1s to -1, and 1s y = (2 * y) - 1 # Call the fit method of Linear Regression matcher self.clf.fit(X, y) # Return the wrapper object return self def predict(self, X): # Call the predict method from the underlying matcher y = self.clf.predict(X) # Convert back the predictions a number between -1 and 1 to -1 and -1 y = (2 * (y > self.threshold)) - 1 # Convert all the -1 to 0s y[y == -1] = 0 # Return back the predictions return y def predict_proba(self, X): # There is no proba function defined for Linear Regression Matcher in scikit # learn. So we return the probs as 0 or 1 # give the warning to the user logger.warning('There is no proba function defined for Linear Regression ' 'Matcher in scikit learn. So we return the probs as 1') y = self.predict(X) p = np.ndarray(shape=[len(y), 2]) for i in range(len(y)): if y[i] == 1: p[i][0] = 0 p[i][1] = 1 elif y[i] == 0: p[i][0] = 1 p[i][1] = 0 return p def get_params(self, deep=True): """ Function to get params. This will be used by other scikit-learn matchers. """ return self.clf.get_params(deep=deep)