def _compare_with_lasso(self, lasso_X, lasso_y, wlasso_X, wlasso_y, sample_weight, alpha_range=[0.01], params={}): for alpha in alpha_range: lasso = Lasso(alpha=alpha) lasso.set_params(**params) lasso.fit(lasso_X, lasso_y) wlasso = WeightedLasso(alpha=alpha) wlasso.set_params(**params) wlasso.fit(wlasso_X, wlasso_y, sample_weight=sample_weight) # Check results are similar with tolerance 1e-6 if np.ndim(lasso_y) > 1: for i in range(lasso_y.shape[1]): np.testing.assert_allclose(lasso.coef_[i], wlasso.coef_[i]) if lasso.get_params()["fit_intercept"]: self.assertAlmostEqual(lasso.intercept_[i], wlasso.intercept_[i]) else: np.testing.assert_allclose(lasso.coef_, wlasso.coef_) self.assertAlmostEqual(lasso.intercept_, wlasso.intercept_)
def lasso_model(xy): #lasso模型 x = xy[:, 0].reshape(-1, 1) y = xy[:, 1] model = Lasso() alpha_can = np.linspace(-1, 10, 30) model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) model.fit(x, y) print(model.best_params_) pred_y = model.predict(x) params = model.get_params() print(params) lasso_r2 = sm.r2_score(y, pred_y) lasso_absolute = sm.mean_absolute_error(y, pred_y) lasso_squared = sm.mean_squared_error(y, pred_y) lasso_median = sm.median_absolute_error(y, pred_y) drawing_lasso(xy, x, pred_y, model.best_params_) return { 'lasso_score': { 'lasso_r2': round(lasso_r2, 5), 'lasso_absolute': round(lasso_absolute, 5), 'lasso_squared': round(lasso_squared, 5), 'lasso_median': round(lasso_median, 5) } }
def generate_regression_team(self): l = Lasso() training_set = self.load_training_set() train, test, encoder = self.prepare_team_data(training_set, .999) l.fit(train.X, train.y) json.dump(l.get_params(), open(f"resources/team_predict.json", 'w+')) json.dump(encoder.get_params(), open('resoures/encoder.json', 'w+'))
def main(): np.set_printoptions(suppress=True) narac_file_path = "../../tigress/arburton/plink_data/narac_rf" csv_data = [] for chunk in pd.read_csv(narac_file_path, delim_whitespace=True, index_col=0, chunksize=20000): csv_data.append(chunk) samples = pd.concat(csv_data, axis=0) del csv_data # TODO: pull out affection column as y affection = pd.DataFrame(samples, columns="Affection") samples = samples.drop([ "Affection", "Sex", "DRB1_1", "DRB1_2", "SENum", "SEStatus", "AntiCCP", "RFUW" ], axis=1) samples = pd.get_dummies(samples, columns=(samples.columns != "ID")) sample_train, sample_test, affection_train, affection_test = train_test_split( samples, affection, test_size=0.8) # TODO: potentially make sample weights percentage of non ?? SNPs # RANDOM FOREST CLASSIFIER rf = RandomForestClassifier(n_estimators=5000, max_features=40, n_jobs=2) rf.fit(sample_train, affection_train) print("Random forest accuracy: {}".format( rf.score(sample_test, affection_test))) print("Random forest feature importances:") print(rf.feature_importances_) print("Random forest parameters:") print(rf.get_params()) # LASSO CLASSIFIER lasso = Lasso() lasso.fit(sample_train, affection_train) print("LASSO accuracy: {}".format(lasso.score(sample_test, affection_test))) print("LASSO parameters:") print(lasso.get_params()) # LOG REGRESSION log_reg = LogisticRegression(n_jobs=2) log_reg.fit(sample_train, affection_train) print("Log regression accuracy: {}".format( log_reg.score(sample_test, affection_test))) print("Log regression parameters:") print(log_reg.get_params()) # NEURAL NETS mlp_classifier = MLPClassifier() mlp_classifier.fit(sample_train, affection_train) print("MLP Classifier accuracy: {}".format( mlp_classifier.score(sample_test, affection_test))) print("MLP Classifier parameters:") print(mlp_classifier.get_params())
def lassoDict(currentX, currentY, eps, lam, currentColumns, colWorth): irrelevant = [] model = Lasso(alpha=lam, fit_intercept=True) model.fit(currentX, currentY) params = model.get_params() print(model.coef_.sum()) for i in range(model.coef_.shape[0]): colWorth[currentColumns[i]] += np.abs(model.coef_[i]) if np.abs(model.coef_[i]) < eps: irrelevant.append(currentColumns[i]) return irrelevant
def test_parameters(self): """ Testing parameters of Model class. """ #1.) #create instance of PLS model using Model class & creating instance # using SKlearn libary, comparing if the parameters of both instances are equal pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200} model = Model(algorithm="PlsRegression", parameters=pls_parameters) pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200) for k, v in model.model.get_params().items(): self.assertIn(k, list(pls_model.get_params())) #2.) rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10} model = Model(algorithm="RandomForest", parameters=rf_parameters) rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10) for k, v in model.model.get_params().items(): self.assertIn(k, list(rf_model.get_params())) #3.) knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"} model = Model(algorithm="KNN", parameters=knn_parameters) knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree") for k, v in model.model.get_params().items(): self.assertIn(k, list(knn_model.get_params())) #4.) svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1} model = Model(algorithm="SVR",parameters=svr_parameters) svr_model = SVR(kernel='poly', degree=5, coef0=1) for k, v in model.model.get_params().items(): self.assertIn(k, list(svr_model.get_params())) #5.) ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"} model = Model(algorithm="AdaBoost", parameters=ada_parameters) ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(ada_model.get_params())) #6.) bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2} model = Model(algorithm="Bagging", parameters=bagging_parameters) bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(bagging_model.get_params())) #7.) lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004} model = Model(algorithm="lasso", parameters=lasso_parameters) lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004) for k, v in model.model.get_params().items(): self.assertIn(k, list(lasso_model.get_params()))
class Lasso(Model): # X represents the features, Y represents the labels X = None Y = None prediction = None model = None def __init__(self, X=None, Y=None, label_headers=None, alpha=1, type='regressor', cfg=False): if X is not None: self.X = X if Y is not None: self.Y = Y self.type = type self.cfg = cfg self.mapping_dict = None self.label_headers = label_headers self.model = LassoRegression(alpha=alpha) def fit(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y if self.type == 'classifier': self.Y = self.map_str_to_number(self.Y) print('Lasso Train started............') self.model.fit(self.X, self.Y) print('Lasso completed..........') return self.model def predict(self, test_features): print('Prediction started............') self.predictions = self.model.predict(test_features) if self.type == 'classifier': predictions = predictions.round() print('Prediction completed..........') return self.predictions def save(self): if self.cfg: f = open('lasso_configs.txt', 'w') f.write(json.dumps(self.model.get_params())) f.close() print('No models will be saved for lasso') def featureImportance(self): return self.model.coef_ def map_str_to_number(self, Y): mapping_flag = False if self.mapping_dict is not None: for label_header in self.label_headers: Y[label_header] = Y[label_header].map(self.mapping_dict) return Y mapping_dict = None for label_header in self.label_headers: check_list = pd.Series(Y[label_header]) for item in check_list: if type(item) == str: mapping_flag = True break if mapping_flag: classes = Y[label_header].unique() mapping_dict = {} index = 0 for c in classes: mapping_dict[c] = index index += 1 Y[label_header] = Y[label_header].map(mapping_dict) mapping_flag = False self.mapping_dict = mapping_dict return Y def map_number_to_str(self, Y, classes): Y = Y.round() Y = Y.astype(int) if self.mapping_dict is not None: mapping_dict = self.mapping_dict else: mapping_dict = {} index = 0 for c in classes: mapping_dict[index] = c index += 1 inv_map = {v: k for k, v in mapping_dict.items()} return Y.map(inv_map) def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8): if self.type == 'classifier': correct = 0 df = pd.DataFrame(data=predictions.flatten()) test_labels = self.map_str_to_number(test_labels.copy()) for i in range(len(df)): if (df.values[i] == test_labels.values[i]): correct = correct + 1 else: correct = 0 df = pd.DataFrame(data=predictions.flatten()) for i in range(len(df)): if 1 - abs(df.values[i] - test_labels.values[i]) / abs( df.values[i]) >= hitmissr: correct = correct + 1 return float(correct) / len(df) def getConfusionMatrix(self, test_labels, predictions, label_headers): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'classifier': index = 0 for label_header in label_headers: classes = test_labels[label_header].unique() df_tmp = self.map_number_to_str(df.ix[:, index], classes) title = 'Normalized confusion matrix for Lasso (' + label_header + ')' self.plot_confusion_matrix(test_labels.ix[:, index], df_tmp, classes=classes, normalize=True, title=title) index = index + 1 else: return 'No Confusion Matrix for Regression' def getROC(self, test_labels, predictions, label_headers): predictions = pd.DataFrame(data=predictions.flatten()) predictions.columns = test_labels.columns.values if self.type == 'classifier': test_labels = self.map_str_to_number(test_labels) fpr, tpr, _ = roc_curve(test_labels, predictions) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.show() else: return 'No Confusion Matrix for Regression' def getRSquare(self, test_labels, predictions, mode='single'): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': if mode == 'multiple': errors = r2_score(test_labels, df, multioutput='variance_weighted') else: errors = r2_score(test_labels, df) return errors else: return 'No RSquare for Classification' def getMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = mean_squared_error(test_labels, df) return errors else: return 'No MSE for Classification' def getMAPE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = np.mean(np.abs( (test_labels - df.values) / test_labels)) * 100 return errors.values[0] else: return 'No MAPE for Classification' def getRMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = sqrt(mean_squared_error(test_labels, df)) return errors else: return 'No RMSE for Classification'
model_ridge.fit(train_X, train_y) print('训练集预测的确定系数R ^ 2: ', model_ridge.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model_ridge.score(test_X, test_y)) pred_1 = model_ridge.predict(test_X) print('模型误差: ', mean_squared_error(test_y, pred_1)) # 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数 model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0]) model.fit(train_X, train_y) print("模型参数:", model.get_params()) print("模型详情:", model) print('最佳alpha', model.alpha_) # Ridge()无这个方法,只有RidgeCV算法有 print('训练集预测的确定系数R ^ 2: ', model.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model.score(test_X, test_y)) pred_2 = model.predict(test_X) print('Ridge模型误差: ', mean_squared_error(test_y, pred_2)) # Lasso回归 model_lasso = Lasso(alpha=0.01) model_lasso = LassoCV() model_lasso = LassoLarsCV() model_lasso.fit(train_X, train_y) print("模型参数:", model_lasso.get_params()) print("模型详情:", model_lasso) #print('最佳alpha',model_lasso.alpha_) print('训练集预测的确定系数R ^ 2: ', model_lasso.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model_lasso.score(test_X, test_y)) pred_3 = model_lasso.predict(test_X) print('Lasso模型误差: ', mean_squared_error(test_y, pred_3))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default="data/formatted.csv", type=str, required=False, help="the input dataset to be used to train the model") parser.add_argument("--output_dir", default="SGDRegressor_5", type=str, required=False, help="the output file for the ") parser.add_argument("--model_type", default="SVR", type=str, required=False, help="the kind of model to use " "[Lasso, SGDRegressor, ElasticNet, SVR, LinearRegression]") args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # load data into numpy array X_train, y_train, X_val, y_val = load_data(args.data_dir, args) # create model if args.model_type == "Lasso": # change the alpha value for shit model = Lasso(alpha=.1, fit_intercept=True, normalize=True, precompute=False, copy_X=True, max_iter=100000, tol=0.000001, warm_start=False, positive=True, random_state=None, selection='cyclic') elif args.model_type == "SGDRegressor": model = SGDRegressor(loss='squared_epsilon_insensitive', <<<<<<< HEAD penalty='elasticnet', alpha=0.1, l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=.00000001, shuffle=True, verbose=1, epsilon=0.1, random_state=None, ======= penalty='l2', alpha=0.1, l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, >>>>>>> 31e4c3dde529b32e328274b653f6f6e5c0bc65c1 learning_rate='optimal', eta0=0.001, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=100, warm_start=False, average=False, n_iter=None) elif args.model_type == "ElasticNet": model = ElasticNet(alpha=.000001, l1_ratio=0.5, fit_intercept=True, normalize=True, precompute=False, max_iter=10000, copy_X=True, tol=0.0001, warm_start=False, positive=True, random_state=None, selection='cyclic') elif args.model_type == "SVR": model = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto_deprecated', kernel='rbf', max_iter=1000, shrinking=True, tol=0.001, verbose=False) elif args.model_type == "LinearRegression": model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None) # train the model with the X, and y train numpy arrays model.fit(X_train, np.log(y_train+1)) # get score with the X, and y dev numpy arrays test_score = model.score(X_val, np.log(y_val+1)) train_score = model.score(X_train, np.log(y_train+1)) print("train: {}, test: {}".format(train_score, test_score)) # save_parameters parameters = model.get_params() with open(os.path.join(args.output_dir, "params.json"), "w") as fp: json.dump(parameters, fp) # save the model weights model_weights_filename = os.path.join(args.output_dir, "trained_model.sav") pickle.dump(model, open(model_weights_filename, 'wb')) # get outputs output = str() for prediction, label in zip(run_regressor(X_val, model_weights_filename), y_val): output+="{}, {}\n".format(prediction, label) # save scorem outputs with open(os.path.join(args.output_dir, "score.txt"), "w") as fp: fp.write("train score: {}, test score:{}".format(train_score, test_score)) fp.write(output)
def lasso(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) # Create the random grid alpha = np.linspace(0, 1, 10) random_grid = {'alpha': alpha} lasso = Lasso(random_state=42) # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(lasso.get_params()) # Use the random grid to search for best hyperparameters # First create the base model to tune # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores lasso_random = RandomizedSearchCV(estimator=lasso, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model lasso_random.fit(train_X, train_y) pprint(lasso_random.best_params_) cv_result_rd = lasso_random.cv_results_ BestPara_random = lasso_random.best_params_ ## Grid search of parameters, using 3 fold cross validation based on Random search from sklearn.model_selection import GridSearchCV # Number of trees in random forest alpha = np.linspace(BestPara_random["alpha"] - 0.2, BestPara_random["alpha"] + 0.2, 10) # Create the random grid grid_grid = {'alpha': alpha} lasso_grid = GridSearchCV(estimator=lasso, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model lasso_grid.fit(train_X, train_y) BestPara_grid = lasso_grid.best_params_ pprint(lasso_grid.best_params_) cv_results_grid = lasso_grid.cv_results_ # Fit the base line search model lasso.fit(train_X, train_y) #prediction predict_y = lasso_random.predict(test_X) predict_y_grid = lasso_grid.predict(test_X) predict_y_base = lasso.predict(test_X) def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y)) #,squared = False)) errors_Random_CV = (mean_squared_error(predict_y, test_y)) #,squared = False)) errors_baseline = (mean_squared_error(predict_y_base, test_y)) #,squared = False)) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] print('lasso results:', results) if True: fig = plt.figure(figsize=(20, 8)) x_axis = range(3) plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) #plt.show() plt.savefig('lasso_error_compare.png') #feature importance #num_feature = len(lasso.best_estimator_.feature_importances_) #plt.figure(figsize=(24,6)) #plt.bar(range(0,num_feature*4,4),lasso.best_estimator_.feature_importances_) #label_name = X.keys() #plt.xticks(range(0,num_feature*4,4), label_name) #plt.title("Feature Importances"+",kfold="+str(kfold)) #plt.show() #plt.savefig('lasso_feature_importance.png') fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(predict_y_grid)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, predict_y_grid, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() #plt.show() plt.savefig('lasso_prediction.png') return lasso_grid.predict, lasso_grid.best_estimator_
return np.array(features), np.array(targets) features, targets = generate_features_and_targets(math_student_data) xTrain, xTest, yTrain, yTest = train_test_split(features, targets) # random_state=7 #all of the following models were tried and the one that preforms best, the Random Forest decision tree, was left uncommented #model = KNeighborsRegressor(n_neighbors=2) #model = LinearRegression() #model = #model = MLPRegressor(solver='lbfgs', random_state=2, hidden_layer_sizes=[100, 100]) model = Lasso(alpha=.03, max_iter=1000) print(model.get_params()) model = Pipeline([("scaler", MinMaxScaler()), ("model", model)]) ''' This code was used to select the best parameters and model.''' ''' param_grid = [{'model__n_estimators': [5, 10, 50, 100, 150, 250, 500]}, {'model__alpha': [.001, .01, .1, 1, 10, 100, 1000]}, {'model__alpha': [.001, .01, .1, 1, 10, 100, 1000], 'model__max_iter': [1000, 5000, 10000, 50000, 100000, 500000]}, {'model__C': [.001, .01, .1, 1, 10, 100, 1000], 'model__gamma': [.001, .01, .1, 1, 10, 100, 1000]}] models = [RandomForestRegressor(n_estimators=500), Ridge(), Lasso(), SVR()] #, random_state=9 for i in range(len(models)): model = Pipeline([("scaler", MinMaxScaler()), ("model", models[i])]) grid_search = GridSearchCV(model, param_grid[i], cv=20, return_train_score=True) model = grid_search.fit(xTrain, yTrain) print(model.best_params_) print(model.score(xTest, yTest)) print('') #print(cross_val_score(model, features, targets).mean()) #cross-val scores were compared between all the model choices to select one '''
#Lasso Regression from sklearn.linear_model import Lasso lasso=Lasso(alpha=0.0007196856730011522) lasso.fit(X_train,y_train) lasso_coef=lasso.coef_ lasso_intercept=lasso.intercept_ names=dataset.drop('MV',axis=1).columns plt.plot(range(len(names)),lasso_coef) plt.ylabel('coefficients') plt.show() lasso.score(X_test,y_test) ##hyper tunung for lasso from sklearn.model_selection import GridSearchCV from sklearn.cross_validation import train_test_split lasso.get_params() c_space=np.logspace(-5,8,15) param_grid={'alpha':c_space} logistic_cv=GridSearchCV(lasso,param_grid,cv=5) logistic_cv.fit(X_train,y_train) logistic_cv.best_params_ logistic_cv.best_score_ #mean absolute error from sklearn import metrics print('MAE:',metrics.mean_absolute_error(y_test,regressor.predict(X_test))) print('MSE:',metrics.mean_squared_error(y_test,regressor.predict(X_test)))
def train_model(): start_time=time.time() data_inp=data_clean(df) pivot = data_inp.pivot(index='goods_code', columns='dis_month', values='sale') #对变量重新命名 col_name=[] for i in range(len(pivot.columns)): col_name.append('sales_'+str(i)) pivot.columns=col_name pivot.fillna(0, inplace=True) sub=pivot.reset_index() test_features=['goods_code'] trian_features = ['goods_code'] for i in range(1,3): test_features.append('sales_' + str(i)) #前面21个月作为训练集 for i in range(3,23): trian_features.append('sales_' + str(i)) sub.fillna(0, inplace=True) sub.drop_duplicates(subset=['goods_code'],keep='first',inplace=True) #最近的两个月作为测试集 for i in range(1,3): test_features.append('sales_' + str(i)) for i in range(3,23): trian_features.append('sales_' + str(i)) X_train = sub[trian_features] y_train = sub[['sales_0', 'goods_code']] X_test = sub[test_features] sales_type = 'sales_' #平均数特征 X_train['mean_sale'] = X_train.apply( lambda x: np.mean([x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], x[sales_type+'14'], x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['mean_sale'] = X_test.apply( lambda x: np.mean([x[sales_type+'1'], x[sales_type+'2']]), axis=1) train_mean=X_train['mean_sale'] test_mean=X_test['mean_sale'] train_mean=pd.Series(train_mean) test_mean=pd.Series(test_mean) #众数特征 X_train['median_sale'] = X_train.apply( lambda x: np.median([ x[sales_type+'3'], x[sales_type+'4'], x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'], x[sales_type+'13'], x[sales_type+'14'],x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['median_sale'] = X_test.apply( lambda x: np.median([x[sales_type+'1'], x[sales_type+'2']]), axis=1) #标准差特征 X_train['std_sale'] = X_train.apply( lambda x: np.std([ x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], x[sales_type+'14'], x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['std_sale'] = X_test.apply( lambda x: np.std([x[sales_type+'1'], x[sales_type+'2']]), axis=1) train_median=X_train['median_sale'] test_median=X_test['median_sale'] train_std=X_train['std_sale'] test_std=X_test['std_sale'] X_train = sub[trian_features] X_test = sub[test_features] formas_train=[train_mean,train_median,train_std] formas_test=[test_mean,test_median,test_std] train_inp=pd.concat(formas_train,axis=1) test_inp=pd.concat(formas_test,axis=1) #残差特征 lr_Y=y_train['sales_0'] lr_train_x=train_inp re_train= sm.OLS(lr_Y,lr_train_x).fit() train_inp['resid']=re_train.resid lr_Y=y_train['sales_0'] lr_test_x=test_inp re_test= sm.OLS(lr_Y,lr_test_x).fit() test_inp['resid']=re_test.resid train_inp=pd.concat([y_train,train_inp],axis=1) ts_test_pro,ts_train_pro=split_ts(df) ts_train_=ts_train_pro.reset_index() train_inp=pd.merge(train_inp,ts_train_,left_on='goods_code',right_on='id',how='left') test_inp=pd.concat([y_train,test_inp],axis=1) ts_test_=ts_test_pro.reset_index() test_inp=pd.merge(test_inp,ts_test_,left_on='goods_code',right_on='id',how='left') train_inp.drop(['sales_0','goods_code'],axis=1,inplace=True) test_inp.drop(['sales_0','goods_code'],axis=1,inplace=True) train_inp.fillna(0,inplace=True) train_inp.replace(np.inf,0,inplace=True) test_inp.replace(np.inf,0,inplace=True) test_inp.fillna(0,inplace=True) #lasso ss = StandardScaler() train_inp_s= ss.fit_transform(train_inp) test_inp_s= ss.transform(test_inp) alpha_ridge = [1e-4,1e-3,1e-2,0.1,1] coeffs = {} for alpha in alpha_ridge: r = Lasso(alpha=alpha, normalize=True, max_iter=1000000) r = r.fit(train_inp_s, y_train['sales_0']) grid_search = GridSearchCV(Lasso(alpha=alpha, normalize=True), scoring='neg_mean_squared_error', param_grid={'alpha': alpha_ridge}, cv=5, n_jobs=-1) grid_search.fit(train_inp_s, y_train['sales_0']) alpha = alpha_ridge rmse = list(np.sqrt(-grid_search.cv_results_['mean_test_score'])) plt.figure(figsize=(6,5)) lasso_cv = pd.Series(rmse, index = alpha) lasso_cv.plot(title = "Validation - LASSO", logx=True) plt.xlabel("alpha") plt.ylabel("rmse") plt.show() least_lasso=min(alpha) lasso = Lasso(alpha=least_lasso,normalize=True) model_lasso=lasso.fit(train_inp_s,y_train['sales_0']) print("lasso feature.......................") lasso_coef = pd.Series(model_lasso.coef_,index = train_inp.columns) lasso_coef=lasso_coef[lasso_coef!=0.0000] lasso_coef=lasso_coef.astype(float) print(".....lasso_coef..............") print(lasso_coef.sort_values(ascending=False).head(10)) print(" R^2,拟合优度") matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef = pd.concat([lasso_coef.sort_values().head(5), lasso_coef.sort_values().tail(5)])#选头尾各10条 imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") print(lasso.score(train_inp_s,y_train['sales_0'])) print(lasso.get_params()) print('参数信息') print(lasso.set_params(fit_intercept=False)) lasso_preds =model_lasso.predict(test_inp_s) #绘制预测结果和真实值散点图 fig, ax = plt.subplots() ax.scatter(y_train['sales_0'],lasso_preds) ax.plot([y_train['sales_0'].min(), y_train['sales_0'].max()], [y_train['sales_0'].min(), y_train['sales_0'].max()], 'k--', lw=4) ax.set_xlabel('y_true') ax.set_ylabel('Pred') plt.show() y_pred=pd.DataFrame(lasso_preds,columns=['y_pred']) matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) preds = pd.DataFrame({"preds":y_pred['y_pred'], "true":y_train['sales_0']}) preds["residuals"] = preds["true"] - preds["preds"] print("打印预测值描述.....................") preds=preds.astype(float) print(preds.head()) print(preds.describe()) print(preds.shape) preds.plot(x = "preds", y = "residuals",kind = "scatter") plt.title("True and residuals") plt.show() data_out=[y_train['goods_code'],y_train['sales_0'],y_pred] result=pd.concat(data_out,axis=1) #计算mape result['mape']=abs((result['sales_0']-result['y_pred'])/result['sales_0']*100) return result,lasso_coef
############################################################ from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.linear_model import ElasticNetCV from sklearn.pipeline import Pipeline model = Pipeline([ ('ss', StandardScaler()), # 线性回归的多项式深度为3 ('poly', PolynomialFeatures(degree=3, include_bias=True)), # 构造特征,degree控制多项式的度,interaction_only: 默认为False, # 如果指定为True,那么就不会有特征自己和自己结合的项,二次项中没有a^2和b^2。 ('linear', ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.99, 1], alphas=np.logspace(-3, 2, 5), fit_intercept=False, max_iter=1e3, cv=3)) ]) model.fit(x_train, y_train.ravel()) linear = model.get_params('linear')['linear'] # print u'系数:', linear.coef_.ravel() y_pred = model.predict(x_test) # R平方系数 r2 = model.score(x_test, y_test) # 均方误差 mse = mean_squared_error(y_test, y_pred)