def _compare_with_lasso(self,
                         lasso_X,
                         lasso_y,
                         wlasso_X,
                         wlasso_y,
                         sample_weight,
                         alpha_range=[0.01],
                         params={}):
     for alpha in alpha_range:
         lasso = Lasso(alpha=alpha)
         lasso.set_params(**params)
         lasso.fit(lasso_X, lasso_y)
         wlasso = WeightedLasso(alpha=alpha)
         wlasso.set_params(**params)
         wlasso.fit(wlasso_X, wlasso_y, sample_weight=sample_weight)
         # Check results are similar with tolerance 1e-6
         if np.ndim(lasso_y) > 1:
             for i in range(lasso_y.shape[1]):
                 np.testing.assert_allclose(lasso.coef_[i], wlasso.coef_[i])
                 if lasso.get_params()["fit_intercept"]:
                     self.assertAlmostEqual(lasso.intercept_[i],
                                            wlasso.intercept_[i])
         else:
             np.testing.assert_allclose(lasso.coef_, wlasso.coef_)
             self.assertAlmostEqual(lasso.intercept_, wlasso.intercept_)
示例#2
0
def lasso_model(xy):
    #lasso模型
    x = xy[:, 0].reshape(-1, 1)
    y = xy[:, 1]
    model = Lasso()
    alpha_can = np.linspace(-1, 10, 30)
    model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5)
    model.fit(x, y)
    print(model.best_params_)
    pred_y = model.predict(x)
    params = model.get_params()
    print(params)
    lasso_r2 = sm.r2_score(y, pred_y)
    lasso_absolute = sm.mean_absolute_error(y, pred_y)
    lasso_squared = sm.mean_squared_error(y, pred_y)
    lasso_median = sm.median_absolute_error(y, pred_y)
    drawing_lasso(xy, x, pred_y, model.best_params_)
    return {
        'lasso_score': {
            'lasso_r2': round(lasso_r2, 5),
            'lasso_absolute': round(lasso_absolute, 5),
            'lasso_squared': round(lasso_squared, 5),
            'lasso_median': round(lasso_median, 5)
        }
    }
示例#3
0
 def generate_regression_team(self):
     l = Lasso()
     training_set = self.load_training_set()
     train, test, encoder = self.prepare_team_data(training_set, .999)
     l.fit(train.X, train.y)
     json.dump(l.get_params(), open(f"resources/team_predict.json", 'w+'))
     json.dump(encoder.get_params(), open('resoures/encoder.json', 'w+'))
示例#4
0
def main():
    np.set_printoptions(suppress=True)
    narac_file_path = "../../tigress/arburton/plink_data/narac_rf"
    csv_data = []
    for chunk in pd.read_csv(narac_file_path,
                             delim_whitespace=True,
                             index_col=0,
                             chunksize=20000):
        csv_data.append(chunk)
    samples = pd.concat(csv_data, axis=0)
    del csv_data
    # TODO: pull out affection column as y
    affection = pd.DataFrame(samples, columns="Affection")
    samples = samples.drop([
        "Affection", "Sex", "DRB1_1", "DRB1_2", "SENum", "SEStatus", "AntiCCP",
        "RFUW"
    ],
                           axis=1)
    samples = pd.get_dummies(samples, columns=(samples.columns != "ID"))
    sample_train, sample_test, affection_train, affection_test = train_test_split(
        samples, affection, test_size=0.8)
    # TODO: potentially make sample weights percentage of non ?? SNPs

    # RANDOM FOREST CLASSIFIER

    rf = RandomForestClassifier(n_estimators=5000, max_features=40, n_jobs=2)
    rf.fit(sample_train, affection_train)
    print("Random forest accuracy: {}".format(
        rf.score(sample_test, affection_test)))
    print("Random forest feature importances:")
    print(rf.feature_importances_)
    print("Random forest parameters:")
    print(rf.get_params())

    # LASSO CLASSIFIER
    lasso = Lasso()
    lasso.fit(sample_train, affection_train)
    print("LASSO accuracy: {}".format(lasso.score(sample_test,
                                                  affection_test)))
    print("LASSO parameters:")
    print(lasso.get_params())

    # LOG REGRESSION
    log_reg = LogisticRegression(n_jobs=2)
    log_reg.fit(sample_train, affection_train)
    print("Log regression accuracy: {}".format(
        log_reg.score(sample_test, affection_test)))
    print("Log regression parameters:")
    print(log_reg.get_params())

    # NEURAL NETS
    mlp_classifier = MLPClassifier()
    mlp_classifier.fit(sample_train, affection_train)
    print("MLP Classifier accuracy: {}".format(
        mlp_classifier.score(sample_test, affection_test)))
    print("MLP Classifier parameters:")
    print(mlp_classifier.get_params())
示例#5
0
def lassoDict(currentX, currentY, eps, lam, currentColumns, colWorth):
    irrelevant = []
    model = Lasso(alpha=lam, fit_intercept=True)
    model.fit(currentX, currentY)
    params = model.get_params()
    print(model.coef_.sum())
    for i in range(model.coef_.shape[0]):
        colWorth[currentColumns[i]] += np.abs(model.coef_[i])
        if np.abs(model.coef_[i]) < eps:
            irrelevant.append(currentColumns[i])
    return irrelevant
示例#6
0
    def test_parameters(self):
        """ Testing parameters of Model class. """
#1.)
        #create instance of PLS model using Model class & creating instance
        #   using SKlearn libary, comparing if the parameters of both instances are equal
        pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200}
        model = Model(algorithm="PlsRegression", parameters=pls_parameters)
        pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(pls_model.get_params()))
#2.)
        rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10}
        model = Model(algorithm="RandomForest", parameters=rf_parameters)
        rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(rf_model.get_params()))
#3.)
        knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"}
        model = Model(algorithm="KNN", parameters=knn_parameters)
        knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(knn_model.get_params()))
#4.)
        svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1}
        model = Model(algorithm="SVR",parameters=svr_parameters)
        svr_model = SVR(kernel='poly', degree=5, coef0=1)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(svr_model.get_params()))
#5.)
        ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"}
        model = Model(algorithm="AdaBoost", parameters=ada_parameters)
        ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(ada_model.get_params()))
#6.)
        bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2}
        model = Model(algorithm="Bagging", parameters=bagging_parameters)
        bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(bagging_model.get_params()))
#7.)
        lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004}
        model = Model(algorithm="lasso", parameters=lasso_parameters)
        lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(lasso_model.get_params()))
示例#7
0
class Lasso(Model):

    # X represents the features, Y represents the labels
    X = None
    Y = None
    prediction = None
    model = None

    def __init__(self,
                 X=None,
                 Y=None,
                 label_headers=None,
                 alpha=1,
                 type='regressor',
                 cfg=False):

        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        self.type = type
        self.cfg = cfg

        self.mapping_dict = None
        self.label_headers = label_headers

        self.model = LassoRegression(alpha=alpha)

    def fit(self, X=None, Y=None):
        if X is not None:
            self.X = X

        if Y is not None:
            self.Y = Y

        if self.type == 'classifier':
            self.Y = self.map_str_to_number(self.Y)

        print('Lasso Train started............')
        self.model.fit(self.X, self.Y)
        print('Lasso completed..........')

        return self.model

    def predict(self, test_features):
        print('Prediction started............')
        self.predictions = self.model.predict(test_features)
        if self.type == 'classifier':
            predictions = predictions.round()
        print('Prediction completed..........')
        return self.predictions

    def save(self):
        if self.cfg:
            f = open('lasso_configs.txt', 'w')
            f.write(json.dumps(self.model.get_params()))
            f.close()
        print('No models will be saved for lasso')

    def featureImportance(self):
        return self.model.coef_

    def map_str_to_number(self, Y):
        mapping_flag = False
        if self.mapping_dict is not None:
            for label_header in self.label_headers:
                Y[label_header] = Y[label_header].map(self.mapping_dict)
            return Y

        mapping_dict = None
        for label_header in self.label_headers:
            check_list = pd.Series(Y[label_header])
            for item in check_list:
                if type(item) == str:
                    mapping_flag = True
                    break
            if mapping_flag:
                classes = Y[label_header].unique()
                mapping_dict = {}
                index = 0
                for c in classes:
                    mapping_dict[c] = index
                    index += 1

                Y[label_header] = Y[label_header].map(mapping_dict)
                mapping_flag = False

        self.mapping_dict = mapping_dict
        return Y

    def map_number_to_str(self, Y, classes):
        Y = Y.round()
        Y = Y.astype(int)
        if self.mapping_dict is not None:
            mapping_dict = self.mapping_dict
        else:
            mapping_dict = {}
            index = 0
            for c in classes:
                mapping_dict[index] = c
                index += 1

        inv_map = {v: k for k, v in mapping_dict.items()}
        return Y.map(inv_map)

    def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8):
        if self.type == 'classifier':
            correct = 0
            df = pd.DataFrame(data=predictions.flatten())
            test_labels = self.map_str_to_number(test_labels.copy())
            for i in range(len(df)):
                if (df.values[i] == test_labels.values[i]):
                    correct = correct + 1
        else:
            correct = 0
            df = pd.DataFrame(data=predictions.flatten())
            for i in range(len(df)):
                if 1 - abs(df.values[i] - test_labels.values[i]) / abs(
                        df.values[i]) >= hitmissr:
                    correct = correct + 1
        return float(correct) / len(df)

    def getConfusionMatrix(self, test_labels, predictions, label_headers):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'classifier':
            index = 0
            for label_header in label_headers:
                classes = test_labels[label_header].unique()
                df_tmp = self.map_number_to_str(df.ix[:, index], classes)
                title = 'Normalized confusion matrix for Lasso (' + label_header + ')'
                self.plot_confusion_matrix(test_labels.ix[:, index],
                                           df_tmp,
                                           classes=classes,
                                           normalize=True,
                                           title=title)
                index = index + 1
        else:
            return 'No Confusion Matrix for Regression'

    def getROC(self, test_labels, predictions, label_headers):
        predictions = pd.DataFrame(data=predictions.flatten())
        predictions.columns = test_labels.columns.values
        if self.type == 'classifier':
            test_labels = self.map_str_to_number(test_labels)
            fpr, tpr, _ = roc_curve(test_labels, predictions)
            plt.figure(1)
            plt.plot([0, 1], [0, 1], 'k--')
            plt.plot(fpr, tpr)
            plt.xlabel('False positive rate')
            plt.ylabel('True positive rate')
            plt.title('ROC curve')
            plt.show()
        else:
            return 'No Confusion Matrix for Regression'

    def getRSquare(self, test_labels, predictions, mode='single'):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            if mode == 'multiple':
                errors = r2_score(test_labels,
                                  df,
                                  multioutput='variance_weighted')
            else:
                errors = r2_score(test_labels, df)
            return errors
        else:
            return 'No RSquare for Classification'

    def getMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = mean_squared_error(test_labels, df)
            return errors
        else:
            return 'No MSE for Classification'

    def getMAPE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = np.mean(np.abs(
                (test_labels - df.values) / test_labels)) * 100
            return errors.values[0]
        else:
            return 'No MAPE for Classification'

    def getRMSE(self, test_labels, predictions):
        df = pd.DataFrame(data=predictions.flatten())
        if self.type == 'regressor':
            errors = sqrt(mean_squared_error(test_labels, df))
            return errors
        else:
            return 'No RMSE for Classification'
model_ridge.fit(train_X, train_y)
print('训练集预测的确定系数R ^ 2: ', model_ridge.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model_ridge.score(test_X, test_y))
pred_1 = model_ridge.predict(test_X)
print('模型误差: ', mean_squared_error(test_y, pred_1))

# 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数
model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0])
model.fit(train_X, train_y)
print("模型参数:", model.get_params())
print("模型详情:", model)
print('最佳alpha', model.alpha_)  # Ridge()无这个方法,只有RidgeCV算法有
print('训练集预测的确定系数R ^ 2: ', model.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model.score(test_X, test_y))

pred_2 = model.predict(test_X)
print('Ridge模型误差: ', mean_squared_error(test_y, pred_2))

# Lasso回归
model_lasso = Lasso(alpha=0.01)
model_lasso = LassoCV()
model_lasso = LassoLarsCV()
model_lasso.fit(train_X, train_y)
print("模型参数:", model_lasso.get_params())
print("模型详情:", model_lasso)
#print('最佳alpha',model_lasso.alpha_)
print('训练集预测的确定系数R ^ 2: ', model_lasso.score(train_X, train_y))
print('验证集预测的确定系数R ^ 2: ', model_lasso.score(test_X, test_y))

pred_3 = model_lasso.predict(test_X)
print('Lasso模型误差: ', mean_squared_error(test_y, pred_3))
示例#9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir",
                        default="data/formatted.csv",
                        type=str,
                        required=False,
                        help="the input dataset to be used to train the model")
    parser.add_argument("--output_dir",
                        default="SGDRegressor_5",
                        type=str,
                        required=False,
                        help="the output file for the ")
    parser.add_argument("--model_type",
                        default="SVR",
                        type=str,
                        required=False,
                        help="the kind of model to use "
                        "[Lasso, SGDRegressor, ElasticNet, SVR, LinearRegression]")

    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # load data into numpy array
    X_train, y_train, X_val, y_val = load_data(args.data_dir, args)

    # create model
    if args.model_type == "Lasso":
        # change the alpha value for shit
        model = Lasso(alpha=.1, fit_intercept=True, normalize=True,
            precompute=False, copy_X=True, max_iter=100000, tol=0.000001,
            warm_start=False, positive=True, random_state=None,
            selection='cyclic')
    elif args.model_type == "SGDRegressor":
        model = SGDRegressor(loss='squared_epsilon_insensitive',
<<<<<<< HEAD
            penalty='elasticnet', alpha=0.1,
            l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=.00000001,
            shuffle=True, verbose=1, epsilon=0.1, random_state=None,
=======
            penalty='l2', alpha=0.1,
            l1_ratio=0.15, fit_intercept=True, max_iter=10000, tol=.001,
            shuffle=True, verbose=0, epsilon=0.1, random_state=None,
>>>>>>> 31e4c3dde529b32e328274b653f6f6e5c0bc65c1
            learning_rate='optimal', eta0=0.001, power_t=0.25,
            early_stopping=False, validation_fraction=0.1,
            n_iter_no_change=100, warm_start=False, average=False,
            n_iter=None)
    elif args.model_type == "ElasticNet":
        model = ElasticNet(alpha=.000001, l1_ratio=0.5, fit_intercept=True,
            normalize=True, precompute=False, max_iter=10000, copy_X=True,
            tol=0.0001, warm_start=False, positive=True, random_state=None,
            selection='cyclic')
    elif args.model_type == "SVR":
        model = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=1000, shrinking=True,
    tol=0.001, verbose=False)
    elif args.model_type == "LinearRegression":
        model = LinearRegression(fit_intercept=True, normalize=False,
            copy_X=True, n_jobs=None)

    # train the model with the X, and y train numpy arrays
    model.fit(X_train, np.log(y_train+1))

    # get score with the X, and y dev numpy arrays
    test_score = model.score(X_val, np.log(y_val+1))
    train_score = model.score(X_train, np.log(y_train+1))
    print("train: {}, test: {}".format(train_score, test_score))

    # save_parameters
    parameters = model.get_params()
    with open(os.path.join(args.output_dir, "params.json"), "w") as fp:
        json.dump(parameters, fp)

    # save the model weights
    model_weights_filename = os.path.join(args.output_dir, "trained_model.sav")
    pickle.dump(model, open(model_weights_filename, 'wb'))

    # get outputs
    output = str()
    for prediction, label in zip(run_regressor(X_val, model_weights_filename), y_val):
        output+="{}, {}\n".format(prediction, label)

    # save scorem outputs
    with open(os.path.join(args.output_dir, "score.txt"), "w") as fp:
        fp.write("train score: {}, test score:{}".format(train_score, test_score))
        fp.write(output)
示例#10
0
def lasso(X, Y, kfold=3, feature_set=None):
    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    # Create the random grid
    alpha = np.linspace(0, 1, 10)
    random_grid = {'alpha': alpha}

    lasso = Lasso(random_state=42)

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    pprint(lasso.get_params())

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    lasso_random = RandomizedSearchCV(estimator=lasso,
                                      param_distributions=random_grid,
                                      scoring='neg_mean_squared_error',
                                      cv=ps2.split(),
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=-1)

    # Fit the random search model
    lasso_random.fit(train_X, train_y)
    pprint(lasso_random.best_params_)

    cv_result_rd = lasso_random.cv_results_

    BestPara_random = lasso_random.best_params_

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    from sklearn.model_selection import GridSearchCV

    # Number of trees in random forest
    alpha = np.linspace(BestPara_random["alpha"] - 0.2,
                        BestPara_random["alpha"] + 0.2, 10)

    # Create the random grid
    grid_grid = {'alpha': alpha}

    lasso_grid = GridSearchCV(estimator=lasso,
                              param_grid=grid_grid,
                              scoring='neg_mean_squared_error',
                              cv=ps2.split(),
                              verbose=2,
                              n_jobs=-1)
    # Fit the grid search model
    lasso_grid.fit(train_X, train_y)
    BestPara_grid = lasso_grid.best_params_

    pprint(lasso_grid.best_params_)
    cv_results_grid = lasso_grid.cv_results_

    # Fit the base line search model
    lasso.fit(train_X, train_y)

    #prediction
    predict_y = lasso_random.predict(test_X)
    predict_y_grid = lasso_grid.predict(test_X)
    predict_y_base = lasso.predict(test_X)

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid,
                                         test_y))  #,squared = False))
    errors_Random_CV = (mean_squared_error(predict_y,
                                           test_y))  #,squared = False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y))  #,squared = False))
    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]

    print('lasso results:', results)

    if True:

        fig = plt.figure(figsize=(20, 8))
        x_axis = range(3)
        plt.bar(x_axis, results)
        plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
        #plt.show()
        plt.savefig('lasso_error_compare.png')

        #feature importance
        #num_feature = len(lasso.best_estimator_.feature_importances_)
        #plt.figure(figsize=(24,6))
        #plt.bar(range(0,num_feature*4,4),lasso.best_estimator_.feature_importances_)
        #label_name = X.keys()
        #plt.xticks(range(0,num_feature*4,4), label_name)
        #plt.title("Feature Importances"+",kfold="+str(kfold))
        #plt.show()
        #plt.savefig('lasso_feature_importance.png')

        fig = plt.figure(figsize=(20, 8))
        ax = fig.gca()
        x_label = range(0, len(predict_y_grid))
        plt.title("kfold=" + str(kfold))
        ax.plot(x_label, predict_y_grid, 'r--', label="predict")
        ax.plot(x_label, test_y, label="ground_truth")
        ax.set_ylim(0, 200)
        ax.legend()
        #plt.show()
        plt.savefig('lasso_prediction.png')

    return lasso_grid.predict, lasso_grid.best_estimator_
示例#11
0
    return np.array(features), np.array(targets)


features, targets = generate_features_and_targets(math_student_data)

xTrain, xTest, yTrain, yTest = train_test_split(features,
                                                targets)  # random_state=7

#all of the following models were tried and the one that preforms best, the Random Forest decision tree, was left uncommented
#model = KNeighborsRegressor(n_neighbors=2)
#model = LinearRegression()

#model =
#model = MLPRegressor(solver='lbfgs', random_state=2, hidden_layer_sizes=[100, 100])
model = Lasso(alpha=.03, max_iter=1000)
print(model.get_params())
model = Pipeline([("scaler", MinMaxScaler()), ("model", model)])
''' This code was used to select the best parameters and model.'''
'''
param_grid = [{'model__n_estimators': [5, 10, 50, 100, 150, 250, 500]}, {'model__alpha': [.001, .01, .1, 1, 10, 100, 1000]}, {'model__alpha': [.001, .01, .1, 1, 10, 100, 1000], 'model__max_iter': [1000, 5000, 10000, 50000, 100000, 500000]}, {'model__C': [.001, .01, .1, 1, 10, 100, 1000], 'model__gamma': [.001, .01, .1, 1, 10, 100, 1000]}]
models = [RandomForestRegressor(n_estimators=500), Ridge(), Lasso(), SVR()] #, random_state=9
for i in range(len(models)):
    model = Pipeline([("scaler", MinMaxScaler()), ("model",  models[i])])
    grid_search = GridSearchCV(model, param_grid[i], cv=20, return_train_score=True)
    model = grid_search.fit(xTrain, yTrain)
    print(model.best_params_)
    print(model.score(xTest, yTest))
    print('')

#print(cross_val_score(model, features, targets).mean()) #cross-val scores were compared between all the model choices to select one
'''
示例#12
0
#Lasso Regression
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.0007196856730011522)
lasso.fit(X_train,y_train)
lasso_coef=lasso.coef_
lasso_intercept=lasso.intercept_
names=dataset.drop('MV',axis=1).columns 
plt.plot(range(len(names)),lasso_coef)
plt.ylabel('coefficients')
plt.show()
lasso.score(X_test,y_test)

##hyper tunung for lasso
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
lasso.get_params()
c_space=np.logspace(-5,8,15)
param_grid={'alpha':c_space}
logistic_cv=GridSearchCV(lasso,param_grid,cv=5)
logistic_cv.fit(X_train,y_train)
logistic_cv.best_params_
logistic_cv.best_score_





#mean absolute error
from sklearn import metrics
print('MAE:',metrics.mean_absolute_error(y_test,regressor.predict(X_test)))
print('MSE:',metrics.mean_squared_error(y_test,regressor.predict(X_test)))
def train_model():
    start_time=time.time()
    data_inp=data_clean(df)
    pivot = data_inp.pivot(index='goods_code', columns='dis_month', values='sale')
    #对变量重新命名
    col_name=[]
    for i in range(len(pivot.columns)):
        col_name.append('sales_'+str(i))
    pivot.columns=col_name
    pivot.fillna(0, inplace=True)
    sub=pivot.reset_index()
    test_features=['goods_code']
    trian_features = ['goods_code']
    for i in range(1,3):
        test_features.append('sales_' + str(i))
    #前面21个月作为训练集
    for i in range(3,23):
        trian_features.append('sales_' + str(i))

    sub.fillna(0, inplace=True)
    sub.drop_duplicates(subset=['goods_code'],keep='first',inplace=True)
    #最近的两个月作为测试集
    for i in range(1,3):
        test_features.append('sales_' + str(i))
   
    for i in range(3,23):
        trian_features.append('sales_' + str(i))
    X_train = sub[trian_features]
    y_train = sub[['sales_0', 'goods_code']]
    X_test = sub[test_features]    
    sales_type = 'sales_'
    
    #平均数特征
    X_train['mean_sale'] = X_train.apply(
        lambda x: np.mean([x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], 
                              x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], 
                           x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], 
                              x[sales_type+'14'],
                           x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'],
                           x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1)
    
    X_test['mean_sale'] = X_test.apply(
        lambda x: np.mean([x[sales_type+'1'], x[sales_type+'2']]), axis=1)
    train_mean=X_train['mean_sale']
    test_mean=X_test['mean_sale']
    train_mean=pd.Series(train_mean)
    test_mean=pd.Series(test_mean)
    
     #众数特征
    X_train['median_sale'] = X_train.apply(
        lambda x: np.median([ x[sales_type+'3'], x[sales_type+'4'],
                      x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], 
                             x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],
                             x[sales_type+'13'], x[sales_type+'14'],x[sales_type+'15'], x[sales_type+'16'], 
                             x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'],
                             x[sales_type+'21'], x[sales_type+'22']]), axis=1)
    X_test['median_sale'] = X_test.apply(
        lambda x: np.median([x[sales_type+'1'], x[sales_type+'2']]), axis=1)
    
    #标准差特征
    X_train['std_sale'] = X_train.apply(
        lambda x: np.std([ x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], x[sales_type+'6'], 
                          x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], 
                          x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], x[sales_type+'14'],
                        x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], 
                        x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1)
    X_test['std_sale'] = X_test.apply(
        lambda x: np.std([x[sales_type+'1'], x[sales_type+'2']]), axis=1)
    
    train_median=X_train['median_sale']
    test_median=X_test['median_sale']

    train_std=X_train['std_sale']
    test_std=X_test['std_sale']

    X_train = sub[trian_features]
    X_test = sub[test_features]
    
    formas_train=[train_mean,train_median,train_std]
    formas_test=[test_mean,test_median,test_std]
    train_inp=pd.concat(formas_train,axis=1)
    test_inp=pd.concat(formas_test,axis=1)
    
    #残差特征
    lr_Y=y_train['sales_0']
    lr_train_x=train_inp
    re_train= sm.OLS(lr_Y,lr_train_x).fit()
    train_inp['resid']=re_train.resid
    
    lr_Y=y_train['sales_0']
    lr_test_x=test_inp
    re_test= sm.OLS(lr_Y,lr_test_x).fit()
    test_inp['resid']=re_test.resid
    
    train_inp=pd.concat([y_train,train_inp],axis=1)
    
    ts_test_pro,ts_train_pro=split_ts(df)
    
    ts_train_=ts_train_pro.reset_index()
    train_inp=pd.merge(train_inp,ts_train_,left_on='goods_code',right_on='id',how='left')
    test_inp=pd.concat([y_train,test_inp],axis=1)
    
    ts_test_=ts_test_pro.reset_index()
    test_inp=pd.merge(test_inp,ts_test_,left_on='goods_code',right_on='id',how='left')
    train_inp.drop(['sales_0','goods_code'],axis=1,inplace=True)
    test_inp.drop(['sales_0','goods_code'],axis=1,inplace=True)
    
    train_inp.fillna(0,inplace=True)
    train_inp.replace(np.inf,0,inplace=True)
    test_inp.replace(np.inf,0,inplace=True)
    test_inp.fillna(0,inplace=True)

    #lasso
    ss = StandardScaler()
    train_inp_s= ss.fit_transform(train_inp) 
    test_inp_s= ss.transform(test_inp)
    alpha_ridge = [1e-4,1e-3,1e-2,0.1,1]

    coeffs = {}
    for alpha in alpha_ridge:
        r = Lasso(alpha=alpha, normalize=True, max_iter=1000000)
        r = r.fit(train_inp_s, y_train['sales_0'])

    grid_search = GridSearchCV(Lasso(alpha=alpha, normalize=True), scoring='neg_mean_squared_error',
                           param_grid={'alpha': alpha_ridge}, cv=5, n_jobs=-1)
    grid_search.fit(train_inp_s, y_train['sales_0'])
    
    alpha = alpha_ridge
    rmse = list(np.sqrt(-grid_search.cv_results_['mean_test_score']))
    plt.figure(figsize=(6,5))
    
    lasso_cv = pd.Series(rmse, index = alpha)
    lasso_cv.plot(title = "Validation - LASSO", logx=True)
    plt.xlabel("alpha")
    plt.ylabel("rmse")
    plt.show()
    
    least_lasso=min(alpha)
    lasso = Lasso(alpha=least_lasso,normalize=True)
    model_lasso=lasso.fit(train_inp_s,y_train['sales_0'])
    
    print("lasso feature.......................")
    lasso_coef = pd.Series(model_lasso.coef_,index = train_inp.columns)
    lasso_coef=lasso_coef[lasso_coef!=0.0000]
    lasso_coef=lasso_coef.astype(float)
    print(".....lasso_coef..............")

    print(lasso_coef.sort_values(ascending=False).head(10))
    print(" R^2,拟合优度")
    
    matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
    imp_coef = pd.concat([lasso_coef.sort_values().head(5), 
                     lasso_coef.sort_values().tail(5)])#选头尾各10条

    imp_coef.plot(kind = "barh")
    plt.title("Coefficients in the Lasso Model")
    
    print(lasso.score(train_inp_s,y_train['sales_0']))
    
    print(lasso.get_params())  
    print('参数信息')
    print(lasso.set_params(fit_intercept=False)) 
    lasso_preds =model_lasso.predict(test_inp_s)
    #绘制预测结果和真实值散点图
    fig, ax = plt.subplots()
    ax.scatter(y_train['sales_0'],lasso_preds)
    ax.plot([y_train['sales_0'].min(), y_train['sales_0'].max()], [y_train['sales_0'].min(), y_train['sales_0'].max()], 'k--', lw=4)
    ax.set_xlabel('y_true')
    ax.set_ylabel('Pred')
    plt.show()
    y_pred=pd.DataFrame(lasso_preds,columns=['y_pred'])
    
    matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
    preds = pd.DataFrame({"preds":y_pred['y_pred'], "true":y_train['sales_0']}) 
    preds["residuals"] = preds["true"] - preds["preds"]
    
    print("打印预测值描述.....................")
    preds=preds.astype(float)
    print(preds.head())
    print(preds.describe())
    print(preds.shape)
    preds.plot(x = "preds", y = "residuals",kind = "scatter")
    plt.title("True and residuals")
    plt.show()
    
    data_out=[y_train['goods_code'],y_train['sales_0'],y_pred]
    result=pd.concat(data_out,axis=1)
    #计算mape
    result['mape']=abs((result['sales_0']-result['y_pred'])/result['sales_0']*100)    
    return result,lasso_coef
############################################################

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('ss', StandardScaler()),
    # 线性回归的多项式深度为3
    ('poly', PolynomialFeatures(degree=3, include_bias=True)),
    # 构造特征,degree控制多项式的度,interaction_only: 默认为False,
    # 如果指定为True,那么就不会有特征自己和自己结合的项,二次项中没有a^2和b^2。
    ('linear',
     ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.99, 1],
                  alphas=np.logspace(-3, 2, 5),
                  fit_intercept=False,
                  max_iter=1e3,
                  cv=3))
])

model.fit(x_train, y_train.ravel())
linear = model.get_params('linear')['linear']
# print u'系数:', linear.coef_.ravel()
y_pred = model.predict(x_test)
# R平方系数
r2 = model.score(x_test, y_test)
# 均方误差
mse = mean_squared_error(y_test, y_pred)