예제 #1
0
def lasso_cv_rmse(X, y, l_alphas):
    lasso_rmse = []

    # initialize model
    lasso = Lasso()
    # iterate through Lasso alphas
    for la in l_alphas:

        # set the current alpha to the model
        lasso.set_params(alpha=la)

        # keep track of fold RMSE
        lasso_scores = cross_val_score(lasso,
                                       X,
                                       y,
                                       scoring="neg_mean_squared_error")
        lasso_rmse.append(np.sqrt(np.mean(-1 * lasso_scores)))

    # We can compared against a multiple linear regression without regularisation
    linreg = LinearRegression()

    linreg_scores = cross_val_score(linreg,
                                    X,
                                    y,
                                    scoring="neg_mean_squared_error")
    linreg_rmse = np.sqrt(np.mean(-1 * linreg_scores))

    return lasso_rmse, linreg_rmse
    def fit_lasso(self, split_frac=0.5, cv=10):

        # Split into training and test sets
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            self.X, self.y, test_size=split_frac, random_state=1)

        #Lasso CV
        lasso = Lasso(max_iter=10000, normalize=False, fit_intercept=False)
        lassocv = LassoCV(alphas=None,
                          cv=cv,
                          max_iter=10000,
                          normalize=False,
                          fit_intercept=False)
        lassocv.fit((X_train), y_train)

        lasso.set_params(alpha=lassocv.alpha_)
        lasso.fit((X_train), y_train)
        pred_train = np.squeeze(lasso.predict((X_train)))
        pred = np.squeeze(lasso.predict((X_test)))

        train_corr = np.corrcoef(y_train, pred_train)[0, 1]
        mse = mean_squared_error(y_test, pred)
        test_corr = np.corrcoef(y_test, pred)[0, 1]
        nnz_coef = len(lasso.coef_[np.abs(lasso.coef_ > 1e-10)])

        return lassocv.alpha_, train_corr, test_corr, lasso.coef_, nnz_coef, mse
예제 #3
0
        def Lasso_regression(X_train, y_train, X_test, params):
            # Поиск по сетке для Lasso в виду малого количества гиперпараметров 'GridSearch' и 'RandomGridSearch' - одно и то же
            if hyperparameters == 'RandomGridSearch' or hyperparameters == 'GridSearch':
                # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3)
                alphas = np.arange(1, 800, 50)
                param_grid = {'alpha': alphas}
                # Задаем модель, которую будем обучать
                estimator = Lasso()
                # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке)
                optimizer = GridSearchCV(estimator, param_grid, iid = 'deprecated', cv = 3, scoring = 'neg_mean_absolute_error')
                optimizer.fit(X_train, y_train)
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            elif hyperparameters == 'Custom':
                estimator = Lasso()
                # Задаем нужные параметры
                estimator.set_params(**params)

                # Проверка по кросс-валидации
                fold = KFold(n_splits = 3, shuffle = True)
                validation_score = cross_val_score(estimator = estimator, X = X_train, y = y_train, cv = fold, scoring = 'neg_mean_absolute_error')

                # Обучаем модель уже на всех данных
                estimator.fit(X_train, np.ravel(y_train))
                predicted = estimator.predict(X_test)
            return(predicted, validation_score)
예제 #4
0
def est_coefs(i):
    #coefs=[]
    cross_val_par = 5
    lasso = Lasso(fit_intercept=False, max_iter=10000, normalize=True)
    lassocv = LassoCV(alphas=None,
                      fit_intercept=False,
                      cv=cross_val_par,
                      max_iter=100000,
                      normalize=True)

    y = Y[:, i]
    y = y.reshape(-1, 1)

    X_train = lagged_ret[train_index, :]
    X_test = lagged_ret[test_index, :]
    y_train = y[train_index, :].ravel()

    y_test = y[test_index, :]
    lassocv.fit(X_train, y_train)
    # print("\n")
    # print("Optimal cv parameter: ", lassocv.alpha_)

    lasso.set_params(alpha=lassocv.alpha_)
    lasso.fit(X_train, y_train.ravel())

    #coefs.append([col_names[i], lasso.coef_])
    coefs = lasso.coef_
    # coefs = lasso.coef_.reshape(1, -1)

    # print("MSE: ", mean_squared_error(y_test, lasso.predict(X_test)))
    # print("Lasso coefs: ", coefs)

    return coefs
예제 #5
0
    def get_estimator(self):
        estimator = self.kwargs.get("estimator", self.ESTIMATOR)
        if estimator == "Lasso":
            model = Lasso()
        elif estimator == "Ridge":
            model = Ridge()
        elif estimator == "Linear":
            model = LinearRegression()
        elif estimator == "GBM":
            model = GradientBoostingRegressor()
        elif estimator == "RandomForest":
            model = RandomForestRegressor()
            self.model_params = {  # 'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)],
                'max_features': ['auto', 'sqrt'],
                'n_estimators': range(60, 220, 40)
            }
            # 'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]}

        else:
            model = Lasso()
        estimator_params = self.kwargs.get("estimator_params", {})
        self.mlflow_log_param("estimator", estimator)
        model.set_params(**estimator_params)
        print(colored(model.__class__.__name__, "red"))
        return model
예제 #6
0
 def _compare_with_lasso(self,
                         lasso_X,
                         lasso_y,
                         wlasso_X,
                         wlasso_y,
                         sample_weight,
                         alpha_range=[0.01],
                         params={}):
     for alpha in alpha_range:
         lasso = Lasso(alpha=alpha)
         lasso.set_params(**params)
         lasso.fit(lasso_X, lasso_y)
         wlasso = WeightedLasso(alpha=alpha)
         wlasso.set_params(**params)
         wlasso.fit(wlasso_X, wlasso_y, sample_weight=sample_weight)
         # Check results are similar with tolerance 1e-6
         if np.ndim(lasso_y) > 1:
             for i in range(lasso_y.shape[1]):
                 np.testing.assert_allclose(lasso.coef_[i], wlasso.coef_[i])
                 if lasso.get_params()["fit_intercept"]:
                     self.assertAlmostEqual(lasso.intercept_[i],
                                            wlasso.intercept_[i])
         else:
             np.testing.assert_allclose(lasso.coef_, wlasso.coef_)
             self.assertAlmostEqual(lasso.intercept_, wlasso.intercept_)
예제 #7
0
def vendor_lasso(df):
    '''
	Creating a subsetted version of the full dataset ('new_train') and running lasso regression on data with OneHotEncoded vendor column 
	'''
    new_train = df[[
        'Order.Month', 'Order.Day', 'Order.Year', 'Vendor Number', 'Pack',
        'Bottle Volume (L)', 'Volume Sold (Liters)', 'Profit'
    ]].copy()

    sale_data = new_train.loc[:, new_train.columns != 'Profit']
    sale_target = new_train['Profit']

    X_train, X_test, y_train, y_test = train_test_split(sale_data,
                                                        sale_target,
                                                        test_size=0.2,
                                                        random_state=3)

    new_train_cols = X_train[[
        'Order.Day', 'Order.Year', 'Order.Month', 'Bottle Volume (L)', 'Pack',
        'Volume Sold (Liters)'
    ]]

    new_train_nomcols = X_train[['Vendor Number']]

    enc = OneHotEncoder(drop='first', sparse=False)

    encodecols = enc.fit_transform(new_train_nomcols)
    feature_names = enc.get_feature_names(['Vendor Number'])

    pd.DataFrame(encodecols, columns=feature_names)

    X_train = pd.concat([
        new_train_cols.reset_index(drop=True),
        pd.DataFrame(encodecols,
                     columns=feature_names).astype(int).reset_index(drop=True)
    ],
                        axis=1)
    '''lasso = Lasso()
	lasso.set_params(alpha=1, normalize=True)
	lasso.fit(X_train, y_train)
	print('The intercept is %.4f' %(lasso.intercept_))
	lassoCoef = pd.Series(lasso.coef_, index=X_train.columns).sort_values(ascending = False)
	print('The slopes are \n%s' %(lassoCoef))'''

    lasso = Lasso()
    coefs = []

    alphaRange = np.linspace(1e-3, 20, 20)
    for alpha in alphaRange:
        lasso.set_params(alpha=alpha, normalize=True)
        lasso.fit(X_train, y_train)
        coefs.append(lasso.coef_)

    coefs = pd.DataFrame(np.array(coefs), columns=X_train.columns)

    for name in coefs.columns:
        plt.plot(alphaRange, coefs[name])
    plt.xlabel('Alpha')
    plt.ylabel("Coefficients")
    plt.title('Change of Lasso Slopes Varying Alpha')
def trylasso5times(array_alphas, cv=5, max_iter=1e6, normalize=True):
    lassocv = LassoCV(alphas=array_alphas, cv=5, max_iter=1e6, normalize=True)
    lassocv.fit(X_train, y_train)
    lasso = Lasso()
    lasso.set_params(alpha=lassocv.alpha_)
    lasso.fit(X_train, y_train)
    return lasso, lassocv
예제 #9
0
def LinearModelLasso(X_train, y_train, X_test, y_test):

    alphas = 10**np.linspace(10, -2, 100) * 0.5
    lasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=100000, cv=10)
    lasso_cv.fit(X_train, y_train)

    print("Value of lasso tuning parameter", lasso_cv.alpha_)
    lasso = Lasso()

    lasso.set_params(alpha=lasso_cv.alpha_)
    lasso.fit(X_train, y_train)

    print_evaluation_metrics(lasso, "Lasso Model", X_train, y_train, X_test,
                             y_test)

    print("\n")
    print("--- Feature Selection -----")

    coef = pd.Series(lasso.coef_, index=X_train.columns)

    print("Lasso picked " + str(sum(coef != 0)) +
          " variables and eliminated the other " + str(sum(coef == 0)) +
          " variables")

    for e in sorted(list(zip(list(X_train), lasso.coef_)),
                    key=lambda e: -abs(e[1])):
        if e[1] != 0:
            print("\t{}, {:.3f}".format(e[0], e[1]))
예제 #10
0
def lasso(x, y):
    lassocv = LassoCV(alphas=None, cv=10, max_iter=10000, normalize=True)
    lassocv.fit(x, y)
    lassoreg = Lasso(alpha=.00005, normalize=True, max_iter=1e5)
    lassoreg.set_params(alpha=lassocv.alpha_)
    lassoreg.fit(x, y)
    r2 = round(lassoreg.score(x, y), 2)
    return r2, lassoreg.coef_
예제 #11
0
def lasso_coefs(X, Y, alphas):
    coefs = []
    lasso_reg = Lasso()
    for a in alphas:
        lasso_reg.set_params(alpha=a)
        lasso_reg.fit(X, Y)
        coefs.append(lasso_reg.coef_)
    return coefs
예제 #12
0
def DoLasso(alphas, currentCountry, otherCountries):
    lasso = Lasso(alpha=alphas, normalize=True)
    coefs = []
    for a in alphas:
        lasso.set_params(alpha=a)
        lasso.fit(otherCountries.transpose(), currentCountry)
        coefs.append(lasso.coef_)
    return coefs
예제 #13
0
def lasso_coefs(X, Y, alphas):
    coefs = []
    lasso_reg = Lasso()
    for a in alphas:
        lasso_reg.set_params(alpha=a)  # We set the hyperparameter alpha
        lasso_reg.fit(X, Y)
        coefs.append(lasso_reg.coef_)

    return coefs
예제 #14
0
        def lasso_regression(X_train,
                             y_train,
                             X_test,
                             params,
                             use_cv: bool = True):
            # If there are not enough points for cross validation
            if use_cv is False:
                if params is None:
                    model = Lasso()
                else:
                    model = Lasso(**params)
                model.fit(X_train, y_train)
                predicted = model.predict(X_test)

                # Calculate score on train
                train_predicted = model.predict(X_train)
                validation_score = mean_absolute_error(
                    np.ravel(y_train), np.ravel(train_predicted))
                return predicted, validation_score

            # Grid search for Lasso due to the small number of hyperparameters 'GridSearch' and 'RandomGridSearch' are the same
            if hyperparameters == 'RandomGridSearch' or hyperparameters == 'GridSearch':
                # We search the grid with cross-validation (the number of folds is 3)
                alphas = np.arange(1, 800, 50)
                param_grid = {'alpha': alphas}
                # Setting the model to train
                estimator = Lasso()
                # We train the model with the specified parameter options (we search the grid)
                optimizer = GridSearchCV(estimator,
                                         param_grid,
                                         iid='deprecated',
                                         cv=3,
                                         scoring='neg_mean_absolute_error')
                optimizer.fit(X_train, y_train)
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            elif hyperparameters == 'Custom':
                estimator = Lasso()
                # Setting the necessary parameters
                estimator.set_params(**params)

                # Cross-validation check
                fold = KFold(n_splits=3, shuffle=True)
                validation_score = cross_val_score(
                    estimator=estimator,
                    X=X_train,
                    y=y_train,
                    cv=fold,
                    scoring='neg_mean_absolute_error')

                # Training the model already on all data
                estimator.fit(X_train, np.ravel(y_train))
                predicted = estimator.predict(X_test)
            return predicted, validation_score
예제 #15
0
class LassoModel(Model):
    def create_model(self):
        self.lasso = Lasso()

    def fit(self, train_x, train_y):
        self.lasso_pip = make_pipeline(RobustScaler(), self.lasso)
        self.lasso_pip.fit(train_x, train_y)

    def set_config(self, config):
        self.lasso.set_params(**config)

    def predict(self, test_x):
        return self.lasso_pip.predict(test_x)
def lasso_prediction(x, y):
    
    lasso = Lasso(normalize = "True", tol = 0.01, max_iter = 500000)
    w, q = x.shape
    
    coefs = []
    preds = []
    
    i = 0
    
    while i < q:
        
        start = timer() 
        x_i = x
        print("\n------------------------------------\n")
        name ="Fitting for country no.: %s" %i
        print(name)
        
        alpha = 100
        lasso.set_params(alpha = alpha)
        population_vector = x.iloc[:, i]
        x_i = x.drop([i], axis = 1)
        y_i = y.drop([i], axis = 1)
        lasso.fit(x_i, population_vector)
        
        while np.count_nonzero(lasso.coef_) > 5:
            alpha = alpha + 500
            lasso.set_params(alpha = alpha)
            population_vector = x.iloc[:, i]
            x_i = x.drop([i], axis = 1)
            lasso.fit(x_i, population_vector)
                
        prediction = lasso.predict(y_i)
        coefs.append(lasso.coef_)
        preds.append(prediction)
        
        end_timer = timer() - start
        
        print("Alpha value: " + str(alpha))
        time_statement = "Fitting time for country no. %s: " %i
        print(time_statement + str(round(end_timer, 4)) + " seconds.")
        print("Number of countries used for fitting: " + str(np.count_nonzero(lasso.coef_)))
        

        i = i + 1
        
    np.savetxt("population_parameters.csv", coefs, delimiter=",")
    np.savetxt("population_prediction.csv", preds, delimiter=",") 
    
    
    return coefs
예제 #17
0
def lasso_prediction(x, y):

    lasso = Lasso(normalize="True", tol=0.001, max_iter=5000000)
    w, q = x.shape

    coefs = []
    preds = []

    i = 0

    alphas = 10**(np.linspace(0.01, 8, 1000))

    while i < q:

        x_i = x
        print("\n------------------------------------\n")
        name = "Fitting for country no.: %s" % (i + 1)
        print(name)

        lasso.set_params(alpha=1000000000)
        population_vector = x.iloc[:, i]
        y_true = y.iloc[:, i]
        lasso.fit(x_i, population_vector)
        prediction = lasso.predict(y)
        error = mse(y_true, prediction)

        for a in alphas:

            lasso.set_params(alpha=a)
            population_vector = x.iloc[:, i]
            y_true = y.iloc[:, i]
            lasso.fit(x, population_vector)
            prediction = lasso.predict(y)
            mean_error = mse(y_true, prediction)

            if (mean_error < error) and 0 < np.count_nonzero(lasso.coef_) <= 5:
                coefs_best = lasso.coef_
                pred_best = prediction
                error = mean_error
                alpha_value = a
                country_number = np.count_nonzero(lasso.coef_)

        coefs.append(coefs_best)
        preds.append(pred_best)
        print("Aplha value: " + str(alpha_value))
        print("Number of countries used for fitting: " + str(country_number))
        i = i + 1

    return coefs, preds
예제 #18
0
def Lasso_Reg(Phi_train, Y_train, Phi_test, Y_test, alphas):
    reg = Lasso()
    coefs = []
    train_MSE = []
    test_MSE = []
    for a in alphas:
        reg.set_params(alpha=a)
        reg.fit(Phi_train, Y_train)
        coefs.append(reg.coef_)
        train_pred = (reg.predict(Phi_train))
        train_MSE.append(mean_squared_error(Y_train, train_pred))
        test_pred = (reg.predict(Phi_test))
        test_MSE.append(mean_squared_error(Y_test, test_pred))

    return {'coefs': coefs, 'train_MSE': train_MSE, 'test_MSE': test_MSE}
예제 #19
0
def lasso_regress(X, y, a=None, b=None):
    """

        This function returns regression of input data by LASSO regression
        method.

    Parameters
    ----------
        X: an array or array-like predictors.
           It should be scaled by StandardScaler.
        y: an array or array-like target.
           It should has compatible dimension with input X.
        a, b: an array or array-like, optional.
           another set of data, such as a = X_test, b = y_test.

    Returns
    -------
        coefs_LASSO: list.
                     a list of coefficients from LASSO with different lambdas
        lambdas_LASSO: list.
                       a list of lambdas used in this LASSO
        error1_LASSO: list.
                      a list of MSE of prediction from first input set (X, y)
        error2_LASSO: list.
                      a list of MSE of prediction from second input set (a, b).
                      Return as None if a and b are not defined.
        modelLASSO: modelLASSO = Lasso(), the LASSO model command
    """

    # lASSO vs lambda
    coefs_LASSO = []
    error1_LASSO = []
    error2_LASSO = []
    # Tunning parameter(lambda)
    lambdas_LASSO = np.logspace(-4, 8, 200)
    modelLASSO = Lasso(max_iter=1e5)

    # loop over lambda values to determine the best by mse
    for l in lambdas_LASSO:
        modelLASSO.set_params(alpha=l)
        modelLASSO.fit(X, y)
        coefs_LASSO.append(modelLASSO.coef_)
        error1_LASSO.append(mean_squared_error(y, modelLASSO.predict(X)))
        if a.any() and b.any() is not None:
            error2_LASSO.append(mean_squared_error(b, modelLASSO.predict(a)))
        else:
            error2_LASSO = None
    return coefs_LASSO, lambdas_LASSO, error1_LASSO, error2_LASSO, modelLASSO
예제 #20
0
 def get_estimator(self):
     estimator = self.kwargs.get("estimator", self.ESTIMATOR)
     if estimator == "Lasso":
         model = Lasso()
     elif estimator == "Ridge":
         model = Ridge()
     elif estimator == "Linear":
         model = LinearRegression()
     elif estimator == "GBM":
         model = GradientBoostingRegressor()
     else:
         model = Lasso()
     estimator_params = self.kwargs.get("estimator_params", {})
     model.set_params(**estimator_params)
     print(model.__class__.__name__)
     return model
예제 #21
0
    def binarySearch(self, X, y, select_num = 50):
        model = Lasso()
        betaM = np.zeros([X.shape[1]])
        #max(abs(crossprod(xx, yy / sqrt(sum(yy ^ 2)) / sqrt(n))))
        iteration = 0
        min_lambda = 1e-15
        max_lambda = 1e15

        minFactor = 0.9
        maxFactor = 1.1

        stuckCount = 1
        previousC = -1

        patience = 20

        while min_lambda < max_lambda and iteration < 50:
            iteration += 1
            lmbd = np.exp((np.log(min_lambda) + np.log(max_lambda)) / 2.0)

            # print "\t\tIter:{}\tlambda:{}".format(iteration, lmbd),
            model.set_params(alpha=lmbd)
            model.fit(X, y)
            beta = model.coef_

            c = len(np.where(np.abs(beta) > 0)[0])  # we choose regularizers based on the number of non-zeros it reports
            # print "# Chosen:{}".format(c)
            if c < select_num * minFactor:  # Regularizer too strong
                max_lambda = lmbd
                betaM = beta
            elif c > select_num * maxFactor:  # Regularizer too weak
                min_lambda = lmbd
                betaM = beta
            else:
                betaM = beta
                break
            if c == previousC:
                stuckCount += 1
            else:
                previousC = c
                stuckCount = 1
            if stuckCount > patience:
                # print 'Run out of patience'
                break

        return betaM
예제 #22
0
def build_models(x, y):
    """
    Build a Ridge regression model
    """
    # Alpha values uniformly
    # spaced between 0.01 and 0.02
    alpha_range = np.linspace(0, 0.5, 200)
    model = Lasso(normalize=True)
    coeffiecients = []
    # Fit a model for each alpha value
    for alpha in alpha_range:
        model.set_params(alpha=alpha)
        model.fit(x, y)
        # Track the coeffiecients for plot
        coeffiecients.append(model.coef_)
    # Plot coeffients weight decay vs alpha value
    # Plot model RMSE vs alpha value
    coeff_path(alpha_range, coeffiecients)
예제 #23
0
def build_models(x,y):
    """
    Build a Ridge regression model
    """
    # Alpha values uniformly
    # spaced between 0.01 and 0.02
    alpha_range = np.linspace(0,0.5,200)
    model = Lasso(normalize=True)
    coeffiecients = []
    # Fit a model for each alpha value
    for alpha in alpha_range:
        model.set_params(alpha=alpha)
        model.fit(x,y)
        # Track the coeffiecients for plot
        coeffiecients.append(model.coef_)
    # Plot coeffients weight decay vs alpha value
    # Plot model RMSE vs alpha value
    coeff_path(alpha_range,coeffiecients)
예제 #24
0
def Lasso_model(train_linear, test_linear):
    train_linear_fea = train_linear.drop(columns=['SalePrice'])
    train_linear_tar = train_linear.SalePrice
    real_train_tar = np.expm1(train_linear_tar)
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea,
                                                        train_linear_tar,
                                                        test_size=0.2,
                                                        random_state=0)
    real_train_tar = np.expm1(train_linear_tar)
    """
        . Lasso model
    """

    lassocv = LassoCV(alphas=np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ", lassocv_score)

    start = time.time()
    lasso = Lasso(normalize=True)
    lasso.set_params(alpha=lassocv_alpha, max_iter=10000)
    lasso.fit(x_train, y_train)
    end = time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso = pd.Series(lassocv.coef_,
                           index=x_train.columns).sort_values(ascending=False)
    evaluate(lasso, x_test, y_test, x_train, y_train)
    print('Time elapsed: %.4f seconds' % (end - start))

    y_lasso_predict = lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line = x_line
    plt.scatter(real_train_tar, np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')

    test_prediction_lasso = np.expm1(lasso.predict(test_linear))
    write_pkl(
        lassocv_alpha,
        '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl'
    )
    return test_prediction_lasso
예제 #25
0
class LassoRegressor(LinearRegressor):
    """
  Linear Regression via Lasso Regression (OLS Regression w/ L1 penalty)
  -------------------------------------------------------------------------------
  This class is a wrapper for scikit-learn's "sklearn.linear_model.Lasso" class.
  """
    def __init__(self, std=True, max_iter=10000):
        self.regressor_type = "Lasso"
        self.sklearn_Lasso = Lasso(fit_intercept=std,
                                   normalize=std,
                                   max_iter=max_iter)
        super().__init__(std)
        self.std = False  # let scikit-learn handle preprocessing

    # regress X on Y
    def compute_coeff(self):
        self.sklearn_Lasso.set_params(alpha=self.alpha)
        self.sklearn_Lasso.fit(self.X, self.Y)
        self.coeff = self.sklearn_Lasso.coef_
def lasso_reg(alphas, x, y):

    lasso = Lasso(normalize="True", tol=0.001, max_iter=50000)

    w, q = x.shape
    f = 1

    error = []

    for a in alphas:

        coefs = []
        preds = []

        lasso.set_params(alpha=a)
        i = 0

        while i < q:

            population_vector = x.iloc[:, i]
            train1 = x.drop([i], axis=1)
            lasso.fit(train1, population_vector)
            test1 = y.drop([i], axis=1)
            prediction = lasso.predict(test1)
            coefs.append(lasso.coef_)
            preds.append(prediction)
            i = i + 1

        name_coef = "coef_no_%s.csv" % f
        name_preds = "pred_no_%s.csv" % f

        np.savetxt(name_coef, coefs, delimiter=",")
        np.savetxt(name_preds, preds, delimiter=",")

        y_true = y.values

        result = mse(y_true, np.transpose(preds))
        error.append(result)

        f = f + 1

    return lasso, preds, error
예제 #27
0
def fit(x, y):

    a = 50000000

    coefs = []
    model = Lasso(tol=0.02, max_iter=500000)
    x = x.T
    w, q = x.shape

    i = 0

    while i < q:
        t = x.iloc[:, i]
        x_i = x[x != t]
        model.set_params(alpha=a)
        model.fit(x_i, t)
        coefs.append(model.coef_)
        i = i + 1

    return coefs
예제 #28
0
def best_parameter(x, y, method, alphas):
    Xm = x.as_matrix()
    ym = y.as_matrix()

    if method == "lasso":
        model = Lasso(fit_intercept=True)
    elif method == "ridge":
        model = Ridge(fit_intercept=True)

    k_fold = cross_validation.KFold(len(Xm), 10)
    best_cv_mse = float("inf")

    for a in alphas:
        model.set_params(alpha=a)
        mse_list_k10 = [
                    MSE(model.fit(Xm[train], ym[train]).predict(Xm[val]), ym[val])
                    for train, val in k_fold]
        if np.mean(mse_list_k10) < best_cv_mse:
            best_cv_mse = np.mean(mse_list_k10)
            best_alpha = a
            print method, "BEST PARAMETER=%f, MSE(CV)=%f" % (best_alpha, best_cv_mse)
예제 #29
0
def get_lasso(X, Y):
    #running lasso with alpha 0 (MLR)
    lasso = Lasso()
    lasso.set_params(alpha=0, normalize=True)
    lasso.fit(X, Y)

    #Grid search
    alphas_lasso = np.linspace(0, 10, 50)
    tuned_parameters_r = [{'alpha': alphas_lasso}]
    n_folds = 5
    cv = KFold(n_splits=n_folds, shuffle=True)

    tune_lasso = GridSearchCV(lasso,
                              tuned_parameters_r,
                              cv=cv,
                              refit=True,
                              return_train_score=True,
                              scoring='neg_mean_squared_error')
    tune_lasso.fit(X, Y)

    print(tune_lasso.best_params_)
    print(np.max(tune_lasso.cv_results_['mean_test_score']))
    print(np.min(tune_lasso.cv_results_['mean_test_score']))

    lasso_best = tune_lasso.best_estimator_
    lasso_best.fit(X, Y)
    print(lasso_best.score(X, Y))

    suffix = str(datetime.datetime.now())
    model_filename = 'lasso' + suffix + '.sav'
    pickle.dump(lasso_best, open(model_filename, 'wb'))
    csv_filename = 'ridge ' + suffix + '.csv'

    raw_test, test_IDs = load_test()
    predict = lasso_best.predict(raw_test)
    predict = np.exp(predict)
    predict = pd.DataFrame(predict)
    predict = pd.concat([test_IDs, predict], axis=1)
    predict.columns = ['Id', 'SalePrice']
    predict.to_csv(csv_filename, index=False)
예제 #30
0
def Lasso_model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    real_train_tar=np.expm1(train_linear_tar)
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl')
    return test_prediction_lasso
예제 #31
0
    def lr(self):
        self.alphas = np.logspace(self.model_params_obj.lasso_params_obj.minAlpha, 
            self.model_params_obj.lasso_params_obj.maxAlpha,
            num=self.model_params_obj.lasso_params_obj.lasso_n_alphas,base=np.e)
        print('alphas', self.alphas)
        lasso = Lasso(max_iter=self.model_params_obj.lasso_params_obj.lasso_max_iter, 
            normalize=False, fit_intercept=True)
        #initialize scores for each alpha
        self.scores = np.empty_like(self.alphas)
        self.weights = []
        for i,a in enumerate(self.alphas):
            lasso.set_params(alpha=a)
            lasso.fit(self.X_train, self.y_train)
            self.weights.append(lasso.coef_)
            #add scores for each alpha
            self.scores[i] = lasso.score(self.X_train, self.y_train)
        print('r-2',self.scores)
        tuned_parameters = [{'alpha': self.alphas}]
        n_folds = 6
        gridsearch_lasso_cv = GridSearchCV(lasso, tuned_parameters, 
            scoring=['neg_mean_squared_error', 'explained_variance', 'r2'], 
            cv=n_folds, refit='neg_mean_squared_error')

        gridsearch_lasso_cv.fit(self.X_train, self.y_train)

        #plot_lasso_permutation_importance(gridsearch_lasso_cv, _X_train, _y_train)

        self.rmse["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_neg_mean_squared_error'].tolist()
        self.r2["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_r2'].tolist()
        self.expvarscore["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_explained_variance'].tolist()
        self.plotsobj.weights_vs_alphas(self.weights, self.alphas, self.features)
        
        self.compute_results(gridsearch_lasso_cv, "Lasso Regression", 'lr')
        plot_lasso_permutation_importance([gridsearch_lasso_cv, self.X_train, self.y_train], self.plotsobj, self.features)
        return {'results': [self.plotsobj.results['lr_train'], self.plotsobj.results['lr_test']], \
                 'rmse':self.rmse['lr'], 'r2':self.r2['lr'], 'expvar': self.expvarscore['lr']}
예제 #32
0
파일: utils.py 프로젝트: jdnc/ml-project
 def set_params(self, C=None):
     if self.l1:
         Lasso.set_params(self, alpha=C)
     else:
         Ridge.set_params(self, alpha=1.0/(2.0*C))
     return self
예제 #33
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
df = df.drop("train", axis=1)
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled["lpsa"] = df["lpsa"]
X = df_scaled.ix[:, :-1]
N = X.shape[0]
X.insert(X.shape[1], "intercept", np.ones(N))
y = df_scaled["lpsa"]
names_regressors = ["Lcavol", "Lweight", "Age", "Lbph", "Svi", "Lcp", "Gleason", "Pgg45"]


X = X.drop("intercept", axis=1)
Xtrain = X[istrain]
ytrain = y[istrain]
names_regressors = ["Lcavol", "Lweight", "Age", "Lbph", "Svi", "Lcp", "Gleason", "Pgg45"]


alphas_ = np.logspace(1, -2, base=10)
Xm = Xtrain.as_matrix()
ym = ytrain.as_matrix()
k_fold = cross_validation.KFold(len(Xm), 10)
best_cv_mse = float("inf")
model = Lasso(fit_intercept=True)
for a in alphas_:
    model.set_params(alpha=a)
    mse_list_k10 = [MSE(model.fit(Xm[train], ym[train]).predict(Xm[val]), ym[val]) for train, val in k_fold]
    if np.mean(mse_list_k10) < best_cv_mse:
        best_cv_mse = np.mean(mse_list_k10)
        best_alpha = a
        print "LASSO BEST PARAMETER=%f, MSE(CV)=%f" % (best_alpha, best_cv_mse)
예제 #35
0
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) #reverse axis
plt.xlabel('Alpha')
plt.ylabel('Weights')
plt.title('Regularization Path RIDGE')
plt.axis('tight')
plt.legend(loc=2)
plt.show()

		#Pregunta B

clf = Lasso(fit_intercept=True)
alphas_2 = np.logspace(1,-2,base=10)
coefs = []
for a in alphas_2:
	clf.set_params(alpha=a)
	clf.fit(Xtrain,ytrain)
	coefs.append(clf.coef_)
ax = plt.gca()

for y_arr, label in zip(np.squeeze(coefs).T, names_regressors):
	plt.plot(alphas_2, y_arr, label=label)
plt.legend()
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) #reverse axis
plt.xlabel('Alpha')
plt.ylabel('Weights')
plt.title('Regularization Path LASSO')
plt.axis('tight')
plt.legend(loc=2)
plt.show()