示例#1
0
def testFunc8(savepath='Results/bikeshare_RidgeCV_AlphaSelection.png'):
    '''
    基于共享单车数据使用AlphaSelection
    '''
    data = pd.read_csv('fixtures/bikeshare/bikeshare.csv')
    X = data[[
        "season", "month", "hour", "holiday", "weekday", "workingday",
        "weather", "temp", "feelslike", "humidity", "windspeed"
    ]]
    Y = data["riders"]
    alphas = np.logspace(-10, 1, 200)
    visualizer = AlphaSelection(RidgeCV(alphas=alphas))
    visualizer.fit(X, Y)
    visualizer.poof(outpath=savepath)
示例#2
0
############################### MODELING ######################################
#############################################################################

################################### RIDGE REGRESSION #########################

####### a) looking for best parameters
#Run it to find the best alpha
#Set a ranges for alphas
alphas_range = np.arange(1, 200, 5)
# Crossvalidate for the best alphas
regr_cv = RidgeCV(alphas=alphas_range)
#Visualize alpha
visualizer = AlphaSelection(regr_cv)
# Fit the linear regression
visualizer.fit(X, y)
g = visualizer.poof()
visualizer.alpha_  # best parameter shows up to be 81

####### b) Implement Ridge Regression
ridge = Ridge(alpha=visualizer.alpha_)  # this parameter is choosen by RidgeCV
ridge.fit(X_train, y_train)  # Fit a ridge regression on the training data
coefs_ridge = pd.DataFrame(ridge.coef_.T,
                           index=[X.columns])  # Print coefficients
coefs_ridge = coefs_ridge.rename(columns={0: 'coef_value'})

# TRAIN SET
pred_train = ridge.predict(X_train)  # Use this model to predict the train data
# Calculate RMSE train
print("RMSE for Train:", sqrt(mean_squared_error(y_train, pred_train)))  #RMSE
print("R^2 for Train:", ridge.score(X_train, y_train))  #R2
示例#3
0
文件: lasso.py 项目: mirzask/summer19
            c=np.sign(lasso.coef_),
            cmap="bwr_r")

######## Yellowbrick

from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError
from sklearn.linear_model import LassoCV

### Find optimal alpha

alphas = np.logspace(-10, 1, 400)

lasso_alpha = LassoCV(alphas=alphas)
lasso_yb = AlphaSelection(lasso_alpha)
lasso_yb.fit(X, y)
lasso_yb.poof()

### RVF plot

lasso_yb = ResidualsPlot(lasso, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
lasso_yb.poof()

### Prediction Error

lasso_yb = PredictionError(lasso, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
lasso_yb.poof()
示例#4
0
def main(processed_path = "data/processed",
         models_path = "models",
         visualizations_path = "visualizations"):
    
    """Creates visualizations."""
    
    # logging
    logger = logging.getLogger(__name__)
    
    # normalize paths
    processed_path = os.path.normpath(processed_path)
    logger.debug("Path to processed data normalized: {}"
                 .format(processed_path))
    models_path = os.path.normpath(models_path)
    logger.debug("Path to models normalized: {}"
                 .format(models_path))
    visualizations_path = os.path.normpath(visualizations_path)
    logger.debug("Path to visualizations normalized: {}"
                 .format(visualizations_path))
    
    #%% load selected_df
    selected_df = pd.read_pickle(os.path.join(processed_path,
                                              'selected_df.pkl'))
    logger.info("Loaded selected_df. Shape of df: {}"
                .format(selected_df.shape))
    
    # load models
    mod = pickle.load(open(
            os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'), 'rb'))
    logger.info("Loaded sklearn_ElasticNetCV.pkl.")
    mod_sm = pickle.load(open(
            os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'), 'rb'))
    logger.info("Loaded sm_OLS_fit_regularized.")
    
    #%% split selected_df into dependent and independent variables
    teams_df = selected_df.iloc[:, :9]
    y = selected_df.iloc[:, 9:10]
    X = selected_df.iloc[:, 10:]
    yX = pd.concat([y, X], axis=1)
    logger.debug("Splitted selected_df to teams_df, y, X and yX.")
    
    #%% start visualization
    
    start = time()
    sns.set_context('paper')
    logger.debug("Set seaborn context to 'paper'.")
    rcParams.update({'figure.autolayout': True})
    logger.debug("Set figure.autoLayout to True.")
    
    #%% correlation coefficient matrix
    
    logger.info("Start visualizing correlation_coefficient_matrix.png.")
    corr = yX.corr()
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 10))
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(240, 10, as_cmap=True)
    # Draw the heatmap with the mask and correct aspect ratio
    fig = sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1,
                      center=0, square=True, linewidths=.5,
                      cbar_kws={"shrink": .5}).get_figure()
    fig.savefig(os.path.join(visualizations_path,
                             'correlation_coefficient_matrix.png'), dpi=300)
    fig.clear()
    plt.close()
    logger.info("Finished visualizing correlation_coefficient_matrix.png.")
    
    #%% histograms of transformation
    
    sns.set_style("darkgrid")
    logger.debug("Set seaborn_style to darkgrid.")
    
    logger.info("Start visualizing histograms.")
    # histogram of ranking
    fig = sns.distplot(teams_df.Ranking, rug=True,
                       axlabel='ranking').get_figure()
    fig.savefig(os.path.join(visualizations_path,
                             'histogram_ranking.png'), dpi=300)
    fig.clear()
    plt.close()
    
    # histogram of ranking_log
    fig = sns.distplot(y, rug=True, axlabel='ranking_log').get_figure()
    fig.savefig(os.path.join(visualizations_path,
                             'histogam_ranking_log.png'), dpi=300)
    fig.clear()
    plt.close()
    
    # histogram of loc_max
    fig = sns.distplot(np.e**X.loc_max_log, rug=True,
                       axlabel='loc_max').get_figure()
    fig.savefig(os.path.join(visualizations_path,
                             'histogram_loc_max.png'), dpi=300)
    fig.clear()
    plt.close()
    
    # histogram of loc_max_log
    fig = sns.distplot(X.loc_max_log, rug=True,
                       axlabel='loc_max_log').get_figure()
    fig.savefig(os.path.join(visualizations_path,
                         'histogram_loc_max_log.png'), dpi=300)
    fig.clear()
    plt.close()
    logger.info("Finished visualizing histograms.")
    
    #%% standardize
    
    logger.info("Start standardizing X.")
    scaler = StandardScaler()
    not_standardize = ['core',
                       'visualization',
                       'machine_learning',
                       'deep_learning']
    X_standardized = scaler.fit_transform(X
                                          .drop(columns=not_standardize)
                                          .values)
    X_standardized = pd.DataFrame(X_standardized,
                                  index = X.index,
                                  columns = X.columns.drop(not_standardize))
    X_not_standardized = X[not_standardize]
    X = pd.concat([X_standardized, X_not_standardized], axis=1)
    logger.debug("After Standardization:\n{}".format(X.describe().to_string))
    # update yX
    yX = pd.concat([y, X], axis=1)
    logger.info("Finished standardizing X.")
    
    #%% boxplot
    logger.info("Start visualizing boxplot.png.")
    f, ax = plt.subplots(figsize=(12, 8))
    fig = sns.boxplot(data=yX)
    fig.set_xticklabels(fig.get_xticklabels(), rotation=270)
    fig.get_figure().savefig(os.path.join(visualizations_path,
                                          'boxplot.png'), dpi=300)
    fig.clear()
    plt.close()
    logger.info("Finished visualizing boxplot.png.")
    
    #%% residual plot
    logger.info("Start visualizing residplot.png.")
    f, ax = plt.subplots(figsize=(5, 5))
    fig = sns.residplot(x=mod_sm.fittedvalues, y=y, data=X).get_figure()
    fig.savefig(os.path.join(visualizations_path, 'residplot.png'), dpi=300)
    fig.clear()
    plt.close()
    logger.info("Finished visualizing residplot.png.")

    #%% plot ElasticNetCV results
    
    # need to refit model with fixed l1_ratio (to best l1_ratio)
    # in order to visualize correctly
    mod.set_params(l1_ratio=mod.l1_ratio_)
    logger.info("Fixed l1_ratio to {}".format(mod.l1_ratio_))
    mod.fit(X.values, y.values)
    logger.info("Refitted ElaticNetCV model.")
    
    # print MSE's across folds
    logger.info("Start visualizing ElasticNetCV_MSE_per_fold.png.")
    alphas = mod.alphas_
    fig = plt.figure()
    plt.plot(alphas, mod.mse_path_, ':')
    plt.plot(alphas, mod.mse_path_.mean(axis=-1), 'b',
                   label='Average over the folds')
    plt.axvline(mod.alpha_, linestyle='--', color='k',
                      label="$\\alpha={:0.3f}$".format(mod.alpha_))
    plt.legend()
    plt.xlabel('alpha')
    plt.ylabel('error (or score)')
    plt.title('ElasticNetCV Alpha Error (per CV-fold)')
    plt.axis('tight')
    fig.savefig(os.path.join(visualizations_path,
                             'ElasticNetCV_MSE_per_fold.png'), dpi=300)
    fig.clear()
    plt.close()
    logger.info("Finished visualizing ElasticNetCV_MSE_per_fold.png.")
    
    # print R^2 errors (minimization equivalent to MSE)
    logger.info("Start visualizing ElasticNetCV_MSE.png.")
    visualizer = AlphaSelection(mod)
    visualizer.fit(X, y)
    visualizer.poof(outpath=os.path.join(visualizations_path,
                                         'ElasticNetCV_MSE.png'), dpi=300)
    plt.close()
    logger.info("Finished visualizing ElasticNetCV_MSE.png.")
    
    #%% pairplot not performed since too big
    
#    X_used = X.loc[:, mod.coef_ != 0]
#    fig = sns.pairplot(pd.concat([y, X_used], axis=1), kind='reg')
#    fig.savefig(os.path.join(visualizations_path,
#                             'pairplot.png'), dpi=100)
#    fig.clear()
#    plt.close()
        
    #%% logging time passed
    end = time()
    time_passed = pd.Timedelta(seconds=end-start).round(freq='s')
    logger.info("Time needed to create visualizations: {}"
                .format(time_passed))
示例#5
0
            c=np.sign(ridge.coef_),
            cmap="bwr_r")

######## Yellowbrick

from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError
from sklearn.linear_model import RidgeCV

### Find optimal alpha

alphas = np.logspace(-10, 1, 400)

ridge_alpha = RidgeCV(alphas=alphas)
ridge_yb = AlphaSelection(ridge_alpha)
ridge_yb.fit(X, y)
ridge_yb.poof()

### RVF plot

ridge_yb = ResidualsPlot(ridge, hist=True)
ridge_yb.fit(X_train, y_train)
ridge_yb.score(X_test, y_test)
ridge_yb.poof()

### Prediction Error

ridge_yb = PredictionError(ridge, hist=True)
ridge_yb.fit(X_train, y_train)
ridge_yb.score(X_test, y_test)
ridge_yb.poof()
示例#6
0
import seaborn as sns

df = pd.read_csv('D:/VS/Pt/ML/Keras/hitters.csv')

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from yellowbrick.regressor import AlphaSelection

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
#X = diabetes.data
#y = diabetes.target

alphas = np.logspace(-5, -0.5, 30)

# Instantiate the linear model and visualizer
model = LassoCV(alphas=alphas)
visualizer = AlphaSelection(model)

visualizer.fit(X, y)
g = visualizer.poof()

示例#7
0
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoCV

from yellowbrick.regressor import AlphaSelection


if __name__ == '__main__':
    # Load the regression data set
    df = pd.read_csv("../../../examples/data/concrete/concrete.csv")

    feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age']
    target_name = 'strength'

    # Get the X and y data from the DataFrame
    X = df[feature_names]
    y = df[target_name]

    # Instantiate the linear model and visualizer
    alphas = np.logspace(-10, 1, 400)
    visualizer = AlphaSelection(LassoCV(alphas=alphas))

    visualizer.fit(X, y)
    g = visualizer.poof(outpath="images/alpha_selection.png")
示例#8
0
plt.plot('t', 'ytest', data=data, color='blue', linewidth=1, label='actual')
plt.plot('t',
         'yhat',
         data=data,
         color='orange',
         marker='o',
         linestyle="None",
         label='predicted',
         alpha=0.5)
plt.plot('t', 'error', data=data, color='gray')
plt.title('Lasso')
plt.legend()
fig.savefig('lasso_total.png')
plt.show()

# Alpha Selection
fig, ax = plt.subplots()
alphas = np.logspace(-2, 1, 250)
cv = KFold(n_splits=5, shuffle=True, random_state=7)
lasso = LassoCV(alphas=alphas,
                n_alphas=250,
                fit_intercept=True,
                normalize=False,
                cv=cv,
                tol=0.0001,
                n_jobs=-1,
                verbose=1)
visualizer = AlphaSelection(lasso, ax=ax)
visualizer.fit(Xtrain, ytrain)
visualizer.poof(outpath="lasso_alphaselection.png")
示例#9
0
sns.heatmap(res, annot=True, cmap="YlGnBu")

######## Yellowbrick

from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError
from sklearn.linear_model import ElasticNetCV

### Find optimal alpha

alphas = np.logspace(-10, 1, 400)

elastic_alpha = ElasticNetCV(alphas=alphas)
elastic_yb = AlphaSelection(elastic_alpha)
elastic_yb.fit(X, y)
elastic_yb.poof()

### RVF plot

elastic_yb = ResidualsPlot(elastic, hist=True)
elastic_yb.fit(X_train, y_train)
elastic_yb.score(X_test, y_test)
elastic_yb.poof()

### Prediction Error

elastic_yb = PredictionError(elastic, hist=True)
elastic_yb.fit(X_train, y_train)
elastic_yb.score(X_test, y_test)
elastic_yb.poof()
示例#10
0
lasso_lars = grid.best_estimator_
plt.scatter(range(X_poly.shape[1]),
            lasso_lars.coef_,
            c=np.sign(lasso_lars.coef_),
            cmap="bwr_r")

######## Yellowbrick

from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError
from sklearn.linear_model import LassoLarsCV

### Find optimal alpha

lassolars_yb = AlphaSelection(LassoLarsCV())
lassolars_yb.fit(X, y)
lassolars_yb.poof()

### RVF plot

lasso_yb = ResidualsPlot(lasso_lars, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
lasso_yb.poof()

### Prediction Error

lasso_yb = PredictionError(lasso_lars, hist=True)
lasso_yb.fit(X_train, y_train)
lasso_yb.score(X_test, y_test)
lasso_yb.poof()
示例#11
0
import numpy as np
import bikeshare
from sklearn.linear_model import RidgeCV
from yellowbrick.regressor import AlphaSelection

alphas = np.logspace(-10, 1, 200)
visualizer = AlphaSelection(RidgeCV(alphas=alphas))
visualizer.fit(bikeshare.X, bikeshare.y)
visualizer.poof()
示例#12
0
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoCV

from yellowbrick.regressor import AlphaSelection

if __name__ == '__main__':
    # Load the regression data set
    df = pd.read_csv("../../../examples/data/concrete/concrete.csv")

    feature_names = [
        'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'
    ]
    target_name = 'strength'

    # Get the X and y data from the DataFrame
    X = df[feature_names].as_matrix()
    y = df[target_name].as_matrix()

    # Instantiate the linear model and visualizer
    alphas = np.logspace(-10, 1, 400)
    visualizer = AlphaSelection(LassoCV(alphas=alphas))

    visualizer.fit(X, y)
    g = visualizer.poof(outpath="images/alpha_selection.png")