def testFunc8(savepath='Results/bikeshare_RidgeCV_AlphaSelection.png'): ''' 基于共享单车数据使用AlphaSelection ''' data = pd.read_csv('fixtures/bikeshare/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] Y = data["riders"] alphas = np.logspace(-10, 1, 200) visualizer = AlphaSelection(RidgeCV(alphas=alphas)) visualizer.fit(X, Y) visualizer.poof(outpath=savepath)
############################### MODELING ###################################### ############################################################################# ################################### RIDGE REGRESSION ######################### ####### a) looking for best parameters #Run it to find the best alpha #Set a ranges for alphas alphas_range = np.arange(1, 200, 5) # Crossvalidate for the best alphas regr_cv = RidgeCV(alphas=alphas_range) #Visualize alpha visualizer = AlphaSelection(regr_cv) # Fit the linear regression visualizer.fit(X, y) g = visualizer.poof() visualizer.alpha_ # best parameter shows up to be 81 ####### b) Implement Ridge Regression ridge = Ridge(alpha=visualizer.alpha_) # this parameter is choosen by RidgeCV ridge.fit(X_train, y_train) # Fit a ridge regression on the training data coefs_ridge = pd.DataFrame(ridge.coef_.T, index=[X.columns]) # Print coefficients coefs_ridge = coefs_ridge.rename(columns={0: 'coef_value'}) # TRAIN SET pred_train = ridge.predict(X_train) # Use this model to predict the train data # Calculate RMSE train print("RMSE for Train:", sqrt(mean_squared_error(y_train, pred_train))) #RMSE print("R^2 for Train:", ridge.score(X_train, y_train)) #R2
c=np.sign(lasso.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import LassoCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) lasso_alpha = LassoCV(alphas=alphas) lasso_yb = AlphaSelection(lasso_alpha) lasso_yb.fit(X, y) lasso_yb.poof() ### RVF plot lasso_yb = ResidualsPlot(lasso, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof() ### Prediction Error lasso_yb = PredictionError(lasso, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof()
def main(processed_path = "data/processed", models_path = "models", visualizations_path = "visualizations"): """Creates visualizations.""" # logging logger = logging.getLogger(__name__) # normalize paths processed_path = os.path.normpath(processed_path) logger.debug("Path to processed data normalized: {}" .format(processed_path)) models_path = os.path.normpath(models_path) logger.debug("Path to models normalized: {}" .format(models_path)) visualizations_path = os.path.normpath(visualizations_path) logger.debug("Path to visualizations normalized: {}" .format(visualizations_path)) #%% load selected_df selected_df = pd.read_pickle(os.path.join(processed_path, 'selected_df.pkl')) logger.info("Loaded selected_df. Shape of df: {}" .format(selected_df.shape)) # load models mod = pickle.load(open( os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'), 'rb')) logger.info("Loaded sklearn_ElasticNetCV.pkl.") mod_sm = pickle.load(open( os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'), 'rb')) logger.info("Loaded sm_OLS_fit_regularized.") #%% split selected_df into dependent and independent variables teams_df = selected_df.iloc[:, :9] y = selected_df.iloc[:, 9:10] X = selected_df.iloc[:, 10:] yX = pd.concat([y, X], axis=1) logger.debug("Splitted selected_df to teams_df, y, X and yX.") #%% start visualization start = time() sns.set_context('paper') logger.debug("Set seaborn context to 'paper'.") rcParams.update({'figure.autolayout': True}) logger.debug("Set figure.autoLayout to True.") #%% correlation coefficient matrix logger.info("Start visualizing correlation_coefficient_matrix.png.") corr = yX.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 10)) # Generate a custom diverging colormap cmap = sns.diverging_palette(240, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio fig = sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}).get_figure() fig.savefig(os.path.join(visualizations_path, 'correlation_coefficient_matrix.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing correlation_coefficient_matrix.png.") #%% histograms of transformation sns.set_style("darkgrid") logger.debug("Set seaborn_style to darkgrid.") logger.info("Start visualizing histograms.") # histogram of ranking fig = sns.distplot(teams_df.Ranking, rug=True, axlabel='ranking').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_ranking.png'), dpi=300) fig.clear() plt.close() # histogram of ranking_log fig = sns.distplot(y, rug=True, axlabel='ranking_log').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogam_ranking_log.png'), dpi=300) fig.clear() plt.close() # histogram of loc_max fig = sns.distplot(np.e**X.loc_max_log, rug=True, axlabel='loc_max').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_loc_max.png'), dpi=300) fig.clear() plt.close() # histogram of loc_max_log fig = sns.distplot(X.loc_max_log, rug=True, axlabel='loc_max_log').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_loc_max_log.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing histograms.") #%% standardize logger.info("Start standardizing X.") scaler = StandardScaler() not_standardize = ['core', 'visualization', 'machine_learning', 'deep_learning'] X_standardized = scaler.fit_transform(X .drop(columns=not_standardize) .values) X_standardized = pd.DataFrame(X_standardized, index = X.index, columns = X.columns.drop(not_standardize)) X_not_standardized = X[not_standardize] X = pd.concat([X_standardized, X_not_standardized], axis=1) logger.debug("After Standardization:\n{}".format(X.describe().to_string)) # update yX yX = pd.concat([y, X], axis=1) logger.info("Finished standardizing X.") #%% boxplot logger.info("Start visualizing boxplot.png.") f, ax = plt.subplots(figsize=(12, 8)) fig = sns.boxplot(data=yX) fig.set_xticklabels(fig.get_xticklabels(), rotation=270) fig.get_figure().savefig(os.path.join(visualizations_path, 'boxplot.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing boxplot.png.") #%% residual plot logger.info("Start visualizing residplot.png.") f, ax = plt.subplots(figsize=(5, 5)) fig = sns.residplot(x=mod_sm.fittedvalues, y=y, data=X).get_figure() fig.savefig(os.path.join(visualizations_path, 'residplot.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing residplot.png.") #%% plot ElasticNetCV results # need to refit model with fixed l1_ratio (to best l1_ratio) # in order to visualize correctly mod.set_params(l1_ratio=mod.l1_ratio_) logger.info("Fixed l1_ratio to {}".format(mod.l1_ratio_)) mod.fit(X.values, y.values) logger.info("Refitted ElaticNetCV model.") # print MSE's across folds logger.info("Start visualizing ElasticNetCV_MSE_per_fold.png.") alphas = mod.alphas_ fig = plt.figure() plt.plot(alphas, mod.mse_path_, ':') plt.plot(alphas, mod.mse_path_.mean(axis=-1), 'b', label='Average over the folds') plt.axvline(mod.alpha_, linestyle='--', color='k', label="$\\alpha={:0.3f}$".format(mod.alpha_)) plt.legend() plt.xlabel('alpha') plt.ylabel('error (or score)') plt.title('ElasticNetCV Alpha Error (per CV-fold)') plt.axis('tight') fig.savefig(os.path.join(visualizations_path, 'ElasticNetCV_MSE_per_fold.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing ElasticNetCV_MSE_per_fold.png.") # print R^2 errors (minimization equivalent to MSE) logger.info("Start visualizing ElasticNetCV_MSE.png.") visualizer = AlphaSelection(mod) visualizer.fit(X, y) visualizer.poof(outpath=os.path.join(visualizations_path, 'ElasticNetCV_MSE.png'), dpi=300) plt.close() logger.info("Finished visualizing ElasticNetCV_MSE.png.") #%% pairplot not performed since too big # X_used = X.loc[:, mod.coef_ != 0] # fig = sns.pairplot(pd.concat([y, X_used], axis=1), kind='reg') # fig.savefig(os.path.join(visualizations_path, # 'pairplot.png'), dpi=100) # fig.clear() # plt.close() #%% logging time passed end = time() time_passed = pd.Timedelta(seconds=end-start).round(freq='s') logger.info("Time needed to create visualizations: {}" .format(time_passed))
c=np.sign(ridge.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import RidgeCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) ridge_alpha = RidgeCV(alphas=alphas) ridge_yb = AlphaSelection(ridge_alpha) ridge_yb.fit(X, y) ridge_yb.poof() ### RVF plot ridge_yb = ResidualsPlot(ridge, hist=True) ridge_yb.fit(X_train, y_train) ridge_yb.score(X_test, y_test) ridge_yb.poof() ### Prediction Error ridge_yb = PredictionError(ridge, hist=True) ridge_yb.fit(X_train, y_train) ridge_yb.score(X_test, y_test) ridge_yb.poof()
import seaborn as sns df = pd.read_csv('D:/VS/Pt/ML/Keras/hitters.csv') import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.linear_model import LassoCV from sklearn.linear_model import Lasso from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV from yellowbrick.regressor import AlphaSelection diabetes = datasets.load_diabetes() X = diabetes.data[:150] y = diabetes.target[:150] #X = diabetes.data #y = diabetes.target alphas = np.logspace(-5, -0.5, 30) # Instantiate the linear model and visualizer model = LassoCV(alphas=alphas) visualizer = AlphaSelection(model) visualizer.fit(X, y) g = visualizer.poof()
import numpy as np import pandas as pd from sklearn.linear_model import LassoCV from yellowbrick.regressor import AlphaSelection if __name__ == '__main__': # Load the regression data set df = pd.read_csv("../../../examples/data/concrete/concrete.csv") feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names] y = df[target_name] # Instantiate the linear model and visualizer alphas = np.logspace(-10, 1, 400) visualizer = AlphaSelection(LassoCV(alphas=alphas)) visualizer.fit(X, y) g = visualizer.poof(outpath="images/alpha_selection.png")
plt.plot('t', 'ytest', data=data, color='blue', linewidth=1, label='actual') plt.plot('t', 'yhat', data=data, color='orange', marker='o', linestyle="None", label='predicted', alpha=0.5) plt.plot('t', 'error', data=data, color='gray') plt.title('Lasso') plt.legend() fig.savefig('lasso_total.png') plt.show() # Alpha Selection fig, ax = plt.subplots() alphas = np.logspace(-2, 1, 250) cv = KFold(n_splits=5, shuffle=True, random_state=7) lasso = LassoCV(alphas=alphas, n_alphas=250, fit_intercept=True, normalize=False, cv=cv, tol=0.0001, n_jobs=-1, verbose=1) visualizer = AlphaSelection(lasso, ax=ax) visualizer.fit(Xtrain, ytrain) visualizer.poof(outpath="lasso_alphaselection.png")
sns.heatmap(res, annot=True, cmap="YlGnBu") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import ElasticNetCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) elastic_alpha = ElasticNetCV(alphas=alphas) elastic_yb = AlphaSelection(elastic_alpha) elastic_yb.fit(X, y) elastic_yb.poof() ### RVF plot elastic_yb = ResidualsPlot(elastic, hist=True) elastic_yb.fit(X_train, y_train) elastic_yb.score(X_test, y_test) elastic_yb.poof() ### Prediction Error elastic_yb = PredictionError(elastic, hist=True) elastic_yb.fit(X_train, y_train) elastic_yb.score(X_test, y_test) elastic_yb.poof()
lasso_lars = grid.best_estimator_ plt.scatter(range(X_poly.shape[1]), lasso_lars.coef_, c=np.sign(lasso_lars.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import LassoLarsCV ### Find optimal alpha lassolars_yb = AlphaSelection(LassoLarsCV()) lassolars_yb.fit(X, y) lassolars_yb.poof() ### RVF plot lasso_yb = ResidualsPlot(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof() ### Prediction Error lasso_yb = PredictionError(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof()
import numpy as np import bikeshare from sklearn.linear_model import RidgeCV from yellowbrick.regressor import AlphaSelection alphas = np.logspace(-10, 1, 200) visualizer = AlphaSelection(RidgeCV(alphas=alphas)) visualizer.fit(bikeshare.X, bikeshare.y) visualizer.poof()
import numpy as np import pandas as pd from sklearn.linear_model import LassoCV from yellowbrick.regressor import AlphaSelection if __name__ == '__main__': # Load the regression data set df = pd.read_csv("../../../examples/data/concrete/concrete.csv") feature_names = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names].as_matrix() y = df[target_name].as_matrix() # Instantiate the linear model and visualizer alphas = np.logspace(-10, 1, 400) visualizer = AlphaSelection(LassoCV(alphas=alphas)) visualizer.fit(X, y) g = visualizer.poof(outpath="images/alpha_selection.png")