def classification_cvscores(outpath="images/cv_scores_classifier.png", **kwargs): X, y = load_occupancy() # Create a new figure and axes _, ax = plt.subplots() cv = StratifiedKFold(12) oz = CVScores( MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted' ) oz.fit(X, y) # Save to disk oz.poof(outpath=outpath)
def regression_cvscores(outpath="images/cv_scores_regressor.png", **kwargs): X, y = load_energy() # Create a new figure and axes _, ax = plt.subplots() cv = KFold(12) oz = CVScores( Ridge(), ax=ax, cv=cv, scoring='r2' ) oz.fit(X, y) # Save to disk oz.poof(outpath=outpath)
def visualizeKFoldCrossValidation(classifier, features, labels): cv = StratifiedKFold(10) # Create the cv score visualizer oz = CVScores(classifier, cv=cv, scoring='precision') oz.fit(features.drop(["appid", "name"], axis=1), list(map(convertLabelToNumber, labels))) oz.poof()
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", report=None, model_name=None): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' """ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0]) visualizer_scores.fit(x_train, y_train) visualizer_scores.finalize() visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring, ax=axes[1]) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.finalize() visualizer_scores.show() visualizer_lcurve.show() if report or _global_config['track_experiments']: # pragma: no cover fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
def regression_cvscores(outpath="images/cv_scores_regressor.png", **kwargs): X, y = load_energy() # Create a new figure and axes _, ax = plt.subplots() cv = KFold(12) oz = CVScores(Ridge(), ax=ax, cv=cv, scoring='r2') oz.fit(X, y) # Save to disk oz.poof(outpath=outpath)
def classification_cvscores(outpath="images/cv_scores_classifier.png", **kwargs): X, y = load_occupancy() # Create a new figure and axes _, ax = plt.subplots() cv = StratifiedKFold(12) oz = CVScores(MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted') oz.fit(X, y) # Save to disk oz.poof(outpath=outpath)
def cv_scores_regressor(path="images/cv_scores_regressor.png"): data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv")) targets = ["heating load", "cooling load"] features = [col for col in data.columns if col not in targets] X = data[features] y = data[targets[1]] _, ax = plt.subplots() oz = CVScores(RidgeCV(), ax=ax, scoring='r2') oz.fit(X, y) oz.poof(outpath=path)
def run_crossvalidation(model, x_train, y_train, cv=5, scoring="accuracy", learning_curve=False): """ Runs cross validation on a certain model. Parameters ---------- model : Model Model to cross validate x_train : nd-array Training data y_train : nd-array Testing data cv : int, Crossvalidation Generator, optional Cross validation method, by default 5 scoring : str, optional Scoring method, by default 'accuracy' learning_curve : bool, optional If true plot learning curve, by default False Returns ------- list List of cross validation curves """ # TODO: Make curves slightly bigger visualizer_scores = CVScores(model, cv=cv, scoring=scoring) visualizer_scores.fit(x_train, y_train) visualizer_scores.show() if learning_curve: visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring) visualizer_lcurve.fit(x_train, y_train) visualizer_lcurve.show() return visualizer_scores.cv_scores_
def cv_scores_classifier(path="images/cv_scores_classifier.png"): data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) target = "outcome" features = [col for col in data.columns if col != target] X = pd.get_dummies(data[features]) y = data[target] _, ax = plt.subplots() cv = StratifiedKFold(12) oz = CVScores(MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted') oz.fit(X, y) oz.poof(outpath=path)
def classification_structure(ml, feature, model_, kFold=False, LOO=False, PCA_data=False, constant_split=False, structured=True, plot_correlation_matrix=False, pred=False, disc=True, bal=True, conf=True): # add kFold argument """ ml : ML-ready feature vector containing experimental and kinematic data feature : labels for each class (vectorized using blist and get_ML_labels) model : classifier (sk-Learn compatible) kFold : int, number of folds if using kFold cross-validation from sk-Learn LOO : boolean flag, set True if using LOO cross-validation from sk-Learn PCA : boolean flag, set True if using PCA to reduce dimensions of feature vectors constant_split : boolean flag, set True if comparing results between classifiers """ # split before norming to prevent bias in test data classifier_pipeline = make_pipeline(preprocessing.StandardScaler(), model_) if constant_split: cs = [] X_train, X_test, y_train, y_test = split_ML_array(ml, feature, t=0.2) train_labels = get_ML_labels(y_train) X_train = norm_and_zscore_ML_array(X_train, robust=False, decomp=False, gauss=False) X_test = norm_and_zscore_ML_array(X_test, robust=False, decomp=False, gauss=False) for i, vals in enumerate(train_labels): cs.append(run_classifier(model_, X_train, X_test, vals, 0)) # need to add cross_val_score for X_train,X_test splits return cs, model_ # Create Classifier Pipeline Object in SciKit Learn if PCA_data: classifier_pipeline = make_pipeline( preprocessing.StandardScaler(), decomposition.PCA(n_components=int(PCA_data)), model_) else: classifier_pipeline = make_pipeline(preprocessing.StandardScaler(), model_) # For simple Classifier: X_train, X_test, y_train, y_test = split_ML_array(ml, feature, t=0.2) # generate correct labels for test/train labels train_labels = get_ML_labels(y_train) # norm and z-score test/train features X_train = norm_and_zscore_ML_array(X_train, robust=False, decomp=False, gauss=False) X_test = norm_and_zscore_ML_array(X_test, robust=False, decomp=False, gauss=False) # Feature Work if PCA_data: pcs = decomposition.PCA() X_train = pcs.fit(X_train) X_test = pcs.fit(X_test) for ii, mi in enumerate(pcs.explained_variance_ratio_[:].sum()): if mi > .99: n_comps = ii X_train = X_train[0:ii, :] X_test = X_test[0:ii, :] if plot_correlation_matrix: pearson_features(X_train) preds = [] if structured: for idx, vals in enumerate(train_labels): # check for important class, then train inputs if idx == 0: # Reach vs Null # Save ML predictions, models preds.append( cross_val_score(classifier_pipeline, ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], cv=kFold)) # Plot in Yellowbrick visualizer = CVScores(classifier_pipeline, cv=kFold, scoring='f1_weighted') visualizer.fit( ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx]) visualizer.show() visualize_model(ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], classifier_pipeline, pred=pred, disc=disc, conf=conf, bal=bal) if idx == 1: # num reaches, 1 vs >1 # Save ML predictions, models preds.append( cross_val_score(classifier_pipeline, ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], cv=kFold)) # Plot in Yellowbrick visualizer = CVScores(classifier_pipeline, cv=kFold, scoring='f1_weighted') visualizer.fit( ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx]) visualizer.show() visualize_model(ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], classifier_pipeline, pred=pred, disc=disc, conf=conf, bal=bal) if idx == 3: # l/r vs lra,bi,rla # Save ML predictions, models preds.append( cross_val_score(classifier_pipeline, ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], cv=kFold)) # Plot in YellowBrick visualizer = CVScores(classifier_pipeline, cv=kFold, scoring='f1_weighted') visualizer.fit( ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx]) visualizer.show() visualize_model(ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[idx], classifier_pipeline, pred=pred, disc=disc, conf=conf, bal=bal) else: for i, vals in enumerate( train_labels ): # loop over each layer of classifier, this just does classification try: if KFold: preds.append( cross_val_score(classifier_pipeline, ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[i], cv=kFold)) elif LOO: preds.append( cross_val_score(classifier_pipeline, ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]), get_ML_labels(feature)[i], cv=ml.shape[0] - 10)) else: # simple classification preds.append( run_classifier(model_, X_train, X_test, vals, 0)) continue except: print('Bad Classifier Entry (Line 500)') pdb.set_trace() try: print_preds(preds, train_labels) except: print('') return preds, model_
def cvscores(): X, y = load_energy() oz = CVScores(Ridge(), scoring="r2", cv=10, ax=newfig()) oz.fit(X, y) savefig(oz, "cv_scores")
cd_visualizer = cooks_distance(X=X_train, y=y_train_log) # + [markdown] pycharm={"name": "#%% md\n"} # ## Cross Validation through YellowBrick # ### **Evaluation of R2 over 4-fold Cross-Validation** # - linear log model is evaluated via 4-k fold # + from sklearn.model_selection import KFold from yellowbrick.model_selection import CVScores # Instantiate the KFold settings cv = KFold(n_splits=4, random_state=42) cv_visualizer = CVScores(model=lr_log, cv=cv, scoring="r2") cv_visualizer.fit(X=X_train_log, y=y_train_log) # fit data into visualizer cv_visualizer.poof() # + [markdown] pycharm={"name": "#%% md\n"} # - Median cross-validation R2 score is 89% and fairly consistent. # - Evaluating next via sci-kit learn's model selection package # # - # ### **Evaluation of RMSE (Root Mean Square Error)** # + from sklearn.model_selection import cross_val_score lr_r2_scores = cross_val_score(estimator=lr_log,
# ### Task 9: Cross Validation Scores # In[36]: from sklearn.model_selection import KFold from yellowbrick.model_selection import CVScores # Create a new figure and axes _, ax = plt.subplots() cv = KFold(12) oz = CVScores( Lasso(), ax=ax, cv=cv, scoring='r2' ) x = oz.fit(X_train, y_train) oz.poof() # ### Task 10: Learning Curves # In[38]: from yellowbrick.model_selection import LearningCurve from sklearn.linear_model import LassoCV from pylab import rcParams
case_name = "mg_sizing_dataset_with_loc" df = pd.read_csv("results/" + case_name + ".csv", sep=";|,", engine="python", index_col='index') #df = df.loc[df['off-grid'] == 1] X = df[features] scaler.fit(X) X = scaler.transform(X) # X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns) targets = ["PV","BAT","RBAT","INV","GEN","NPV"] y = df[targets] cv = StratifiedKFold(12) param_range = np.arange(1, 30, 1) cv = KFold(n_splits=12, random_state=40, shuffle=True) viz = ValidationCurve( KNeighborsRegressor(), param_name="n_neighbors", param_range=param_range, scoring="r2", cv=cv, n_jobs=8 ) viz.fit(X, y) viz.show() visualizer = LearningCurve(KNeighborsRegressor(), scoring='r2', random_state=2, cv=cv, shuffle=True) visualizer.fit(X, y) visualizer.show() vis = CVScores(KNeighborsRegressor(), cv=cv, scoring='r2') vis.fit(X, y) # Fit the data to the visualizer vis.show()
visualizer = ConfusionMatrix(model) visualizer.score(X_test, y_test) visualizer.show() # 阈值选择 visualizer = DiscriminationThreshold(model) visualizer.fit(X_train, y_train) visualizer.show() # 学习率 visualizer = LearningCurve(model, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 交叉验证 visualizer = CVScores(model, cv=5, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 特征重要性 visualizer = FeatureImportances(model) visualizer.fit(X_train, y_train) visualizer.show() # 特征递归消减 visualizer = RFECV(model, cv=5, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 特征选择 visualizer = ValidationCurve(model,
ax.bar(March_avg_counts['day'], March_avg_counts['average_count']) ax.set_ylabel('Average Count from hours 12 to 18') ax.set_xlabel('Day') plt.title("Average counts in March (1st to 19th)") plt.xticks(np.arange(0, 20, step=1)) plt.show() fig = plt.figure() ax = fig.add_axes([0,0,1,1]) ax.bar(March_test_avg_counts['day'], March_test_avg_counts['average_count']-60) ax.set_ylabel('Average Count from hours 12 to 18') ax.set_xlabel('Day') plt.title("Average counts in March (20th to 31st)") plt.xticks(np.arange(20, 32, step=1)) plt.show() #Cross-Validation from yellowbrick.model_selection import CVScores from sklearn.model_selection import KFold X_train, X_test, y_train, y_test = train_test_split(train_100[columns1], train_100['count'], test_size=0.20) dt = DecisionTreeRegressor(random_state=0, criterion="mae") dt_fit = dt.fit(X_train, y_train) cv = KFold(n_splits=12, random_state=42) visualizer = CVScores(dt, cv=cv, scoring='r2') visualizer.fit(train_100[columns1], train_100['count']) visualizer.show()
cd_visualizer = cooks_distance(X=X_train, y=y_train_log) # + [markdown] pycharm={"name": "#%% md\n"} # ## Cross Validation through YellowBrick # - linear log model is evaluated via 4-k fold # + pycharm={"is_executing": false} from sklearn.model_selection import KFold from yellowbrick.model_selection import CVScores # Instantiate the KFold settings cv = KFold(n_splits=4, random_state=42) cv_visualizer = CVScores(model=lr_log, cv=cv, scoring="r2") cv_visualizer.fit(X=X_train_log, y=y_train_log) # fit data into visualizer cv_visualizer.poof() # + [markdown] pycharm={"name": "#%% md\n"} # - Median cross-validation R2 score is 89% and fairly consistent. # - Evaluating next via sci-kit learn's model selection package # # + pycharm={"is_executing": false} from sklearn.model_selection import cross_val_score lr_r2_scores = cross_val_score(estimator=lr_log, X=X_train_log, y=y_train_log, scoring='r2',
def draw_cross_validation_scores(self, cv, scoring='accuracy'): visualizer = CVScores(model=self.model, cv=cv, scoring=scoring) visualizer.fit(self.training_data, self.training_labels) visualizer.poof()
for train_index, test_index in cv.split(m2_pch): print("Train index: ", train_index, "\n") print("Test index: ", test_index) X_train, X_test, y_train, y_test = m2_pch_strs_broken.iloc[ train_index], m2_pch_strs_broken.iloc[test_index], y_pch.iloc[ train_index], y_pch.iloc[test_index] ridger5.fit(X_train, y_train) scores.append(ridger5.score(X_test, y_test)) print(np.mean(scores)) from yellowbrick.model_selection import CVScores cv = KFold(n_splits=10, random_state=12345, shuffle=False) ridger5 = Ridge(alpha=5) visualizer = CVScores(ridger5, cv=cv, scoring='r2') visualizer.fit(m2_pch_strs_broken, y_pch) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure #Instantiate the linear model and visualizer #from yellowbrick.regressor import ResidualsPlot #visr5 = ResidualsPlot(ridger5) #visr5.fit(m2_pch_train_strs_broken, y_pch_train) # Fit the training data to the visualizer #visr5.score(m2_pch_test_strs_broken, y_pch_test) # Evaluate the model on the test data #visr5.show() # Finalize and render the figure #from yellowbrick.regressor import PredictionError #ridger5 = Ridge(alpha=5) #visualizer = PredictionError(ridger5)
# Cross Validation Model from sklearn.model_selection import (cross_val_score, StratifiedShuffleSplit) from yellowbrick.model_selection import CVScores cv = StratifiedShuffleSplit(n_splits=5, random_state=0) cvs = cross_val_score(SVC(kernel='rbf', C=100, gamma='auto'), X_scale, y_scale, cv=cv, scoring='f1_macro') print('\nCross Validation\n') print('Cross Validation Score : ', cvs.mean()) cv_vis = CVScores(SVC(kernel='rbf', C=100, gamma='auto'), cv=cv, scoring='f1_macro') cv_vis.fit(X_scale, y_scale) cv_vis.show() print('\nVisualization...') # Scoring Estimator print('\nScoring Estimator\n') ## Classification Report from sklearn.metrics import (classification_report, confusion_matrix) name = ['edible', 'poisonous'] cr = classification_report(y_test, y_pred, target_names=name) print('Classification Report : \n', cr) ## Confusion Matrix