def classification_cvscores(outpath="images/cv_scores_classifier.png", **kwargs):
    X, y = load_occupancy()

    # Create a new figure and axes
    _, ax = plt.subplots()

    cv = StratifiedKFold(12)

    oz = CVScores(
        MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted'
    )

    oz.fit(X, y)

    # Save to disk
    oz.poof(outpath=outpath)
def regression_cvscores(outpath="images/cv_scores_regressor.png", **kwargs):
    X, y = load_energy()

    # Create a new figure and axes
    _, ax = plt.subplots()

    cv = KFold(12)

    oz = CVScores(
        Ridge(), ax=ax, cv=cv, scoring='r2'
    )

    oz.fit(X, y)

    # Save to disk
    oz.poof(outpath=outpath)
示例#3
0
def visualizeKFoldCrossValidation(classifier, features, labels):

    cv = StratifiedKFold(10)
    # Create the cv score visualizer
    oz = CVScores(classifier, cv=cv, scoring='precision')

    oz.fit(features.drop(["appid", "name"], axis=1),
           list(map(convertLabelToNumber, labels)))
    oz.poof()
示例#4
0
文件: util.py 项目: nperera0/aethos
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        report=None,
                        model_name=None):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'
    """

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0])
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.finalize()

    visualizer_lcurve = LearningCurve(model,
                                      cv=cv,
                                      scoring=scoring,
                                      ax=axes[1])
    visualizer_lcurve.fit(x_train, y_train)
    visualizer_lcurve.finalize()

    visualizer_scores.show()
    visualizer_lcurve.show()

    if report or _global_config['track_experiments']:  # pragma: no cover
        fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
示例#5
0
def regression_cvscores(outpath="images/cv_scores_regressor.png", **kwargs):
    X, y = load_energy()

    # Create a new figure and axes
    _, ax = plt.subplots()

    cv = KFold(12)

    oz = CVScores(Ridge(), ax=ax, cv=cv, scoring='r2')

    oz.fit(X, y)

    # Save to disk
    oz.poof(outpath=outpath)
示例#6
0
def classification_cvscores(outpath="images/cv_scores_classifier.png",
                            **kwargs):
    X, y = load_occupancy()

    # Create a new figure and axes
    _, ax = plt.subplots()

    cv = StratifiedKFold(12)

    oz = CVScores(MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted')

    oz.fit(X, y)

    # Save to disk
    oz.poof(outpath=outpath)
示例#7
0
def cv_scores_regressor(path="images/cv_scores_regressor.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv"))

    targets = ["heating load", "cooling load"]
    features = [col for col in data.columns if col not in targets]

    X = data[features]
    y = data[targets[1]]

    _, ax = plt.subplots()

    oz = CVScores(RidgeCV(), ax=ax, scoring='r2')
    oz.fit(X, y)
    oz.poof(outpath=path)
示例#8
0
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        learning_curve=False):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'

    learning_curve : bool, optional
        If true plot learning curve, by default False
    
    Returns
    -------
    list
        List of cross validation curves
    """

    # TODO: Make curves slightly bigger
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring)
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.show()

    if learning_curve:
        visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring)
        visualizer_lcurve.fit(x_train, y_train)
        visualizer_lcurve.show()

    return visualizer_scores.cv_scores_
示例#9
0
def cv_scores_classifier(path="images/cv_scores_classifier.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv"))

    target = "outcome"
    features = [col for col in data.columns if col != target]

    X = pd.get_dummies(data[features])
    y = data[target]

    _, ax = plt.subplots()
    cv = StratifiedKFold(12)

    oz = CVScores(MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted')

    oz.fit(X, y)
    oz.poof(outpath=path)
def classification_structure(ml,
                             feature,
                             model_,
                             kFold=False,
                             LOO=False,
                             PCA_data=False,
                             constant_split=False,
                             structured=True,
                             plot_correlation_matrix=False,
                             pred=False,
                             disc=True,
                             bal=True,
                             conf=True):  # add kFold argument
    """
    ml : ML-ready feature vector containing experimental and kinematic data
    feature : labels for each class (vectorized using blist and get_ML_labels)
    model : classifier (sk-Learn compatible)
    kFold : int, number of folds if using kFold cross-validation from sk-Learn
    LOO : boolean flag, set True if using LOO cross-validation from sk-Learn
    PCA : boolean flag, set True if using PCA to reduce dimensions of feature vectors
    constant_split : boolean flag, set True if comparing results between classifiers

    """
    # split before norming to prevent bias in test data
    classifier_pipeline = make_pipeline(preprocessing.StandardScaler(), model_)
    if constant_split:
        cs = []
        X_train, X_test, y_train, y_test = split_ML_array(ml, feature, t=0.2)
        train_labels = get_ML_labels(y_train)
        X_train = norm_and_zscore_ML_array(X_train,
                                           robust=False,
                                           decomp=False,
                                           gauss=False)
        X_test = norm_and_zscore_ML_array(X_test,
                                          robust=False,
                                          decomp=False,
                                          gauss=False)
        for i, vals in enumerate(train_labels):
            cs.append(run_classifier(model_, X_train, X_test, vals, 0))
            # need to add cross_val_score for X_train,X_test splits

        return cs, model_

    # Create Classifier Pipeline Object in SciKit Learn
    if PCA_data:
        classifier_pipeline = make_pipeline(
            preprocessing.StandardScaler(),
            decomposition.PCA(n_components=int(PCA_data)), model_)
    else:
        classifier_pipeline = make_pipeline(preprocessing.StandardScaler(),
                                            model_)
    # For simple Classifier:
    X_train, X_test, y_train, y_test = split_ML_array(ml, feature, t=0.2)
    # generate correct labels for test/train labels
    train_labels = get_ML_labels(y_train)
    # norm and z-score test/train features
    X_train = norm_and_zscore_ML_array(X_train,
                                       robust=False,
                                       decomp=False,
                                       gauss=False)
    X_test = norm_and_zscore_ML_array(X_test,
                                      robust=False,
                                      decomp=False,
                                      gauss=False)
    # Feature Work
    if PCA_data:
        pcs = decomposition.PCA()
        X_train = pcs.fit(X_train)
        X_test = pcs.fit(X_test)
        for ii, mi in enumerate(pcs.explained_variance_ratio_[:].sum()):
            if mi > .99:
                n_comps = ii
        X_train = X_train[0:ii, :]
        X_test = X_test[0:ii, :]

    if plot_correlation_matrix:
        pearson_features(X_train)

    preds = []
    if structured:
        for idx, vals in enumerate(train_labels):
            # check for important class, then train inputs
            if idx == 0:  # Reach vs Null
                # Save ML predictions, models
                preds.append(
                    cross_val_score(classifier_pipeline,
                                    ml.reshape(ml.shape[0],
                                               ml.shape[1] * ml.shape[2]),
                                    get_ML_labels(feature)[idx],
                                    cv=kFold))
                # Plot in Yellowbrick
                visualizer = CVScores(classifier_pipeline,
                                      cv=kFold,
                                      scoring='f1_weighted')
                visualizer.fit(
                    ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]),
                    get_ML_labels(feature)[idx])
                visualizer.show()
                visualize_model(ml.reshape(ml.shape[0],
                                           ml.shape[1] * ml.shape[2]),
                                get_ML_labels(feature)[idx],
                                classifier_pipeline,
                                pred=pred,
                                disc=disc,
                                conf=conf,
                                bal=bal)
            if idx == 1:  # num reaches, 1 vs >1
                # Save ML predictions, models
                preds.append(
                    cross_val_score(classifier_pipeline,
                                    ml.reshape(ml.shape[0],
                                               ml.shape[1] * ml.shape[2]),
                                    get_ML_labels(feature)[idx],
                                    cv=kFold))
                # Plot in Yellowbrick
                visualizer = CVScores(classifier_pipeline,
                                      cv=kFold,
                                      scoring='f1_weighted')
                visualizer.fit(
                    ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]),
                    get_ML_labels(feature)[idx])
                visualizer.show()
                visualize_model(ml.reshape(ml.shape[0],
                                           ml.shape[1] * ml.shape[2]),
                                get_ML_labels(feature)[idx],
                                classifier_pipeline,
                                pred=pred,
                                disc=disc,
                                conf=conf,
                                bal=bal)
            if idx == 3:  # l/r vs lra,bi,rla
                # Save ML predictions, models
                preds.append(
                    cross_val_score(classifier_pipeline,
                                    ml.reshape(ml.shape[0],
                                               ml.shape[1] * ml.shape[2]),
                                    get_ML_labels(feature)[idx],
                                    cv=kFold))
                # Plot in YellowBrick
                visualizer = CVScores(classifier_pipeline,
                                      cv=kFold,
                                      scoring='f1_weighted')
                visualizer.fit(
                    ml.reshape(ml.shape[0], ml.shape[1] * ml.shape[2]),
                    get_ML_labels(feature)[idx])
                visualizer.show()
                visualize_model(ml.reshape(ml.shape[0],
                                           ml.shape[1] * ml.shape[2]),
                                get_ML_labels(feature)[idx],
                                classifier_pipeline,
                                pred=pred,
                                disc=disc,
                                conf=conf,
                                bal=bal)
    else:
        for i, vals in enumerate(
                train_labels
        ):  # loop over each layer of classifier, this just does classification
            try:
                if KFold:
                    preds.append(
                        cross_val_score(classifier_pipeline,
                                        ml.reshape(ml.shape[0],
                                                   ml.shape[1] * ml.shape[2]),
                                        get_ML_labels(feature)[i],
                                        cv=kFold))
                elif LOO:
                    preds.append(
                        cross_val_score(classifier_pipeline,
                                        ml.reshape(ml.shape[0],
                                                   ml.shape[1] * ml.shape[2]),
                                        get_ML_labels(feature)[i],
                                        cv=ml.shape[0] - 10))
                else:  # simple classification
                    preds.append(
                        run_classifier(model_, X_train, X_test, vals, 0))
                    continue
            except:
                print('Bad Classifier Entry (Line 500)')
                pdb.set_trace()
        try:
            print_preds(preds, train_labels)
        except:
            print('')
    return preds, model_
示例#11
0
def cvscores():
    X, y = load_energy()
    oz = CVScores(Ridge(), scoring="r2", cv=10, ax=newfig())
    oz.fit(X, y)
    savefig(oz, "cv_scores")
cd_visualizer = cooks_distance(X=X_train, y=y_train_log)

# + [markdown] pycharm={"name": "#%% md\n"}
# ## Cross Validation through YellowBrick
# ### **Evaluation of R2 over 4-fold Cross-Validation**
# - linear log model is evaluated via 4-k fold

# +
from sklearn.model_selection import KFold

from yellowbrick.model_selection import CVScores

# Instantiate the KFold settings
cv = KFold(n_splits=4, random_state=42)

cv_visualizer = CVScores(model=lr_log, cv=cv, scoring="r2")

cv_visualizer.fit(X=X_train_log, y=y_train_log)  # fit data into visualizer
cv_visualizer.poof()

# + [markdown] pycharm={"name": "#%% md\n"}
# - Median cross-validation R2 score is 89% and fairly consistent.
# - Evaluating next via sci-kit learn's model selection package
#
# -

# ### **Evaluation of RMSE (Root Mean Square Error)**

# +
from sklearn.model_selection import cross_val_score
lr_r2_scores = cross_val_score(estimator=lr_log,
示例#13
0
# ### Task 9: Cross Validation Scores

# In[36]:


from sklearn.model_selection import KFold
from yellowbrick.model_selection import CVScores

# Create a new figure and axes
_, ax = plt.subplots()

cv = KFold(12)

oz = CVScores(
    Lasso(), ax=ax, cv=cv, scoring='r2'
)

x = oz.fit(X_train, y_train)

oz.poof()


# ### Task 10: Learning Curves

# In[38]:


from yellowbrick.model_selection import LearningCurve
from sklearn.linear_model import LassoCV
from pylab import rcParams
示例#14
0
case_name = "mg_sizing_dataset_with_loc"
df = pd.read_csv("results/" + case_name + ".csv", sep=";|,", engine="python", index_col='index')
#df = df.loc[df['off-grid'] == 1]
X = df[features]
scaler.fit(X)
X = scaler.transform(X)
# X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
targets = ["PV","BAT","RBAT","INV","GEN","NPV"]
y = df[targets]
cv = StratifiedKFold(12)
param_range = np.arange(1, 30, 1)
cv = KFold(n_splits=12, random_state=40, shuffle=True)

viz = ValidationCurve(
    KNeighborsRegressor(), param_name="n_neighbors", param_range=param_range, scoring="r2", cv=cv, n_jobs=8
)

viz.fit(X, y)
viz.show()

visualizer = LearningCurve(KNeighborsRegressor(), scoring='r2', random_state=2, cv=cv, shuffle=True)

visualizer.fit(X, y)
visualizer.show()

vis = CVScores(KNeighborsRegressor(), cv=cv, scoring='r2')

vis.fit(X, y)        # Fit the data to the visualizer
vis.show()
示例#15
0
    visualizer = ConfusionMatrix(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 阈值选择
    visualizer = DiscriminationThreshold(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 学习率
    visualizer = LearningCurve(model, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 交叉验证
    visualizer = CVScores(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征重要性
    visualizer = FeatureImportances(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征递归消减
    visualizer = RFECV(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征选择
    visualizer = ValidationCurve(model,
示例#16
0
ax.bar(March_avg_counts['day'], March_avg_counts['average_count'])
ax.set_ylabel('Average Count from hours 12 to 18')
ax.set_xlabel('Day')
plt.title("Average counts in March (1st to 19th)")
plt.xticks(np.arange(0, 20, step=1))
plt.show()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(March_test_avg_counts['day'], March_test_avg_counts['average_count']-60)
ax.set_ylabel('Average Count from hours 12 to 18')
ax.set_xlabel('Day')
plt.title("Average counts in March (20th to 31st)")
plt.xticks(np.arange(20, 32, step=1))
plt.show()

#Cross-Validation
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import KFold

X_train, X_test, y_train, y_test = train_test_split(train_100[columns1], train_100['count'], test_size=0.20)

dt = DecisionTreeRegressor(random_state=0, criterion="mae")
dt_fit = dt.fit(X_train, y_train)

cv = KFold(n_splits=12, random_state=42)

visualizer = CVScores(dt, cv=cv, scoring='r2')

visualizer.fit(train_100[columns1], train_100['count'])
visualizer.show()
示例#17
0
cd_visualizer = cooks_distance(X=X_train, y=y_train_log)

# + [markdown] pycharm={"name": "#%% md\n"}
# ## Cross Validation through YellowBrick
# - linear log model is evaluated via 4-k fold

# + pycharm={"is_executing": false}
from sklearn.model_selection import KFold

from yellowbrick.model_selection import CVScores

# Instantiate the KFold settings
cv = KFold(n_splits=4, random_state=42)

cv_visualizer = CVScores(model=lr_log, cv=cv, scoring="r2")

cv_visualizer.fit(X=X_train_log, y=y_train_log)  # fit data into visualizer
cv_visualizer.poof()

# + [markdown] pycharm={"name": "#%% md\n"}
# - Median cross-validation R2 score is 89% and fairly consistent.
# - Evaluating next via sci-kit learn's model selection package
#

# + pycharm={"is_executing": false}
from sklearn.model_selection import cross_val_score
lr_r2_scores = cross_val_score(estimator=lr_log,
                               X=X_train_log,
                               y=y_train_log,
                               scoring='r2',
示例#18
0
 def draw_cross_validation_scores(self, cv, scoring='accuracy'):
     visualizer = CVScores(model=self.model, cv=cv, scoring=scoring)
     visualizer.fit(self.training_data, self.training_labels)
     visualizer.poof()
for train_index, test_index in cv.split(m2_pch):
    print("Train index: ", train_index, "\n")
    print("Test index: ", test_index)

    X_train, X_test, y_train, y_test = m2_pch_strs_broken.iloc[
        train_index], m2_pch_strs_broken.iloc[test_index], y_pch.iloc[
            train_index], y_pch.iloc[test_index]
    ridger5.fit(X_train, y_train)
    scores.append(ridger5.score(X_test, y_test))

print(np.mean(scores))

from yellowbrick.model_selection import CVScores
cv = KFold(n_splits=10, random_state=12345, shuffle=False)
ridger5 = Ridge(alpha=5)
visualizer = CVScores(ridger5, cv=cv, scoring='r2')
visualizer.fit(m2_pch_strs_broken, y_pch)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure

#Instantiate the linear model and visualizer
#from yellowbrick.regressor import ResidualsPlot

#visr5 = ResidualsPlot(ridger5)
#visr5.fit(m2_pch_train_strs_broken, y_pch_train)  # Fit the training data to the visualizer
#visr5.score(m2_pch_test_strs_broken, y_pch_test)  # Evaluate the model on the test data
#visr5.show() # Finalize and render the figure

#from yellowbrick.regressor import PredictionError
#ridger5 = Ridge(alpha=5)
#visualizer = PredictionError(ridger5)
# Cross Validation Model
from sklearn.model_selection import (cross_val_score, StratifiedShuffleSplit)
from yellowbrick.model_selection import CVScores

cv = StratifiedShuffleSplit(n_splits=5, random_state=0)
cvs = cross_val_score(SVC(kernel='rbf', C=100, gamma='auto'),
                      X_scale,
                      y_scale,
                      cv=cv,
                      scoring='f1_macro')
print('\nCross Validation\n')
print('Cross Validation Score : ', cvs.mean())

cv_vis = CVScores(SVC(kernel='rbf', C=100, gamma='auto'),
                  cv=cv,
                  scoring='f1_macro')
cv_vis.fit(X_scale, y_scale)
cv_vis.show()
print('\nVisualization...')

# Scoring Estimator
print('\nScoring Estimator\n')
## Classification Report
from sklearn.metrics import (classification_report, confusion_matrix)

name = ['edible', 'poisonous']
cr = classification_report(y_test, y_pred, target_names=name)
print('Classification Report : \n', cr)

## Confusion Matrix