def plot_learning_curve(X, y, model, figPath=None):
    """Prints a training curve for a model (with file saving ability).

    Args:
        X (numpy array): Features for the evaluation.
        y (numpy array): Targets for the evaluation.
        model (sklearn Model): The model to visualize.
        figPath (str): Where to save the figure. figPath=None does not save the figure.

    """

    # create a stratified cross validation to ensure good sample
    # representation.
    cv = StratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring='f1_weighted',
                               train_sizes=np.linspace(0.3, 1.0, 10),
                               n_jobs=-1)
    visualizer.fit(X, y)

    if figPath == None:
        visualizer.show()
    else:
        visualizer.show(outpath=f'{figPath}/learning.png')
Пример #2
0
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir):
    model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c)
    rfe = RFE(model, n_features_to_select=n)

    ## learning curve
    plt.clf()
    viz_LC = LearningCurve(
        rfe, scoring='f1_weighted', n_jobs=4
    )
    viz_LC.fit(X, y)
    viz_LC.show(outpath=outdir + '/LC.png')

    ## classification report
    plt.clf()
    viz_CR = ClassificationReport(rfe, classes=class_names, support=True)
    viz_CR.fit(X, y)
    viz_CR.score(X_test, y_test)
    viz_CR.show(outpath=outdir + '/CR.png')

    ## confusion matrix
    plt.clf()
    viz_CM = ConfusionMatrix(rfe, classes=class_names)
    viz_CM.fit(X, y)
    viz_CM.score(X_test, y_test)
    viz_CM.show(outpath=outdir + '/CM.png')

    ## precision recall curve
    plt.clf()
    viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True,
                                   fill_area=False, micro=False, classes=class_names)
    viz_PRC.fit(X, y)
    viz_PRC.score(X_test, y_test)
    viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720))

    ## class prediction error
    plt.clf()
    viz_CPE = ClassPredictionError(
        rfe, classes=class_names
    )
    viz_CPE.fit(X, y)
    viz_CPE.score(X_test, y_test)
    viz_CPE.show(outpath=outdir + '/CPE.png')

    ## ROCAUC
    plt.clf()
    viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720))
    viz_RA.fit(X, y)
    viz_RA.score(X, y)
    viz_RA.show(outpath=outdir + '/RA.png')

    fit = rfe.fit(X,y)
    y_predict = fit.predict(X_test)
    f1 = f1_score(y_test, y_predict, average='weighted')

    features_retained_RFE = X.columns[rfe.get_support()].values
    feature_df =pd.DataFrame(features_retained_RFE.tolist())
    feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False)

    return f1
Пример #3
0
def generate_learning_curve(model, clf_name, scoring, sizes, cv, n_jobs,
                            dataset_name, X_train, y_train):
    viz = LearningCurve(model,
                        cv=cv,
                        scoring=scoring,
                        train_sizes=sizes,
                        n_jobs=n_jobs)
    viz.fit(X_train, y_train)
    viz.show("results/{}_learning_curve_{}.png".format(clf_name, dataset_name))
    plt.clf()
Пример #4
0
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        learning_curve=False):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'

    learning_curve : bool, optional
        If true plot learning curve, by default False
    
    Returns
    -------
    list
        List of cross validation curves
    """

    # TODO: Make curves slightly bigger
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring)
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.show()

    if learning_curve:
        visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring)
        visualizer_lcurve.fit(x_train, y_train)
        visualizer_lcurve.show()

    return visualizer_scores.cv_scores_
Пример #5
0
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        report=None,
                        model_name=None):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'
    """

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0])
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.finalize()

    visualizer_lcurve = LearningCurve(model,
                                      cv=cv,
                                      scoring=scoring,
                                      ax=axes[1])
    visualizer_lcurve.fit(x_train, y_train)
    visualizer_lcurve.finalize()

    visualizer_scores.show()
    visualizer_lcurve.show()

    if report or _global_config['track_experiments']:  # pragma: no cover
        fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
Пример #6
0
def yellow_brick_learning_curve(model, x, y, cpu_count, cv_count,
                                scoring_metric):
    """

    """
    # Create the learning curve visualizer
    cv = StratifiedKFold(n_splits=cv_count)
    sizes = np.linspace(0.3, 1.0, 10)

    # Instantiate the classification model and visualizer
    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring=scoring_metric,
                               train_sizes=sizes,
                               n_jobs=cpu_count)

    visualizer.fit(x, y)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
Пример #7
0
def learning_curves(models, X, y):
    """
    :params models:  Modelos a serem avaliados
    :params X:       Dados de Treino variaveis independentes
    :params y:       Dados de Treino variavel dependente
    :return:         Viz da curvas de apendizagem
    """

    cv_strategy = StratifiedKFold(n_splits=3)
    for model in models:

        sizes = np.linspace(0.3, 1.0, 10)
        viz = LearningCurve(model,
                            cv=cv_strategy,
                            scoring='roc_auc',
                            train_sizes=sizes,
                            n_jobs=4)
        viz.fit(X, y)
        viz.show()
Пример #8
0
# Do some scoring on XGB estimators
# Validation curve
viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"),
                      param_name="max_depth",
                      param_range=np.arange(1, 11),
                      cv=5,
                      scoring="r2")
viz.fit(x_train, y_train)
viz.show()

# Learning curve
model = XGBRegressor(objective="reg:squarederror")
viz_2 = LearningCurve(model, scoring="r2")
viz_2.fit(x_train, y_train)
viz_2.show()

model = RFECV(LassoCV(), cv=5, scoring='r2')
model.fit(x_train, y_train)
model.show()
"""
Section: 5
Time-Series Algorithms
"""
# Fitting ARIMA
# Original Series
# plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})
fig, axes = plt.subplots(3, 1, sharex=True)
plot_acf(main_data.traffic_volume, ax=axes[0])

# 1st Differencing
Пример #9
0
#class weight balanced didn't improve it!

from sklearn.model_selection import permutation_test_score
score, permutation_scores, pvalue = permutation_test_score(
    tree1, X_train, y_train, scoring="accuracy", cv=5, n_permutations=20, n_jobs=1)
print("Classification score %s (pvalue : %s)" % (score, pvalue))

scoring = make_scorer(f1_score, average = 'micro')

from yellowbrick.model_selection import LearningCurve
visualizer = LearningCurve(
    tree1, cv=10, scoring=scoring,verbose = 0
)

visualizer.fit(X_train, y_train)        # Fit the data to the visualizer
visualizer.show()

"""# > **Random Forest**"""

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(oob_score = True,n_estimators=250)

clf.get_params

clf.fit(X_train,y_train)
y_clf = clf.predict(X_test)

clf.score(X_test,y_test)

clf.score(X_train,y_train)
Пример #10
0
print(classification_report(y_test, y_test_pred))

# In[42]:

#Training and Testing error with new data
print(classification_report(y_train_New, y_train_pred_New))
print(classification_report(y_test_New, y_test_pred_New))

# In[44]:

#Learning Curve

from sklearn.model_selection import cross_validate
from yellowbrick.model_selection import LearningCurve
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=12)
sizes = np.linspace(0.3, 1.0, 10)
model = ExtraTreesRegressor(n_estimators=250,
                            random_state=0,
                            max_depth=80,
                            max_features='auto')
visualizer = LearningCurve(model,
                           cv=cv,
                           scoring='roc_auc',
                           train_sizes=sizes,
                           n_jobs=5)

visualizer.fit(X_train_New, y_train_New)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure
Пример #11
0
def upper_region_classifier():
    ur_dataset = pd.read_csv("../resources/datasets/ur_dataset.csv",
                             na_values='?',
                             dtype='category')
    print("UR classes", ur_dataset.groupby(SECOND_LEVEL_TARGET).size())

    # Separate training feature & training labels
    X = ur_dataset.drop(['class'], axis=1)
    y = ur_dataset['class']

    # Spot check
    # spot_check_algorithms(X, y)

    # pipeline = Pipeline([
    #     ('bow', CountVectorizer()),
    #     ('classifier', BernoulliNB()),
    # ])

    # Create a cross-validation strategy
    # StratifiedKFold cross-validation strategy to ensure all of our classes in each split are represented with the same proportion.
    cv = RepeatedStratifiedKFold(n_splits=3, random_state=42)

    # https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/
    # create feature union
    features = []
    # features.append(('pca', MCA(n_components=3)))
    features.append(('select_best', SelectKBest(k=15)))
    feature_union = FeatureUnion(features)
    # create pipeline
    estimators = []
    estimators.append(('feature_union', feature_union))
    estimators.append(('ROS', RandomOverSampler(random_state=42)))
    estimators.append(('logistic', RandomForestClassifier(random_state=13)))
    model = Pipeline(estimators)

    imba_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                  SelectKBest(k=15),
                                  RandomForestClassifier(random_state=13))
    scores = cross_val_score(imba_pipeline,
                             X,
                             y,
                             scoring='f1_micro',
                             cv=cv,
                             n_jobs=-1)
    print("After oversampling mean", scores.mean())

    ############################################# Hyper-parameter Tuning ###########################################
    params = {
        'n_estimators': [5, 10, 20, 30],
        'max_depth': [4, 6, 10, 12],
        'random_state': [13]
    }

    new_params = {
        'randomforestclassifier__' + key: params[key]
        for key in params
    }
    grid_imba = GridSearchCV(imba_pipeline,
                             param_grid=new_params,
                             cv=cv,
                             scoring='f1_micro',
                             return_train_score=True)
    grid_imba.fit(X, y)
    print(grid_imba.best_params_)
    print(grid_imba.best_score_)
    #refer - https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn

    model = grid_imba.best_estimator_

    sizes = np.linspace(0.3, 1.0, 10)
    # Instantiate the classification model and visualizer
    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring='f1_micro',
                               train_sizes=sizes,
                               n_jobs=4)
    visualizer.fit(X, y)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure