示例#1
0
def show_learning_curve(
    est: BaseEstimator,
    conf_mat_labels: List,
    X_train: DataFrame,
    y_train: Series,
    X_test: DataFrame,
    y_test: Series,
    scoring_metric: str = "f1_micro",
    cv: StratifiedKFold = StratifiedKFold(n_splits=12),
    sizes: np.linspace = np.linspace(0.3, 1.0, 10),
    fig_size: Tuple = (8, 8),
    savefig: Path = Path().cwd() / "reports" / "figures" / "cm.png",
) -> None:
    """Plot the learning curve"""
    fig, ax = plt.subplots(figsize=fig_size)
    cm = LearningCurve(est,
                       cv=cv,
                       scoring=scoring_metric,
                       train_sizes=sizes,
                       n_jobs=-1)
    cm = LearningCurve(est, classes=conf_mat_labels, ax=ax)
    cm.fit(X_train, y_train)
    cm.score(X_test, y_test)
    cm.finalize()
    if not savefig.is_file():
        fig.savefig(savefig, bbox_inches="tight", dpi=300)
示例#2
0
def evaluation(estimator, X, Y, x, y):

    classes = [Y[1], Y[0]]
    f, (ax, ax1, ax2) = plt.subplots(1, 3, figsize=(18, 6))

    #Confusion Matrix
    cmm = ConfusionMatrix(model=estimator,
                          ax=ax1,
                          classes=classes,
                          label_encoder={
                              0.0: 'Negativo',
                              1.0: 'Positivo'
                          })
    cmm.score(x, y)

    #ROCAUC
    viz = ROCAUC(model=estimator, ax=ax2)
    viz.fit(X, Y)
    viz.score(x, y)

    #Learning Curve
    cv_strategy = StratifiedKFold(n_splits=3)
    sizes = np.linspace(0.3, 1.0, 10)
    visualizer = LearningCurve(estimator,
                               ax=ax,
                               cv=cv_strategy,
                               scoring='roc_auc',
                               train_sizes=sizes,
                               n_jobs=4)
    visualizer.fit(X, Y)

    cmm.poof(), viz.poof(), visualizer.poof()
    plt.show()
def learning_curve_clusterer(path="images/learning_curve_clusterer.png"):

    X, y = make_blobs(n_samples=1000, centers=5)

    _, ax = plt.subplots()
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(
        KMeans(), ax=ax, train_sizes=sizes, scoring="adjusted_rand_score"
    )
    oz.fit(X, y)
    oz.poof(outpath=path)
def plot_learning_curve(X, y, model, figPath=None):
    """Prints a training curve for a model (with file saving ability).

    Args:
        X (numpy array): Features for the evaluation.
        y (numpy array): Targets for the evaluation.
        model (sklearn Model): The model to visualize.
        figPath (str): Where to save the figure. figPath=None does not save the figure.

    """

    # create a stratified cross validation to ensure good sample
    # representation.
    cv = StratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring='f1_weighted',
                               train_sizes=np.linspace(0.3, 1.0, 10),
                               n_jobs=-1)
    visualizer.fit(X, y)

    if figPath == None:
        visualizer.show()
    else:
        visualizer.show(outpath=f'{figPath}/learning.png')
def learning_curve_sklearn_example(path="images/learning_curve_sklearn_example.png"):
    digits = load_digits()
    X, y = digits.data, digits.target

    _, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9,4))

    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    oz = LearningCurve(GaussianNB(), ax=ax[0], cv=cv, n_jobs=4)
    oz.fit(X, y)
    oz.finalize()

    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    oz = LearningCurve(SVC(gamma=0.001), ax=ax[1], cv=cv, n_jobs=4)
    oz.fit(X, y)
    oz.poof(outpath=path)
def learning_curve_regressor(path="images/learning_curve_regressor.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv"))

    targets = ["heating load", "cooling load"]
    features = [col for col in data.columns if col not in targets]

    X = data[features]
    y = data[targets[0]]

    _, ax = plt.subplots()
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(RidgeCV(), ax=ax, train_sizes=sizes, scoring='r2')
    oz.fit(X, y)
    oz.poof(outpath=path)
def learning_curve_classifier(path="images/learning_curve_classifier.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv"))

    target = "outcome"
    features = [col for col in data.columns if col != target]

    X = pd.get_dummies(data[features])
    y = data[target]

    _, ax = plt.subplots()
    cv = StratifiedKFold(12)
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(
        MultinomialNB(), ax=ax, cv=cv, n_jobs=4,
        train_sizes=sizes, scoring='f1_weighted'
    )

    oz.fit(X, y)
    oz.poof(outpath=path)
示例#8
0
 def draw_learning_curve(self, cv, scoring='accuracy', n_jobs=5):
     visualizer = LearningCurve(self.model,
                                cv=cv,
                                scoring=scoring,
                                n_jobs=n_jobs)
     visualizer.fit(self.training_data, self.training_labels)
     visualizer.poof()
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir):
    model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c)
    rfe = RFE(model, n_features_to_select=n)

    ## learning curve
    plt.clf()
    viz_LC = LearningCurve(
        rfe, scoring='f1_weighted', n_jobs=4
    )
    viz_LC.fit(X, y)
    viz_LC.show(outpath=outdir + '/LC.png')

    ## classification report
    plt.clf()
    viz_CR = ClassificationReport(rfe, classes=class_names, support=True)
    viz_CR.fit(X, y)
    viz_CR.score(X_test, y_test)
    viz_CR.show(outpath=outdir + '/CR.png')

    ## confusion matrix
    plt.clf()
    viz_CM = ConfusionMatrix(rfe, classes=class_names)
    viz_CM.fit(X, y)
    viz_CM.score(X_test, y_test)
    viz_CM.show(outpath=outdir + '/CM.png')

    ## precision recall curve
    plt.clf()
    viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True,
                                   fill_area=False, micro=False, classes=class_names)
    viz_PRC.fit(X, y)
    viz_PRC.score(X_test, y_test)
    viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720))

    ## class prediction error
    plt.clf()
    viz_CPE = ClassPredictionError(
        rfe, classes=class_names
    )
    viz_CPE.fit(X, y)
    viz_CPE.score(X_test, y_test)
    viz_CPE.show(outpath=outdir + '/CPE.png')

    ## ROCAUC
    plt.clf()
    viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720))
    viz_RA.fit(X, y)
    viz_RA.score(X, y)
    viz_RA.show(outpath=outdir + '/RA.png')

    fit = rfe.fit(X,y)
    y_predict = fit.predict(X_test)
    f1 = f1_score(y_test, y_predict, average='weighted')

    features_retained_RFE = X.columns[rfe.get_support()].values
    feature_df =pd.DataFrame(features_retained_RFE.tolist())
    feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False)

    return f1
示例#10
0
def generate_learning_curve(model, clf_name, scoring, sizes, cv, n_jobs,
                            dataset_name, X_train, y_train):
    viz = LearningCurve(model,
                        cv=cv,
                        scoring=scoring,
                        train_sizes=sizes,
                        n_jobs=n_jobs)
    viz.fit(X_train, y_train)
    viz.show("results/{}_learning_curve_{}.png".format(clf_name, dataset_name))
    plt.clf()
示例#11
0
文件: util.py 项目: nperera0/aethos
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        report=None,
                        model_name=None):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'
    """

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring, ax=axes[0])
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.finalize()

    visualizer_lcurve = LearningCurve(model,
                                      cv=cv,
                                      scoring=scoring,
                                      ax=axes[1])
    visualizer_lcurve.fit(x_train, y_train)
    visualizer_lcurve.finalize()

    visualizer_scores.show()
    visualizer_lcurve.show()

    if report or _global_config['track_experiments']:  # pragma: no cover
        fig.savefig(os.path.join(IMAGE_DIR, model_name, "cv.png"))
示例#12
0
def visualizeLearningCurve(classifier, features, labels, scoring='precision'):

    sizes = numpy.linspace(0.1, 1.0, 10)
    cv = StratifiedKFold(10)
    visualizer = LearningCurve(classifier,
                               cv=cv,
                               train_sizes=sizes,
                               scoring=scoring,
                               n_jobs=10)

    visualizer.fit(features.drop(["appid", "name"], axis=1),
                   list(map(convertLabelToNumber, labels)))
    visualizer.poof()
示例#13
0
def learning_curve_clusterer(path="images/learning_curve_clusterer.png"):

    X, y = make_blobs(n_samples=1000, centers=5)

    _, ax = plt.subplots()
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(KMeans(),
                       ax=ax,
                       train_sizes=sizes,
                       scoring="adjusted_rand_score")
    oz.fit(X, y)
    oz.poof(outpath=path)
示例#14
0
def learning_curve_regressor(path="images/learning_curve_regressor.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv"))

    targets = ["heating load", "cooling load"]
    features = [col for col in data.columns if col not in targets]

    X = data[features]
    y = data[targets[0]]

    _, ax = plt.subplots()
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(RidgeCV(), ax=ax, train_sizes=sizes, scoring='r2')
    oz.fit(X, y)
    oz.poof(outpath=path)
示例#15
0
def run_crossvalidation(model,
                        x_train,
                        y_train,
                        cv=5,
                        scoring="accuracy",
                        learning_curve=False):
    """
    Runs cross validation on a certain model.
    
    Parameters
    ----------
    model : Model
        Model to cross validate

    x_train : nd-array
        Training data

    y_train : nd-array
        Testing data

    cv : int, Crossvalidation Generator, optional
        Cross validation method, by default 5

    scoring : str, optional
        Scoring method, by default 'accuracy'

    learning_curve : bool, optional
        If true plot learning curve, by default False
    
    Returns
    -------
    list
        List of cross validation curves
    """

    # TODO: Make curves slightly bigger
    visualizer_scores = CVScores(model, cv=cv, scoring=scoring)
    visualizer_scores.fit(x_train, y_train)
    visualizer_scores.show()

    if learning_curve:
        visualizer_lcurve = LearningCurve(model, cv=cv, scoring=scoring)
        visualizer_lcurve.fit(x_train, y_train)
        visualizer_lcurve.show()

    return visualizer_scores.cv_scores_
示例#16
0
def yellow_brick_learning_curve(model, x, y, cpu_count, cv_count,
                                scoring_metric):
    """

    """
    # Create the learning curve visualizer
    cv = StratifiedKFold(n_splits=cv_count)
    sizes = np.linspace(0.3, 1.0, 10)

    # Instantiate the classification model and visualizer
    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring=scoring_metric,
                               train_sizes=sizes,
                               n_jobs=cpu_count)

    visualizer.fit(x, y)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
示例#17
0
def hyperparameter_tuning(fname="hyperparameter_tuning.png"):
    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18, 6))

    # Load the concrete dataset
    data = load_concrete(split=False)

    # Create a list of alphas to cross-validate against
    alphas = np.logspace(-10, 1, 400)

    # Add AlphaSelection to the left
    oz = AlphaSelection(LassoCV(alphas=alphas), ax=axes[0])
    oz.fit(data.X, data.y)
    oz.finalize()

    # Add LearningCurve to the right
    oz = LearningCurve(RandomForestRegressor(), scoring='r2', ax=axes[1])
    oz.fit(data.X, data.y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
示例#18
0
def learning_curves(models, X, y):
    """
    :params models:  Modelos a serem avaliados
    :params X:       Dados de Treino variaveis independentes
    :params y:       Dados de Treino variavel dependente
    :return:         Viz da curvas de apendizagem
    """

    cv_strategy = StratifiedKFold(n_splits=3)
    for model in models:

        sizes = np.linspace(0.3, 1.0, 10)
        viz = LearningCurve(model,
                            cv=cv_strategy,
                            scoring='roc_auc',
                            train_sizes=sizes,
                            n_jobs=4)
        viz.fit(X, y)
        viz.show()
示例#19
0
def learning_curve(model, X, y):
    # from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import RepeatedStratifiedKFold

    from yellowbrick.model_selection import LearningCurve

    # Create the learning curve visualizer
    # cv = StratifiedKFold(12)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)
    sizes = np.linspace(0.3, 1.0, 10)

    viz = LearningCurve(model,
                        cv=cv,
                        train_sizes=sizes,
                        scoring='neg_log_loss',
                        n_jobs=4)

    # Fit and poof the visualizer
    viz.fit(X, y)
    viz.poof()
示例#20
0
def learning_curve_classifier(path="images/learning_curve_classifier.png"):

    data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv"))

    target = "outcome"
    features = [col for col in data.columns if col != target]

    X = pd.get_dummies(data[features])
    y = data[target]

    _, ax = plt.subplots()
    cv = StratifiedKFold(12)
    sizes = np.linspace(0.3, 1.0, 10)

    oz = LearningCurve(MultinomialNB(),
                       ax=ax,
                       cv=cv,
                       n_jobs=4,
                       train_sizes=sizes,
                       scoring='f1_weighted')

    oz.fit(X, y)
    oz.poof(outpath=path)
示例#21
0
        residuals_plot(model, x_test, preds, hist=False, ax=ax[index])
        prediction_error(model, x_test, preds, ax=ax)

# Do some scoring on XGB estimators
# Validation curve
viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"),
                      param_name="max_depth",
                      param_range=np.arange(1, 11),
                      cv=5,
                      scoring="r2")
viz.fit(x_train, y_train)
viz.show()

# Learning curve
model = XGBRegressor(objective="reg:squarederror")
viz_2 = LearningCurve(model, scoring="r2")
viz_2.fit(x_train, y_train)
viz_2.show()

model = RFECV(LassoCV(), cv=5, scoring='r2')
model.fit(x_train, y_train)
model.show()
"""
Section: 5
Time-Series Algorithms
"""
# Fitting ARIMA
# Original Series
# plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})
fig, axes = plt.subplots(3, 1, sharex=True)
plot_acf(main_data.traffic_volume, ax=axes[0])
示例#22
0
def learning():
    X, y = load_energy()
    sizes = np.linspace(0.3, 1.0, 10)
    oz = LearningCurve(RidgeCV(), train_sizes=sizes, scoring="r2", ax=newfig())
    oz.fit(X, y)
    savefig(oz, "learning_curve")
示例#23
0
from yellowbrick.classifier.rocauc import roc_auc
roc_auc(tree1, X_train, y_train, X_test=X_test, y_test=y_test, classes=["Low_damage","Medium_damage","High_damage"])

#class weight balanced didn't improve it!

from sklearn.model_selection import permutation_test_score
score, permutation_scores, pvalue = permutation_test_score(
    tree1, X_train, y_train, scoring="accuracy", cv=5, n_permutations=20, n_jobs=1)
print("Classification score %s (pvalue : %s)" % (score, pvalue))

scoring = make_scorer(f1_score, average = 'micro')

from yellowbrick.model_selection import LearningCurve
visualizer = LearningCurve(
    tree1, cv=10, scoring=scoring,verbose = 0
)

visualizer.fit(X_train, y_train)        # Fit the data to the visualizer
visualizer.show()

"""# > **Random Forest**"""

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(oob_score = True,n_estimators=250)

clf.get_params

clf.fit(X_train,y_train)
y_clf = clf.predict(X_test)
示例#24
0
# ### Task 10: Learning Curves

# In[38]:


from yellowbrick.model_selection import LearningCurve
from sklearn.linear_model import LassoCV
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

# Create the learning curve visualizer
sizes = np.linspace(0.3, 1.0, 10)

# Create the learning curve visualizer, fit and poof
viz = LearningCurve(LassoCV(), train_sizes=sizes, scoring='r2')
viz.fit(X, y)

viz.poof()


#  

# ### Task 11:  Hyperparamter Tuning

# The `AlphaSelection` Visualizer demonstrates how different values of alpha influence model selection during the regularization of linear models.

# In[16]:


from sklearn.linear_model import LassoCV
示例#25
0
print(classification_report(y_test, y_test_pred))

# In[42]:

#Training and Testing error with new data
print(classification_report(y_train_New, y_train_pred_New))
print(classification_report(y_test_New, y_test_pred_New))

# In[44]:

#Learning Curve

from sklearn.model_selection import cross_validate
from yellowbrick.model_selection import LearningCurve
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=12)
sizes = np.linspace(0.3, 1.0, 10)
model = ExtraTreesRegressor(n_estimators=250,
                            random_state=0,
                            max_depth=80,
                            max_features='auto')
visualizer = LearningCurve(model,
                           cv=cv,
                           scoring='roc_auc',
                           train_sizes=sizes,
                           n_jobs=5)

visualizer.fit(X_train_New, y_train_New)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure
示例#26
0
result2 = cross_val_score(arbre, x_train, y_train, cv=kplis_strat)
print(' score kplis strat cross validation :{}'.format(result2))
print('moyenne score cross validation : {:.2f}'.format(result2.mean()))

result3 = cross_val_score(arbre, x_train, y_train, cv=shuffle)
print(' score shuffle split cross validation :{}'.format(result3))
print('moyenne score cross validation : {:.2f}'.format(result3.mean()))

cm = ConfusionMatrix(arbre, classes=[0, 1, 2, 3, 4, 5, 6], percent=True)
cm.fit(x_train, y_train)
cm.score(x_test, y_test)
cm.poof()

size = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
lc = LearningCurve(DecisionTreeClassifier(), train_sizes=size, score='r2')
lc.fit(x_train, y_train)
lc.poof()
''' ---------------------- Forêt aléatoire ------------------------'''

foret = RandomForestClassifier(n_estimators=120,
                               max_features='sqrt',
                               n_jobs=-1,
                               random_state=0)
foret.fit(x_train, y_train)

result = cross_val_score(foret, x_train, y_train, cv=5)
print(' score cross validation :{}'.format(result))
print('moyenne score cross validation : {:.2f}'.format(result.mean()))

result1 = cross_val_score(foret, x_train, y_train, cv=kplis)
示例#27
0
def learning_curve_sklearn_example(
        path="images/learning_curve_sklearn_example.png"):
    digits = load_digits()
    X, y = digits.data, digits.target

    _, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9, 4))

    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    oz = LearningCurve(GaussianNB(), ax=ax[0], cv=cv, n_jobs=4)
    oz.fit(X, y)
    oz.finalize()

    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    oz = LearningCurve(SVC(gamma=0.001), ax=ax[1], cv=cv, n_jobs=4)
    oz.fit(X, y)
    oz.poof(outpath=path)
示例#28
0
    visualizer = ClassificationReport(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 混淆矩阵
    visualizer = ConfusionMatrix(model)
    visualizer.score(X_test, y_test)
    visualizer.show()

    # 阈值选择
    visualizer = DiscriminationThreshold(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 学习率
    visualizer = LearningCurve(model, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 交叉验证
    visualizer = CVScores(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征重要性
    visualizer = FeatureImportances(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征递归消减
    visualizer = RFECV(model, cv=5, scoring='f1_weighted')
示例#29
0
case_name = "mg_sizing_dataset_with_loc"
df = pd.read_csv("results/" + case_name + ".csv", sep=";|,", engine="python", index_col='index')
#df = df.loc[df['off-grid'] == 1]
X = df[features]
scaler.fit(X)
X = scaler.transform(X)
# X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
targets = ["PV","BAT","RBAT","INV","GEN","NPV"]
y = df[targets]
cv = StratifiedKFold(12)
param_range = np.arange(1, 30, 1)
cv = KFold(n_splits=12, random_state=40, shuffle=True)

viz = ValidationCurve(
    KNeighborsRegressor(), param_name="n_neighbors", param_range=param_range, scoring="r2", cv=cv, n_jobs=8
)

viz.fit(X, y)
viz.show()

visualizer = LearningCurve(KNeighborsRegressor(), scoring='r2', random_state=2, cv=cv, shuffle=True)

visualizer.fit(X, y)
visualizer.show()

vis = CVScores(KNeighborsRegressor(), cv=cv, scoring='r2')

vis.fit(X, y)        # Fit the data to the visualizer
vis.show()
    param_name='n_neighbors',
    param_range=n_neighbors,
    cv=5,
    scoring=rmse_score,
    #                       n_jobs=-1,
    ax=ax)
val_curve.fit(X, y)
val_curve.poof()
fig.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(16, 9))
l_curve = LearningCurve(
    KNeighborsRegressor(n_neighbors=best_k),
    train_sizes=np.arange(.1, 1.01, .1),
    scoring=rmse_score,
    cv=5,
    #                         n_jobs=-1,
    ax=ax)
l_curve.fit(X, y)
l_curve.poof()
fig.tight_layout()
plt.show()

# Binary Classification
y_binary = (y > y.median()).astype(int)
n_neighbors = tuple(range(5, 151, 10))
n_folds = 5
scoring = 'roc_auc'

pipe = Pipeline([('scaler', StandardScaler()),
示例#31
0
viz.fit(x_data,y_data)
viz.show()


# %%
best_params = dt.hyperParameterTuning(x_train,y_train)
print(best_params)
dt_tuned = DecisionTreeClassifier(max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], random_state=rs)


# %%
sizes = np.linspace(0.3, 1.0, 10)

# Instantiate the classification model and visualizer
visualizer = LearningCurve(
    dt_tuned, scoring='f1_weighted', train_sizes=sizes, n_jobs=4
)

visualizer.fit(x_data,y_data)        # Fit the data to the visualizer
visualizer.show()  

# %% [markdown]
# # 3. Support Vector Machine

# %%
class SupportVectorMachine():
    def trainTest(self,x_train,x_test, y_train, y_test):
        scaler = StandardScaler()
        scaled_x_train = scaler.fit_transform(x_train)
        scaled_x_test = scaler.fit_transform(x_test)
        cs = [x/10000 for x in [1, 10, 100, 1000, 10000, 100000, 1000000]]
示例#32
0
def upper_region_classifier():
    ur_dataset = pd.read_csv("../resources/datasets/ur_dataset.csv",
                             na_values='?',
                             dtype='category')
    print("UR classes", ur_dataset.groupby(SECOND_LEVEL_TARGET).size())

    # Separate training feature & training labels
    X = ur_dataset.drop(['class'], axis=1)
    y = ur_dataset['class']

    # Spot check
    # spot_check_algorithms(X, y)

    # pipeline = Pipeline([
    #     ('bow', CountVectorizer()),
    #     ('classifier', BernoulliNB()),
    # ])

    # Create a cross-validation strategy
    # StratifiedKFold cross-validation strategy to ensure all of our classes in each split are represented with the same proportion.
    cv = RepeatedStratifiedKFold(n_splits=3, random_state=42)

    # https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/
    # create feature union
    features = []
    # features.append(('pca', MCA(n_components=3)))
    features.append(('select_best', SelectKBest(k=15)))
    feature_union = FeatureUnion(features)
    # create pipeline
    estimators = []
    estimators.append(('feature_union', feature_union))
    estimators.append(('ROS', RandomOverSampler(random_state=42)))
    estimators.append(('logistic', RandomForestClassifier(random_state=13)))
    model = Pipeline(estimators)

    imba_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                  SelectKBest(k=15),
                                  RandomForestClassifier(random_state=13))
    scores = cross_val_score(imba_pipeline,
                             X,
                             y,
                             scoring='f1_micro',
                             cv=cv,
                             n_jobs=-1)
    print("After oversampling mean", scores.mean())

    ############################################# Hyper-parameter Tuning ###########################################
    params = {
        'n_estimators': [5, 10, 20, 30],
        'max_depth': [4, 6, 10, 12],
        'random_state': [13]
    }

    new_params = {
        'randomforestclassifier__' + key: params[key]
        for key in params
    }
    grid_imba = GridSearchCV(imba_pipeline,
                             param_grid=new_params,
                             cv=cv,
                             scoring='f1_micro',
                             return_train_score=True)
    grid_imba.fit(X, y)
    print(grid_imba.best_params_)
    print(grid_imba.best_score_)
    #refer - https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn

    model = grid_imba.best_estimator_

    sizes = np.linspace(0.3, 1.0, 10)
    # Instantiate the classification model and visualizer
    visualizer = LearningCurve(model,
                               cv=cv,
                               scoring='f1_micro',
                               train_sizes=sizes,
                               n_jobs=4)
    visualizer.fit(X, y)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure