예제 #1
0
def test_validation_curve_cv_splits_consistency():
    n_samples = 100
    n_splits = 5
    X, y = make_classification(n_samples=100, random_state=0)

    scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=OneTimeSplitter(n_splits=n_splits,
                                                  n_samples=n_samples))
    # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
    # `split` is called for each parameter, the following should produce
    # identical results for param setting 1 and param setting 2 as both have
    # the same C value.
    assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :],
                                         2))

    scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=KFold(n_splits=n_splits, shuffle=True))

    # For scores2, compare the 1st and 2nd parameter's scores
    # (Since the C value for 1st two param setting is 0.1, they must be
    # consistent unless the train test folds differ between the param settings)
    assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :],
                                         2))

    scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
                               'C', [0.1, 0.1, 0.2, 0.2],
                               cv=KFold(n_splits=n_splits))

    # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
    assert_array_almost_equal(np.array(scores3), np.array(scores1))
예제 #2
0
def validation_crv(estimator, X, y, title, n_jobs=1):
    param_range = np.logspace(-6, -1, 5)
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name="max_features", param_range=param_range,
        cv=10, scoring="accuracy", n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title(title)
    plt.xlabel("$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")
    return plt
예제 #3
0
 def plot_validation_curve(self, X_train, X_test, y_train, y_test, pipeline, param_name, param_range, title, filename, cv=4, param_range_plot=None):
     train_scores, test_scores = validation_curve(estimator=pipeline,
                                                  X=X_train,
                                                  y=y_train, 
                                                  param_name=param_name, 
                                                  param_range=param_range,
                                                  cv=cv)
     
     #train_scores = 1. - train_scores
     #test_scores = 1. - test_scores
     train_mean = np.mean(train_scores, axis=1)
     train_std = np.std(train_scores, axis=1)
     test_mean = np.mean(test_scores, axis=1)
     test_std = np.std(test_scores, axis=1)
     
     if param_range_plot != None:
         param_range = param_range_plot
     
     plot_series(param_range,
                 [train_mean, test_mean],
                 [train_std, test_std],
                 ['training accuracy', 'validation accuracy'],
                 ['blue', 'green'],
                 ['o', 's'],
                 title,
                 param_name,
                 'Accuracy',
                 filename)
예제 #4
0
def test_validation_curve_clone_estimator():
    X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)

    param_range = np.linspace(1, 0, 10)
    _, _ = validation_curve(
        MockEstimatorWithSingleFitCallAllowed(), X, y,
        param_name="param", param_range=param_range, cv=2
    )
예제 #5
0
def test_validation_curve():
    X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    param_range = np.linspace(0, 1, 10)
    with warnings.catch_warnings(record=True) as w:
        train_scores, test_scores = validation_curve(
            MockEstimatorWithParameter(), X, y, param_name="param",
            param_range=param_range, cv=2
        )
    if len(w) > 0:
        raise RuntimeError("Unexpected warning: %r" % w[0].message)

    assert_array_almost_equal(train_scores.mean(axis=1), param_range)
    assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
예제 #6
0
    def test_validation_curve(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        param_range = np.logspace(-2, -1, 2)

        svc = df.svm.SVC(random_state=self.random_state)
        result = df.model_selection.validation_curve(svc, 'gamma',
                                                     param_range)
        expected = ms.validation_curve(svm.SVC(random_state=self.random_state),
                                       digits.data, digits.target,
                                       'gamma', param_range)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])
예제 #7
0
def validation_curve(estimator, epochs, y, param_name, param_range, cv=None):
    """Validation curve on epochs.

    Parameters
    ----------
    estimator : object that implements "fit" and "predict" method.
        the estimator whose Validation curve must be found
    epochs : instance of mne.Epochs.
        The epochs.
    y : array
        The labels.
    param_name : str
        Name of the parameter that will be varied.
    param_range : array
        The values of the parameter that will be evaluated.
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation strategy.

    Returns
    -------
    train_scores : array
        The scores in the training set
    test_scores : array
        The scores in the test set
    """
    from sklearn.model_selection import validation_curve
    if not isinstance(estimator, GlobalAutoReject):
        msg = 'No guarantee that it will work on this estimator.'
        raise NotImplementedError(msg)

    BaseEpochs = _get_epochs_type()
    if not isinstance(epochs, BaseEpochs):
        raise ValueError('Only accepts MNE epochs objects.')

    data_picks = _handle_picks(epochs.info, picks=None)
    X = epochs.get_data()[:, data_picks, :]
    n_epochs, n_channels, n_times = X.shape

    estimator.n_channels = n_channels
    estimator.n_times = n_times

    train_scores, test_scores = \
        validation_curve(estimator, X.reshape(n_epochs, -1), y=y,
                         param_name="thresh", param_range=param_range,
                         cv=cv, n_jobs=1, verbose=0)

    return train_scores, test_scores
예제 #8
0
 def plot_validation_curve(self, estimator, x_train, y_train, cv, data_label, param_range, param_name, n_jobs=-1):
     
     # plot the validation curves
     plt.clf()
     
     train_scores, test_scores = validation_curve(estimator=estimator,
                                                  X=x_train,
                                                  y=y_train, 
                                                  param_name=param_name, 
                                                  param_range=param_range, 
                                                  cv=cv,
                                                  n_jobs=n_jobs)
 
     train_mean = np.mean(train_scores, axis=1)
     train_std = np.std(train_scores, axis=1)
     test_mean = np.mean(test_scores, axis=1)
     test_std = np.std(test_scores, axis=1)
 
     plt.plot(param_range, train_mean,
              color='blue', marker='o',
              markersize=5,
              label='training accuracy')
 
     plt.fill_between(param_range,
                      train_mean + train_std,
                      train_mean - train_std,
                      alpha=0.15, color='blue')
 
     plt.plot(param_range, test_mean,
              color='green', marker='s',
              markersize=5, linestyle='--',
              label='validation accuracy')
 
     plt.fill_between(param_range,
                      test_mean + test_std,
                      test_mean - test_std,
                      alpha=0.15, color='green')
     
     plt.grid()
     plt.title("Validation curve: %s" % (data_label))
     plt.xlabel(param_name)
     plt.ylabel('Accurancy')
     plt.legend(loc='lower right')
     fn = self.save_path + data_label + '_' + param_name + '_validationcurve.png'
     plt.savefig(fn)
예제 #9
0
def plot_cv_parameters(classifier,X_train,y_train,param, param_range,cv=10):
    train_scores,test_scores= validation_curve(estimator=classifier,
                                              X=X_train,
                                              y=y_train,
                                              param_name=param,
                                              param_range=param_range,
                                              cv=10)
    train_mean = np.mean(train_scores,axis=1)
    train_std = np.std(train_scores,axis=1)
    test_mean = np.mean(test_scores,axis=1)
    test_std = np.std(test_scores,axis=1)
    
    fig = plt.figure(figsize=(10,5))
    
    plt.plot(param_range,
             train_mean,
             color='blue',
             marker='o' ,
             markersize=5,
             label='training accuracy')
    plt.fill_between(param_range,
                     train_mean+train_std,
                     train_mean-train_std,
                     alpha=0.15,color='blue')
    plt.plot(param_range,
             test_mean,
             color='green',
             linestyle="--",
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(param_range,
                     test_mean+test_std,
                     test_mean-test_std,
                     alpha=0.15,color='green')
    plt.grid()
    plt.xscale('log')
    plt.xlabel(param)
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.6 ,1.1])
    plt.show()
def ModelComplexity(X, y):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1,11)

    # Calculate the training and testing scores
    train_scores, test_scores = validation_curve(DecisionTreeRegressor(), X, y, \
        param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score')
    pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score')
    pl.fill_between(max_depth, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    pl.fill_between(max_depth, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')

    # Visual aesthetics
    pl.legend(loc = 'lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05,1.05])
    pl.show()
예제 #11
0
        w = min(W,dx)
        image(temppath,imgx,imgy,width=w)
        imgy = imgy + dy + 20
        os.remove(temppath)
        size(W, HEIGHT+dy+40)
else:
    def pltshow(mplpyplot):
        mplpyplot.show()
# nodebox section end

digits = load_digits()
X, y = digits.data, digits.target

param_range = np.logspace(-6, -1, 5)
train_scores, test_scores = validation_curve(
    SVC(), X, y, param_name="gamma", param_range=param_range,
    cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
예제 #12
0
# In[ ]:

from sklearn import svm
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
get_ipython().magic(u'matplotlib inline')

# Find better 'gamma' by default C value
param_range = np.logspace(-2, 0, 20)
print(param_range)
train_scores, test_scores = validation_curve(svm.SVC(C=0.6),
                                             whole_data_x_scaled,
                                             whole_data_y,
                                             param_name="gamma",
                                             param_range=param_range,
                                             cv=10,
                                             scoring="accuracy",
                                             n_jobs=1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.6, 1.1)
lw = 2
plt.semilogx(param_range,
예제 #13
0
def plot_validation_curve(model, partition, pname, prange):
    r"""Generate scikit-learn validation curves.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.
    pname : str
        Name of the hyperparameter to test.
    prange : numpy array
        The values of the hyperparameter that will be evaluated.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py

    """

    logger.info("Generating Validation Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    scorer = model.specs['scorer']
    verbosity = model.specs['verbosity']

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Define plotting constants.

    spacing = 0.5
    alpha = 0.2

    # Calculate a validation curve for each algorithm.

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # get estimator
        estimator = model.estimators[algo]
        # set up plot
        train_scores, test_scores = validation_curve(estimator,
                                                     X,
                                                     y,
                                                     param_name=pname,
                                                     param_range=prange,
                                                     cv=cv_folds,
                                                     scoring=scorer,
                                                     n_jobs=n_jobs)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        # set up figure
        plt.style.use('classic')
        plt.figure()
        # plot learning curves
        title = BSEP.join([algo, "Validation Curve [", pstring, "]"])
        plt.title(title)
        # x-axis
        x_min, x_max = min(prange) - spacing, max(prange) + spacing
        plt.xlabel(pname)
        plt.xlim(x_min, x_max)
        # y-axis
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        # plot scores
        plt.plot(prange, train_scores_mean, label="Training Score", color="r")
        plt.fill_between(prange,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=alpha,
                         color="r")
        plt.plot(prange,
                 test_scores_mean,
                 label="Cross-Validation Score",
                 color="g")
        plt.fill_between(prange,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=alpha,
                         color="g")
        plt.legend(loc="best")  # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
예제 #14
0
def validation_curve_model(X,
                           Y,
                           model,
                           param_name,
                           parameters,
                           cv,
                           ylim,
                           log=True):

    train_scores, test_scores = validation_curve(model,
                                                 X,
                                                 Y,
                                                 param_name=param_name,
                                                 param_range=parameters,
                                                 cv=cv,
                                                 scoring="accuracy")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title("Validation curve")
    plt.fill_between(parameters,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(parameters,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")

    if log == True:
        plt.semilogx(parameters,
                     train_scores_mean,
                     'o-',
                     color="r",
                     label="Training score")
        plt.semilogx(parameters,
                     test_scores_mean,
                     'o-',
                     color="g",
                     label="Cross-validation score")
    else:
        plt.plot(parameters,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
        plt.plot(parameters,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")

    #plt.ylim([0.55, 0.9])
    if ylim is not None:
        plt.ylim(*ylim)

    plt.ylabel('Score')
    plt.xlabel('Parameter C')
    plt.legend(loc="best")

    return plt
예제 #15
0
import numpy as np
from sklearn.model_selection import validation_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from hw1_util import create_two_spirals, get_hill_valley_data

X, y = create_two_spirals()
X, y = get_hill_valley_data()

param_range = range(1, 60)
train_scores, test_scores = validation_curve(
    DecisionTreeClassifier(max_features='auto'),
    X,
    y,
    param_name="max_depth",
    param_range=param_range,
    cv=10,
    scoring="accuracy",
    n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with DecisionTreeClassifier")
plt.xlabel("Max depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_range,
예제 #16
0
    def tune_hyper_parameter(self, param_name, param_range):

        train_scores, valid_scores = validation_curve(self.model,
                                                      self.X_train,
                                                      self.y_train,
                                                      param_name,
                                                      param_range,
                                                      cv=5,
                                                      scoring='f1',
                                                      n_jobs=-1,
                                                      verbose=51)

        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        valid_scores_mean = np.mean(valid_scores, axis=1)
        valid_scores_std = np.std(valid_scores, axis=1)

        title = "MC Curve for " + param_name + "\n" + self.model_type
        title_dic = {'fontsize': 7, 'fontweight': 'bold'}
        fig, (ax1), = plt.subplots(1, 1, figsize=(3, 2))
        ax1.set_title(title, title_dic)
        ax1.set_ylabel("Mean F1 Score", title_dic)
        ax1.tick_params(axis="x", labelsize=7)
        ax1.tick_params(axis="y", labelsize=7)
        ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))

        ax1.set_xlabel(param_name, title_dic)
        plt.fill_between(param_range,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.2,
                         color="darkred",
                         lw=2)
        plt.fill_between(param_range,
                         valid_scores_mean - valid_scores_std,
                         valid_scores_mean + valid_scores_std,
                         alpha=0.2,
                         color="navy",
                         lw=2)

        ax1.plot(param_range,
                 train_scores_mean,
                 "r",
                 linewidth=2,
                 label="train")
        ax1.plot(param_range,
                 valid_scores_mean,
                 "b",
                 linewidth=2,
                 label="cross val")

        if param_name == 'max_iter':
            ax1.set_xlabel("iterations", title_dic)

        ax1.legend(loc='best', fontsize=6)
        ax1.grid()
        plt.tight_layout()
        path = OUTPUT_PATH + '/' + self.model_type + "/"
        filename = "MC_Curve_" + param_name + ".png"
        filename = os.path.join(path, filename)
        plt.savefig(filename)
예제 #17
0
def drawvalidationCurve():
	train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha", np.logspace(-7, 3, 3))
예제 #18
0
# %% [markdown]
# Then, create an `AbaBoostRegressor`. Use the function
# `sklearn.model_selection.validation_curve` to get training and test scores
# by varying the number of estimators.
# *Hint: vary the number of estimators between 1 and 60.*

# %%
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import validation_curve

adaboost = AdaBoostRegressor()
param_range = np.unique(np.logspace(0, 1.8, num=30).astype(int))
train_scores, test_scores = validation_curve(adaboost,
                                             data_train,
                                             target_train,
                                             param_name="n_estimators",
                                             param_range=param_range,
                                             n_jobs=-1)

# %% [markdown]
# Plot both the mean training and test scores. You can also plot the
# standard deviation of the scores.

# %%
import matplotlib.pyplot as plt

plt.errorbar(param_range,
             train_scores.mean(axis=1),
             yerr=train_scores.std(axis=1),
             label="Training score",
             alpha=0.7)
예제 #19
0
def boost_plot(train_set,
               label_train,
               validation_set,
               label_validation,
               depth=8):
    print("Boosting test")

    boost_clf = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(
        criterion='mse', max_depth=25),
                                           n_estimators=600,
                                           learning_rate=1.5)
    grad_boost_clf = ensemble.GradientBoostingRegressor(max_depth=7,
                                                        n_estimators=600,
                                                        learning_rate=0.05)
    boost_clf.fit(train_set, label_train)
    grad_boost_clf.fit(train_set, label_train)
    ada_computed_values = boost_clf.predict(validation_set)
    mean_absolute_error = sklearn.metrics.mean_absolute_error(
        label_validation, ada_computed_values, multioutput='uniform_average')
    print("ada mean error is : ", mean_absolute_error)
    mean_max_absolute_error_arrays = np.mean(
        np.amax(np.absolute(np.subtract(ada_computed_values,
                                        label_validation)),
                axis=1))
    print("ada mean max error per row is : ", mean_max_absolute_error_arrays)

    grad_computed_values = grad_boost_clf.predict(validation_set)
    mean_absolute_error = sklearn.metrics.mean_absolute_error(
        label_validation, grad_computed_values, multioutput='uniform_average')
    print("grad mean error is : ", mean_absolute_error)
    mean_max_absolute_error_arrays = np.mean(
        np.amax(np.absolute(np.subtract(grad_computed_values,
                                        label_validation)),
                axis=1))
    print("grad mean max error per row is : ", mean_max_absolute_error_arrays)

    X = np.concatenate((train_set, validation_set), axis=0)
    y = np.concatenate((label_train, label_validation), axis=0)

    # z = np.transpose(norm_values[:, -1:])[0]
    param_range = np.arange(0.1, 2.5, 0.5)
    # train_scores, test_scores = validation_curve(
    #     ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(criterion='mse', max_depth=9), n_estimators=200), X,
    #     y, param_name="learning_rate", param_range=param_range, cv=6, scoring="neg_mean_absolute_error", n_jobs=1)
    train_scores, test_scores = validation_curve(
        ensemble.GradientBoostingRegressor(max_depth=9, n_estimators=200),
        X,
        y,
        param_name="learning_rate",
        param_range=param_range,
        cv=3,
        scoring="neg_mean_absolute_error",
        n_jobs=1)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    print(test_scores)
    plt.title(
        "Eye Validation Curve with Boosting (GradBoost) : criterion : 'entropy'"
    )
    plt.xlabel("learning_rate")
    plt.ylabel("Score")
    plt.ylim(-10, 10)
    lw = 2
    plt.plot(param_range,
             train_scores_mean,
             label="Training score",
             color="darkorange",
             lw=lw)
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="darkorange",
                     lw=lw)
    plt.plot(param_range,
             test_scores_mean,
             label="Cross-validation score",
             color="navy",
             lw=lw)
    plt.fill_between(param_range,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.2,
                     color="navy",
                     lw=lw)
    plt.legend(loc="best")
    plt.show()
print(train_sizes)
print("Max cross-validation score:")
# print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print("Max training score at the same point as max cross-validation score:")
print(train_mean[np.argmax(test_mean)])

# In[89]:

# Model complexity analysis
print("Max depth model complexity analysis")
train_scores, test_scores = validation_curve(
    DecisionTreeClassifier(criterion='entropy'),
    x_train,
    y_train,
    param_name="max_depth",
    param_range=range(2, 51),
    scoring="accuracy",
    n_jobs=-1,
    cv=5)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
train_sizes = list(range(2, 51))
plt.close()
plot_data(train_sizes,
          test_mean,
          title="Figure 5 (Decision Tree Validation Curve - Wine Quality)",
          x_label="Max depth",
          y_label="Accuracy Score",
          color="blue",
          label='Cross-validation score')
예제 #21
0
# load & scale dataset
X, y = load_boston(return_X_y=True)
X = StandardScaler().fit_transform(X)
y = StandardScaler().fit_transform(y.reshape(-1, 1))

# regression by decision trees
subsets = ShuffleSplit(n_splits=5, test_size=0.33, random_state=23)

dtree_model = DecisionTreeRegressor()
dtree_max_depth = range(1, 11)

trn_scores, tst_scores = validation_curve(dtree_model,
                                          X,
                                          y,
                                          param_name='max_depth',
                                          param_range=dtree_max_depth,
                                          cv=subsets,
                                          scoring='r2')

mean_trn_scores = np.mean(trn_scores, axis=1)
mean_tst_scores = np.mean(tst_scores, axis=1)
dtree_scores = pd.concat([
    pd.DataFrame({
        'max_depth': dtree_max_depth,
        'score': mean_trn_scores,
        'subset': 'train'
    }),
    pd.DataFrame({
        'max_depth': dtree_max_depth,
        'score': mean_tst_scores,
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

digits = load_digits()
X, y = digits.data, digits.target

param_range = np.logspace(-6, -1, 5)
train_scores, test_scores = validation_curve(SVC(),
                                             X,
                                             y,
                                             param_name="gamma",
                                             param_range=param_range,
                                             cv=10,
                                             scoring="accuracy",
                                             n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range,
             train_scores_mean,
예제 #23
0
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3, 3, 4)
train_scores, test_scores = validation_curve(SVC(), X, y,
                                            param_name='gamma',
                                            param_range=param_range, cv=3)

print(train_scores)
print(test_scores)

# This code based on scikit-learn validation_plot example
#  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure()

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2

plt.semilogx(param_range, train_scores_mean, label='Training score',
            color='darkorange', lw=lw)

plt.fill_between(param_range, train_scores_mean - train_scores_std,
예제 #24
0


# ## Addressing over- and underfitting with validation curves

# In[16]:


from sklearn.model_selection import validation_curve


param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='logisticregression__C', 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
plt.scatter(x_test, y_test, color='blue', label='Test set')
plt.title('The data')
plt.legend(loc='best')

############################################################
# Plot a validation curve
from sklearn.model_selection import validation_curve

degrees = np.arange(1, 21)

model = make_pipeline(PolynomialFeatures(), LinearRegression())

# The parameter to vary is the "degrees" on the pipeline step
# "polynomialfeatures"
train_scores, validation_scores = validation_curve(
                 model, x[:, np.newaxis], y,
                 param_name='polynomialfeatures__degree',
                 param_range=degrees)

# Plot the mean train error and validation error across folds
plt.figure(figsize=(6, 4))
plt.plot(degrees, validation_scores.mean(axis=1), lw=2,
         label='cross-validation')
plt.plot(degrees, train_scores.mean(axis=1), lw=2, label='training')

plt.legend(loc='best')
plt.xlabel('degree of fit')
plt.ylabel('explained variance')
plt.title('Validation curve')
plt.tight_layout()

예제 #26
0
                 alpha=0.15,
                 color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.03])
plt.tight_layout()
plt.show()

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
    estimator=pipe_lr,
    X=X_train,
    y=y_train,
    param_name='logisticregression__C',
    param_range=param_range,
    cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range,
         train_mean,
         color='blue',
         marker='o',
         markersize=5,
         label='training accuracy')
예제 #27
0
def plot_validation_curve(estimator,
                          title,
                          X,
                          y,
                          param_name,
                          param_range,
                          ylim=None,
                          cv=None,
                          scoring='accuracy',
                          n_jobs=-1,
                          verbose=1,
                          islog=False):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xticks(param_range)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    train_scores, test_scores = validation_curve(estimator,
                                                 X,
                                                 y,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 scoring=scoring,
                                                 n_jobs=n_jobs,
                                                 verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    if islog:
        lw = 2
        plt.semilogx(param_range,
                     train_scores_mean,
                     label="Training score",
                     color="darkorange",
                     lw=lw)
        plt.fill_between(param_range,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.2,
                         color="darkorange",
                         lw=lw)
        plt.semilogx(param_range,
                     test_scores_mean,
                     label="Cross-validation score",
                     color="navy",
                     lw=lw)
        plt.fill_between(param_range,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.2,
                         color="navy",
                         lw=lw)
    else:
        plt.fill_between(param_range,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(param_range,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(param_range,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training score")
        plt.plot(param_range,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
예제 #28
0
def plot_validation_curve(estimator,
                          X,
                          y,
                          title=None,
                          ylim=None,
                          param_name=None,
                          param_range=[1, 100, 1000, 10000],
                          cv=10,
                          scoring=None,
                          n_jobs=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_scores, test_scores = validation_curve(estimator,
                                                 X,
                                                 y,
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 scoring=scoring,
                                                 n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve")
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(-0.1, 1.1)
    lw = 2
    #     plt.grid()
    #     plt.fill_between(param_range, train_scores_mean - train_scores_std,
    #                      train_scores_mean + train_scores_std, alpha=0.01,
    #                      color="r")
    #     plt.fill_between(param_range, test_scores_mean - test_scores_std,
    #                      test_scores_mean + test_scores_std, alpha=0.01, color="g")
    #     plt.plot(param_range, train_scores_mean, 'o-', color="r",
    #              label="Training score")
    #     plt.plot(param_range, test_scores_mean, 'o-', color="g",
    #              label="Cross-validation score")
    #     plt.legend(loc="best")
    plt.semilogx(param_range,
                 train_scores_mean,
                 label="Training score",
                 color="darkorange",
                 lw=lw)
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="darkorange",
                     lw=lw)
    plt.semilogx(param_range,
                 test_scores_mean,
                 label="Cross-validation score",
                 color="navy",
                 lw=lw)
    plt.fill_between(param_range,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="navy",
                     lw=lw)
    plt.legend(loc="best")
    return plt
예제 #29
0
kfold = KFold(n_splits=10)
cv = kfold

scores = cross_val_score(rbf_kernel_svm_clf, X,y,cv=cv)
print('gamma : {}'.format(gamma))
print('Cross-Validation scores: {}'.format(scores))

title = "Learning Curves ("+name+")"
plt = plot_learning_curve(rbf_kernel_svm_clf, title, X, y, cv=10)
plt.savefig(name+'_lc.png')
print(name+'_lc.png')
plt.show()

param_range = np.logspace(-6, -1, 5)
train_scores, test_scores = validation_curve(
     rbf_kernel_svm_clf, X, y, param_name='svc__gamma', param_range=param_range,
    scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve (" + name +")")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
예제 #30
0
def validation_curves(X, Y, alg1, alg2, alg3, CV, score, csv_name):
    """
    
    Given training data, iteratively runs some ML algorithms (currently, this
    is nearest neighbor, decision tree, and support vector methods), varying
    the training set size for each prediction category: reactor type, cooling
    time, enrichment, and burnup

    Parameters 
    ---------- 
    
    X : dataframe that includes all training data
    Y : series with labels for training data
    alg1 : optimized learner 1
    alg2 : optimized learner 2
    alg3 : optimized learner 3
    CV : cross-validation generator
    score : 
    csv_name : string containing the train set, nuc subset, and parameter being 
               predicted for naming purposes

    Returns
    -------
    *validation_curve.csv : csv file with val curve results for each 
                            prediction category

    """

    # Note: I'm trying to avoid loops here so the code is inelegant

    # Varied alg params for validation curves
    k_list = np.linspace(1, 25, 10).astype(int)
    depth_list = np.linspace(3, 25, 10).astype(int)
    feat_list = np.linspace(5, 47, 10).astype(int)
    gamma_list = np.logspace(-4, -1, 10)
    c_list = np.logspace(0, 5, 10)

    # knn
    train, cv = validation_curve(alg1,
                                 X,
                                 Y,
                                 'n_neighbors',
                                 k_list,
                                 cv=CV,
                                 scoring=score,
                                 n_jobs=4)
    train_mean = np.mean(train, axis=1)
    train_std = np.std(train, axis=1)
    cv_mean = np.mean(cv, axis=1)
    cv_std = np.std(cv, axis=1)
    df1 = pd.DataFrame({
        'ParamList': k_list,
        'TrainScore': train_mean,
        'TrainStd': train_std,
        'CV-Score': cv_mean,
        'CV-Std': cv_std
    })
    df1['Algorithm'] = 'knn'

    # dtree
    train, cv = validation_curve(alg2,
                                 X,
                                 Y,
                                 'max_depth',
                                 depth_list,
                                 cv=CV,
                                 scoring=score,
                                 n_jobs=4)
    train_mean = np.mean(train, axis=1)
    train_std = np.std(train, axis=1)
    cv_mean = np.mean(cv, axis=1)
    cv_std = np.std(cv, axis=1)
    df2 = pd.DataFrame({
        'ParamList': depth_list,
        'TrainScore': train_mean,
        'TrainStd': train_std,
        'CV-Score': cv_mean,
        'CV-Std': cv_std
    })
    df2['Algorithm'] = 'dtree'

    train, cv = validation_curve(alg2,
                                 X,
                                 Y,
                                 'max_features',
                                 feat_list,
                                 cv=CV,
                                 scoring=score,
                                 n_jobs=4)
    train_mean = np.mean(train, axis=1)
    train_std = np.std(train, axis=1)
    cv_mean = np.mean(cv, axis=1)
    cv_std = np.std(cv, axis=1)
    df3 = pd.DataFrame({
        'ParamList': feat_list,
        'TrainScore': train_mean,
        'TrainStd': train_std,
        'CV-Score': cv_mean,
        'CV-Std': cv_std
    })
    df3['Algorithm'] = 'dtree'

    # svr
    train, cv = validation_curve(alg3,
                                 X,
                                 Y,
                                 'gamma',
                                 gamma_list,
                                 cv=CV,
                                 scoring=score,
                                 n_jobs=4)
    train_mean = np.mean(train, axis=1)
    train_std = np.std(train, axis=1)
    cv_mean = np.mean(cv, axis=1)
    cv_std = np.std(cv, axis=1)
    df4 = pd.DataFrame({
        'ParamList': gamma_list,
        'TrainScore': train_mean,
        'TrainStd': train_std,
        'CV-Score': cv_mean,
        'CV-Std': cv_std
    })
    df4['Algorithm'] = 'svr'

    train, cv = validation_curve(alg3,
                                 X,
                                 Y,
                                 'C',
                                 c_list,
                                 cv=CV,
                                 scoring=score,
                                 n_jobs=4)
    train_mean = np.mean(train, axis=1)
    train_std = np.std(train, axis=1)
    cv_mean = np.mean(cv, axis=1)
    cv_std = np.std(cv, axis=1)
    df5 = pd.DataFrame({
        'ParamList': c_list,
        'TrainScore': train_mean,
        'TrainStd': train_std,
        'CV-Score': cv_mean,
        'CV-Std': cv_std
    })
    df5['Algorithm'] = 'svr'

    vc_data = pd.concat([df1, df2, df3, df4, df5])
    vc_data.to_csv(csv_name + '_validation_curve.csv')
    return
예제 #31
0
def draw_validation_curve(model,
                          param_range,
                          param,
                          name,
                          ax,
                          features,
                          labels,
                          metric='roc_auc',
                          cv=3,
                          n_jobs=1,
                          title=None):
    '''
    A function to draw a validation curve using cross-validation.
    model = the algorithm or pipeline used to model the data
    param_range = the inputs to validate
    param = the hyperparameter to evaluate
    name = name of the hyperparameter to use for the graph axis
    metric = evaluation metric
    title = name of the algorithm to use for the graph title
    '''
    train_scores, test_scores = validation_curve(model,
                                                 features,
                                                 labels,
                                                 param_name=param,
                                                 param_range=param_range,
                                                 cv=cv,
                                                 scoring=metric,
                                                 n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with %s" % title)
    plt.xlabel(name)
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range,
                 train_scores_mean,
                 label="Training score",
                 color="darkorange",
                 lw=lw)
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="darkorange",
                     lw=lw)
    plt.semilogx(param_range,
                 test_scores_mean,
                 label="Cross-validation score",
                 color="navy",
                 lw=lw)
    plt.fill_between(param_range,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.2,
                     color="navy",
                     lw=lw)
    plt.legend(loc="best")
# We can use the :class:`~sklearn.model_selection.validation_curve` to inspect
# the impact of varying the parameter `k_neighbors`. In this case, we need
# to use a score to evaluate the generalization score during the
# cross-validation.

# %%
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import validation_curve

scorer = make_scorer(cohen_kappa_score)
param_range = range(1, 11)
train_scores, test_scores = validation_curve(
    model,
    X,
    y,
    param_name="smote__k_neighbors",
    param_range=param_range,
    cv=3,
    scoring=scorer,
)

# %%
train_scores_mean = train_scores.mean(axis=1)
train_scores_std = train_scores.std(axis=1)
test_scores_mean = test_scores.mean(axis=1)
test_scores_std = test_scores.std(axis=1)

# %% [markdown]
# We can now plot the results of the cross-validation for the different
# parameter values that we tried.
예제 #33
0
"""
from __future__ import print_function
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

digits = load_digits()
X = digits.data
y = digits.target
param_range = np.logspace(-6, -2.3, 5)
train_loss, test_loss = validation_curve(SVC(),
                                         X,
                                         y,
                                         param_name='gamma',
                                         param_range=param_range,
                                         cv=10,
                                         scoring='neg_mean_squared_error')
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

plt.plot(param_range, train_loss_mean, 'o-', color="r", label="Training")
plt.plot(param_range,
         test_loss_mean,
         'o-',
         color="g",
         label="Cross-validation")

plt.xlabel("gamma")
plt.ylabel("Loss")
예제 #34
0
import numpy as np
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split

boston = datasets.load_boston()
print(boston.data.shape)
#print(boston.data)
X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4)

# 針對不同gamma參數進行fit model,並選擇gamma參數
param_range = np.logspace(-6, -3, 10)
train_loss, test_loss = validation_curve(SVR(), X_train, y_train, param_name = "gamma",
                                    param_range = param_range, 
                                    cv = 10, scoring = "neg_mean_squared_error")

train_loss_mean = -np.mean(train_loss, axis = 1)
test_loss_mean = -np.mean(test_loss, axis = 1)

plt.plot(param_range, train_loss_mean, "o-", color = "r", label = "Training")
plt.plot(param_range, test_loss_mean, "o-", color = "g", label = "Cross_validation")

plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc = "best")
plt.show()

# 針對不同 C 參數進行fit model,並選擇 C 參數
param_range = np.logspace(-1, 2, 10)
예제 #35
0
# plt.grid(True)
# plt.xlabel("C values")
# plt.ylabel("accuracy Score")
# plt.axis([0, max(x) + 0.001, min(min(train_scores), min(valid_scores)), max(max(train_scores), max(valid_scores))])
# plt.title("Validation Curve SVM-C")
# plt.legend()

# plt.show()

# plt.clf()

x = [1, 2]
train_scores, valid_scores = validation_curve(SVC(gamma=10, random_state=42),
                                              X_train,
                                              y_train,
                                              "kernel", ['linear', 'rbf'],
                                              cv=5,
                                              verbose=1000,
                                              n_jobs=-1,
                                              scoring='accuracy')

train_scores = np.mean(train_scores, axis=1)
valid_scores = np.mean(valid_scores, axis=1)

plt.plot(x, train_scores, label="Train score")
plt.plot(x, valid_scores, label="Validation score")
plt.grid(True)
plt.xlabel("Kernel")
plt.ylabel("accuracy Score")
plt.axis([
    0,
    max(x) + 0.001,
scorer = metrics.make_scorer(metrics.cohen_kappa_score)

# Generate the dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE)
smote = os.SMOTE(random_state=RANDOM_STATE)
cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
pipeline = pl.make_pipeline(smote, cart)

param_range = range(1, 11)
train_scores, test_scores = ms.validation_curve(
    pipeline, X, y, param_name="smote__k_neighbors", param_range=param_range,
    cv=3, scoring=scorer, n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

plt.plot(param_range, test_scores_mean, label='SMOTE')
ax.fill_between(param_range, test_scores_mean + test_scores_std,
                test_scores_mean - test_scores_std, alpha=0.2)
idx_max = np.argmax(test_scores_mean)
plt.scatter(param_range[idx_max], test_scores_mean[idx_max],
            label=r'Cohen Kappa: ${0:.2f}\pm{1:.2f}$'.format(
예제 #37
0
Y_class1_train_LR = Y_probas_train_LR[np.newaxis, :, 1].T
Y_class1_train_DT = Y_probas_train_DT[np.newaxis, :, 1].T
Y_class1_train_RF = Y_probas_train_RF[np.newaxis, :, 1].T

X_meta_train = np.concatenate(
    (Y_class1_train_svm, Y_class1_train_knn, Y_class1_train_LR,
     Y_class1_train_DT, Y_class1_train_RF),
    axis=1)  #concatenate horizontally, final shape (m, 5)
Y_meta_train = Y_train

x = [0.1, 1, 10, 100, 1000, 1100, 1300, 1500]
train_scores, valid_scores = validation_curve(
    LogisticRegression(random_state=42),
    X_train,
    Y_meta_train,
    "C",
    x,
    cv=3,
    verbose=1000,
    n_jobs=-1,
    scoring='accuracy')

train_scores = np.mean(train_scores, axis=1)
valid_scores = np.mean(valid_scores, axis=1)

plt.plot(x, train_scores, label="Train score")
plt.plot(x, valid_scores, label="Validation score")
plt.grid(True)
plt.xlabel("C values")
plt.ylabel("accuracy Score")
plt.axis([
    0,
예제 #38
0
파일: tmp.py 프로젝트: yuanjingjy/Oliguria
]  #16273个样本假设检验不通过的
dataMat = dataMat.drop(delnames, axis=1)

dataMat = StandardScaler().fit_transform(dataMat)

# param_range = np.logspace(-6, -1, 5)
param_range = [0.0001, 0.001, 0.01, 0.1]
train_scores, test_scores = validation_curve(estimator=SVC(
    kernel='rbf',
    gamma='auto',
    shrinking=True,
    probability=True,
    tol=0.0001,
    cache_size=1000,
    max_iter=-1,
    class_weight='balanced',
    decision_function_shape='ovr',
    random_state=None),
                                             X=dataMat,
                                             y=labelMat,
                                             param_name='C',
                                             param_range=param_range,
                                             cv=10,
                                             scoring="accuracy",
                                             n_jobs=1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
from sklearn.model_selection import validation_curve #validation_curve模块
from sklearn.datasets import load_digits 
from sklearn.svm import SVC 
import matplotlib.pyplot as plt 
import numpy as np

#digits数据集
digits = load_digits()
X = digits.data
y = digits.target

#建立参数测试集
param_range = np.logspace(-6, -2.3, 5)

#使用validation_curve快速找出参数对模型的影响
train_loss, test_loss = validation_curve(
    SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='neg_mean_squared_error')

#平均每一轮的平均方差
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

#可视化图形
plt.plot(param_range, train_loss_mean, 'o-', color="r",
         label="Training")
plt.plot(param_range, test_loss_mean, 'o-', color="g",
        label="Cross-validation")

plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()
def main():
    """
    機械学習パイプラインによる、機械学習処理フロー(scikit-learn ライブラリの Pipeline クラスを使用)
    学習曲線, 検証曲線よるモデルの汎化性能の評価
    """
    print("Enter main()")

    # データの読み込み
    prePro = DataPreProcess.DataPreProcess()
    prePro.setDataFrameFromCsvFile(
        "https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wdbc/wdbc.data"
    )
    #prePro.print( "Breast Cancer Wisconsin dataset" )

    dat_X = prePro.df_.loc[:, 2:].values
    dat_y = prePro.df_.loc[:, 1].values

    #===========================================
    # 前処理 [PreProcessing]
    #===========================================
    # 欠損データへの対応
    #prePro.meanImputationNaN()

    # ラベルデータをエンコード
    prePro.encodeClassLabelByLabelEncoder(colum=1)
    prePro.print("Breast Cancer Wisconsin dataset")

    # データをトレードオフデータとテストデータに分割
    X_train, X_test, y_train, y_test \
    = DataPreProcess.DataPreProcess.dataTrainTestSplit( X_input = dat_X, y_input = dat_y, ratio_test = 0.2 )

    #-------------------------------------------
    # Pipeline の設定
    #-------------------------------------------
    # パイプラインに各変換器、推定器を設定
    pipe_logReg = Pipeline(steps=[  # タプル (任意の識別文字, 変換器 or 推定器のクラス) で指定
        ("scl", StandardScaler()),  # スケーリング: 変換器のクラス(fit() 関数を持つ)
        ("clf", LogisticRegression(penalty='l2', random_state=0)
         )  # ロジスティクス回帰(L2正則化):推定器のクラス(predict()関数を持つ)
    ])

    # パイプラインに設定した変換器の fit() 関数を実行
    #pipe_logReg.fit( X_train, y_train )

    #
    #print( "Test Accuracy: %.3f" % pipe_logReg.score( X_test, y_test ) )

    #============================================
    # Learning Process
    #===========================================
    # パイプラインに設定した推定器の predict() 実行
    #y_predict = pipe_logReg.predict(X_test)
    #print("predict : ", y_predict )

    #===========================================
    # 学習曲線による汎化性能の確認
    #===========================================
    # learning_curve() 関数で"交差検証"による正解率を算出
    train_sizes, train_scores, test_scores \
    = learning_curve(
          estimator = pipe_logReg,                      # 推定器 : Pipeline に設定しているロジスティクス回帰
          X = X_train,                                  # 
          y = y_train,                                  # 
          train_sizes = numpy.linspace(0.1, 1.0, 10),   # トレードオフサンプルの絶対数 or 相対数
                                                        # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値を設定
          cv = 10,                                      # 交差検証の回数(分割数)
          n_jobs = -1                                   # 全てのCPUで並列処理
      )

    # 平均値、分散値を算出
    train_means = numpy.mean(train_scores, axis=1)  # axis = 1 : 行方向
    train_stds = numpy.std(train_scores, axis=1)
    test_means = numpy.mean(test_scores, axis=1)
    test_stds = numpy.std(test_scores, axis=1)

    #
    print("train_sizes : \n",
          train_sizes)  # トレーニングデータサイズに応じた, 等間隔の10 個の相対的な値のリスト
    print("train_scores : \n", train_scores)
    print("test_scores : \n", test_scores)
    print("train_means : \n", train_means)
    print("train_stds : \n", train_stds)
    print("test_means : \n", test_means)
    print("test_stds : \n", test_stds)

    #-------------------------------------------
    # 学習曲線を描写
    #-------------------------------------------
    Plot2D.Plot2D.drawLearningCurve(train_sizes=train_sizes,
                                    train_means=train_means,
                                    train_stds=train_stds,
                                    test_means=test_means,
                                    test_stds=test_stds)
    plt.title("Learning Curve \n LogisticRegression (L2 regularization)")
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.8, 1.01])
    plt.tight_layout()

    plt.savefig("./MachineLearningPipeline_scikit-learn_1.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    #===========================================
    # 検証曲線による汎化性能の確認
    #===========================================
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

    # validication_curve() 関数で"交差検証"による正解率を算出
    train_scores, test_scores \
    = validation_curve(
          estimator = pipe_logReg,      # 
          X = X_train,
          y = y_train,
          param_name = 'clf__C',        # 
          param_range = param_range,
          cv = 10
      )

    # 上書き
    train_means = numpy.mean(train_scores, axis=1)
    train_stds = numpy.std(train_scores, axis=1)
    test_means = numpy.mean(test_scores, axis=1)
    test_stds = numpy.std(test_scores, axis=1)

    #
    print("param_range : \n", param_range)
    print("train_scores : \n", train_scores)
    print("test_scores : \n", test_scores)
    print("train_means : \n", train_means)
    print("train_stds : \n", train_stds)
    print("test_means : \n", test_means)
    print("test_stds : \n", test_stds)

    #-------------------------------------------
    # 検証曲線を描写
    #-------------------------------------------
    Plot2D.Plot2D.drawValidationCurve(param_range=param_range,
                                      train_means=train_means,
                                      train_stds=train_stds,
                                      test_means=test_means,
                                      test_stds=test_stds)
    plt.xscale('log')  # log スケール
    plt.title("Validation Curve \n LogisticRegression (L2 regularization)")
    plt.xlabel('Parameter C [Reverse regularization parameter]')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.8, 1.01])
    plt.tight_layout()

    plt.savefig("./MachineLearningPipeline_scikit-learn_2.png",
                dpi=300,
                bbox_inches='tight')
    plt.show()

    print("Finish main()")
    return
예제 #41
0
         out_performance,
         color="navy",
         marker='o',
         label="Mean test Acc. from 5-fold CV")
plt.grid(True)
plt.legend()
plt.title('5-fold CV Curve for Decision Tree on max_depth by cross_val_score')
plt.show()

##########

base_clf = DecisionTreeClassifier(random_state=0)
train_scores, test_scores = validation_curve(base_clf,
                                             X,
                                             y,
                                             param_name="max_depth",
                                             param_range=candidate_max_depth,
                                             scoring="accuracy",
                                             n_jobs=-1,
                                             cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
plt.figure(1, figsize=(8, 8))
plt.ylabel("Accuracy")
plt.xlabel("max_depth")
plt.title(
    "K-fold(k={}) CV Curve for Decision Tree on max_depth by validation_curve".
    format(train_scores.shape[1]))

plt.plot(candidate_max_depth,
         train_scores_mean,
         marker='o',
예제 #42
0
train_x = np.array(train_x).T  # 转置回来,变为编码后的矩阵

# 定义模型
model = se.RandomForestClassifier(
    max_depth=8,  # 最大深度
    random_state=7  # 随机种子
)
# 产生数组,用于验证
n_estimators = np.arange(50, 550, 50)  # 产生一个数组
print('n_estimators:', n_estimators)

# 通过不同的参数,构建多个随机森林,验证其准确率
train_scores, test_scores = ms.validation_curve(
    model,
    train_x,
    train_y,
    'n_estimators',  # 待验证的参数名称
    n_estimators,  # 待验证的参数值
    cv=5  # 折叠数量
)
# print(test_scores)

train_mean = train_scores.mean(axis=1)  # 求各个折叠下性能均值
test_mean = test_scores.mean(axis=1)  # 求各个折叠下测试性能均值

# 可视化
mp.figure('n_estimators', facecolor='lightgray')
mp.title('n_estimators', fontsize=20)
mp.xlabel('n_estimators', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
예제 #43
0
# plt.plot(param_range, test_scores_mean, label="Cross-validation score",
#              color="navy", lw=lw)
# plt.fill_between(param_range, test_scores_mean - test_scores_std,
#                  test_scores_mean + test_scores_std, alpha=0.2,
#                  color="navy", lw=lw)
# plt.legend(loc="best")
# print('plot time bro')
# plt.show()

# LEARNING RATE INIT
param_range = np.arange(0.001, 0.05, 0.001)
train_scores, test_scores = validation_curve(MLPClassifier(
    hidden_layer_sizes=(100, 5), solver='sgd', learning_rate='constant'),
                                             X,
                                             y,
                                             param_name="learning_rate_init",
                                             param_range=param_range,
                                             cv=10,
                                             scoring="accuracy",
                                             n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with MLPClassifier")
plt.xlabel("Learning Rate")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
X_test = np.linspace(-0.1, 1.1, 500)[:, None]

plt.scatter(X.ravel(), y, color='black')
axis = plt.axis()
for degree in [1, 3, 5]:
    y_test = PolynomialRegression(degree).fit(X, y).predict(X_test)
    plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 12)
plt.legend(loc='best');

#%%
from sklearn.model_selection import validation_curve
degree = np.arange(0, 21)
train_score, val_score = validation_curve(PolynomialRegression(), X, y,
                                          'polynomialfeatures__degree', degree, cv=7)

plt.plot(degree, np.median(train_score, 1), color='blue', label='training score')
plt.plot(degree, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('degree')
plt.ylabel('score');

#%%
plt.scatter(X.ravel(), y)
lim = plt.axis()
y_test = PolynomialRegression(3).fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test);
plt.axis(lim);
예제 #45
0
def draw_validation_curve(X, y, clf_type):
    param_range = np.logspace(0, 2, 5)
    if str(clf_type) == str(CLF_TYPE.K_SVM):
        train_scores, test_scores = validation_curve(SVC(),
                                                     X,
                                                     y,
                                                     param_name="C",
                                                     verbose=2,
                                                     param_range=param_range,
                                                     cv=3,
                                                     scoring="accuracy",
                                                     n_jobs=2)
    elif str(clf_type) == str(CLF_TYPE.Logistic):
        train_scores, test_scores = validation_curve(LogisticRegression(),
                                                     X,
                                                     y,
                                                     param_name="C",
                                                     verbose=2,
                                                     param_range=param_range,
                                                     cv=5,
                                                     scoring="accuracy",
                                                     n_jobs=1)
    elif str(clf_type) == str(CLF_TYPE.SVM):
        train_scores, test_scores = validation_curve(LinearSVC(),
                                                     X,
                                                     y,
                                                     param_name="C",
                                                     verbose=2,
                                                     param_range=param_range,
                                                     cv=5,
                                                     scoring="accuracy",
                                                     n_jobs=1)
    else:
        print('clf type not implemented')
        return
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with " + str(clf_type))
    plt.xlabel(r"$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range,
                 train_scores_mean,
                 label="Training score",
                 color="darkorange",
                 lw=lw)
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="darkorange",
                     lw=lw)
    plt.semilogx(param_range,
                 test_scores_mean,
                 label="Cross-validation score",
                 color="navy",
                 lw=lw)
    plt.fill_between(param_range,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.2,
                     color="navy",
                     lw=lw)
    plt.legend(loc="best")
    plt.show()
                 alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s',
         markersize=5, label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std,
                 alpha=0.15, color='green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.9, 1.0])
plt.show()  # evidence of slight overfit due to gap between curves
# create validation curves for C (the inverse regularization param of logreg)
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = ms.validation_curve(estimator=PIPE_LR,
                                                X=FEAT_TRAIN,
                                                y=LABEL_TRAIN,
                                                param_name='clf__C',
                                                param_range=param_range,
                                                cv=10)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# plot validation curves
plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5,
         label='training accuracy')
plt.fill_between(param_range, test_mean + train_std, train_mean - train_std,
                 alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s',
         markersize=5, label='validation accuracy')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std,
                 alpha=0.15, color='green')