def test_learning_curve_with_shuffle(): # Following test case was designed this way to verify the code # changes made in pull request: #7506. X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16], [17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], [17, 18]]) y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4]) groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4]) # Splits on these groups fail without shuffle as the first iteration # of the learning curve doesn't contain label 4 in the training set. estimator = PassiveAggressiveClassifier(shuffle=False) cv = GroupKFold(n_splits=2) train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, shuffle=True, random_state=2) assert_array_almost_equal(train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111])) assert_array_almost_equal(test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25])) assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups) train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, shuffle=True, random_state=2, exploit_incremental_learning=True) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
def test_learning_curve(): n_samples = 30 n_splits = 3 X, y = make_classification(n_samples=n_samples, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits)) for shuffle_train in [False, True]: with warnings.catch_warnings(record=True) as w: train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=KFold(n_splits=n_splits), train_sizes=np.linspace(0.1, 1.0, 10), shuffle=shuffle_train) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_equal(train_scores.shape, (10, 3)) assert_equal(test_scores.shape, (10, 3)) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) # Test a custom cv splitter that can iterate only once with warnings.catch_warnings(record=True) as w: train_sizes2, train_scores2, test_scores2 = learning_curve( estimator, X, y, cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), train_sizes=np.linspace(0.1, 1.0, 10), shuffle=shuffle_train) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_array_almost_equal(train_scores2, train_scores) assert_array_almost_equal(test_scores2, test_scores)
def plot_learning_curve(self, estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), filename=None): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") if filename != None: plt.savefig(filename) return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): # 画出 data 在某模型上的 learning curve. # estimator: 你用的分类器 # title: 表格的标题 # X: 输入的 feature, numpy 类型 # y: 输入的 target vector # ylim: tuple 格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 # cv: 做 cross-validation 的时候, 数据分成的份数, 其中一份作为 cv 集, 其余 n-1 份作为 training(默认为 3 份) # n_jobs: 并行的的任务数(默认 1) train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("训练样本数") plt.ylabel("得分") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label="训练集上得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label="交叉验证集上得分") plt.legend(loc="best") plt.draw() plt.show() plt.gca().invert_yaxis() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
def plot(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u'Veri nokta sayısı') plt.ylabel(u"Hata") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores = 1.0-train_scores test_scores = 1.0-test_scores train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label=u'Eğitim hatası') plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label=u'Test hatası') plt.legend(loc="best") return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and traning learning curve. Taken from sklearn website. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects train_sizes : sizes to test over. n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.show()
def plot_learning_curve(self): # Plot the learning curve plt.figure(figsize=(9, 6)) train_sizes, train_scores, test_scores = learning_curve( self.model, X=self.X_train, y=self.y_train, cv=3, scoring='neg_mean_squared_error') self.plot_learning_curve_helper(train_sizes, train_scores, test_scores, 'Learning Curve') plt.show()
def test_learning_curve(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) result = df.learning_curve.learning_curve(df.naive_bayes.GaussianNB()) expected = ms.learning_curve(nb.GaussianNB(), digits.data, digits.target) self.assertEqual(len(result), 3) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1]) self.assert_numpy_array_almost_equal(result[2], expected[2])
def plot_learning_curve(est, X, y): training_set_size, train_scores, test_scores = learning_curve( est, X, y, train_sizes=np.linspace(.1, 1, 20), cv=KFold(20, shuffle=True, random_state=1)) estimator_name = est.__class__.__name__ line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label="training " + estimator_name) plt.plot(training_set_size, test_scores.mean(axis=1), '-', label="test " + estimator_name, c=line[0].get_color()) plt.xlabel('Training set size') plt.ylabel('Score (R^2)') plt.ylim(0, 1.1)
def learning_curve(self, graphs, targets, cv=5, n_steps=10, start_fraction=0.1): """learning_curve.""" graphs, targets = paired_shuffle(graphs, targets) x = self.transform(graphs) train_sizes = np.linspace(start_fraction, 1.0, n_steps) scoring = 'roc_auc' train_sizes, train_scores, test_scores = learning_curve( self.model, x, targets, cv=cv, train_sizes=train_sizes, scoring=scoring) return train_sizes, train_scores, test_scores
def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) train_sizes = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(n_iter=1, shuffle=False) train_sizes_inc, train_scores_inc, test_scores_inc = \ learning_curve( estimator, X, y, train_sizes=train_sizes, cv=3, exploit_incremental_learning=True) train_sizes_batch, train_scores_batch, test_scores_batch = \ learning_curve( estimator, X, y, cv=3, train_sizes=train_sizes, exploit_incremental_learning=False) assert_array_equal(train_sizes_inc, train_sizes_batch) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
def test_learning_curve_unsupervised(): X, _ = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("data_dir") parser.add_argument('--method','-m',type=int,default=0,choices=range(5), help= """chose methods from: 0:linear_svc 1:logistic regression 2:naive bayes 3:decision tree 4:ExtraTreesClassifier """) args= parser.parse_args() silent_feature_vector,threshold_feature_vector,threshold_vector,silent_classification_vector\ = load_data_set(args.data_dir) regr = linear_model.LinearRegression() clf = get_classifier(args.method) #regr_train_sizes = gene_train_sizes(len(threshold_feature_vector)) #clf_train_sizes = gene_train_sizes(len(silent_feature_vector)) regr_train_sizes = [0.3,0.6,1.0] clf_train_sizes = [0.3,0.6,1.0] print "cross validation:" regr_train_sizes, regr_train_scores, regr_valid_scores =\ learning_curve(regr, threshold_feature_vector, threshold_vector, train_sizes=regr_train_sizes, cv=5) clf_train_sizes, clf_train_scores, clf_valid_scores =\ learning_curve(clf, silent_feature_vector, silent_classification_vector, train_sizes=clf_train_sizes, cv=5) print "Thresholding:" print regr_train_scores print regr_valid_scores print "-"*20 print "Classification:" print clf_train_scores print clf_valid_scores
def test_learning_curve_with_boolean_indices(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) cv = KFold(n_folds=3) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10)) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
def ModelLearning(X, y): """ Calculates the performance of several models with varying sizes of training data. The learning and validation scores for each model are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0) # Generate the training set sizes increasing by 50 train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int) # Create the figure window fig = pl.figure(figsize=(10,7)) # Create three different models based on max_depth for k, depth in enumerate([1,3,6,10]): # Create a Decision tree regressor at max_depth = depth regressor = DecisionTreeRegressor(max_depth = depth) # Calculate the training and testing scores sizes, train_scores, valid_scores = learning_curve(regressor, X, y, \ cv = cv, train_sizes = train_sizes, scoring = 'r2') # Find the mean and standard deviation for smoothing train_std = np.std(train_scores, axis = 1) train_mean = np.mean(train_scores, axis = 1) valid_std = np.std(valid_scores, axis = 1) valid_mean = np.mean(valid_scores, axis = 1) # Subplot the learning curve ax = fig.add_subplot(2, 2, k+1) ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') ax.plot(sizes, valid_mean, 'o-', color = 'g', label = 'Validation Score') ax.fill_between(sizes, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') ax.fill_between(sizes, valid_mean - valid_std, \ valid_mean + valid_std, alpha = 0.15, color = 'g') # Labels ax.set_title('max_depth = %s'%(depth)) ax.set_xlabel('Number of Training Points') ax.set_ylabel('r2_score') ax.set_xlim([0, X.shape[0]*0.8]) ax.set_ylim([-0.05, 1.05]) # Visual aesthetics ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.) fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) fig.tight_layout() fig.show()
def __calc_learning_curve(self, algorithm): estimator = algorithm.estimator train_sizes, train_scores, test_scores = learning_curve( estimator, self.data.X, self.data.y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs) # parallel run in cross validation train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) return {'x': train_sizes, 'y_train': train_scores_mean, 'y_cv': test_scores_mean}
def test_learning_curve_incremental_learning(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockIncrementalImprovingEstimator(20) for shuffle_train in [False, True]: train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=3, exploit_incremental_learning=True, train_sizes=np.linspace(0.1, 1.0, 10), shuffle=shuffle_train) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
def test_learning_curve_implementation(): """ Test to ensure that the learning curve results match scikit-learn """ # This test is different from the other tests which just use regression data. # The reason is that we want this test to fail in case our implementation # diverges from the scikit-learn implementation. This test essentially # serves as a regression test as well. # Load in the digits data set digits = load_digits() X, y = digits.data, digits.target # get the learning curve results from scikit-learn for this data cv_folds = 10 random_state = 123456789 cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) estimator = MultinomialNB() train_sizes = np.linspace(.1, 1.0, 5) train_sizes1, train_scores1, test_scores1 = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy') # get the features from this data into a FeatureSet instance we can use # with the SKLL API feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])] features = [] for row in X: features.append(dict(zip(feature_names, row))) fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0]))) # we don't want to filter out any features since scikit-learn # does not do that either learner = Learner('MultinomialNB', min_feature_count=0) (train_scores2, test_scores2, train_sizes2) = learner.learning_curve(fs, cv_folds=cv_folds, train_sizes=train_sizes, metric='accuracy') assert np.all(train_sizes1 == train_sizes2) assert np.allclose(train_scores1, train_scores2) assert np.allclose(test_scores1, test_scores2)
def plot_learning_curve(mod, X, y, cv, n_jobs, title, ax=None, invert=True): ''' Generates a simple plot of test & training learning curves. Inspired from https://github.com/cs109/a-2017/blob/master/Sections/Standard/section_9_student.ipynb and from lecture/section. Inputs: ----------------------------------------------------------------- mod: model for which learning curve must be plotted X: predictor data y: true labels cv: number cross validation iterations n_jobs: number of cores (-1 for all available) ax: optional matplotlib Axes object on which to plot Outputs: ----------------------------------------------------------------- None: plotted learning curves ''' plt.style.use('seaborn-whitegrid') train_sizes, train_scores, test_scores = learning_curve(mod, X=X, y=y_train.values.ravel(), cv=20, n_jobs=-1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if ax == None: fig, ax = plt.subplots(figsize=(12, 7)) if invert: ax.invert_yaxis() ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training score') ax.plot(train_sizes, test_scores_mean, 'o-', color='g', label='test score') ax.set_xlabel('Training Examples') ax.set_ylabel('Score') ax.set_title(title) ax.grid(alpha=0.5) sns.despine(bottom=True, left=True) ax.legend(loc='best') ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") return None
def test_learning_curve(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) with warnings.catch_warnings(record=True) as w: train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10)) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_equal(train_scores.shape, (10, 3)) assert_equal(test_scores.shape, (10, 3)) assert_array_equal(train_sizes, np.linspace(2, 20, 10)) assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10))
def test_learning_curve_verbose(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) old_stdout = sys.stdout sys.stdout = StringIO() try: train_sizes, train_scores, test_scores = \ learning_curve(estimator, X, y, cv=3, verbose=1) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert("[learning_curve]" in out)
def __plot_learning_curve(self, dname=None): for alg in self.algorithms: if self.verbose: print(' %s' % alg.name) estimator = alg.estimator train_sizes, train_scores, test_scores = learning_curve( estimator, self.data.X, self.data.y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs) # parallel run in cross validation train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title(estimator.__class__.__name__) plt.xlabel("Training examples") plt.ylabel("Score") plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="lower right") if dname is not None and not os.path.exists(dname): os.mkdir(dname) if dname is not None: plt.savefig('%s/learning_curve_%s.png' % (dname, estimator.__class__.__name__), bbox_inches='tight', dpi=75) else: plt.savefig('learning_curve_%s.png' % estimator.__class__.__name__, bbox_inches='tight', dpi=75) plt.close()
def plot_cv_accuracy(classifier,X_train,y_train,cv=10,n_jobs=1): train_sizes,train_scores,test_scores =\ learning_curve(estimator=classifier, X=X_train, y=y_train, train_sizes=np.linspace(0.1,1.0,10), cv=10, n_jobs=1) train_mean = np.mean(train_scores,axis=1) train_std = np.std(train_scores,axis=1) test_mean = np.mean(test_scores,axis=1) test_std = np.std(test_scores,axis=1) fig = plt.figure(figsize=(10,5)) plt.plot(train_sizes, train_mean, color='blue', marker='o' , markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std, alpha=0.15,color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle="--", marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std, alpha=0.15,color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.6 ,1.1]) plt.show()
def plot_learning_curve(estimator, X, y, train_sizes=np.linspace(.1, 1.0, 5), cv=None, n_jobs=1, ax=None): ''' Plot the learning curve for `estimator`. Parameters ---------- estimator : sklearn.Estimator X : array-like y : array-like train_sizes : array-like list of floats between 0 and 1 cv : int n_jobs : int ax : matplotlib.axes ''' # http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html if ax is None: fig, ax = plt.subplots() ax.set_xlabel("Training examples") ax.set_ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return ax
def plot_learning_curve(self, estimator, x_train, y_train, cv, data_label, n_jobs=-1): # plot the learning curves using sklearn and matplotlib plt.clf() train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=x_train, y=y_train, cv=cv, n_jobs=n_jobs) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', marker='s', markersize=5, linestyle='--', label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.title("Learning curve: %s" % (data_label)) plt.xlabel('Number of training samples') plt.ylabel('Accurancy') plt.legend(loc='lower right') fn = self.save_path + data_label + '_learncurve.png' plt.savefig(fn)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("top_data_dir") parser.add_argument('--method','-m',type=int,default=0,choices=range(5), help= """chose methods from: 0:linear_svc 1:logistic regression 2:naive bayes 3:decision tree 4:ExtraTreesClassifier """) args=parser.parse_args() training_dataset, testing_dataset = load_data(args.top_data_dir) clf = get_classifier(args.method) print "cross validation:" clf_train_sizes = [0.05,0.1,0.2,0.4,0.6,0.8,1.0] clf_train_sizes, clf_train_scores, clf_valid_scores =\ learning_curve(clf, testing_dataset.X, testing_dataset.y, train_sizes=clf_train_sizes, cv=5,scoring='f1') print "-"*20 print "Classification:" print "Training:" print clf_train_scores # random.shuffle(clf_train_scores) # print clf_train_scores print "Validation:" print clf_valid_scores # random.shuffle(clf_valid_scores) # print clf_valid_scores print "Average" print "Training:" for i in clf_train_scores: print "%f" %(sum(i)/len(i)) print "Validation:" for i in clf_valid_scores: print "%f" %(sum(i)/len(i))
def run_nn(X,y): # Create CV training and test scores for various training set sizes #this is for Neural Network Classification problem of shots made train_sizes1, train_scores1, test_scores1 = learning_curve(MLPClassifier(), X, y, # Number of folds in cross-validation cv=10, # Evaluation metric scoring='accuracy', # Use all computer cores n_jobs=-1, # 50 different sizes of the training set train_sizes=np.linspace(0.01, 1.0, 50)) # Create means and standard deviations of training set scores train_mean1 = np.mean(train_scores1, axis=1) train_std1 = np.std(train_scores1, axis=1) print "Avg. Accuracy Score of Training Set: ", np.mean(train_mean1) # Create means and standard deviations of test set scores test_mean1 = np.mean(test_scores1, axis=1) test_std1 = np.std(test_scores1, axis=1) print "Avg. Accuracy Score of Test Set: ", np.mean(test_mean1) # Draw lines plt.plot(train_sizes1, train_mean1, '--', color="#111111", label="Training score") plt.plot(train_sizes1, test_mean1, color="#111111", label="Cross-validation score") # Draw bands plt.fill_between(train_sizes1, train_mean1 - train_std1, train_mean1 + train_std1, color="#DDDDDD") plt.fill_between(train_sizes1, test_mean1 - test_std1, test_mean1 + test_std1, color="#DDDDDD") # Create plot plt.title("Learning Curve for Shot Made Classification Problem Neural Network") plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best") plt.tight_layout() plt.show()
X = finalfsdf import lightgbm as lgb params = { 'max_depth': 10, 'n_estimators ': 10, 'objective': 'binary', 'colsample_bytree': 0.8, "class_weight":{0:1 , 1:20}, "base_score":0.2, "n_jobs":-1, "metric":"auc", "reg_alpha":0.4, "reg_lambda":0.18, } clf = lgb.LGBMClassifier(**params) from sklearn.model_selection import learning_curve train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5) train_sizes lcurveplotdf = pd.DataFrame({"train_size":train_sizes , "train_score" : train_scores[:,1] , "valid_score":valid_scores[:,1]}) ggplot(lcurveplotdf ) + \ geom_line(aes(x="train_size" , y="train_score") , color="red") + \ geom_line(aes(x="train_size" , y="valid_score") , color="green")
clf = KNeighborsClassifier() grid_object = GridSearchCV(estimator=clf, param_grid=param_dict, scoring='accuracy', cv=10, n_jobs=-1) grid_object.fit(X_train, y_train) best_params = grid_object.best_params_ print(best_params) opt_clf = KNeighborsClassifier(**best_params) # Learning Curve Plots train_sizes, train_scores, validation_scores = learning_curve( opt_clf, X_train, y_train, train_sizes=np.linspace(0.1, 1.0, 100), n_jobs=-1) av_train_scores = np.mean(train_scores, axis=1) av_validation_scores = np.mean(validation_scores, axis=1) # LC Plot plt.plot(train_sizes, av_train_scores, label='train scores') plt.plot(train_sizes, av_validation_scores, label='validation scores') plt.title("Learning Curve") plt.xlabel("Training Examples") plt.ylabel("Scores") plt.ylim([0.60, 1.02]) plt.legend() plt.show() # Validation Curves
model.fit(x_train, y_train, verbose=1) #, callbacks=[tb]) # accuracy # train_sizes, train_scores_model, test_scores_model = \ # learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10), # scoring="accuracy", cv=8, shuffle=True, random_state=42, verbose=1) # train_scores_mean = np.mean(train_scores_model, axis=1) # train_scores_std = np.std(train_scores_model, axis=1) # test_scores_mean = np.mean(test_scores_model, axis=1) # test_scores_std = np.std(test_scores_model, axis=1) # log loss train_sizes, train_scores_model, test_scores_model = \ learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10), scoring='neg_log_loss', cv=8, shuffle=True, random_state=42) # accuracy # plt.plot(train_sizes, train_scores_mean, 'o-', color="r", # label="Training score") # plt.plot(train_sizes, test_scores_mean, 'o-', color="g", # label="validation score") # log loss plt.plot(train_sizes, -train_scores_model.mean(1), 'o-', color="r", label="log_loss") plt.plot(train_sizes, -test_scores_model.mean(1), 'o-', color="g", label="val log_loss") plt.xlabel("Train size") plt.ylabel("Log loss") # plt.ylabel("Accuracy") plt.title('lgbm')
def function_plot_learning_curve(estimator, features, target, train_sizes, cv, title): _, axes = plt.subplots(figsize=(8, 5)) axes.set_title(title) axes.set_xlabel("Training examples") axes.set_ylabel("MAE") train_sizes, train_scores, validation_scores = learning_curve( estimator, features, target, train_sizes=train_sizes, cv=cv, scoring='neg_mean_squared_error') train_scores_mean = -train_scores.mean(axis=1) test_scores_mean = -validation_scores.mean(axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_std = np.std(validation_scores, axis=1) # Plot learning curve axes.grid() axes.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes.legend(loc="lower left") # plt.ylim(0,40) #def function_plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, # n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): # """ # Generate 3 plots: the test and training learning curve, the training # samples vs fit times curve, the fit times vs score curve. # # Parameters # ---------- # estimator : object type that implements the "fit" and "predict" methods # An object of that type which is cloned for each validation. # # title : string # Title for the chart. # # X : array-like, shape (n_samples, n_features) # Training vector, where n_samples is the number of samples and # n_features is the number of features. # # y : array-like, shape (n_samples) or (n_samples, n_features), optional # Target relative to X for classification or regression; # None for unsupervised learning. # # axes : array of 3 axes, optional (default=None) # Axes to use for plotting the curves. # # ylim : tuple, shape (ymin, ymax), optional # Defines minimum and maximum yvalues plotted. # # cv : int, cross-validation generator or an iterable, optional # Determines the cross-validation splitting strategy. # Possible inputs for cv are: # # - None, to use the default 5-fold cross-validation, # - integer, to specify the number of folds. # - :term:`CV splitter`, # - An iterable yielding (train, test) splits as arrays of indices. # # For integer/None inputs, if ``y`` is binary or multiclass, # :class:`StratifiedKFold` used. If the estimator is not a classifier # or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. # # Refer :ref:`User Guide <cross_validation>` for the various # cross-validators that can be used here. # # n_jobs : int or None, optional (default=None) # Number of jobs to run in parallel. # ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. # ``-1`` means using all processors. See :term:`Glossary <n_jobs>` # for more details. # # train_sizes : array-like, shape (n_ticks,), dtype float or int # Relative or absolute numbers of training examples that will be used to # generate the learning curve. If the dtype is float, it is regarded as a # fraction of the maximum size of the training set (that is determined # by the selected validation method), i.e. it has to be within (0, 1]. # Otherwise it is interpreted as absolute sizes of the training sets. # Note that for classification the number of samples usually have to # be big enough to contain at least one sample from each class. # (default: np.linspace(0.1, 1.0, 5)) # """ # # _, axes = plt.subplots(figsize=(8, 5)) # # axes.set_title(title) # if ylim is not None: # axes.set_ylim(*ylim) # axes.set_xlabel("Training examples") # axes.set_ylabel("MAE") # # # train_sizes, train_scores, test_scores, fit_times, _ = \ # learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, scoring='neg_mean_absolute_error', # train_sizes=train_sizes, return_times=True) # train_scores_mean = np.mean(train_scores, axis=1) # train_scores_std = np.std(train_scores, axis=1) # test_scores_mean = np.mean(test_scores, axis=1) # test_scores_std = np.std(test_scores, axis=1) # fit_times_mean = np.mean(fit_times, axis=1) # fit_times_std = np.std(fit_times, axis=1) # # # Plot learning curve # axes.grid() # axes.fill_between(train_sizes, train_scores_mean - train_scores_std, # train_scores_mean + train_scores_std, alpha=0.1, # color="r") # axes.fill_between(train_sizes, test_scores_mean - test_scores_std, # test_scores_mean + test_scores_std, alpha=0.1, # color="g") # axes.plot(train_sizes, train_scores_mean, 'o-', color="r", # label="Training score") # axes.plot(train_sizes, test_scores_mean, 'o-', color="g", # label="Cross-validation score") # axes.legend(loc="lower left") return plt
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate 3 plots: the test and training learning curve, the training samples vs fit times curve, the fit times vs score curve. Parameters ---------- estimator : estimator instance An estimator instance implementing `fit` and `predict` methods which will be cloned for each validation. title : str Title for the chart. X : array-like of shape (n_samples, n_features) Training vector, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : array-like of shape (n_samples) or (n_samples, n_features) Target relative to ``X`` for classification or regression; None for unsupervised learning. axes : array-like of shape (3,), default=None Axes to use for plotting the curves. ylim : tuple of shape (2,), default=None Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like of shape (n_ticks,) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the ``dtype`` is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True, scoring='accuracy') train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") return plt
def runMLP(): # We use Stochastic Gradient Descent to be able to get learning errors, and # Adam is more suited for larger datasets. Lbfgs would be better than sgd but # does not provide error information on learning. solvers = ["lbfgs", "sgd", "adam"] # We disregard the tanh activation function: It ranges from -1 to 1, mapping inputs # Distinctly towrads -1 when negative, and 0 inputs close to 0. Behavior making it # well suited for classification between two classes, thus not our choice. # For much the same reason we disregard the logistic (sigmoid) activation function. # We use relu for our activation function. It goes from 0 to infinity and can suffer from 'dying' # when the input includes negative values. Not a great concern for us. It has the advantage # of not suffering from the vanishing gradient problem (while a benefit over of activations such # as sigmoid and tanh, whom both suffer from this problem, # it would be more relevant if our model included a greater number of layers). # The fact that it sets all negative inputs to 0 also helps produce a more sparse model, # silencing low/non importance neurons. activationFunction = ["identity", "logistic", "tanh", "relu"] # We leave our alpha value as default. Could be optimised further with trial and error. alpha = 0.001 # We set our learning rate to adaptive, a perk of using a gradient descent algorithm. learning_rate = ["constant", "invscaling", "adaptive"] learning_rate_init = 0.001 # To avoid needles learning we set a max iter of 200. max_iters = 200 hidden_layer_sizes = (50, 10) applyStandardization = True mlpTrainData = trainData mlpTestData = data.instanceAttriTest if applyStandardization: scaler = StandardScaler() scaler.fit(mlpTrainData) mlpTrainData = scaler.transform(mlpTrainData) mlpTestData = scaler.transform(mlpTestData) mlp = MLPClassifier(solver="sgd", activation="relu", alpha=alpha, learning_rate="adaptive", max_iter=max_iters, learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes) mlp.fit(mlpTrainData, trainLabels) print(mlp.loss_curve_) t_sizes, t_scores, valid_scores = learning_curve( mlp, mlpTrainData, trainLabels, train_sizes=np.linspace(0.1, 1.0, 5), cv=5, scoring='neg_mean_squared_error', error_score='raise') prediction = mlp.predict(mlpTestData) print( f"Precision: {precision_score(data.instanceLabelTest, prediction, average='weighted')}" ) print(f"Accuracy: {accuracy_score(data.instanceLabelTest, prediction)}") print( f"confusion Matrix:\n {confusion_matrix(data.instanceLabelTest, prediction)}\n" ) fig, axs = plt.subplots(3) axs[0].plot(np.arange(0, mlp.n_iter_, 1), mlp.loss_curve_, label='') axs[0].set_title("Loss Curve") axs[1].plot( t_sizes, t_scores.mean(axis=1), ) axs[1].set_title("Training Scores") axs[2].plot( t_sizes, valid_scores.mean(axis=1), ) axs[2].set_title("Validation Scores") plt.ylabel('Error/Score') plt.xlabel('Set Size') plt.legend() fig.tight_layout() plt.show()
def eval_lc(model, x, y, train_sizes): train_sizes, train_scores, test_scores = sm.learning_curve( model, x, y, train_sizes=train_sizes, cv=5) print(train_scores) print(test_scores) return train_sizes, train_scores, test_scores
info = np.load(op.join(cfg.path_data, 'info_allch.npy')).item() picks = mne.pick_types(info, meg=meg) fname = op.join(cfg.path_outputs, 'covs_allch_oas.h5') covs = mne.externals.h5io.read_hdf5(fname) subjects = [d['subject'] for d in covs if 'subject' in d] covs = [d['covs'][:, picks][:, :, picks] for d in covs if 'subject' in d] X = np.array(covs) n_sub, n_fb, n_ch, _ = X.shape part = pd.read_csv(op.join(cfg.path_data, 'participants.csv')) y = part.set_index('Observations').age.loc[subjects] common = ProjCommonSpace(scale=scale, n_compo=n_compo, reg=reg) riemann = Riemann(n_fb=n_fb, metric=metric) sc = StandardScaler() ridge = RidgeCV(alphas=np.logspace(-3, 5, 100)) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) pipe = make_pipeline(common, riemann, sc, ridge) train_sizes = np.linspace(0.1, 1, 5) train_sizes, train_scores, test_scores = learning_curve( pipe, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='neg_mean_absolute_error') scores = {'train_sizes': train_sizes, 'train_scores': train_scores, 'test_scores': test_scores} np.save(op.join(cfg.path_outputs, 'all_scores_learning_curves.npy'), scores)
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, scoring=None, train_sizes=np.linspace(.1, 1.0, 5)): if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, scoring=scoring, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") return plt
plt.ylim(-.65, .0) plt.xlim(.5 * train_sizes.min(), train_sizes.max()) plt.xticks((100, 1000), ('100', '1000'), size=13) plt.yticks(()) plt.ylabel('Error') plt.xlabel('Number of samples ') plt.subplots_adjust(left=.07, bottom=.22, top=.99, right=.99) plt.savefig(name, edgecolor='none', facecolor='none') # Degree 9 model = make_pipeline(PolynomialFeatures(degree=9), LinearRegression()) train_sizes, train_scores, test_scores = model_selection.learning_curve( model, X, y, cv=model_selection.ShuffleSplit(n_splits=20), train_sizes=np.logspace(-2.5, -.3, 30)) idx_to_plot = [0, 7, 19, 29] for i in idx_to_plot: n_train = train_sizes[i] if i > 0: symbol_train = '--' symbol_test = '' else: symbol_train = 'o' symbol_test = 'o' plt.figure(figsize=(4.5, 3)) test_plot = plt.semilogx(train_sizes[:i + 1],
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), verbose=0): ''' Generate a simple plot of the test and traning learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects n_jobs : integer, optional Number of jobs to run in parallel (default 1). ''' plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
f1Score = f1_score(y_test, y_pred, average=None) print('\n\n\n\n', ' f1 score is : ', f1Score) from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import learning_curve #Coming up with training sizes train_sizes = [1, 50, 100, 150, 200, 250, 300, 4000, 8000] train_sizes, training_scores, test_scores = learning_curve( DecisionTreeClassifier(criterion='gini', max_depth=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter='best'), X, y, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error', shuffle='True') print('\n\n\nTraining scores:\n\n', training_scores) print('\n', '-' * 70) # separator to make the output easy to read print('\nValidation scores:\n\n', test_scores) training_scores_mean = -training_scores.mean(axis=1) test_scores_mean = -test_scores.mean(axis=1) print('\n\nMean training scores\n\n', pd.Series(training_scores_mean, index=train_sizes))
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True): """ 画出data在某模型上的learning curve. 参数解释 ---------- estimator : 你用的分类器。 title : 表格的标题。 X : 输入的feature,numpy类型 y : 输入的target vector ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) n_jobs : 并行的的任务数(默认1) """ train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if plot: plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(u"训练样本数") plt.ylabel(u"得分") plt.gca().invert_yaxis() plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="b") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分") plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分") plt.legend(loc="best") plt.draw() plt.gca().invert_yaxis() plt.show() midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1]) - ( test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff
test_size=0.2, random_state=0) # Cross validation cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0) # # Generate C param C = np.arange(1, 10) # # LEARNING CURVE SCORE # # Create three different models based on max_depth for k, C in enumerate(C): # Create a Decision tree regressor at max_depth = depth regressor = LogisticRegression(C=C) # Calculate the training and testing scores sizes, train_scores, test_scores = learning_curve( regressor, X, y, cv=cv, n_jobs=4, scoring=make_scorer(accuracy_score)) print('C:', C) print('score train:', np.mean(train_scores)) print('score test:', np.mean(test_scores)) # MODEL COMPLEX SCORE # Calculate the training and testing scores C = np.arange(1, 10) regressor = LogisticRegression() train_scores, test_scores = validation_curve( regressor, X, y, cv=cv, param_name='C',
more_scores = precision_recall_fscore_support(y_test, y_pred_test, average='weighted') print('Precision: ', more_scores[0]) print('Recall: ', more_scores[1]) # Define a 10 fold CV with 11 % data of training set (train_temp) for validation # 11 %, not 10 %, because the validation split is being used instead of the test split. cv = ShuffleSplit(n_splits=10, test_size=0.11, random_state=0) # Plot learning curves with 10-fold CV train_sizes, train_scores, test_scores = learning_curve( estimator=svm, X=X_train_temp_centered, y=y_train_temp, train_sizes=np.linspace(0.1, 1.0, 10), cv=cv, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) fig = plt.figure() plt.plot(train_sizes, train_mean, color='tab:blue', marker='o', markersize=5, label='training accuracy')
def plot_learning_curve(estimator, x, y, file_name=None, **estimator_info): ''' plot the learning curve of a estimator configured with a specific set of parameters estimator: unfitted estimator with a specific set of parameters already set x: numpy array of shape (N, K), with N samples and K features y: numpy array of shape (1,), target file_name: the file name to save the plot, if None, plot the graph on a window estimator_info: dict, details about the estimator and parameter configuration return: None ''' train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=5, scoring=ks_scorer, n_jobs=6) plt.figure() plt.title(dict_to_string(**estimator_info)) plt.xlabel("Training examples") plt.ylabel("Score") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") if file_name is not None: plt.savefig(file_name) else: plt.show() plt.close()
features, labels, param_name='svc__gamma', param_range=param_range) plt.title("Validation Curve with SVM") plt.xlabel("gamma") plt.ylabel("Score") plt.plot(param_range, validation_scores.mean(axis=1), label='cross-validation') plt.plot(param_range, train_scores.mean(axis=1), label='training') plt.legend(loc='best') plt.show() plt.figure() train_sizes, train_scores, validation_scores = learning_curve( tree_model, features, labels, train_sizes=np.logspace(-1, 0, 20)) plt.xlabel('Trainging Examples') plt.ylabel('Score') plt.title('Learning Curve') plt.plot(train_sizes, validation_scores.mean(axis=1), label='cross-validation') plt.plot(train_sizes, train_scores.mean(axis=1), label='training') plt.legend(loc='best') plt.show() fpr = dict() tpr = dict() roc_auc = dict() for i in range(0, len(classes)): fpr[i], tpr[i], _ = roc_curve(te_lab[:, i], y_p[:, i])
data = pd.read_excel('D:\SVM\\test_all.xlsx') pre_data = data.iloc[0:, 1:] X = pre_data.iloc[:, :53] y = pre_data.iloc[:, 53] x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.75) clf_rbf = svm.SVC(kernel='rbf') clf_rbf.fit(x_train, y_train.ravel()) print('rbf_train:%.2f' % clf_rbf.score(x_train, y_train)) print('rbf_test:%.2f' % clf_rbf.score(x_test, y_test)) #绘制学习曲线 X_shuffle, y_shuffle = shuffle(X, y) plt.figure(figsize=(7, 5)) train_sizes, train_scores, test_scores = learning_curve(clf_rbf,X_shuffle,y_shuffle) train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) plt.plot(train_sizes, train_scores_mean, 'o-', color='red', label='Train_Score = %0.2f' % clf_rbf.score(x_train, y_train) ) plt.plot(train_sizes, test_scores_mean, 'o-', color='blue', label='Test_Score = %0.2f' % clf_rbf.score(x_test, y_test)) plt.xlim([0.0, 200.0]) plt.ylim([0.5, 1.2]) plt.legend(loc="lower right") plt.title("Learning Curve") #绘制ROC metrics.f1_score(y_test,clf_rbf.predict(x_test)) fpr,tpr,thresholds=metrics.roc_curve(y_test,clf_rbf.decision_function(x_test), pos_label=2) roc_auc = metrics.auc(fpr,tpr)
def plot_learing_curve(pipeline, title): size = 10000 cv = KFold(size, shuffle=True) X = dataprep.train_news["Statement"] y = dataprep.train_news["Label"] pl = pipeline pl.fit(X, y) train_sizes, train_scores, test_scores = learning_curve( pl, X, y, n_jobs=-1, cv=cv, train_sizes=np.linspace(.1, 1.0, 5), verbose=0) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title(title) plt.legend(loc="best") plt.xlabel("Training examples") plt.ylabel("Score") plt.gca().invert_yaxis() # box-like grid plt.grid() # plot the std deviation as a transparent range at each training set size plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") # plot the average training and test score lines at each training set size plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") # sizes the window for readability and displays the plot # shows error from 0 to 1.1 plt.ylim(-.1, 1.1) plt.show()
def plot_learning_curve2(estimator, fn, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Validation Score") plt.legend(loc="best") plt.savefig("./%s" % fn) plt.close() # timing time_mean = np.mean(fit_times, axis=1) # Draw lines plt.plot(train_sizes, time_mean, label="Fit Time") # Create plot plt.title("Scalability (w/ regards to time)") plt.xlabel("Training Set Size"), plt.ylabel("Time"), plt.legend(loc="best") plt.tight_layout() plt.savefig("./%s_scale" % fn) plt.close() return train_scores, train_sizes, test_scores, fit_times
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """ 绘制学习曲线,用于判断欠拟合与过拟合 Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def plot_learning_curve(clf, X, y, title='Learning Curve', cv=None, train_sizes=None, n_jobs=1, ax=None): """Generates a plot of the train and test learning curves for a given classifier. Args: clf: Classifier instance that implements ``fit`` and ``predict`` methods. X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples) or (n_samples, n_features)): Target relative to X for classification or regression; None for unsupervised learning. title (string, optional): Title of the generated plot. Defaults to "Learning Curve" cv (int, cross-validation generator, iterable, optional): Determines the cross-validation strategy to be used for splitting. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. train_sizes (iterable, optional): Determines the training sizes used to plot the learning curve. If None, ``np.linspace(.1, 1.0, 5)`` is used. n_jobs (int, optional): Number of jobs to run in parallel. Defaults to 1. ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot the learning curve. If None, the plot is drawn on a new set of axes. Returns: ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn. Example: >>> import scikitplot.plotters as skplt >>> rf = RandomForestClassifier() >>> skplt.plot_learning_curve(rf, X, y) <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490> >>> plt.show() .. image:: _static/examples/plot_learning_curve.png :align: center :alt: Learning Curve """ if ax is None: fig, ax = plt.subplots(1, 1) if train_sizes is None: train_sizes = np.linspace(.1, 1.0, 5) ax.set_title(title) ax.set_xlabel("Training examples") ax.set_ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( clf, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) ax.grid() ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") ax.legend(loc="best") return ax
plt.fill_between(max_depth2, train_mean2 - train_std2, \ train_mean2 + train_std2, alpha = 0.15, color = 'r') plt.fill_between(max_depth2, test_mean2 - test_std2, \ test_mean2 + test_std2, alpha = 0.15, color = 'g') # Visual aesthetics plt.legend(loc = 'lower right') plt.xlabel('Maximum Depth') plt.ylabel('Score') plt.ylim([-0.05,1.05]) plt.show() """ regressor = DecisionTreeRegressor(max_depth=depth) sizes, train_score, test_score = learning_curve(regressor, features, price, train_sizes=train_sizes, cv=cv) train_std = np.std(train_score, axis=1) train_mean = np.mean(train_score, axis=1) test_std = np.std(test_score, axis=1) test_mean = np.mean(test_score, axis=1) ax = fig.add_subplot(2, 2, K + 1) ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score') ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score') ax.fill_between(sizes, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') ax.fill_between(sizes, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g')
print( "\n\nBest Accuracy Score %f\n Best Parameters %s\n Best Splits %i" % (gridResults.best_score_, gridResults.best_params_, gridResults.n_splits_)) from sklearn.model_selection import learning_curve #Coming up with training sizes train_sizes = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000] #Features=['Mean of the integrated profile',' Standard deviation of the integrated profile',' Excess kurtosis of the integrated profile',' Skewness of the integrated profile',' Mean of the DM-SNR curve',' Standard deviation of the DM-SNR curve',' Excess kurtosis of the DM-SNR curve',' Skewness of the DM-SNR curve'] #target='target_class' train_sizes, training_scores, test_scores = learning_curve( SVC(kernel='rbf', random_state=0, degree=1, shrinking=True, gamma='auto'), X, y, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error', shuffle='True') print('Training scores:\n\n', training_scores) print('\n', '-' * 70) # separator to make the output easy to read print('\nValidation scores:\n\n', test_scores) training_scores_mean = -training_scores.mean(axis=1) test_scores_mean = -test_scores.mean(axis=1) print('Mean training scores\n\n', pd.Series(training_scores_mean, index=train_sizes)) print('\n', '-' * 20) # separator print('\nMean test scores\n\n', pd.Series(test_scores_mean, index=train_sizes))
label="%s (test)" % name) plt.xscale("log") plt.yscale("log") plt.xlabel("Train size") plt.ylabel("Time (seconds)") plt.title('Execution Time') plt.legend(loc="best") # Visualize learning curves plt.figure() svr = SVR(kernel='rbf', C=1e1, gamma=0.1) kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1) train_sizes, train_scores_svr, test_scores_svr = \ learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), scoring="neg_mean_squared_error", cv=10) train_sizes_abs, train_scores_kr, test_scores_kr = \ learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), scoring="neg_mean_squared_error", cv=10) plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r", label="SVR") plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g", label="KRR") plt.xlabel("Train size") plt.ylabel("Mean Squared Error") plt.title('Learning curves') plt.legend(loc="best") plt.show()
import numpy as np from sklearn.model_selection import learning_curve from sklearn.metrics import make_scorer from sklearn.metrics import matthews_corrcoef import time start = time.time() train_sizes_lc_lgbm, train_scores_lc_lgbm, test_scores_lc_lgbm = learning_curve( estimator = classifier_lgbm, X = X, y = Y, train_sizes = np.linspace(0.1, 1.0, 20), cv = 10, scoring = make_scorer(matthews_corrcoef), shuffle = True, random_state = 42 ) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start)/60)) #Tempo de Execução: 1058.66 min train_mean_lc_lgbm = np.mean(train_scores_lc_lgbm, axis = 1) train_std_lc_lgbm = np.std(train_scores_lc_lgbm, axis = 1) test_mean_lc_lgbm = np.mean(test_scores_lc_lgbm, axis = 1) test_std_lc_lgbm = np.std(test_scores_lc_lgbm, axis = 1) plt.figure(figsize = (14, 7)) plt.plot( train_sizes_lc_lgbm, train_mean_lc_lgbm,
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.legend(loc="best") return plt
def plot_learning_curve( estimator, X, y, ylim=None, cv=None, n_jobs=multiprocessing.cpu_count() - 1, train_sizes=np.linspace(0.1, 1.0, 5), scoring=None, title="Learning Curve", ): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=multiprocessing.cpu_count() - 1) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ # learning curves in scikit learn # https://devdocs.io/scikit_learn/modules/generated/sklearn.model_selection.learning_curve#sklearn.model_selection.learning_curve train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, scoring=scoring ) # https://devdocs.io/scikit_learn/auto_examples/model_selection/plot_learning_curve#sphx-glr-auto-examples-model-selection-plot-learning-curve-py plt.figure() plt.title(title) plt.xlabel("Training examples") plt.ylabel("Score") if ylim is not None: plt.ylim(*ylim) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="#00AAAA", ) plt.fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="#AA00AA", ) plt.plot( train_sizes, train_scores_mean, "o-", color="#00AAAA", label="Training score" ) plt.plot( train_sizes, test_scores_mean, "o-", color="#AA00AA", label="Cross-validation score", ) # plt.axis([0, 80, 0, 3]) plt.legend(loc="best") plt.show()
def createLearningCurve(estimator, x, y, cv=None, name="", keras=False, is_estimator=True, scores=None): #if the estimator is the keras model if keras == True: #model already ran plt.plot(estimator.history['acc']) plt.plot(estimator.history['val_acc']) fname = "./results/" + name + '_learning_curve.png' plt.savefig(fname) return if is_estimator: train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, train_sizes=np.linspace(.1, 1.0, 5)) else: #read in scores[x[kfolds], y[kfolds], testx[kfolds], test_y[kfolds]] train_sizes = scores[0][0] test_scores = [] train_scores = [] ''' for i in range(len(scores)): train_scores.append(scores[i][1]) test_scores.append(scores[i][3]) ''' time_train = [] time_test = [] for i in range(len(scores)): #rolls over all train scores of ith run for j in range(len(scores[i][1])): #create time array #add first time score if i == 0: time_train.append([scores[i][1][j]]) else: train_scores[j].append(scores[i][1][j]) #for 1 to batchNum test scores for j in range(len(scores[i][3])): #add first time score if i == 0: time_test.append([scores[i][3][j]]) else: test_scores[j].append(scores[i][3][j]) if i == 0: #two arrays of batchNum arrays each train_scores = time_train test_scores = time_test #create plots _, plots = plt.subplots(figsize=(20, 5)) #set axis names plots.set_xlabel("Training examples") plots.set_ylabel("Score") #plot curves train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) #fit_times_mean = np.mean(fit_times, axis=1) #fit_times_std = np.std(fit_times, axis=1) plots.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plots.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") #plot train plots.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") #plot test plots.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross validation score") fname = "./results/" + name + '_learning_curve.png' plt.savefig(fname) plots.legend(loc="best")
''' ## Adjust parameters = {#'n_estimators': [10,20,30,40,50,60,70,80,90,100,120,140,160,180,200]} #'min_samples_leaf': [ 3, 10,20,30,40,50,60,70,80,90,100,120,140,160,180,200,300,400,500]} #'alpha': [0.1, 0.3, 0.6, 0.9] #'max_features':[30,32,34,36,38,40,42,44,46,48,50] 'max_depth':[i for i in range(10,200)] } # 定义要优化的参数信息 model_gs = GridSearchCV(estimator=vr, param_grid=parameters, cv=10) model_gs.fit(X_train,y_train) print(model_gs.best_params_, model_gs.best_score_) ''' # learning_curve train_sizes, train_scores, test_scores = learning_curve(estimator=gbr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='test accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.01, 1.0])
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : integer, optional Number of jobs to run in parallel (default 1). """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
#dataset = datasets.load_diabetes() # fit a model to the data import pandas as pd mydata = pd.read_csv('winequality-red.csv') dataset = mydata dataset.target = mydata["quality"] #provided your csv has header row, and the label column is named "Label" #select all but the last column as data dataset.data = mydata.ix[:, :-1] model = ensemble.AdaBoostClassifier() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted - expected)**2) print(mse) print(model.score(dataset.data, dataset.target)) train_sizes, train_scores, valid_scores = learning_curve( model, dataset.data, dataset.target, train_sizes=[100, 200, 300, 400, 500, 600, 700, 800], cv=5) train_sizes print(train_scores) print(valid_scores)