def shuffle_split(): iris = datasets.load_iris() sp1 = model_selection.ShuffleSplit(train_size=0.6, test_size=0.4, n_splits=3) for train_index, test_index in sp1.split(iris.data, iris.target): print(len(train_index), len(test_index)) print() sp2 = model_selection.ShuffleSplit(train_size=0.6, n_splits=3) for train_index, test_index in sp2.split(iris.data, iris.target): print(len(train_index), len(test_index)) print() sp3 = model_selection.ShuffleSplit(test_size=0.4, n_splits=3) for train_index, test_index in sp3.split(iris.data, iris.target): print(len(train_index), len(test_index)) print() total = [] sp4 = model_selection.ShuffleSplit(train_size=100, test_size=50, n_splits=3) for train_index, test_index in sp4.split(iris.data, iris.target): print(len(test_index), test_index[:10]) total += list(test_index) print(sorted(total))
def et1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 scores = list() num_seeds = 3 num_splits = 5 base_seed = 13 ss = model_selection.ShuffleSplit(n_splits=num_splits) for seed in range(base_seed, base_seed + num_seeds): ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=seed) for n, (itrain, ival) in enumerate(ss.split(train2, y)): reg = ensemble.ExtraTreesClassifier(max_depth=15, random_state=seed, n_estimators=2500, n_jobs=-2) reg.fit(train2[itrain], y[itrain]) p = reg.predict_proba(train2[ival])[:,1] v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) print(cname, 'seed %d step %d: '%(seed, n+1), score, now()) scores.append(score) z[cname] += pconvert(reg.predict_proba(test2)[:,1]) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits * num_seeds v[cname] /= num_seeds
def test_classifier(clf): #iterate of test/train data set for n interattion and apply the decision tree regressor algorithm shuffle_validator = ms.ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) scores = ms.cross_val_score(clf, X, y, cv=shuffle_validator) print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
def apply_svm_cross_validation( X, y, svc_args={ 'loss': 'hinge', 'penalty': 'elasticnet', 'max_iter': 1000, 'alpha': 0.001, 'tol': 1e-3, 'random_state': 123456, 'class_weight': None }): clf = linear_model.SGDClassifier(**svc_args) cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) scores = model_selection.cross_validate( clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True) print(scores) return [ np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']) ]
def __call__(self, table): if self.replace: # pylint: disable=no-member rgen = np.random.RandomState(self.random_state) sample = rgen.randint(0, len(table), self.n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return others, sample if self.n == len(table): rgen = np.random.RandomState(self.random_state) sample = np.arange(self.n) rgen.shuffle(sample) return np.array([], dtype=int), sample elif self.stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), self.n) splitter = skl.StratifiedShuffleSplit( n_splits=1, test_size=test_size, train_size=len(table) - test_size, random_state=self.random_state) splitter.get_n_splits(table.X, table.Y) ind = splitter.split(table.X, table.Y) else: splitter = skl.ShuffleSplit(n_splits=1, test_size=self.n, random_state=self.random_state) splitter.get_n_splits(table) ind = splitter.split(table) return next(iter(ind))
def cv_models(self): nfold = 3 cv_split = model_selection.ShuffleSplit(n_splits=nfold, test_size=0.3, train_size=0.7, random_state=43) index = 0 for model_name, model in self.models.items(): # use muiti processing to speed up this process print(" ---> Work on CV for %s " % model_name) start = time.time() rmse = np.sqrt(-cross_val_score(model, self.train_x.values, self.train_y, scoring='neg_mean_squared_error', cv=cv_split, n_jobs=5)) ##cv = cv_split)) print(rmse) end = time.time() print(" time spent: ", end - start) self.MLA.loc[index, 'CVScoreMean'] = rmse.mean() self.MLA.loc[index, 'CVScoreSTD'] = rmse.std() index += 1 print(self.MLA)
def compare_classifiers(X, y): # I initialise a shuffle split class to split our dataset 10 times # and each batch will include 90% of the dataset. # This makes the classifiers more robust since we'll 'rotate' the # training data multiple times. shuffle_split_class = model_selection.ShuffleSplit(n_splits=10, test_size=0.3, train_size=0.6, random_state=0) classifier_comparison = [] for classifier in classifiers: cross_validation = model_selection.cross_validate( classifier, X, y, cv=shuffle_split_class, return_train_score=True) classifier_output = { 'Name': classifier.__class__.__name__, 'Train Accuracy Mean': cross_validation['train_score'].mean(), 'Dev Accuracy Mean': cross_validation['test_score'].mean(), 'Dev Accuracy 3*STD': cross_validation['test_score'].std() * 3, 'Time': cross_validation['fit_time'].mean(), } classifier_comparison.append(classifier_output) classifier_comparison = pd.DataFrame( classifier_comparison, columns=classifier_comparison[0].keys()) classifier_comparison.sort_values(by=['Dev Accuracy Mean'], ascending=False, inplace=True) return classifier_comparison
def predict_age(dataset): ''' 预测年龄 ''' data_p = dataset[['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']] x_train = data_p.loc[~data_p['Age'].isnull(), :].drop('Age', 1) y_train = data_p.loc[~data_p['Age'].isnull(), :]['Age'] x_test = data_p.loc[data_p['Age'].isnull(), :].drop('Age', 1) print('初步处理完') param_grid = { 'learning_rate': [.001, .005, .01, .05, .1], 'max_depth': [2, 4, 6, 8], 'n_estimators': [50, 100, 300, 500, 1000], 'seed': [2018] } cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=0) tune_model = model_selection.GridSearchCV(XGBRegressor(nthread=-1), param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv_split) print('model tuned') tune_model.fit(x_train, y_train) print('model fitted') print(tune_model.best_params_) y_test = tune_model.best_estimator_.predict(x_test) print('model predicted') print(y_test.head(5)) return y_test
def apply_logreg_cross_validation_coeff( X, y, svc_args={ 'penalty': 'l2', 'C': 1.0, 'random_state': 123456, 'multi_class': "auto", 'class_weight': None, 'solver': "lbfgs", 'max_iter': 1000, 'verbose': 1 }): clf = linear_model.LogisticRegression(**svc_args) #cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) #for l2 cv = model_selection.ShuffleSplit(n_splits=3, test_size=0.1, random_state=123456) #for l1 scores = model_selection.cross_validate( clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True, return_estimator=True) print(scores) return [ np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean([model.coef_[0] for model in scores['estimator']], axis=0) ]
def identical_distribution_split(target, n_splits=3, train_size=0.7, test_size=0.3, cos_theta_lim=0.7): # cos_theta_list = [0.0] # while min(cos_theta_list) < cos_theta_lim: # del cos_theta_list[:] index = [] cv = model_selection.ShuffleSplit(n_splits=n_splits, train_size=train_size, test_size=test_size) # for index1, index2 in cv.split(target): #测试集必须与训练集、验证集分开归一化处理,不可合并归一化 dataset1 = target[index1][:, np.newaxis] dataset2 = target[index2][:, np.newaxis] # cos_theta, _, _ = data_similarity.hist_similarity( data_normalization(dataset1), data_normalization(dataset2)) cos_theta_list.append(cos_theta) index.extend([index1, index2]) # return cv, index, cos_theta_list
def sample(table, n=0.7, stratified=False, replace=False, random_state=None): """ Samples data instances from a data table. Returns the sample and a dataset from input data table that are not in the sample. Also uses several sampling functions from `scikit-learn <http://scikit-learn.org>`_. table : data table A data table from which to sample. n : float, int (default = 0.7) If float, should be between 0.0 and 1.0 and represents the proportion of data instances in the resulting sample. If int, n is the number of data instances in the resulting sample. stratified : bool, optional (default = False) If true, sampling will try to consider class values and match distribution of class values in train and test subsets. replace : bool, optional (default = False) sample with replacement random_state : int or RandomState Pseudo-random number generator state used for random sampling. """ if type(n) == float: n = int(n * len(table)) if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.randint(0, len(table), n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return table[sample], table[others] n = len(table) - n if stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), n) splitter = skl.StratifiedShuffleSplit( n_splits=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state, ) splitter.get_n_splits(table.X, table.Y) ind = splitter.split(table.X, table.Y) else: splitter = skl.ShuffleSplit(n_splits=1, test_size=n, random_state=random_state) splitter.get_n_splits(table) ind = splitter.split(table) ind = next(ind) return table[ind[0]], table[ind[1]]
def gridsearch_params(MLA_compare, X_train, y_train, top): """ This function will return the best parameters for a model as a dictionary """ best_classifiers = MLA_compare['MLA Name'].values[:top] best_cls_ind = MLA_compare['MLA Name'].index[:top] cv_split = model_selection.ShuffleSplit(n_splits=5, test_size=.2, train_size=.8, random_state=39) best_params_dict = {'cls': best_classifiers, 'param': [], 'score': []} start_total = time() for ind, clf in zip(best_cls_ind, best_classifiers): start = time() param = grid_param[ind] estimator = MLA[clf] # if estimator == 'XGBClassifier': # break # else: best_search = model_selection.GridSearchCV(estimator=estimator, param_grid=param, cv=cv_split, scoring='roc_auc', n_jobs=-1) best_search.fit(X_train, y_train) run = time() - start best_param = best_search.best_params_ best_params_dict['param'].append(MLA[clf].set_params(**best_param)) best_params_dict['score'].append(best_search.best_score_) print(f'{clf}\nBest Parameters: {best_param}\nRuntime: {run:.2f} seconds.') print('-' * 10) run_total = time() - start_total print(f'Total optimization time was {(run_total / 60):.2f} minutes.') return best_params_dict
def keras_common(train3, y, test3, v, z, num_splits, cname, build_model, seed = 1234): v[cname], z[cname] = 0, 0 np.random.seed(seed) build_model().summary(line_length=120) model_path = '../data/working/' + cname + '_keras_model.h5' ss = model_selection.ShuffleSplit(n_splits=num_splits, random_state=11, test_size=1/num_splits) scores = list() for n, (itrain, ival) in enumerate(ss.split(train3, y)): xtrain, xval = train3[itrain], train3[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = 128, epochs=10000, validation_data=(xval, yval), verbose=0, callbacks=build_keras_fit_callbacks(model_path), shuffle=True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += pconvert(p).ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, now()) scores.append(score) z[cname] += pconvert(model.predict(test3)).ravel() del model for i in range(3): gc.collect(i) os.remove(model_path) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits
def _cross_validate_individual_classifier(self, classifier, test_size_ratio): cv_options = model_selection.ShuffleSplit(n_splits=10, test_size=test_size_ratio, random_state=RANDOM_SEED) X = self._all_predictor_variables y = self._all_response_variables out = model_selection.cross_validate(classifier, X, y, scoring="accuracy", cv=cv_options, return_train_score=True, verbose=1) return out
def fit(self, X, y): print "Fitting a restricted ElasticNetCV regressor..." self.standardizer = preprocessing.StandardScaler() X = self.standardizer.fit_transform(X) cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) alpha_range = [ 0.005, 0.007, 0.002, 0.0025, 0.004, 0.003, 0.0035877427142009029, 0.01, 0.001 ] param_grid = [] param_grid.append( dict(alpha=alpha_range, l1_ratio=[.1, .2, .25, .3, .35, .4, .5, .6, .65, .7, .8], normalize=[True], max_iter=[10000])) print "Using param grid " + str(param_grid) self.clf = model_selection.GridSearchCV(linear_model.ElasticNet(), param_grid=param_grid, cv=cv, n_jobs=12) self.clf.fit(X, y) print "Best params: " + str( self.clf.best_params_) + " and corresponding score is " + str( self.clf.best_score_)
def run_classification(K, C): clf = svm.SVC(kernel='precomputed', probability=True) ss = model_selection.ShuffleSplit(n_splits=10, test_size=0.5) accuracy_scores = list() precision_scores = list() recall_scores = list() for train, test in ss.split(K): kTrain = K[train][:, train] cTrain = C[train] # For the test kernel matrix, the values between *all* training # vectors and all test vectors must be provided. #kTest = K[train, test] kTest = K[test] kTest = kTest[:, train] cTest = C[test] clf.fit(kTrain, cTrain) prediction = clf.predict(kTest) accuracy_scores.append(accuracy_score(cTest, prediction)) precision_scores.append(precision_score(cTest, prediction)) recall_scores.append(recall_score(cTest, prediction)) print("Average accuracy: ", sum(accuracy_scores) / len(accuracy_scores), file=sys.stderr) print("Average precision:", sum(precision_scores) / len(precision_scores), file=sys.stderr) print("Average recall: ", sum(recall_scores) / len(recall_scores), file=sys.stderr)
def getAndScoreVotingEnsemble(self, trainingDataFrame, predictorColumns, labelColumn, votingMethod="hard"): trainingInputs = trainingDataFrame[predictorColumns] #trainingInputs = preprocessing.normalize(trainingInputs, axis=0) trainingLabels = trainingDataFrame[labelColumn] cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.7, random_state=0) voter = ensemble.VotingClassifier(estimators=self.MLA, voting=votingMethod) voter_cv = model_selection.cross_validate(voter, trainingInputs, trainingLabels, cv=cv_split) voter.fit(trainingDataFrame[predictorColumns], trainingDataFrame[labelColumn]) print("{} Voting Training mean Score: {:.2f}".format( votingMethod, voter_cv['train_score'].mean() * 100)) print("{} Voting Test mean Score: {:.2f}".format( votingMethod, voter_cv['test_score'].mean() * 100)) print("{} Voting Test Score 3*std: +/- {:.2f}".format( votingMethod, voter_cv['test_score'].std() * 100 * 3)) print('-' * 10)
def nationality_trainer_v1(): # Load nationality list with codecs.open("nationalities.txt", "r", "utf-8") as f: nationalities = list(map(lambda x: x.strip().lower(), f.readlines())) # Load words list with codecs.open("words.txt", "r", "utf-8") as f: words = list(map(lambda x: x.strip().lower(), f.readlines())) random.shuffle(words) wordstrain = words[0:1 * len(nationalities)] # Results of nationalities list are true the others are false # These will be used in training ynat = [1 for x in nationalities] ywor = [0 for x in wordstrain] y = ynat + ywor # Combine all words into a big list and calculate length of each word totallist = nationalities + wordstrain X = [len(x) for x in totallist] # Prepare a matrix whose number of rows is the same as the big list # and whose number of cols is the same as the word with max length # separate them into letters, then two letters, then three letters longest = 25 + 24 + 23 # max(X) X = np.zeros((len(X), longest), dtype=np.int64) # convert each character of each word in the list and save them in the # value matrix we created above. For words of length less than max length, # remaning col values are zero. Do this for two and three letter pairs for i in range(len(totallist)): # X[i, 0:len(totallist[i])] = np.array([ord(x) for x in totallist[i]]) #two letters for j in range(0, len(totallist[i]) - 1): letter1 = ord(totallist[i][j]) letter2 = ord(totallist[i][j + 1]) X[i, (25 + j)] = int("{0}{1}".format(letter1, letter2)) #three letters for j in range(0, len(totallist[i]) - 2): letter1 = ord(totallist[i][j]) letter2 = ord(totallist[i][j + 1]) letter3 = ord(totallist[i][j + 2]) X[i, (25 + 24 + j)] = int("{0}{1}{2}".format(letter1, letter2, letter3)) # Load neural network classifier # hiddenlayers = (100,) # clf = neural_network.MLPClassifier(hidden_layer_sizes = hiddenlayers) # Load Support Vector Machine # clf = svm.SVC() # Load Naive Bayes clf = naive_bayes.MultinomialNB() # Use shuffle split for N way splitting cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.3) # score the classifier print(model_selection.cross_val_score(clf, X, y, cv=cv))
def quick_fitted_tree(X, Y, model_type=['GridSearch', 'FeatureSelection'], test_split=None, random_state=None): from sklearn import tree, model_selection, feature_selection splitted_data = None sel_cols = None x = X.copy() y = Y.copy() if isinstance(test_split, (float)): x, x_test, y, y_test = model_selection.train_test_split( x, y, test_size=test_split, random_state=random_state) cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=random_state) dtree = tree.DecisionTreeClassifier(random_state=random_state) model = dtree if 'FeatureSelection' in model_type: dtree_rfe = feature_selection.RFECV( tree.DecisionTreeClassifier(random_state=random_state), step=1, scoring='accuracy', cv=cv_split) dtree_rfe.fit(x, y) x = x[:, dtree_rfe.get_support()] if isinstance(test_split, (float)): x_test = x_test[:, dtree_rfe.get_support()] sel_cols = dtree_rfe.get_support() if 'GridSearch' in model_type: param_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': [2, 4, 6, 8, 10, None], 'random_state': [0], #'splitter': ['best', 'random'], #'min_samples_split': [2,5,10,.03,.05], #'min_samples_leaf': [1,5,10,.03,.05], #'max_features': [None, 'auto'], } model = model_selection.GridSearchCV( tree.DecisionTreeClassifier(random_state=random_state), param_grid=param_grid, scoring='accuracy', cv=cv_split) model.fit(x, y) if model_type != None: model = model.best_estimator_ if isinstance(test_split, (float)): splitted_data = (x, x_test, y, y_test) else: splitted_data = (None, x, None, y) return model, sel_cols, splitted_data
def bayes_search(): data, target = load_train() pipeline = create_pipeline() data = pipeline.fit_transform(data) nrmse_scorer = make_scorer(lambda x, y: rmse(x, y) * -1) bayes_cv_tuner = BayesSearchCV( estimator = XGBRegressor(), search_spaces = { 'learning_rate': (0.01, 1.0, 'log-uniform'), 'min_child_weight': (0, 10), 'max_depth': (0, 50), 'max_delta_step': (0, 20), 'subsample': (0.01, 1.0, 'uniform'), 'colsample_bytree': (0.01, 1.0, 'uniform'), 'colsample_bylevel': (0.01, 1.0, 'uniform'), 'reg_lambda': (1e-9, 1000, 'log-uniform'), 'reg_alpha': (1e-9, 1.0, 'log-uniform'), 'gamma': (1e-9, 0.5, 'log-uniform'), 'min_child_weight': (0, 5), 'n_estimators': (50, 2000), 'scale_pos_weight': (1e-6, 500, 'log-uniform') }, scoring = nrmse_scorer, cv = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ), n_jobs = 1, n_iter = 10000, verbose = 0, refit = True, random_state = 42 ) def status_print(optim_result): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(bayes_cv_tuner.cv_results_) # Get current parameters and the best parameters best_params = pd.Series(bayes_cv_tuner.best_params_) print('Model #{}\nBest MSE: {}\nBest params: {}\n'.format( len(all_models), np.round(bayes_cv_tuner.best_score_, 4), bayes_cv_tuner.best_params_ )) # Save all model results clf_name = bayes_cv_tuner.estimator.__class__.__name__ all_models.to_csv(clf_name+"_cv_results.csv") # Fit the model result = bayes_cv_tuner.fit(data, target, callback=status_print)
def cross_validation_split_2(): # split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html # #sklearn.model_selection.ShuffleSplit # note: this is an alternative to train_test_split # split training dataset into 0.6:0.3:0.1 subsets and return index of those subset, run model 10 times with 60/30 split intentionally leaving out 10% cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=0) return cv_split
def split_train_test_shuffle(data, n_splits, test_ratio, seed=None): train_set = pd.DataFrame(data=None, columns=data.columns, index=data.index) test_set = pd.DataFrame(data=None, columns=data.columns, index=data.index) rs = ms.ShuffleSplit(n_splits=n_splits, test_size=test_ratio, random_state=seed) for train_idx, test_idx in rs.split(data): train_set = data.loc[train_idx] test_set = data.loc[test_idx] return train_set, test_set
def plot_learning_curve_1(model, X, y, n_splits=3, train_size=0.8, test_size=0.2, scoring="neg_mean_squared_error", train_sizes=np.linspace(0.1, 1.0, 5)): train_scores = [] cv_scores = [] # subset_sizes = X.shape[0] * train_sizes subset_sizes = subset_sizes.astype(int) cv = model_selection.ShuffleSplit(n_splits=n_splits, train_size=train_size, test_size=test_size) for m in subset_sizes: X_train_cv = X[:m] y_train_cv = y[:m] # scores = model_selection.cross_validate(model, X_train_cv, y_train_cv, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True) # tain_score = scores.get("train_score") cv_score = scores.get("test_score") # train_scores.append(tain_score) cv_scores.append(cv_score) # train_scores = np.mean(np.array(train_scores), axis=1) cv_scores = np.mean(np.array(cv_scores), axis=1) # fig = plt.figure() ax = fig.add_subplot(111) #ax.set_xscale("log") ax.set_xlabel("Training size") ax.set_ylabel('Score') ax.set_title('Learning curve') training_score, = ax.plot(subset_sizes, train_scores, lw=2) cv_score, = ax.plot(subset_sizes, cv_scores, lw=2) ax.legend(handles=[training_score, cv_score], labels=["Training score", "Cross validation score"], loc="upper right", prop={"size": 8}) ### # plt.tight_layout() plt.show()
def learning_curve_example(): # REF [site] >> http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html digits = datasets.load_digits() X, y = digits.data, digits.target title = 'Learning Curves (Naive Bayes)' # Cross validation with 100 iterations to get smoother mean test and train score curves, each time with 20% data randomly selected as a validation set. cv = model_selection.ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = naive_bayes.GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) title = 'Learning Curves (SVM, RBF kernel, $\gamma=0.001$)' # SVC is more expensive so we do a lower number of CV iterations: cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = svm.SVC(gamma=0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) plt.show() # REF [site] >> http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html digits = datasets.load_digits() X, y = digits.data, digits.target param_range = np.logspace(-6, -1, 5) train_scores, test_scores = model_selection.validation_curve(svm.SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='accuracy', n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Validation Curve with SVM') plt.xlabel('$\gamma$') plt.ylabel('Score') plt.ylim(0.0, 1.1) lw = 2 plt.semilogx(param_range, train_scores_mean, label='Training score', color='darkorange', lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='darkorange', lw=lw) plt.semilogx(param_range, test_scores_mean, label='Cross-validation score', color='navy', lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color='navy', lw=lw) plt.legend(loc='best') plt.show()
def assertClassifierWorksWithCV(self, classifier): # all the nice stuff is tested here - whether the classifier is # clonable, etc. for X, y in self.get_multilabel_data_for_tests('dense'): n_iterations = 3 cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0) scores = model_selection.cross_val_score( classifier, X, y=y, cv=cv, scoring='accuracy') self.assertEqual(len(scores), n_iterations)
def fit(self, X, y): print "Fitting a RidgeCV regressor..." self.standardizer = preprocessing.StandardScaler() X = self.standardizer.fit_transform(X) cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.clf = linear_model.RidgeCV(alphas=[0.01, 0.1, 1., 10.], cv=cv, normalize=[True, False]) self.clf.fit(X, y)
def fit(self, X, y): print "Fitting a LassoCV regressor..." self.standardizer = preprocessing.StandardScaler() X = self.standardizer.fit_transform(X) cv = model_selection.ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.clf = linear_model.LassoCV(n_alphas=100, cv=cv, n_jobs=7, normalize=[True, False]) self.clf.fit(X, y)
def assertClassifierWorksWithCV(self, classifier): # all the nice stuff is tested here - whether the classifier is # clonable, etc. X, y = make_multilabel_classification( sparse=False, return_indicator='dense') n_iterations = 3 cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0) scores = model_selection.cross_val_score( classifier, X, y=y, cv=cv, scoring='accuracy') self.assertEqual(len(scores), n_iterations)
def _cross_validate_individual_classifier_number(self, classifier, number_of_training_observations): X = self._all_predictor_variables y = self._all_response_variables total_observations = len(X) test_size_count = total_observations - number_of_training_observations cv_options = model_selection.ShuffleSplit(n_splits=10, test_size=test_size_count, random_state=RANDOM_SEED) out = model_selection.cross_validate(classifier, X, y, scoring="accuracy", cv=cv_options, return_train_score=True, verbose=1) return out
def apply_svm_cross_validation(X, y, svc_args={'loss':'hinge', 'penalty':'elasticnet', 'max_iter':1000, 'alpha':1e-9, 'tol':1e-3, 'random_state':123456, 'class_weight':None}, kernel_args={'kernel':'rbf', 'gamma':None, 'degree':None, 'n_components':100, 'random_state':123456}): #print("kernel_approx") #feature_map_nystroem = kernel_approximation.Nystroem(**kernel_args) #feature_map_nystroem.fit(X) #X_new = feature_map_nystroem.transform(X) print("SVM") clf = linear_model.SGDClassifier(**svc_args) cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=123456) scores = model_selection.cross_validate(clf, X, y, cv=cv, scoring=['precision', 'recall', 'f1'], return_train_score=True) print(scores) return [np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1'])]