def IrisData(self, args): # init splitting size if (len(args) > 1): self.test_size = float(args[1]) if (len(args) > 2): self.train_unlabeled_size = float(args[2]) # read data data_load = np.loadtxt(self.iris_data_file, dtype='str', delimiter=',') map_load = np.genfromtxt(self.iris_map_file, dtype='str', delimiter=',') # re-index class to number 0, 1, ..., c index_map = {} for i in range(len(map_load)): index_map[map_load[i]] = i for i, d in enumerate(data_load): d[-1] = index_map.get(d[-1]) # split data into 3 parts, nearly same proportion # first slpit train and test, then from train split to label and unlabel sss1 = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=self.test_size, random_state=0) train_indices = [] #just in case for train_indices, test_indices in sss1.split(data_load, data_load.T[-1]): np.savetxt(self.iris_output_test, data_load[test_indices], fmt="%s", delimiter=',') # test first # if self.train_unlabeled_size == 0: np.savetxt(self.iris_output_train[0], data_load[train_indices], fmt='%s', delimiter=',') # only train else: sss2 = model_selection.StratifiedShuffleSplit( n_splits=1, test_size=self.train_unlabeled_size, random_state=0) for train_label_indices, train_unlabel_indices in sss2.split( data_load[train_indices], data_load[train_indices].T[-1]): np.savetxt(self.iris_output_train[0], data_load[train_indices][train_label_indices], fmt='%s', delimiter=',') np.savetxt(self.iris_output_train[1], data_load[train_indices][train_unlabel_indices], fmt='%s', delimiter=',') # map file generate np.savetxt(self.iris_output_map, np.mat(map_load)[0], fmt="%s", delimiter=',')
def train_test_split(X, Y, test_size=.2, use_examples_num=None, random_state=None): # First - use only required number of examples if use_examples_num: if use_examples_num > X.shape[0]: raise ValueError('Too big total_size') cv = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=use_examples_num, random_state=random_state) _, index = next(cv.split(X, np.argmax(Y, axis=1))) X, Y = X[index], Y[index] if not test_size: return X, np.array([]), Y, np.array([]) # Second - split train/test cv = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) train_idx, test_idx = next(cv.split(X, np.argmax(Y, axis=1))) return train_idx, test_idx
def __call__(self, table): if self.replace: # pylint: disable=no-member rgen = np.random.RandomState(self.random_state) sample = rgen.randint(0, len(table), self.n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return others, sample if self.n == len(table): rgen = np.random.RandomState(self.random_state) sample = np.arange(self.n) rgen.shuffle(sample) return np.array([], dtype=int), sample elif self.stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), self.n) splitter = skl.StratifiedShuffleSplit( n_splits=1, test_size=test_size, train_size=len(table) - test_size, random_state=self.random_state) splitter.get_n_splits(table.X, table.Y) ind = splitter.split(table.X, table.Y) else: splitter = skl.ShuffleSplit(n_splits=1, test_size=self.n, random_state=self.random_state) splitter.get_n_splits(table) ind = splitter.split(table) return next(iter(ind))
def fit(self, X, y): print "Fitting an SGD Elasticnet Classification model..." t_start = time.time() n_iter = np.ceil(10**6 / float(len(y))) self.standardizer = preprocessing.StandardScaler() X = self.standardizer.fit_transform(X) alpha_range = 10.0**-np.arange(1, 7) param_grid = [] param_grid.append( dict(loss=['log', 'modified_huber'], alpha=alpha_range, n_iter=[n_iter], penalty=['elasticnet'], l1_ratio=[.1, .5, .7, .9, .95, .99, 1.])) print "Using param grid " + str(param_grid) self.clf = linear_model.SGDClassifier(random_state=1337) cv = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.clf = model_selection.GridSearchCV(self.clf, param_grid=param_grid, cv=cv, n_jobs=7) self.clf.fit(X, y) print "Best params: " + str( self.clf.best_params_) + " and corresponding score is " + str( self.clf.best_score_) utime = time.time() - t_start print " Done fitting. Took time " + str(utime)
def fit(self, X, y): # Split into categorical,numerical categories: self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()), ('bnb', naive_bayes.BernoulliNB()))) self.num_clf = pipeline.Pipeline( (('num-tf', NumericalTransformer()), ('gnb', naive_bayes.GaussianNB()))) weights_range = [[ a, 1.0 - a ] for a in [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]] voting_range = ['soft'] param_grid = dict(voting=voting_range, weights=weights_range) print "Using param grid " + str(param_grid) cv = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.clf = ensemble.VotingClassifier( estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)]) self.clf = model_selection.GridSearchCV(self.clf, param_grid=param_grid, cv=cv, n_jobs=7) self.clf.fit(X, y) print "Best params: " + str( self.clf.best_params_) + " and corresponding score is " + str( self.clf.best_score_)
def _split(data, test_size, random_state): """Splits the data into train and valid set, in a stratified manner. Also puts similar examples in the same set (based on _/input/similar_examples_hashXXX.npy)""" data['fold'] = -1 classes = ( np.where(data.data_provider == 'karolinska', 6, 0) + data.isup_grade.values) skf = model_selection.StratifiedShuffleSplit( n_splits=1, test_size=test_size, random_state=random_state) skf_iterator = skf.split( X=data.image_id, y=classes) train_idx, valid_idx = next(skf_iterator) data.loc[valid_idx, 'fold'] = 0 data.loc[train_idx, 'fold'] = 1 return data[data.fold != 0], data[data.fold == 0]
def test_StratifiedShuffleSplit(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) sf1 = df.model_selection.StratifiedShuffleSplit( random_state=self.random_state) sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state) # consume generator ind1 = [x for x in sf1.split(df.data.values, df.target.values)] ind2 = [x for x in sf2.split(iris.data, iris.target)] for i1, i2 in zip(ind1, ind2): self.assertIsInstance(i1, tuple) self.assertEqual(len(i1), 2) self.assertIsInstance(i2, tuple) self.assertEqual(len(i2), 2) self.assert_numpy_array_equal(i1[0], i1[0]) self.assert_numpy_array_equal(i1[1], i2[1]) sf1 = df.model_selection.StratifiedShuffleSplit( random_state=self.random_state) with tm.assert_produces_warning(FutureWarning): gen = df.model_selection.iterate(sf1) # StratifiedShuffleSplit is not a subclass of BaseCrossValidator for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) self.assert_index_equal(df.columns, train_df.columns) self.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def classifier_tester(classifier,x,y): sss=model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) scores=model_selection.cross_validate(classifier,x, y, scoring='accuracy',cv=sss) acc=scores['test_score'] print('accuracies=',acc*100) print('total acc=',round(acc.mean()*100,2),round(acc.std()*100,2)) print('test time=',scores['score_time'])
def fit(self, x, y, sample_weight=None, check_input=True, x_idx_sorted=None): if self._alpha is None: return self._learner.fit(x, y, sample_weight=sample_weight, check_input=check_input, X_idx_sorted=x_idx_sorted) if sample_weight is None: sample_weight = np.ones(x.shape[0]) self.training_x = x.copy() self.training_y = y.copy() self.training_weights = sample_weight.copy() # TODO: Make this tunable? at least random_state? sss = ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123) for train_index, test_index in sss.split(self.training_x, self.training_y): self.value_x = self.training_x[test_index] self.value_y = self.training_y[test_index] self.training_x = self.training_x[train_index] self.training_y = self.training_y[train_index] self.value_weights = sample_weight[test_index] self.training_weights = sample_weight[train_index] self._learner.fit(self.training_x, self.training_y, self.training_weights, check_input, x_idx_sorted) self.prune() return self
def eval_classification(self, session, labels, train_size): sk_graph = self._skipgram_graph node_embeddings = session.run(sk_graph["normalized_embeddings"]) # Classifier choice classifier = linear_model.LogisticRegression(C=10) #classifier = svm.SVC(C=1) scoring = ['accuracy', 'f1_macro', 'f1_micro'] shuffle = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.8) cv_scores = model_selection.cross_validate(classifier, node_embeddings, labels, scoring=scoring, cv=shuffle, return_train_score=True) train_acc = cv_scores['train_accuracy'].mean() train_f1 = cv_scores['train_f1_macro'].mean() test_acc = cv_scores['test_accuracy'].mean() test_f1 = cv_scores['test_f1_macro'].mean() print("Train acc: {:0.3f}, f1: {:0.3f}".format(train_acc, train_f1)) print("Test acc: {:0.3f}, f1: {:0.3f}".format(test_acc, test_f1)) return { 'train_acc': train_acc, 'test_acc': test_acc, 'train_f1': train_f1, 'test_f1': test_f1 }
def abide1_subtype_stability_core(n_cpu): # Hardcoded variables scale = 20 state = 1 n_boot = 1000 dist_thr = 0.99 part_thr = 20 regressors = 'AGE_AT_SCAN+fd_scrubbed+SITE_ID' # Paths root_p = pal.Path(__file__).resolve().parents[2] / 'data' pheno_p = root_p / 'pheno/ABIDE1_Pheno_PSM_matched_minimum_10.tsv' # Data sca_p = root_p / f'preprocessed/seed_maps/abide_1/MIST_{scale}' sca_t = f'sub_{{}}_ses_{{}}_run{{}}_mist_{scale}_nocereb.npy' # Output out_d = root_p / f'processed/stability/abide_1/' out_p = out_d / f'abide_1_subtype_stability_mist_{scale}_core_{part_thr:d}_within_{dist_thr*100:.0f}.npz' if not out_d.is_dir(): out_d.mkdir() pheno = pd.read_csv(pheno_p, sep='\t') seed_paths = [ sca_p / sca_t.format(row['SUB_ID'], row['session'], row['run']) for rid, row in pheno.iterrows() ] subject_stack = np.array([np.load(p) for p in seed_paths]) n_sub, n_vox, n_roi = subject_stack.shape splitter = skm.StratifiedShuffleSplit(n_splits=n_boot, test_size=0.5, random_state=state) asd_label = (pheno.DX_GROUP == 'Autism').values.astype(int) n_samples = len(asd_label) if not n_samples == n_sub: raise Exception( f'got {n_sub} subjects in residual but {n_samples} in the pheno file. Doesnt work.' ) # data_stack, mode='classic', n_subtypes=3, dist_thr=0.7, part_thr=20 job_arg_list = [{ 'data_stack': subject_stack, 'sbt_idx': train, 'dist_thr': dist_thr, 'part_thr': part_thr, 'regressors': regressors, 'pheno': pheno } for train, test in splitter.split(X=np.zeros(n_samples), y=asd_label)] train_indices_list, _ = zip( *list(splitter.split(X=np.zeros(n_samples), y=asd_label))) # decorate the subtype function ex = futures.ThreadPoolExecutor(max_workers=n_cpu) results = { run_id: res for run_id, res in zip( range(len(job_arg_list)), list( tqdm(ex.map(wrap_subtype_stability, job_arg_list), total=len(job_arg_list)))) } # Store the results np.savez(out_p, train_idx=train_indices_list, partitions=results)
def SVM_hyper(X_train_pca, y_train, X_test_pca, y_test): k_list = list(range(1, 50, 1)) all_train = [] all_test = [] sss = model_selection.StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0) for train_index, test_index in sss.split(X_train_pca, y_train): train_scores = [] val_scores = [] split_X_train = X_train_pca[train_index] split_y_train = y_train[train_index] split_X_val = X_train_pca[test_index] split_y_val = y_train[test_index] for k in k_list: clf_SVM = SVC(kernel='poly', degree=3, gamma='scale', coef0=k, C=1) #clf_knn = neighbors.KNeighborsClassifier(n_neighbors=k) clf_SVM.fit(split_X_train, split_y_train) # Test the classifier on the training data and plot score_train = clf_SVM.score(split_X_train, split_y_train) score_val = clf_SVM.score(split_X_val, split_y_val) train_scores.append(score_train) val_scores.append(score_val) all_train.append(train_scores) all_test.append(val_scores) # Create numpy array of scores and calculate the mean and std all_train = np.array(all_train) all_test = np.array(all_test) train_scores_mean = all_train.mean(axis=0) train_scores_std = all_train.std(axis=0) test_scores_mean = all_test.mean(axis=0) test_scores_std = all_test.std(axis=0) # Plot the mean scores and the std as shading fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.grid() ax.fill_between(k_list, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(k_list, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") ax.plot(k_list, train_scores_mean, 'o-', color="r", label="Training score") ax.plot(k_list, test_scores_mean, 'o-', color="g", label="Test score") plt.show()
def sample(table, n=0.7, stratified=False, replace=False, random_state=None): """ Samples data instances from a data table. Returns the sample and a dataset from input data table that are not in the sample. Also uses several sampling functions from `scikit-learn <http://scikit-learn.org>`_. table : data table A data table from which to sample. n : float, int (default = 0.7) If float, should be between 0.0 and 1.0 and represents the proportion of data instances in the resulting sample. If int, n is the number of data instances in the resulting sample. stratified : bool, optional (default = False) If true, sampling will try to consider class values and match distribution of class values in train and test subsets. replace : bool, optional (default = False) sample with replacement random_state : int or RandomState Pseudo-random number generator state used for random sampling. """ if type(n) == float: n = int(n * len(table)) if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.randint(0, len(table), n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return table[sample], table[others] n = len(table) - n if stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), n) splitter = skl.StratifiedShuffleSplit( n_splits=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state, ) splitter.get_n_splits(table.X, table.Y) ind = splitter.split(table.X, table.Y) else: splitter = skl.ShuffleSplit(n_splits=1, test_size=n, random_state=random_state) splitter.get_n_splits(table) ind = splitter.split(table) ind = next(ind) return table[ind[0]], table[ind[1]]
def fit(self, X, Y, sample_weight=None, check_input=True, X_idx_sorted=None): if sample_weight is None: sample_weight = np.ones(X.shape[0]) self.trgX = X.copy() self.trgY = Y.copy() self.trgWts = sample_weight.copy() sss = ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123) for train_index, test_index in sss.split(self.trgX, self.trgY): self.valX = self.trgX[test_index] self.valY = self.trgY[test_index] self.trgX = self.trgX[train_index] self.trgY = self.trgY[train_index] self.valWts = sample_weight[test_index] self.trgWts = sample_weight[train_index] super().fit(self.trgX, self.trgY, self.trgWts, check_input, X_idx_sorted) self.prune() return self
def print_data(train_file_addr, val_file_addr, x, y): def p_data(addr, arr): with open(addr, 'w') as f: for i in arr: f.write(str(i)) cv = model_selection.StratifiedShuffleSplit(2) for train, val in cv.split(x, y): p_data(train_file_addr, x[train]) p_data(val_file_addr, x[val])
def split_data(X, y): """ Splits training data into train and test sets in a stratified fashion preserving class distribution in the data :param X: the features :param y: the labels :return: a generator that returns exactly one train/test split in a stratified fashion based on labels """ splitter = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=.05, random_state=10) return splitter.split(X=X, y=y)
def paramSearch(self, xMat, yMat): C_range = np.logspace(-3, 3, 6) gamma_range = np.logspace(-3, 2, 6) param_grid = dict(gamma=gamma_range, C=C_range) cv = ms.StratifiedShuffleSplit(n_splits=3, test_size=0.33, random_state=42) grid = ms.GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv) grid.fit(xMat, yMat) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
def split_train_test(self, folds, test_size, seed, remove_duplicates, kfold_statified_shuffle_splits): data = self.data_tp if not self.use_fp else self.data_all if folds < 0: return [data], [[]] grouped = data[[self.k_recording_id, self.classes_column_name ]].groupby(self.k_recording_id).min() classes = grouped[self.classes_column_name].tolist() train = [] test = [] if folds < 2: res = model_selection.train_test_split(grouped, test_size=test_size, random_state=seed, stratify=classes) train.append(res[0].index) test.append(res[1].index) else: if not kfold_statified_shuffle_splits: folds_generator = model_selection.StratifiedKFold( folds, shuffle=True, random_state=seed) else: folds_generator = model_selection.StratifiedShuffleSplit( folds, random_state=seed, test_size=test_size) for train_part, test_part in folds_generator.split( grouped, classes): train.append(grouped.iloc[train_part, :].index) test.append(grouped.iloc[test_part, :].index) # train.append(self.data.iloc[train_part, :]) # test.append(self.data.iloc[test_part, :]) for split_id in range(len(train)): train_part, test_part = train[split_id], test[split_id] train_part = data.loc[data[self.k_recording_id].isin(train_part)] test_part = data.loc[data[self.k_recording_id].isin(test_part)] if self.sample_val_fp: train_part, test_part = self.perform_sampling_for_val_fp( train_part, test_part) # if remove_duplicates: # duplicated_records = train_part[self.k_recording_id].duplicated(keep=False) # test_part = pd.concat([test_part, train_part.loc[duplicated_records, :]]) # train_part = train_part.drop_duplicates(self.k_recording_id, keep=False) train[split_id], test[split_id] = train_part, test_part return train, test
def data_from_scaling(self, size): # Split labeled and unlabeled data by scaling size[labeled, unlabeled] splited_data = Dataset() # splitting data, guarantee that number of samples per class are nearly equal # labeled data slpiting if size[0] == 1: splited_data.train_xl = self.train_xl[:] splited_data.train_yl = self.train_yl[:] else: sss1 = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=size[0], random_state=0) # notice: set random_state is a constant to make sure that the next scaling is the expand of last data set for data_indecices, labeled_indices in sss1.split(self.train_xl, self.train_yl.T): splited_data.train_xl = self.train_xl[labeled_indices] splited_data.train_yl = self.train_yl[0,labeled_indices] # unlabeled data splitting if size[1] == 1: splited_data.train_xu = self.train_xu else: sss2 = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=size[1], random_state=0) for data_indecices, unlabeled_indices in sss2.split(self.train_xu, self.train_yu.T): splited_data.train_xu = self.train_xu[unlabeled_indices] # update parameters splited_data.problem_type = self.problem_type splited_data.test_x = self.test_x splited_data.test_y = self.test_y splited_data.class_name = self.class_name splited_data.class_number = self.class_number splited_data.feature_number = self.feature_number splited_data.instance_label_number = len(splited_data.train_xl) splited_data.instance_unlabel_number = len(splited_data.train_xu) splited_data.instance_test_number = self.instance_test_number return splited_data
def split_train_test_strat(data, split_category, n_splits, test_ratio, seed): strat_train_set = pd.DataFrame(data=None, columns=data.columns, index=data.index) strat_test_set = pd.DataFrame(data=None, columns=data.columns, index=data.index) sd = ms.StratifiedShuffleSplit(n_splits=n_splits, test_size=test_ratio, random_state=seed) for train_idx, test_idx in sd.split(data, data[split_category]): strat_train_set = data.loc[train_idx] strat_test_set = data.loc[test_idx] return strat_train_set, strat_test_set
def get_indices(self, data): if self.stratified and data.domain.has_discrete_class: splitter = skl.StratifiedShuffleSplit( n_splits=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state ) splitter.get_n_splits(data.X, data.Y) return list(splitter.split(data.X, data.Y)) splitter = skl.ShuffleSplit( n_splits=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state ) splitter.get_n_splits(data) return list(splitter.split(data))
def split(ds, testSplit, testSplitSeed, stratified=False, groupFunc=None): rn = list(range(0, len(ds))) if stratified: data_classes = dataset_classes(ds, groupFunc) vals = ms.StratifiedShuffleSplit(4, testSplit, random_state=testSplitSeed).split( rn, data_classes) for v in vals: return SubDataSet(ds, v[0]), SubDataSet(ds, v[1]) random.seed(testSplitSeed) random.shuffle(rn) dm = round(len(ds) - len(ds) * testSplit) return SubDataSet(ds, rn[:dm]), SubDataSet(ds, rn[dm:])
def setup_indices(self, train_data, test_data): if self.stratified and test_data.domain.has_discrete_class: splitter = skl.StratifiedShuffleSplit( n_splits=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state ) splitter.get_n_splits(test_data.X, test_data.Y) self.indices = list(splitter.split(test_data.X, test_data.Y)) else: splitter = skl.ShuffleSplit( n_splits=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state ) splitter.get_n_splits(test_data) self.indices = list(splitter.split(test_data))
def stratified_split(data, cat, bins, test_size): lab = list(range(len(bins) - 1)) temp_cat = "temp" data[temp_cat] = pd.cut(data[cat], bins=bins, labels=lab) split = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42) for train_index, test_index in split.split(data, data[temp_cat]): strat_train_set = data.loc[train_index] strat_test_set = data.loc[test_index] for set_ in (strat_train_set, strat_test_set): set_.drop(temp_cat, axis=1, inplace=True) return strat_train_set, strat_test_set
def evaluate(model, which="dev"): X, y = [], [] for (s1, s2), label in three_class_data_iter(which): d = model.distance(s1, s2) X.append(d) y.append(label) scores = sel.cross_validate( lm.LogisticRegression(), X, y=y, scoring=metrics.make_scorer(metrics.accuracy_score), cv=sel.StratifiedShuffleSplit(n_splits=5), ) return scores
def split_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: # The entry test_fold[i] represents the index of the test set that sample i belongs to. # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1. test_fold = [0, 1, -1, 1] split = PredefinedSplit(test_fold) print('#splits =', split.get_n_splits(X, y)) elif False: # The stratified folds are made by preserving the percentage of samples for each class. split = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X, y)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. split = model_selection.GroupShuffleSplit(n_splits=3, test_size=0.25, random_state=None) #print('#splits =', split.get_n_splits(X, y, groups)) print('#splits =', split.get_n_splits(groups=groups)) elif False: split = model_selection.TimeSeriesSplit(n_splits=3, max_train_size=None) print('#splits =', split.get_n_splits()) else: split = model_selection.ShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X)) print('Split:', split) #for train_indices, test_indices in split.split(): #for train_indices, test_indices in split.split(X, y): #for train_indices, test_indices in split.split(X, y, groups): for train_indices, test_indices in split.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def evaluateModel(self, model, features, classes, train_size=0.7): XT, XF, YT, YF = model_selection.train_test_split( features, classes, train_size) kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=12345) # https: // scikit - learn.org / stable / modules / cross_validation.html # cross-validation # https://chrisalbon.com/machine_learning/model_evaluation/cross_validation_parameter_tuning_grid_search/ # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, по очереди kf1 = model_selection.KFold(n_splits=5, shuffle=False, random_state=12345) # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, случайный порядок kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=12345) # Разбивает так, что все тестовые выборки содержат примерно одинаковое количество эл-тов разных классов kf3 = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=12345) # Разбивает в случайном порядке, элементы могут повторяться kf4 = model_selection.ShuffleSplit(n_splits=10, random_state=12345) # Разбивает в случайном порядке, элементы могут повторяться, тестовые выборки содержат примерно одинаковое количество эл-тов разных классов kf5 = model_selection.StratifiedShuffleSplit(n_splits=10, random_state=12345) # делает N тестовых выборок, содержащих поочередно каждый элемент kf6 = model_selection.LeaveOneOut() self.trainModel(model, XT, YT) YP = self.predictModel(model, XF) acc = metrics.accuracy_score(YF, YP) prec = metrics.precision_score(YF, YP) rec = metrics.recall_score(YF, YP) f1 = metrics.f1_score(YF, YP) return f1, prec, rec, acc
def load_data_set(file_name): data = pandas.read_csv(file_name) goal = data['drowsy'] data = data.drop('drowsy', axis=1) model = model_selection.StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=1) gen = model.split(data, goal) xTrain, yTrain, xTest, yTest = [], [], [], [] for train_idx, test_idx in gen: xTrain = data.loc[train_idx] yTrain = goal.loc[train_idx] xTest = data.loc[test_idx] yTest = goal.loc[test_idx] return xTrain, yTrain, xTest, yTest
def build_sklearn(self, splitter_id, splitter_params): """Build splitters wrapping sklearn""" if splitter_id == 'mangoml_sklearn_KFold': return SplitterWrapper(model_selection.KFold(**splitter_params)) elif splitter_id == 'mangoml_sklearn_StratifiedKFold': return SplitterWrapper( model_selection.StratifiedKFold(**splitter_params)) elif splitter_id == 'mangoml_sklearn_ShuffleSplit': return SplitterWrapper( model_selection.ShuffleSplit(**splitter_params)) elif splitter_id == 'mangoml_sklearn_StratifiedShuffleSplit': return SplitterWrapper( model_selection.StratifiedShuffleSplit(**splitter_params)) elif splitter_id == 'mangoml_sklearn_GroupKFold': group_column = splitter_params.pop('group_column') return SplitterWrapper( model_selection.GroupKFold(**splitter_params), group_column) return None
def load_data(file_name, training=False): data = pandas.read_csv(file_name) target = data['answer'] data = data.drop('answer', axis=1) if training: model = model_selection.StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=1) gen = model.split(data, target) train_x, train_y, test_x, test_y = [], [], [], [] for train_index, test_index in gen: train_x = data.loc[train_index] train_y = target.loc[train_index] test_x = data.loc[test_index] test_y = target.loc[test_index] return train_x, train_y, test_x, test_y return data, target