def test_stratified_shuffle_split_iter_no_indices(): y = np.asarray([0, 1, 2] * 10) sss1 = cval.StratifiedShuffleSplit(y, indices=False, random_state=0) train_mask, test_mask = next(iter(sss1)) sss2 = cval.StratifiedShuffleSplit(y, indices=True, random_state=0) train_indices, test_indices = next(iter(sss2)) assert_array_equal(sorted(test_indices), np.where(test_mask)[0])
def sample_random_n(table, n, stratified=False, replace=False, random_state=None): assert n > 0 n = int(n) if replace: ind = cross_validation.Bootstrap(len(table), train_size=n, random_state=random_state) elif stratified and is_discrete(table.domain.class_var): train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) else: train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) return next(iter(ind))
def sample_random_n(table, n, stratified=False, replace=False, random_state=None): if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.random_integers(0, len(table) - 1, n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return others, sample if stratified and is_discrete(table.domain.class_var): test_size = max(len(table.domain.class_var.values), n) ind = skl_cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state) else: ind = skl_cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=n, random_state=random_state) return next(iter(ind))
def testing_cycle(data, classes, gamma=0, C=0, loops=50): ''' Takes data, does a stratified split (70/30) and trains/tests the provided classifier $loops times ''' #Create random (but balanced) test/train groups splits = cval.StratifiedShuffleSplit(classes, n_iter=loops, test_size=0.3) scores = [] for train_indices, test_indices in splits: classifier = svm.SVC(gamma=gamma, C=C, kernel='rbf') train_data = [data[i] for i in train_indices] train_classes = [classes[i] for i in train_indices] test_data = [data[i] for i in test_indices] test_classes = [classes[i] for i in test_indices] #train_data = np.array(train_data) #Train model classifier.fit(train_data, train_classes) #Get predictions predictions = classifier.predict(test_data) score = sum([1 for i, t in zip(predictions, test_classes) if i == t ]) / float(len(predictions)) scores.append(score) print 'Average score: ', np.mean(scores) print 'Best score: ', max(scores) print '\n\n\n' plt.hist(scores) plt.show()
def fit_ann(data_X, data_Y): ann = MLPClassifier( alpha=1, hidden_layer_sizes=(12, 12, 12), solver='adam', #sgd adam learning_rate='adaptive', learning_rate_init=0.001, #0.001, invscaling adaptive momentum=0.4, max_iter=500) cv_score = 0 t_start = time() ann.fit(data_X, data_Y) time_ann = time() - t_start cv = cross_validation.StratifiedShuffleSplit(data_Y, n_iter=5, test_size=0.3, random_state=42) train_score = round(ann.score(data_X, data_Y), 4) * 100 cv_score, auc = training_score(ann, data_X, data_Y, cv) print "\ntrain {0:.2f} cv: {1:.2f} auc: {2:.2f} time {3:.4f}".format( train_score, cv_score, auc, time_ann) # run learning curve #run_learning_curve(ann, data_X, data_Y, cv) return cv_score, train_score, auc, time_ann
def cv_select(y, random_state, n_cv, cv, test_size=0.1): if isinstance(cv, basestring): if cv == 'shuffle': return cross_validation.StratifiedShuffleSplit( y, n_cv, test_size=test_size, random_state=random_state) elif cv == 'loo': return cross_validation.LeaveOneOut(n_cv) elif cv == 'kfold': return cross_validation.StratifiedKFold(y, n_folds=n_cv) elif cv == 'boot': return cross_validation.Bootstrap(len(y), n_iter=n_cv, train_size=(1 - test_size), random_state=random_state) elif cv == 'boot632': return bootstrap_632(len(y), n_iter=n_cv, random_state=random_state) # for regression elif cv == '_shuffle': return cross_validation.ShuffleSplit(len(y), n_iter=n_cv, test_size=test_size, random_state=random_state) elif cv == '_kfold': return cross_validation.KFold(len(y), n_folds=n_cv) else: raise ValueError("bad cv:%s" % cv) else: return cv
def train_on_features(self, clf=None, parameters=None, cv=None): """Train a support vector machine classifier on the features and labels that have been produced using self.create_features_set. """ if not hasattr(self, "features"): raise ValueError( "No features present, have you run create_features_set?") if not hasattr(self, "labels"): raise ValueError( "No labels present, have you run create_features_set?") if clf is None: self.svc = LinearSVC() else: self.svc = clf score_func = f1_score if cv is None: cv = cross_validation.StratifiedShuffleSplit((self.labels), test_size=1 / 2., n_iterations=10) if parameters is None: parameters = {"dual": [False, False]} grid = GridSearchCV(self.svc, parameters, score_func=score_func, cv=cv, verbose=0, n_jobs=1) grid.fit(self.features, self.labels) self.svc = grid.best_estimator_ return self
def test_stratified_shuffle_split_iter(): ys = [ np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50) ] for y in ys: sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, random_state=0, indices=True) for train, test in sss: assert_array_equal(unique(y[train]), unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(unique(y[train], return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(y[train].size + y[test].size, y.size) assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
def do_cv(clf, X, y, n_samples=1000, n_iter=3, test_size=0.1, quiet=False, scoring=None, stratified=False, fit_params=None, reseed_classifier=True, n_jobs=-1): t0 = time.time() if reseed_classifier: reseed(clf) if type(n_samples) is float: n_samples = int(n_samples) try: if (n_samples > X.shape[0]): n_samples = X.shape[0] except: pass cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, test_size=test_size, random_state=cfg['sys_seed']) \ if not(stratified) else cross_validation.StratifiedShuffleSplit(y, n_iter, train_size=n_samples, test_size=test_size, random_state=cfg['sys_seed']) test_scores = cross_validation.cross_val_score(clf, X, y, cv=cv, scoring=scoring or cfg['scoring'], fit_params=fit_params, n_jobs=n_jobs) if not (quiet): dbg('%s took: %.2fm' % (mean_score(test_scores), (time.time() - t0) / 60)) return (np.mean(test_scores), sem(test_scores))
def splitDatasetInBlocks(data, labels, trainBlockSizes, testSetPercentage): trainDataBlocks = [] trainLabelBlocks = [] testDataBlocks = [] testLabelBlocks = [] for i in range(len(trainBlockSizes)): train = trainBlockSizes[i] test = testSetPercentage * trainBlockSizes[i] skf = cross_validation.StratifiedShuffleSplit(labels, 5, train_size=train, test_size=test) a = [] b = [] c = [] d = [] for trainIndex, testIndex in skf: a.append(data[trainIndex]) b.append(labels[trainIndex]) c.append(data[testIndex]) d.append(labels[testIndex]) trainDataBlocks.append(a) trainLabelBlocks.append(b) testDataBlocks.append(c) testLabelBlocks.append(d) return trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks
def train_and_test_model(data, response, labels, model_type, split_by, c, impute=True, varname=""): """ train and test model of users based on given response variable """ model, type, model_string = models[model_type] if type == 'c': split = cross_validation.StratifiedShuffleSplit(response, 1, 0.2) else: #split = cross_validation.KFold(len(response), 5) #split = cross_validation.LeavePLabelOut(labels, 3) split = cross_validation.LeaveOneLabelOut(labels) predict = np.zeros(response.shape) for train, test in split: model.fit(data[train], response[train]) predict[test] = model.predict(data[test]) #print np.corrcoef(np.vstack((response[test], predict[test])))[0,1] plot_obs_pred(predict, response, "%s Model Performance" % model_string, varname) model.fit(data, response) return model
def loadData(filename): infile = open(filename, 'r') data = np.array([[item for item in line.strip().split(',')] for line in infile]) # extract the target values for the samples and the the set of target names. # change the target values to integers targets = data[:,-1] target_names = list(set(targets)) for name in target_names: targets[targets == name] = target_names.index(name) targets = targets.astype(int) X = data[:,:-1].astype(float) # split the data into training and test sets with the ratio 70-30, preserving the percentage of samples for each class np.random.seed(0) cv = cross_validation.StratifiedShuffleSplit(targets, n_iter=1, test_size=0.3) # vectorize the target values, i.e. replace 0 by (1,0,0), 1 by (0,1,0), 2 by (0,0,1) etc. y = np.zeros((len(targets), len(target_names))) for i in range(len(targets)): y[i, targets[i]] = 1 for train_index, test_index in cv: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] targets_train, targets_test = targets[train_index], targets[test_index] infile.close() return X_train, X_test, y_train, targets_train, targets_test
def test_ionosphere(): f = open("../results/basic_ionosphere_cv_results.txt", 'w') rng = np.random.RandomState() params = { 'window': [5, 10, 15, 20, 25, 30], 'num_particles': [5, 10, 15, 20] } X = np.genfromtxt('../data/ionosphere.data', delimiter=',')[:, :-1] Y = np.genfromtxt('../data/ionosphere.data', delimiter=',', usecols=[-1], dtype='str') le = LabelEncoder() y = le.fit_transform(Y) for w in params['window']: for p in params['num_particles']: # do a 5x2 cross val sss = cv.StratifiedShuffleSplit(y, n_iter=5, test_size=0.5, random_state=rng) mses, accs, evals = [], [], [] for train_index, test_index in sss: mse, acc, ev = xval( BasicOSI(n_hidden=[5], num_particles=p, window=w, random_state=rng, validation_size=0.33, verbose=False), X, y, train_index, test_index) mses.append(mse) accs.append(acc) evals.append(ev) mse, acc, ev = xval( BasicOSI(n_hidden=[5], num_particles=p, window=w, random_state=rng, validation_size=0.33, verbose=False), X, y, test_index, train_index) mses.append(mse) accs.append(acc) evals.append(ev) print ",".join( map(str, [w, p, np.mean(mses), np.mean(accs), np.mean(evals)])) f.write("\n" + ",".join( map(str, [w, p, np.mean(mses), np.mean(accs), np.mean(evals)]))) f.write("\n" + ",".join(map(str, mses))) f.write("\n" + ",".join(map(str, accs))) f.write("\n" + ",".join(map(str, evals))) f.flush() f.close()
def genTrainTest(df,features,queries,ranks,ts=.5): #features X =df[features] X = np.asarray(X) #X=np.asarray(X) #X=np.asarray(data0['sSvol'],data0['tSvol']) #queries blocks=np.asarray(list(df[queries])) #ranks y=np.asarray(list(df[ranks])) #split into test and train cv = cross_validation.StratifiedShuffleSplit(df[ranks],test_size=ts) train, test = iter(cv).next() X_train, y_train, b_train = X[train], y[train], blocks[train] X_test, y_test, b_test = X[test], y[test], blocks[test] #Scale features to range [0,1] in training dat #Mean/SD scaling doesn't make sense with so many factor variables #Each topic would have a different range scaler = preprocessing.MinMaxScaler() X_train = scaler.fit_transform(X_train) #Use same transformation on test data (may not have range from 0 to 1 in test) X_test = scaler.transform(X_test) #output #train = [X_train, y_train, b_train] #test = [X_test, y_test, b_test] return X_train, y_train, b_train, X_test, y_test, b_test
def load_dataset(limit=None, skip=0): X, y, ids = db2np(db_trans,limit=limit, skip=skip) sss = cross_validation.StratifiedShuffleSplit(y[:,0], n_iter=1, test_size=VALIDATION_SIZE, random_state=SEED) for train_index, test_index in sss: X_train = X[train_index] y_train = y[train_index] X_val = X[test_index] y_val = y[test_index] return X_train, y_train, X_val, y_val, X, y, ids
def sample(table, n=0.7, stratified=False, replace=False, random_state=None): """ Samples data instances from a data table. Returns the sample and a data set from input data table that are not in the sample. Also uses several sampling functions from `scikit-learn <http://scikit-learn.org>`_. table : data table A data table from which to sample. n : float, int (default = 0.7) If float, should be between 0.0 and 1.0 and represents the proportion of data instances in the resulting sample. If int, n is the number of data instances in the resulting sample. stratified : bool, optional (default = False) If true, sampling will try to consider class values and match distribution of class values in train and test subsets. replace : bool, optional (default = False) sample with replacement random_state : int or RandomState Pseudo-random number generator state used for random sampling. """ if type(n) == float: n = int(n * len(table)) if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.randint(0, len(table), n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return table[sample], table[others] n = len(table) - n if stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), n) ind = skl_cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state) else: ind = skl_cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=n, random_state=random_state) ind = next(iter(ind)) return table[ind[0]], table[ind[1]]
def test_iris(): f = open("../results/basic_iris_cv_results.txt", 'w') rng = np.random.RandomState() params = { 'window': [5, 10, 15, 20, 25, 30], 'num_particles': [5, 10, 15, 20] } iris = datasets.load_iris() for w in params['window']: for p in params['num_particles']: # do a 5x2 cross val sss = cv.StratifiedShuffleSplit(iris.target, n_iter=5, test_size=0.5, random_state=rng) mses, accs, evals = [], [], [] for train_index, test_index in sss: mse, acc, ev = xval( BasicOSI(n_hidden=[3], num_particles=p, window=w, random_state=rng, validation_size=0.33, verbose=False), iris.data, iris.target, train_index, test_index) mses.append(mse) accs.append(acc) evals.append(ev) mse, acc, ev = xval( BasicOSI(n_hidden=[3], num_particles=p, window=w, random_state=rng, validation_size=0.33, verbose=False), iris.data, iris.target, test_index, train_index) mses.append(mse) accs.append(acc) evals.append(ev) print ",".join( map(str, [w, p, np.mean(mses), np.mean(accs), np.mean(evals)])) f.write("\n" + ",".join( map(str, [w, p, np.mean(mses), np.mean(accs), np.mean(evals)]))) f.write("\n" + ",".join(map(str, mses))) f.write("\n" + ",".join(map(str, accs))) f.write("\n" + ",".join(map(str, evals))) f.flush() f.close()
def load_dataset(limit=None, skip=0): db_trans = pymongo.MongoClient("192.168.0.99:30000")["google"]["trainingset"] X, y = db2np(db_trans,limit=limit, skip=skip) sss = cross_validation.StratifiedShuffleSplit(y[:,1], n_iter=1, test_size=VALIDATION_SIZE, random_state=SEED) for train_index, test_index in sss: X_train = X[train_index] y_train = y[train_index] X_val = X[test_index] y_val = y[test_index] return X_train, y_train, X_val, y_val, X_val, y_val
def setup_indices(self, train_data, test_data): if self.stratified and test_data.domain.has_discrete_class: self.indices = skl_cross_validation.StratifiedShuffleSplit( test_data.Y, n_iter=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state ) else: self.indices = skl_cross_validation.ShuffleSplit( len(test_data), n_iter=self.n_resamples, train_size=self.train_size, test_size=self.test_size, random_state=self.random_state )
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE): names = get_names(files) labels = get_labels(names, per_patient=True) spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], test_size=test_size, random_state=random_state, n_iter=1) tr, te = next(iter(spl)) tr = np.hstack([tr * 2, tr * 2 + 1]) te = np.hstack([te * 2, te * 2 + 1]) return tr, te
def cv_loop(X, y, model, rseed=42, n_iter=8): cv = cross_validation.StratifiedShuffleSplit(y, random_state=rseed, n_iter=n_iter) scores = cross_validation.cross_val_score(model, X, y, scoring='roc_auc', n_jobs=1, cv=cv) return np.mean(scores)
def get_classifier_scores(clf_class, clf_kwargs, features_train, features_test, labels_train, labels_test, feature_list): # instantiate classifier with related arguments clf = clf_class() # set up cross validation crossval = cross_validation.StratifiedShuffleSplit( labels_train, 50, test_size=gl_test_size, random_state=gl_random_state) # perform grid search to find optimal parameter configuration grid_search = GridSearchCV(clf, clf_kwargs, cv=crossval, scoring='recall') #grid_search = GridSearchCV(clf, clf_kwargs, scoring='recall') # train grid_search.fit(features_train, labels_train) # pick a winner best_clf = grid_search.best_estimator_ # predict for test features predictions = best_clf.predict(features_test) # calculate accuracy scores = dict() scores["accuracy"] = accuracy_score(labels_test, predictions) scores["precision"] = precision_score(labels_test, predictions) scores["recall"] = recall_score(labels_test, predictions) # declare as string else you point to a changing value best_configuration = "" best_configuration = str(grid_search.best_estimator_) # Print the feature ranking try: # Get importance of features importances = best_clf.feature_importances_ indices = np.argsort(importances)[::-1] print("Feature ranking:") for f in range(features_train.shape[1]): print("%d. feature %s (%f)" % (f+1, feature_list[indices[f]], importances[indices[f]])) except: print "no importances available for classifier " + str(clf_class) # return cm scores and accuracy return scores["precision"], scores["recall"], scores["accuracy"], \ grid_search.best_params_, best_configuration
def cv_loop(X, y, model, N, N_JOBS=4, seed=25): scores = cross_validation.cross_val_score( model, X, y, scoring='roc_auc', pre_dispatch=N_JOBS, n_jobs=N_JOBS, cv=cross_validation.StratifiedShuffleSplit(y, random_state=seed, n_iter=N)) return sum(scores) / N
def create_test_split(X, y, test_size=0.3): # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = test_size) sss = cross_validation.StratifiedShuffleSplit(y, 1, test_size=test_size) for train, test in sss: train_indices = train test_indices = test # print train, test X_train = X[(train_indices)] y_train = y[(train_indices)] X_test = X[(test_indices)] y_test = y[(test_indices)] return X_train, X_test, y_train, y_test
def train_final_model(traindata, targets): model = linear_model.LogisticRegression(penalty='l2', dual=True, C=0.1, fit_intercept=True) cv = cross_validation.StratifiedShuffleSplit(targets, n_iter=4) scores = cross_validation.cross_val_score(model, traindata, targets, \ cv=cv, n_jobs=-1, score_func=metrics.auc_score) print "Cross-validation accuracy on the training set for final model:" print "%0.3f (+/-%0.03f)" % (scores.mean(), scores.std() / 2) model.fit(traindata, targets) return model
def test_stratified_shuffle_split_overlap_train_test_bug(): # See https://github.com/scikit-learn/scikit-learn/issues/6121 for # the original bug report labels = [0, 1, 2, 3] * 3 + [4, 5] * 5 splits = cval.StratifiedShuffleSplit(labels, n_iter=1, test_size=0.5, random_state=0) train, test = next(iter(splits)) assert_array_equal(np.intersect1d(train, test), [])
def createValidation(data, labels, test, train, ids): sss = cross_validation.StratifiedShuffleSplit(labels, 1, test_size=test, train_size=train, random_state=0) for train_index, test_index in sss: X_train, X_test = data[train_index], data[test_index] y_train, y_test = labels[train_index], labels[test_index] test_ids = ids[test_index] # return X_train, X_test, y_train, y_test, test_ids return X_train, X_test, y_train, y_test
def _split_shuffle(self, data, all_labels): # find out other other_indices = [] ## extract ids of posts classified as "Other" if not self.config.remove_other: for i, ele in enumerate(data): for label in ele["label"]: if label == self.config.other_id: other_indices.append(i) all_indices = np.arange(len(data)) re_indices = list(set(all_indices) - set(other_indices)) ## extract samples according to distributions of classes skf = cross_validation.StratifiedShuffleSplit(all_labels, 2, test_size=0.4, random_state=0) for train_index, test_index in skf: # X_train, X_test = re_indices[train_index], re_indices[test_index] data_train = [data[id] for id in train_index] data_test = [data[id] for id in test_index] exp_data = {"data_train": data_train, "data_test": data_test} exp_fw = open(os.path.join(".", "data", "sto", "exp_data.json"), "wb") pickle.dump(exp_data, exp_fw) exp_fw.close() train_dict = {} for data_ele in data_train: for sub_label in data_ele["label"]: if sub_label not in train_dict: train_dict[sub_label] = 0 train_dict[sub_label] += 1 print("training data ", train_dict) test_dict = {} for data_ele in data_test: for sub_label in data_ele["label"]: if sub_label not in test_dict: test_dict[sub_label] = 0 test_dict[sub_label] += 1 print("test data ", test_dict) print("Len of training data is ", len(data_train), "; Len of test data is ", len(data_test)) ## random shuffle # np.random.shuffle(re_indices) # num_test = int(len(re_indices) * 0.2) # # data_train = [data[id] for id in re_indices[:-num_test]] # data_test = [data[id] for id in re_indices[-num_test:] + other_indices] return data_train, data_test
def split(df, fraction_test): sss = cross_validation.StratifiedShuffleSplit( y=df.yyyymm, n_iter=1, test_size=fraction_test, train_size=None, random_state=control.random_seed, ) assert len(sss) == 1 for train_index, test_index in sss: train = df.iloc[train_index] test = df.iloc[test_index] return test, train
def load_dataset(limit=None, skip=0): #from get_data.build_trainingset import db2np import pymongo from sklearn import cross_validation db_trans = pymongo.MongoClient("192.168.0.99:30000")["google"]["transformedset"] X, y = db2np(db_trans,limit=limit, skip=skip) sss = cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=.2, random_state=3476) for train_index, test_index in sss: X_train = X[train_index] y_train = y[train_index] X_val = X[test_index] y_val = y[test_index] return X_train, y_train, X_val, y_val, X, y