def split_patients(patients, valid_percent, test_percent, rng=(2014, 10, 22)): if isinstance(rng, (list, tuple)): rng = make_np_rng(None, rng, which_method='uniform') vals = np.asarray(patients.values()) keys = np.asarray(patients.keys()) sss = StratifiedShuffleSplit( vals, n_iter=1, test_size=test_percent, random_state=rng) remaining_idx, test_idx = sss.__iter__().next() if valid_percent > 0: # Rate of samples required to build validation set valid_rate = valid_percent / (1 - test_percent) sss = StratifiedShuffleSplit( vals[remaining_idx], n_iter=1, test_size=valid_rate, random_state=rng) tr_idx, val_idx = sss.__iter__().next() valid_idx = remaining_idx[val_idx] train_idx = remaining_idx[tr_idx] else: train_idx = remaining_idx valid_idx = [] train_patients = dict(zip(keys[train_idx], vals[train_idx])) valid_patients = dict(zip(keys[valid_idx], vals[valid_idx])) test_patients = dict(zip(keys[test_idx], vals[test_idx])) return train_patients, valid_patients, test_patients
def simple_classification(n_samples=100, n_features=10, random_state=33): """ Generate simple classification task for training. Parameters ---------- n_samples : int Number of samples in dataset. n_features : int Number of features for each sample. random_state : int Random state to make results reproducible. Returns ------- tuple Returns tuple that contains 4 variables. There are input train, input test, target train, target test respectevly. """ X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state) shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6, random_state=random_state) train_index, test_index = next(shuffle_split.__iter__()) x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return x_train, x_test, y_train, y_test
def setUp(self): super(QuasiNewtonTestCase, self).setUp() X, y = datasets.make_classification(n_samples=100, n_features=10, random_state=33) shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6, random_state=33) train_index, test_index = next(shuffle_split.__iter__()) x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.X, self.y = X, y self.data = (x_train, x_test, y_train, y_test)
def get_rows_msr(data): conf = get_config() rng = get_rng() train = [data[1][idx] for idx in data[3]] test = [data[1][idx] for idx in data[2]] # test = [data[1][idx] for idx in data[4]] shuffle(train, rng.rand) train_y = [y for y, o, p in train] # build dev set sss = StratifiedShuffleSplit( train_y, 1, train_size=0.8, test_size=0.2, random_state=rng) train_index, dev_index = sss.__iter__().next() return [train[i] for i in train_index], [train[i] for i in dev_index], test
dataset = task.get_dataset() # Impute the values - While values would be imputed when calculating some meta-features anyway, this gives more control. X, y, categorical = dataset.get_data(target = task.target_feature, return_categorical_indicator = True) #X, categorical = remove_zero_columns(impute_values(X, categorical), categorical) # Subsample landmarker need folds, the train+test set of subsample landmarkers should be 500 instances, # since that is the size of our smallest dataset. # We first create a fold for 500 stratified samples, and then again divide that selection to 10 folds. max_size = 500 number_of_classes = len(np.unique(y)) if y.shape[0] < (max_size + number_of_classes): subset_indices = np.arange(max_size) else: subset_split = StratifiedShuffleSplit(y, n_iter=1, test_size=500, random_state = 0) _, subset_indices = next(subset_split.__iter__()) mapped_folds = StratifiedShuffleSplit(y[subset_indices], n_iter=10, test_size=0.2, random_state = 0) subsample_folds = [(subset_indices[train],subset_indices[test]) for train, test in mapped_folds] # Because the subsamples are of constant size, always 500, we just calculate them once per dataset, # not once for every subsample of every dataset (those are stratified anyway) log("subsample-mf") subsample_features = subsample_metafeatures(X, y, categorical, subsample_folds) # We also take subsets of the original dataset, because it creates a bigger metadataset to learn from for i in np.arange(0.1, 1.01, 0.1): # We want a minimum size of 500, otherwise predicting runtime is not that useful anyway, # and it avoids some issues with train/test splits being too small and timing not being accurately measured if(int(i*len(y)) >= 500):