def build_dataset_ids(oids, test, seed): """ Build splitted dataset_ids from all ids. In this function we build datasets ids from NCBI database. With sklearn function LabelShuffleSplit we split whole dataset into train and test set. When built, test and train datasets are saved for later use. We want to evaluate our model only with train data, so we split train data into two parts (with LabelShuffleSplit function) by 80-20. Files tr_ (trX and trY) are actually trtr_ (train from train - so we are going to train our model ONLY on this data). With files trte_ we are going to evaluate our model for saving, and with te_ files we are going to actually predict final classes. The params helps you set some rules for building the data you want. :param oids: TODO :param test: test size in percentage of whole dataset (smaller than 1.0 or an integer) :param seed: random seed for replicating experiments :return: dictionary with all split ids for every dataset """ datasets_ids = {"tr_ids": [], "te_ids": [], "trtr_ids": [], "trte_ids": []} ss = cross_validation.LabelShuffleSplit(oids, n_iter=1, test_size=test, random_state=seed) for train_index, test_index in ss: # we split ids to train and test datasets_ids["tr_ids"] = list(oids[i] for i in train_index) datasets_ids["te_ids"] = list(oids[i] for i in test_index) # intersection of train and test must be empty set assert set(datasets_ids["tr_ids"]).intersection(set(datasets_ids["te_ids"])) == set() # get train test IDs for evaluating model # hardcode test_size for train evaluation tr_ids = datasets_ids["tr_ids"] ss_tr = cross_validation.LabelShuffleSplit(tr_ids, n_iter=1, test_size=0.2, random_state=seed) for train_train_index, train_test_index in ss_tr: datasets_ids["trtr_ids"] = list(tr_ids[i] for i in train_train_index) datasets_ids["trte_ids"] = list(tr_ids[i] for i in train_test_index) return datasets_ids
def test_label_shuffle_split(): ys = [ np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), ] for y in ys: n_iter = 6 test_size = 1. / 3 slo = cval.LabelShuffleSplit(y, n_iter, test_size=test_size, random_state=0) # Make sure the repr works repr(slo) # Test that the length is correct assert_equal(len(slo), n_iter) y_unique = np.unique(y) for train, test in slo: # First test: no train label is in the test set and vice versa y_train_unique = np.unique(y[train]) y_test_unique = np.unique(y[test]) assert_false(np.any(np.in1d(y[train], y_test_unique))) assert_false(np.any(np.in1d(y[test], y_train_unique))) # Second test: train and test add up to all the data assert_equal(y[train].size + y[test].size, y.size) # Third test: train and test are disjoint assert_array_equal(np.intersect1d(train, test), []) # Fourth test: # unique train and test labels are correct, # +- 1 for rounding error assert_true( abs(len(y_test_unique) - round(test_size * len(y_unique))) <= 1) assert_true( abs( len(y_train_unique) - round((1.0 - test_size) * len(y_unique))) <= 1)