예제 #1
0
파일: load.py 프로젝트: mkopar/VirClass
def build_dataset_ids(oids, test, seed):
    """
    Build splitted dataset_ids from all ids.

    In this function we build datasets ids from NCBI database.
    With sklearn function LabelShuffleSplit we split whole dataset into train and test set. When built, test and train
    datasets are saved for later use.
    We want to evaluate our model only with train data, so we split train data into two parts
    (with LabelShuffleSplit function) by 80-20. Files tr_ (trX and trY) are actually trtr_ (train from train - so we
    are going to train our model ONLY on this data). With files trte_ we are going to evaluate our model for saving,
    and with te_ files we are going to actually predict final classes.
    The params helps you set some rules for building the data you want.
    :param oids: TODO
    :param test: test size in percentage of whole dataset (smaller than 1.0 or an integer)
    :param seed: random seed for replicating experiments
    :return: dictionary with all split ids for every dataset
    """
    datasets_ids = {"tr_ids": [], "te_ids": [], "trtr_ids": [], "trte_ids": []}

    ss = cross_validation.LabelShuffleSplit(oids, n_iter=1, test_size=test, random_state=seed)
    for train_index, test_index in ss:
        # we split ids to train and test
        datasets_ids["tr_ids"] = list(oids[i] for i in train_index)
        datasets_ids["te_ids"] = list(oids[i] for i in test_index)

        # intersection of train and test must be empty set
        assert set(datasets_ids["tr_ids"]).intersection(set(datasets_ids["te_ids"])) == set()

        # get train test IDs for evaluating model
        # hardcode test_size for train evaluation
        tr_ids = datasets_ids["tr_ids"]
        ss_tr = cross_validation.LabelShuffleSplit(tr_ids, n_iter=1, test_size=0.2, random_state=seed)
        for train_train_index, train_test_index in ss_tr:
            datasets_ids["trtr_ids"] = list(tr_ids[i] for i in train_train_index)
            datasets_ids["trte_ids"] = list(tr_ids[i] for i in train_test_index)

    return datasets_ids
예제 #2
0
def test_label_shuffle_split():
    ys = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
    ]

    for y in ys:
        n_iter = 6
        test_size = 1. / 3
        slo = cval.LabelShuffleSplit(y,
                                     n_iter,
                                     test_size=test_size,
                                     random_state=0)

        # Make sure the repr works
        repr(slo)

        # Test that the length is correct
        assert_equal(len(slo), n_iter)

        y_unique = np.unique(y)

        for train, test in slo:
            # First test: no train label is in the test set and vice versa
            y_train_unique = np.unique(y[train])
            y_test_unique = np.unique(y[test])
            assert_false(np.any(np.in1d(y[train], y_test_unique)))
            assert_false(np.any(np.in1d(y[test], y_train_unique)))

            # Second test: train and test add up to all the data
            assert_equal(y[train].size + y[test].size, y.size)

            # Third test: train and test are disjoint
            assert_array_equal(np.intersect1d(train, test), [])

            # Fourth test: # unique train and test labels are correct,
            #              +- 1 for rounding error
            assert_true(
                abs(len(y_test_unique) -
                    round(test_size * len(y_unique))) <= 1)
            assert_true(
                abs(
                    len(y_train_unique) -
                    round((1.0 - test_size) * len(y_unique))) <= 1)