예제 #1
0
def test_load_digits():
    digits = load_digits()
    assert_equal(digits.data.shape, (1797, 64))
    assert_equal(numpy.unique(digits.target).size, 10)

    # test return_X_y option
    check_return_X_y(digits, partial(load_digits))
예제 #2
0
def test_fetch_rcv1():
    try:
        data1 = fetch_rcv1(shuffle=False, download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("Download RCV1 dataset to run this test.")

    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert_true(sp.issparse(X1))
    assert_true(sp.issparse(Y1))
    assert_equal(60915113, X1.data.size)
    assert_equal(2606875, Y1.data.size)

    # test shapes
    assert_equal((804414, 47236), X1.shape)
    assert_equal((804414, 103), Y1.shape)
    assert_equal((804414,), s1.shape)
    assert_equal(103, len(cat_list))

    # test ordering of categories
    first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151']
    assert_array_equal(first_categories, cat_list[:6])

    # test number of sample for some categories
    some_categories = ('GMIL', 'E143', 'CCAT')
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert_equal(num, Y1[:, j].data.size)

    # test shuffling and subset
    data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77,
                       download_if_missing=False)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # test return_X_y option
    fetch_func = partial(fetch_rcv1, shuffle=False, subset='train',
                         download_if_missing=False)
    check_return_X_y(data2, fetch_func)

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
예제 #3
0
def test_percent10():
    try:
        data = fetch_kddcup99(download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert_equal(data.data.shape, (494021, 41))
    assert_equal(data.target.shape, (494021,))

    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
    assert_equal(data.data.shape, data_shuffled.data.shape)
    assert_equal(data.target.shape, data_shuffled.target.shape)

    data = fetch_kddcup99('SA')
    assert_equal(data.data.shape, (100655, 41))
    assert_equal(data.target.shape, (100655,))

    data = fetch_kddcup99('SF')
    assert_equal(data.data.shape, (73237, 4))
    assert_equal(data.target.shape, (73237,))

    data = fetch_kddcup99('http')
    assert_equal(data.data.shape, (58725, 3))
    assert_equal(data.target.shape, (58725,))

    data = fetch_kddcup99('smtp')
    assert_equal(data.data.shape, (9571, 3))
    assert_equal(data.target.shape, (9571,))

    fetch_func = partial(fetch_kddcup99, 'smtp')
    check_return_X_y(data, fetch_func)
예제 #4
0
def test_load_wine():
    res = load_wine()
    assert_equal(res.data.shape, (178, 13))
    assert_equal(res.target.size, 178)
    assert_equal(res.target_names.size, 3)
    assert res.DESCR

    # test return_X_y option
    check_return_X_y(res, partial(load_wine))
예제 #5
0
def test_load_diabetes():
    res = load_diabetes()
    assert_equal(res.data.shape, (442, 10))
    assert_true(res.target.size, 442)
    assert_equal(len(res.feature_names), 10)
    assert_true(res.DESCR)

    # test return_X_y option
    check_return_X_y(res, partial(load_diabetes))
예제 #6
0
def test_load_wine():
    res = load_wine()
    assert res.data.shape == (178, 13)
    assert res.target.size == 178
    assert res.target_names.size == 3
    assert res.DESCR

    # test return_X_y option
    check_return_X_y(res, partial(load_wine))
예제 #7
0
def test_load_diabetes():
    res = load_diabetes()
    assert_equal(res.data.shape, (442, 10))
    assert res.target.size, 442
    assert_equal(len(res.feature_names), 10)
    assert res.DESCR

    # test return_X_y option
    check_return_X_y(res, partial(load_diabetes))
예제 #8
0
def test_load_wine():
    res = load_wine()
    assert_equal(res.data.shape, (178, 13))
    assert_equal(res.target.size, 178)
    assert_equal(res.target_names.size, 3)
    assert_true(res.DESCR)

    # test return_X_y option
    check_return_X_y(res, partial(load_wine))
예제 #9
0
def test_fetch_rcv1(fetch_rcv1_fxt):
    data1 = fetch_rcv1_fxt(shuffle=False)
    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert sp.issparse(X1)
    assert sp.issparse(Y1)
    assert 60915113 == X1.data.size
    assert 2606875 == Y1.data.size

    # test shapes
    assert (804414, 47236) == X1.shape
    assert (804414, 103) == Y1.shape
    assert (804414, ) == s1.shape
    assert 103 == len(cat_list)

    # test descr
    assert data1.DESCR.startswith(".. _rcv1_dataset:")

    # test ordering of categories
    first_categories = ["C11", "C12", "C13", "C14", "C15", "C151"]
    assert_array_equal(first_categories, cat_list[:6])

    # test number of sample for some categories
    some_categories = ("GMIL", "E143", "CCAT")
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert num == Y1[:, j].data.size

    # test shuffling and subset
    data2 = fetch_rcv1_fxt(shuffle=True, subset="train", random_state=77)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # test return_X_y option
    fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset="train")
    check_return_X_y(data2, fetch_func)

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
예제 #10
0
def test_load_fake_lfw_people():
    lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA,
                                  min_faces_per_person=3,
                                  download_if_missing=False)

    # The data is croped around the center as a rectangular bounding box
    # around the face. Colors are converted to gray levels:
    assert lfw_people.images.shape == (10, 62, 47)
    assert lfw_people.data.shape == (10, 2914)

    # the target is array of person integer ids
    assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])

    # names of the persons can be found using the target_names array
    expected_classes = ["Abdelatif Smith", "Abhati Kepler", "Onur Lopez"]
    assert_array_equal(lfw_people.target_names, expected_classes)

    # It is possible to ask for the original data without any croping or color
    # conversion and not limit on the number of picture per person
    lfw_people = fetch_lfw_people(
        data_home=SCIKIT_LEARN_DATA,
        resize=None,
        slice_=None,
        color=True,
        download_if_missing=False,
    )
    assert lfw_people.images.shape == (17, 250, 250, 3)
    assert lfw_people.DESCR.startswith(
        ".. _labeled_faces_in_the_wild_dataset:")

    # the ids and class names are the same as previously
    assert_array_equal(lfw_people.target,
                       [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2])
    assert_array_equal(
        lfw_people.target_names,
        [
            "Abdelatif Smith",
            "Abhati Kepler",
            "Camara Alvaro",
            "Chen Dupont",
            "John Lee",
            "Lin Bauman",
            "Onur Lopez",
        ],
    )

    # test return_X_y option
    fetch_func = partial(
        fetch_lfw_people,
        data_home=SCIKIT_LEARN_DATA,
        resize=None,
        slice_=None,
        color=True,
        download_if_missing=False,
    )
    check_return_X_y(lfw_people, fetch_func)
예제 #11
0
def test_load_iris():
    res = load_iris()
    assert res.data.shape == (150, 4)
    assert res.target.size == 150
    assert res.target_names.size == 3
    assert res.DESCR
    assert os.path.exists(res.filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_iris))
예제 #12
0
def test_load_iris():
    res = load_iris()
    assert_equal(res.data.shape, (150, 4))
    assert_equal(res.target.size, 150)
    assert_equal(res.target_names.size, 3)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_iris))
예제 #13
0
def test_load_boston():
    res = load_boston()
    assert_equal(res.data.shape, (506, 13))
    assert_equal(res.target.size, 506)
    assert_equal(res.feature_names.size, 13)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_boston))
예제 #14
0
def test_load_iris():
    res = load_iris()
    assert_equal(res.data.shape, (150, 4))
    assert_equal(res.target.size, 150)
    assert_equal(res.target_names.size, 3)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_iris))
예제 #15
0
def test_load_boston():
    res = load_boston()
    assert res.data.shape == (506, 13)
    assert res.target.size == 506
    assert res.feature_names.size == 13
    assert res.DESCR
    assert os.path.exists(res.filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_boston))
예제 #16
0
def test_load_breast_cancer():
    res = load_breast_cancer()
    assert res.data.shape == (569, 30)
    assert res.target.size == 569
    assert res.target_names.size == 2
    assert res.DESCR
    assert os.path.exists(res.filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_breast_cancer))
예제 #17
0
def test_load_boston():
    res = load_boston()
    assert_equal(res.data.shape, (506, 13))
    assert_equal(res.target.size, 506)
    assert_equal(res.feature_names.size, 13)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_boston))
예제 #18
0
def test_load_breast_cancer():
    res = load_breast_cancer()
    assert_equal(res.data.shape, (569, 30))
    assert_equal(res.target.size, 569)
    assert_equal(res.target_names.size, 2)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_breast_cancer))
예제 #19
0
def test_load_breast_cancer():
    res = load_breast_cancer()
    assert_equal(res.data.shape, (569, 30))
    assert_equal(res.target.size, 569)
    assert_equal(res.target_names.size, 2)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_breast_cancer))
예제 #20
0
def test_fetch():
    try:
        data = fetch()
    except IOError:
        raise SkipTest("California housing dataset can not be loaded.")
    assert ((20640, 8) == data.data.shape)
    assert ((20640, ) == data.target.shape)

    # test return_X_y option
    fetch_func = partial(fetch)
    check_return_X_y(data, fetch_func)
예제 #21
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.data_filename))
    assert_true(os.path.exists(res.target_filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_linnerud))
def test_fetch():
    try:
        data = fetch()
    except IOError:
        raise SkipTest("California housing dataset can not be loaded.")
    assert((20640, 8) == data.data.shape)
    assert((20640, ) == data.target.shape)

    # test return_X_y option
    fetch_func = partial(fetch)
    check_return_X_y(data, fetch_func)
예제 #23
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.data_filename))
    assert_true(os.path.exists(res.target_filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_linnerud))
예제 #24
0
def test_load_linnerud():
    res = load_linnerud()
    assert res.data.shape == (20, 3)
    assert res.target.shape == (20, 3)
    assert len(res.target_names) == 3
    assert res.DESCR
    assert os.path.exists(res.data_filename)
    assert os.path.exists(res.target_filename)

    # test return_X_y option
    check_return_X_y(res, partial(load_linnerud))
예제 #25
0
def test_olivetti_faces(fetch_olivetti_faces_fxt):
    data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0)

    assert isinstance(data, Bunch)
    for expected_keys in ('area_data', 'images', 'target', 'DESCR'):
        assert expected_keys in data.keys()

    assert data.data.shape == (400, 4096)
    assert data.images.shape == (400, 64, 64)
    assert data.target.shape == (400, )
    assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))

    # test the return_X_y option
    check_return_X_y(data, fetch_olivetti_faces_fxt)
예제 #26
0
def test_olivetti_faces(fetch_olivetti_faces_fxt):
    data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0)

    assert isinstance(data, Bunch)
    for expected_keys in ("data", "images", "target", "DESCR"):
        assert expected_keys in data.keys()

    assert data.data.shape == (400, 4096)
    assert data.images.shape == (400, 64, 64)
    assert data.target.shape == (400, )
    assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))
    assert data.DESCR.startswith(".. _olivetti_faces_dataset:")

    # test the return_X_y option
    check_return_X_y(data, fetch_olivetti_faces_fxt)
예제 #27
0
def test_fetch(fetch_covtype_fxt):
    data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
    data2 = fetch_covtype_fxt(shuffle=True, random_state=37)

    X1, X2 = data1['data'], data2['data']
    assert (581012, 54) == X1.shape
    assert X1.shape == X2.shape

    assert X1.sum() == X2.sum()

    y1, y2 = data1['target'], data2['target']
    assert (X1.shape[0],) == y1.shape
    assert (X1.shape[0],) == y2.shape

    # test return_X_y option
    fetch_func = partial(fetch_covtype_fxt)
    check_return_X_y(data1, fetch_func)
예제 #28
0
def test_fetch(fetch_covtype_fxt):
    data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
    data2 = fetch_covtype_fxt(shuffle=True, random_state=37)

    X1, X2 = data1["data"], data2["data"]
    assert (581012, 54) == X1.shape
    assert X1.shape == X2.shape

    assert X1.sum() == X2.sum()

    y1, y2 = data1["target"], data2["target"]
    assert (X1.shape[0],) == y1.shape
    assert (X1.shape[0],) == y2.shape

    descr_prefix = ".. _covtype_dataset:"
    assert data1.DESCR.startswith(descr_prefix)
    assert data2.DESCR.startswith(descr_prefix)

    # test return_X_y option
    fetch_func = partial(fetch_covtype_fxt)
    check_return_X_y(data1, fetch_func)
def test_fetch():
    try:
        data1 = fetch(shuffle=True, random_state=42)
    except IOError:
        raise SkipTest("Covertype dataset can not be loaded.")

    data2 = fetch(shuffle=True, random_state=37)

    X1, X2 = data1['data'], data2['data']
    assert (581012, 54) == X1.shape
    assert X1.shape == X2.shape

    assert X1.sum() == X2.sum()

    y1, y2 = data1['target'], data2['target']
    assert (X1.shape[0], ) == y1.shape
    assert (X1.shape[0], ) == y2.shape

    # test return_X_y option
    fetch_func = partial(fetch)
    check_return_X_y(data1, fetch_func)
예제 #30
0
def test_fetch():
    try:
        data1 = fetch(shuffle=True, random_state=42)
    except IOError:
        raise SkipTest("Covertype dataset can not be loaded.")

    data2 = fetch(shuffle=True, random_state=37)

    X1, X2 = data1['data'], data2['data']
    assert_equal((581012, 54), X1.shape)
    assert_equal(X1.shape, X2.shape)

    assert_equal(X1.sum(), X2.sum())

    y1, y2 = data1['target'], data2['target']
    assert_equal((X1.shape[0],), y1.shape)
    assert_equal((X1.shape[0],), y2.shape)

    # test return_X_y option
    fetch_func = partial(fetch)
    check_return_X_y(data1, fetch_func)
예제 #31
0
def test_load_fake_lfw_people():
    lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA,
                                  min_faces_per_person=3,
                                  download_if_missing=False)

    # The data is croped around the center as a rectangular bounding box
    # around the face. Colors are converted to gray levels:
    assert_equal(lfw_people.images.shape, (10, 62, 47))
    assert_equal(lfw_people.data.shape, (10, 2914))

    # the target is array of person integer ids
    assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])

    # names of the persons can be found using the target_names array
    expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez']
    assert_array_equal(lfw_people.target_names, expected_classes)

    # It is possible to ask for the original data without any croping or color
    # conversion and not limit on the number of picture per person
    lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None,
                                  slice_=None, color=True,
                                  download_if_missing=False)
    assert_equal(lfw_people.images.shape, (17, 250, 250, 3))

    # the ids and class names are the same as previously
    assert_array_equal(lfw_people.target,
                       [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2])
    assert_array_equal(lfw_people.target_names,
                       ['Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro',
                        'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez'])

    # test return_X_y option
    fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA,
                         resize=None,
                         slice_=None, color=True,
                         download_if_missing=False)
    check_return_X_y(lfw_people, fetch_func)
예제 #32
0
def test_fetch_rcv1():
    try:
        data1 = fetch_rcv1(shuffle=False, download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("Download RCV1 dataset to run this test.")

    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert sp.issparse(X1)
    assert sp.issparse(Y1)
    assert 60915113 == X1.data.size
    assert 2606875 == Y1.data.size

    # test shapes
    assert (804414, 47236) == X1.shape
    assert (804414, 103) == Y1.shape
    assert (804414, ) == s1.shape
    assert 103 == len(cat_list)

    # test ordering of categories
    first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151']
    assert_array_equal(first_categories, cat_list[:6])

    # test number of sample for some categories
    some_categories = ('GMIL', 'E143', 'CCAT')
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert num == Y1[:, j].data.size

    # test shuffling and subset
    data2 = fetch_rcv1(shuffle=True,
                       subset='train',
                       random_state=77,
                       download_if_missing=False)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # test return_X_y option
    fetch_func = partial(fetch_rcv1,
                         shuffle=False,
                         subset='train',
                         download_if_missing=False)
    check_return_X_y(data2, fetch_func)

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
예제 #33
0
def _fetch_dataset_from_openml(data_id, data_name, data_version,
                               target_column,
                               expected_observations, expected_features,
                               expected_missing,
                               expected_data_dtype, expected_target_dtype,
                               expect_sparse, compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name, version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id, cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, string_types)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(data_by_id.data)) ==
                expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id
예제 #34
0
def _fetch_dataset_from_openml(data_id, data_name, data_version,
                               target_column,
                               expected_observations, expected_features,
                               expected_missing,
                               expected_data_dtype, expected_target_dtype,
                               expect_sparse, compare_default_target):
    # fetches a dataset in three various ways from OpenML, using the
    # fetch_openml function, and does various checks on the validity of the
    # result. Note that this function can be mocked (by invoking
    # _monkey_patch_webbased_functions before invoking this function)
    data_by_name_id = fetch_openml(name=data_name, version=data_version,
                                   cache=False)
    assert int(data_by_name_id.details['id']) == data_id

    fetch_openml(name=data_name, cache=False)
    # without specifying the version, there is no guarantee that the data id
    # will be the same

    # fetch with dataset id
    data_by_id = fetch_openml(data_id=data_id, cache=False,
                              target_column=target_column)
    assert data_by_id.details['name'] == data_name
    assert data_by_id.data.shape == (expected_observations, expected_features)
    if isinstance(target_column, str):
        # single target, so target is vector
        assert data_by_id.target.shape == (expected_observations, )
    elif isinstance(target_column, list):
        # multi target, so target is array
        assert data_by_id.target.shape == (expected_observations,
                                           len(target_column))
    assert data_by_id.data.dtype == np.float64
    assert data_by_id.target.dtype == expected_target_dtype
    assert len(data_by_id.feature_names) == expected_features
    for feature in data_by_id.feature_names:
        assert isinstance(feature, string_types)

    # TODO: pass in a list of expected nominal features
    for feature, categories in data_by_id.categories.items():
        feature_idx = data_by_id.feature_names.index(feature)
        values = np.unique(data_by_id.data[:, feature_idx])
        values = values[np.isfinite(values)]
        assert set(values) <= set(range(len(categories)))

    if compare_default_target:
        # check whether the data by id and data by id target are equal
        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
        if data_by_id.data.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.data,
                                       data_by_id_default.data)
        else:
            assert np.array_equal(data_by_id.data, data_by_id_default.data)
        if data_by_id.target.dtype == np.float64:
            np.testing.assert_allclose(data_by_id.target,
                                       data_by_id_default.target)
        else:
            assert np.array_equal(data_by_id.target, data_by_id_default.target)

    if expect_sparse:
        assert isinstance(data_by_id.data, scipy.sparse.csr_matrix)
    else:
        assert isinstance(data_by_id.data, np.ndarray)
        # np.isnan doesn't work on CSR matrix
        assert (np.count_nonzero(np.isnan(data_by_id.data)) ==
                expected_missing)

    # test return_X_y option
    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
                         target_column=target_column)
    check_return_X_y(data_by_id, fetch_func)
    return data_by_id
예제 #35
0
def test_fetch_kddcup99_return_X_y(fetch_kddcup99_fxt):
    fetch_func = partial(fetch_kddcup99_fxt, subset='smtp')
    data = fetch_func()
    check_return_X_y(data, fetch_func)
예제 #36
-1
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)