def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64
def test_percent10(): try: data = fetch_kddcup99(download_if_missing=False) except IOError: raise SkipTest("kddcup99 dataset can not be loaded.") assert data.data.shape == (494021, 41) assert data.target.shape == (494021, ) data_shuffled = fetch_kddcup99(shuffle=True, random_state=0) assert data.data.shape == data_shuffled.data.shape assert data.target.shape == data_shuffled.target.shape data = fetch_kddcup99('SA') assert data.data.shape == (100655, 41) assert data.target.shape == (100655, ) data = fetch_kddcup99('SF') assert data.data.shape == (73237, 4) assert data.target.shape == (73237, ) data = fetch_kddcup99('http') assert data.data.shape == (58725, 3) assert data.target.shape == (58725, ) data = fetch_kddcup99('smtp') assert data.data.shape == (9571, 3) assert data.target.shape == (9571, ) fetch_func = partial(fetch_kddcup99, 'smtp') check_return_X_y(data, fetch_func)
def test_load_digits(): digits = load_digits() assert digits.data.shape == (1797, 64) assert numpy.unique(digits.target).size == 10 # test return_X_y option check_return_X_y(digits, partial(load_digits))
def test_load_wine(): res = load_wine() assert res.data.shape == (178, 13) assert res.target.size == 178 assert res.target_names.size == 3 assert res.DESCR # test return_X_y option check_return_X_y(res, partial(load_wine))
def test_load_diabetes(): res = load_diabetes() assert res.data.shape == (442, 10) assert res.target.size, 442 assert len(res.feature_names) == 10 assert res.DESCR # test return_X_y option check_return_X_y(res, partial(load_diabetes))
def test_load_boston(): res = load_boston() assert res.data.shape == (506, 13) assert res.target.size == 506 assert res.feature_names.size == 13 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_boston))
def test_load_breast_cancer(): res = load_breast_cancer() assert res.data.shape == (569, 30) assert res.target.size == 569 assert res.target_names.size == 2 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_breast_cancer))
def test_load_iris(): res = load_iris() assert res.data.shape == (150, 4) assert res.target.size == 150 assert res.target_names.size == 3 assert res.DESCR assert os.path.exists(res.filename) # test return_X_y option check_return_X_y(res, partial(load_iris))
def test_load_linnerud(): res = load_linnerud() assert res.data.shape == (20, 3) assert res.target.shape == (20, 3) assert len(res.target_names) == 3 assert res.DESCR assert os.path.exists(res.data_filename) assert os.path.exists(res.target_filename) # test return_X_y option check_return_X_y(res, partial(load_linnerud))
def test_fetch(): try: data = fetch() except IOError: raise SkipTest("California housing dataset can not be loaded.") assert((20640, 8) == data.data.shape) assert((20640, ) == data.target.shape) # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data, fetch_func)
def test_olivetti_faces(): data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0) assert isinstance(data, Bunch) for expected_keys in ('data', 'images', 'target', 'DESCR'): assert expected_keys in data.keys() assert data.data.shape == (400, 4096) assert data.images.shape == (400, 64, 64) assert data.target.shape == (400, ) assert_array_equal(np.unique(np.sort(data.target)), np.arange(40)) # test the return_X_y option check_return_X_y(data, datasets.fetch_olivetti_faces)
def test_load_fake_lfw_people(): lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=3, download_if_missing=False) # The data is croped around the center as a rectangular bounding box # around the face. Colors are converted to gray levels: assert lfw_people.images.shape == (10, 62, 47) assert lfw_people.data.shape == (10, 2914) # the target is array of person integer ids assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2]) # names of the persons can be found using the target_names array expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez'] assert_array_equal(lfw_people.target_names, expected_classes) # It is possible to ask for the original data without any croping or color # conversion and not limit on the number of picture per person lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False) assert lfw_people.images.shape == (17, 250, 250, 3) # the ids and class names are the same as previously assert_array_equal(lfw_people.target, [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2]) assert_array_equal(lfw_people.target_names, [ 'Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro', 'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez' ]) # test return_X_y option fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False) check_return_X_y(lfw_people, fetch_func)
def test_fetch(): try: data1 = fetch(shuffle=True, random_state=42) except IOError: raise SkipTest("Covertype dataset can not be loaded.") data2 = fetch(shuffle=True, random_state=37) X1, X2 = data1['data'], data2['data'] assert (581012, 54) == X1.shape assert X1.shape == X2.shape assert X1.sum() == X2.sum() y1, y2 = data1['target'], data2['target'] assert (X1.shape[0], ) == y1.shape assert (X1.shape[0], ) == y2.shape # test return_X_y option fetch_func = partial(fetch) check_return_X_y(data1, fetch_func)
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) assert data_by_id.target_names == [target_column] elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.target_names == target_column assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, str) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert sp.issparse(X1) assert sp.issparse(Y1) assert 60915113 == X1.data.size assert 2606875 == Y1.data.size # test shapes assert (804414, 47236) == X1.shape assert (804414, 103) == Y1.shape assert (804414, ) == s1.shape assert 103 == len(cat_list) # test ordering of categories first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert num == Y1[:, j].data.size # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)