def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): data_id = 292 _monkey_patch_webbased_functions(monkeypatch, data_id, True) msg = 'Cannot return dataframe with sparse data' with pytest.raises(ValueError, match=msg): fetch_openml(data_id=data_id, as_frame=True, cache=False)
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch): pytest.importorskip('pandas') data_id = 1119 _monkey_patch_webbased_functions(monkeypatch, data_id, True) msg = 'Could not adhere to working_memory config.' with pytest.warns(UserWarning, match=msg): with config_context(working_memory=1e-6): fetch_openml(data_id=data_id, as_frame=True, cache=False)
def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype # Check because of the numeric row attribute (issue #12329) data_id = 1119 data_shape = (10, 14) target_shape = (10, ) frame_shape = (10, 15) expected_data_categories = 8 expected_data_floats = 6 target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape n_categories = len([dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.name == target_column assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 1119 data_shape = (10, 14) target_shape = (10, ) expected_data_categories = 8 expected_data_floats = 6 target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False, return_X_y=True) assert isinstance(X, pd.DataFrame) assert X.shape == data_shape n_categories = len([dtype for dtype in X.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in X.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(y, pd.Series) assert y.shape == target_shape assert y.name == target_column
def test_fetch_openml_anneal_pandas(monkeypatch): # classification dataset with numeric and categorical columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 2 target_column = 'class' data_shape = (11, 38) target_shape = (11,) frame_shape = (11, 39) expected_data_categories = 32 expected_data_floats = 6 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, target_column=target_column, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape n_categories = len([dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(target, pd.Series) assert target.shape == target_shape assert isinstance(target.dtype, CategoricalDtype) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx]] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))
def test_fetch_openml_iris_pandas_equal_to_no_frame(monkeypatch): # as_frame = True returns the same underlying data as as_frame = False pytest.importorskip('pandas') data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, True) frame_bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) frame_data = frame_bunch.data frame_target = frame_bunch.target norm_bunch = fetch_openml(data_id=data_id, as_frame=False, cache=False) norm_data = norm_bunch.data norm_target = norm_bunch.target assert_allclose(norm_data, frame_data) assert_array_equal(norm_target, frame_target)
def test_fetch_openml_titanic_pandas(monkeypatch): # dataset with strings pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 data_shape = (1309, 13) target_shape = (1309, ) frame_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, 'sex': CategoricalDtype(['female', 'male']), 'age': np.float64, 'sibsp': np.float64, 'parch': np.float64, 'ticket': object, 'fare': np.float64, 'cabin': object, 'embarked': CategoricalDtype(['C', 'Q', 'S']), 'boat': object, 'body': np.float64, 'home.dest': object, 'survived': CategoricalDtype(['0', '1']) } frame_columns = ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'] frame_dtypes = [name_to_dtype[col] for col in frame_columns] feature_names = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'] target_name = 'survived' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.columns == feature_names) assert bunch.target_names == [target_name] assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.name == target_name assert target.dtype == name_to_dtype[target_name] assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape assert np.all(frame.dtypes == frame_dtypes)
def test_fetch_openml_notarget(monkeypatch, gzip_response): data_id = 61 target_column = None expected_observations = 150 expected_features = 5 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) data = fetch_openml(data_id=data_id, target_column=target_column, cache=False) assert data.data.shape == (expected_observations, expected_features) assert data.target is None
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(sklearn_lib.datasets._openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
def test_fetch_openml_cpu_pandas(monkeypatch): # regression dataset with numeric and categorical columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 data_shape = (209, 7) target_shape = (209, ) frame_shape = (209, 8) cat_dtype = CategoricalDtype(['adviser', 'amdahl', 'apollo', 'basf', 'bti', 'burroughs', 'c.r.d', 'cdc', 'cambex', 'dec', 'dg', 'formation', 'four-phase', 'gould', 'hp', 'harris', 'honeywell', 'ibm', 'ipl', 'magnuson', 'microdata', 'nas', 'ncr', 'nixdorf', 'perkin-elmer', 'prime', 'siemens', 'sperry', 'sratus', 'wang']) data_dtypes = [cat_dtype] + [np.float64] * 6 feature_names = ['vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX'] target_name = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.dtypes == data_dtypes) assert np.all(data.columns == feature_names) assert np.all(bunch.feature_names == feature_names) assert bunch.target_names == [target_name] assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.dtype == np.float64 assert target.name == target_name assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def test_fetch_openml_iris_multitarget_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 data_shape = (150, 3) target_shape = (150, 2) frame_shape = (150, 5) target_column = ['petalwidth', 'petallength'] cat_dtype = CategoricalDtype(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']) data_dtypes = [np.float64, np.float64] + [cat_dtype] data_names = ['sepallength', 'sepalwidth', 'class'] target_dtypes = [np.float64, np.float64] target_names = ['petalwidth', 'petallength'] _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert np.all(data.dtypes == data_dtypes) assert data.shape == data_shape assert np.all(data.columns == data_names) assert np.all(bunch.feature_names == data_names) assert bunch.target_names == target_names assert isinstance(target, pd.DataFrame) assert np.all(target.dtypes == target_dtypes) assert target.shape == target_shape assert np.all(target.columns == target_names) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape assert np.all(frame.dtypes == [np.float64] * 4 + [cat_dtype])
def test_fetch_openml_miceprotein_pandas(monkeypatch): # JvR: very important check, as this dataset defined several row ids # and ignore attributes. Note that data_features json has 82 attributes, # and row id (1), ignore attributes (3) have been removed. pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40966 data_shape = (7, 77) target_shape = (7, ) frame_shape = (7, 78) target_column = 'class' frame_n_categories = 1 frame_n_floats = 77 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.dtypes == np.float64) assert isinstance(target, pd.Series) assert isinstance(target.dtype, CategoricalDtype) assert target.shape == target_shape assert target.name == target_column assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape n_categories = len([dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) assert frame_n_categories == n_categories assert frame_n_floats == n_floats
def test_fetch_openml_emotions_pandas(monkeypatch): # classification dataset with multiple targets (natively) pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40589 target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm', 'quiet.still', 'sad.lonely', 'angry.aggresive'] data_shape = (13, 72) target_shape = (13, 6) frame_shape = (13, 78) expected_frame_categories = 6 expected_frame_floats = 72 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert isinstance(target, pd.DataFrame) assert target.shape == target_shape assert np.all(target.columns == target_column) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape n_categories = len([dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) assert expected_frame_categories == n_categories assert expected_frame_floats == n_floats
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) assert data_by_id.target_names == [target_column] elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.target_names == target_column assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, str) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id