def test_fetch_openml_australian_pandas_error_sparse(monkeypatch): data_id = 292 _monkey_patch_webbased_functions(monkeypatch, data_id, True) msg = 'Cannot return dataframe with sparse data' with pytest.raises(ValueError, match=msg): fetch_openml(data_id=data_id, as_frame=True, cache=False)
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch): pytest.importorskip('pandas') data_id = 1119 _monkey_patch_webbased_functions(monkeypatch, data_id, True) msg = 'Could not adhere to working_memory config.' with pytest.warns(UserWarning, match=msg): with config_context(working_memory=1e-6): fetch_openml(data_id=data_id, as_frame=True, cache=False)
def test_fetch_openml_adultcensus_pandas(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype # Check because of the numeric row attribute (issue #12329) data_id = 1119 data_shape = (10, 14) target_shape = (10, ) frame_shape = (10, 15) expected_data_categories = 8 expected_data_floats = 6 target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape n_categories = len([ dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype) ]) n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.name == target_column assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 1119 data_shape = (10, 14) target_shape = (10, ) expected_data_categories = 8 expected_data_floats = 6 target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False, return_X_y=True) assert isinstance(X, pd.DataFrame) assert X.shape == data_shape n_categories = len( [dtype for dtype in X.dtypes if isinstance(dtype, CategoricalDtype)]) n_floats = len([dtype for dtype in X.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(y, pd.Series) assert y.shape == target_shape assert y.name == target_column
def test_fetch_openml_iris_pandas_equal_to_no_frame(monkeypatch): # as_frame = True returns the same underlying data as as_frame = False pytest.importorskip('pandas') data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, True) frame_bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) frame_data = frame_bunch.data frame_target = frame_bunch.target norm_bunch = fetch_openml(data_id=data_id, as_frame=False, cache=False) norm_data = norm_bunch.data norm_target = norm_bunch.target assert_allclose(norm_data, frame_data) assert_array_equal(norm_target, frame_target)
def test_fetch_openml_titanic_pandas(monkeypatch): # dataset with strings pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40945 data_shape = (1309, 13) target_shape = (1309, ) frame_shape = (1309, 14) name_to_dtype = { 'pclass': np.float64, 'name': object, 'sex': CategoricalDtype(['female', 'male']), 'age': np.float64, 'sibsp': np.float64, 'parch': np.float64, 'ticket': object, 'fare': np.float64, 'cabin': object, 'embarked': CategoricalDtype(['C', 'Q', 'S']), 'boat': object, 'body': np.float64, 'home.dest': object, 'survived': CategoricalDtype(['0', '1']) } frame_columns = [ 'pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest' ] frame_dtypes = [name_to_dtype[col] for col in frame_columns] feature_names = [ 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest' ] target_name = 'survived' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.columns == feature_names) assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.name == target_name assert target.dtype == name_to_dtype[target_name] assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape assert np.all(frame.dtypes == frame_dtypes)
def load_mnist(n_samples=None, class_0='0', class_1='8'): """Load MNIST, select two classes, shuffle and return only n_samples.""" # Load data from http://openml.org/d/554 mnist = fetch_openml('mnist_784', version=1) # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42) if n_samples is not None: X, y = X[:n_samples], y[:n_samples] return X, y
def test_fetch_openml_notarget(monkeypatch, gzip_response): data_id = 61 target_column = None expected_observations = 150 expected_features = 5 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) data = fetch_openml(data_id=data_id, target_column=target_column, cache=False) assert data.data.shape == (expected_observations, expected_features) assert data.target is None
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name).data return X
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(mrex.datasets.openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features X /= 255 return X, y
def test_fetch_openml_iris_multitarget_pandas(monkeypatch): # classification dataset with numeric only columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 61 data_shape = (150, 3) target_shape = (150, 2) frame_shape = (150, 5) target_column = ['petalwidth', 'petallength'] cat_dtype = CategoricalDtype( ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']) data_dtypes = [np.float64, np.float64] + [cat_dtype] data_names = ['sepallength', 'sepalwidth', 'class'] target_dtypes = [np.float64, np.float64] target_names = ['petalwidth', 'petallength'] _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert np.all(data.dtypes == data_dtypes) assert data.shape == data_shape assert np.all(data.columns == data_names) assert np.all(bunch.feature_names == data_names) assert isinstance(target, pd.DataFrame) assert np.all(target.dtypes == target_dtypes) assert target.shape == target_shape assert np.all(target.columns == target_names) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape assert np.all(frame.dtypes == [np.float64] * 4 + [cat_dtype])
def test_fetch_openml_emotions_pandas(monkeypatch): # classification dataset with multiple targets (natively) pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40589 target_column = [ 'amazed.suprised', 'happy.pleased', 'relaxing.calm', 'quiet.still', 'sad.lonely', 'angry.aggresive' ] data_shape = (13, 72) target_shape = (13, 6) frame_shape = (13, 78) expected_frame_categories = 6 expected_frame_floats = 72 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert isinstance(target, pd.DataFrame) assert target.shape == target_shape assert np.all(target.columns == target_column) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape n_categories = len([ dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype) ]) n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) assert expected_frame_categories == n_categories assert expected_frame_floats == n_floats
def test_fetch_openml_cpu_pandas(monkeypatch): # regression dataset with numeric and categorical columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 561 data_shape = (209, 7) target_shape = (209, ) frame_shape = (209, 8) cat_dtype = CategoricalDtype([ 'adviser', 'amdahl', 'apollo', 'basf', 'bti', 'burroughs', 'c.r.d', 'cdc', 'cambex', 'dec', 'dg', 'formation', 'four-phase', 'gould', 'hp', 'harris', 'honeywell', 'ibm', 'ipl', 'magnuson', 'microdata', 'nas', 'ncr', 'nixdorf', 'perkin-elmer', 'prime', 'siemens', 'sperry', 'sratus', 'wang' ]) data_dtypes = [cat_dtype] + [np.float64] * 6 feature_names = [ 'vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX' ] target_name = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.dtypes == data_dtypes) assert np.all(data.columns == feature_names) assert np.all(bunch.feature_names == feature_names) assert isinstance(target, pd.Series) assert target.shape == target_shape assert target.dtype == np.float64 assert target.name == target_name assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test
def test_fetch_openml_miceprotein_pandas(monkeypatch): # JvR: very important check, as this dataset defined several row ids # and ignore attributes. Note that data_features json has 82 attributes, # and row id (1), ignore attributes (3) have been removed. pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 40966 data_shape = (7, 77) target_shape = (7, ) frame_shape = (7, 78) target_column = 'class' frame_n_categories = 1 frame_n_floats = 77 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape assert np.all(data.dtypes == np.float64) assert isinstance(target, pd.Series) assert isinstance(target.dtype, CategoricalDtype) assert target.shape == target_shape assert target.name == target_column assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape n_categories = len([ dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype) ]) n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == 'f']) assert frame_n_categories == n_categories assert frame_n_floats == n_floats
def test_fetch_openml_anneal_pandas(monkeypatch): # classification dataset with numeric and categorical columns pd = pytest.importorskip('pandas') CategoricalDtype = pd.api.types.CategoricalDtype data_id = 2 target_column = 'class' data_shape = (11, 38) target_shape = (11, ) frame_shape = (11, 39) expected_data_categories = 32 expected_data_floats = 6 _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, target_column=target_column, cache=False) data = bunch.data target = bunch.target frame = bunch.frame assert isinstance(data, pd.DataFrame) assert data.shape == data_shape n_categories = len([ dtype for dtype in data.dtypes if isinstance(dtype, CategoricalDtype) ]) n_floats = len([dtype for dtype in data.dtypes if dtype.kind == 'f']) assert expected_data_categories == n_categories assert expected_data_floats == n_floats assert isinstance(target, pd.Series) assert target.shape == target_shape assert isinstance(target.dtype, CategoricalDtype) assert isinstance(frame, pd.DataFrame) assert frame.shape == frame_shape
def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [ None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx] ] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))
def load_mauna_loa_atmospheric_co2(): ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] y = ml_data.data[:, 0] m = ml_data.data[:, 1] month_float = y + (m - 1) / 12 ppmvs = ml_data.target for month, ppmv in zip(month_float, ppmvs): if not months or month != months[-1]: months.append(month) ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) avg_ppmvs = np.asarray(ppmv_sums) / counts return months, avg_ppmvs
# License: BSD 3 clause import numpy as np from mrex.compose import ColumnTransformer from mrex.datasets import fetch_openml from mrex.pipeline import Pipeline from mrex.impute import SimpleImputer from mrex.preprocessing import StandardScaler, OneHotEncoder from mrex.linear_model import LogisticRegression from mrex.model_selection import train_test_split, GridSearchCV np.random.seed(0) # Load data from https://www.openml.org/d/40945 X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) # Alternatively X and y can be obtained directly from the frame attribute: # X = titanic.frame.drop('survived', axis=1) # y = titanic.frame['survived'] # We will train our classifier with the following features: # Numeric Features: # - age: float. # - fare: float. # Categorical Features: # - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. # We create the preprocessing pipelines for both numeric and categorical data.
# Author: Adam Kleczewski # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from mrex.datasets import fetch_openml from mrex.multioutput import ClassifierChain from mrex.model_selection import train_test_split from mrex.multiclass import OneVsRestClassifier from mrex.metrics import jaccard_score from mrex.linear_model import LogisticRegression print(__doc__) # Load a multi-label dataset from https://www.openml.org/d/40597 X, Y = fetch_openml('yeast', version=4, return_X_y=True) Y = Y == 'TRUE' X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) # Fit an independent logistic regression model for each class using the # OneVsRestClassifier wrapper. base_lr = LogisticRegression() ovr = OneVsRestClassifier(base_lr) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples') # Fit an ensemble of logistic regression classifier chains and take the
from mrex.linear_model import LogisticRegression from mrex.model_selection import train_test_split from mrex.preprocessing import StandardScaler from mrex.utils import check_random_state print(__doc__) # Author: Arthur Mensch <*****@*****.**> # License: BSD 3 clause # Turn down for faster convergence t0 = time.time() train_samples = 5000 # Load data from https://www.openml.org/d/554 X, y = fetch_openml('mnist_784', version=1, return_X_y=True) random_state = check_random_state(0) permutation = random_state.permutation(X.shape[0]) X = X[permutation] y = y[permutation] X = X.reshape((X.shape[0], -1)) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_samples, test_size=10000) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Turn up tolerance for faster convergence
def _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, expected_data_dtype, expected_target_dtype, expect_sparse, compare_default_target): # fetches a dataset in three various ways from OpenML, using the # fetch_openml function, and does various checks on the validity of the # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, cache=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): # single target, so target is vector assert data_by_id.target.shape == (expected_observations, ) elif isinstance(target_column, list): # multi target, so target is array assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.data.dtype == np.float64 assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: assert isinstance(feature, str) # TODO: pass in a list of expected nominal features for feature, categories in data_by_id.categories.items(): feature_idx = data_by_id.feature_names.index(feature) values = np.unique(data_by_id.data[:, feature_idx]) values = values[np.isfinite(values)] assert set(values) <= set(range(len(categories))) if compare_default_target: # check whether the data by id and data by id target are equal data_by_id_default = fetch_openml(data_id=data_id, cache=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) else: assert np.array_equal(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) else: assert np.array_equal(data_by_id.target, data_by_id_default.target) if expect_sparse: assert isinstance(data_by_id.data, scipy.sparse.csr_matrix) else: assert isinstance(data_by_id.data, np.ndarray) # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan( data_by_id.data)) == expected_missing) # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] plt.figure() for dataset_name in datasets: # loading and vectorization print('loading data') if dataset_name in ['http', 'smtp', 'SA', 'SF']: dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=random_state) X = dataset.data y = dataset.target if dataset_name == 'shuttle': dataset = fetch_openml('shuttle') X = dataset.data y = dataset.target # we remove data with label 4 # normal data are then those of class 1 s = (y != 4) X = X[s, :] y = y[s] y = (y != 1).astype(int) if dataset_name == 'forestcover': dataset = fetch_covtype() X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4