def setup_working_with_text_data(): if IS_PYPY and os.environ.get('CI', None): raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Read more in the :ref:`User Guide <datasets>`. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns -------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1623,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (619,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. * For an example of using this dataset with scikit-learn, see :ref:`examples/applications/plot_species_distribution_modeling.py <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`. """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) if not exists(archive_path): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading species data from %s to %s' % (SAMPLES.url, data_home)) samples_path = _fetch_remote(SAMPLES, dirname=data_home) with np.load(samples_path) as X: # samples.zip is a valid npz for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) remove(samples_path) logger.info('Downloading coverage data from %s to %s' % (COVERAGES.url, data_home)) coverages_path = _fetch_remote(COVERAGES, dirname=data_home) with np.load(coverages_path) as X: # coverages.zip is a valid npz coverages = [] for f in X.files: fhandle = BytesIO(X[f]) logger.debug(' - converting {}'.format(f)) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) remove(coverages_path) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) joblib.dump(bunch, archive_path, compress=9) else: bunch = joblib.load(archive_path) return bunch
def setup_working_with_text_data(): check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def setup_twenty_newsgroups(): data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Read more in the :ref:`User Guide <datasets>`. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns -------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1623,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (619,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. - For an example of using this dataset with scikit-learn, see :ref:`examples/applications/plot_species_distribution_modeling.py <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`. """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) if not exists(archive_path): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading species data from %s to %s' % ( SAMPLES.url, data_home)) samples_path = _fetch_remote(SAMPLES, dirname=data_home) X = np.load(samples_path) # samples.zip is a valid npz remove(samples_path) for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) logger.info('Downloading coverage data from %s to %s' % ( COVERAGES.url, data_home)) coverages_path = _fetch_remote(COVERAGES, dirname=data_home) X = np.load(coverages_path) # coverages.zip is a valid npz remove(coverages_path) coverages = [] for f in X.files: fhandle = BytesIO(X[f]) logger.debug(' - converting {}'.format(f)) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) joblib.dump(bunch, archive_path, compress=9) else: bunch = joblib.load(archive_path) return bunch
def setup_twenty_newsgroups(): data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_classic(data_home=None, subset='all', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True): """Load the filenames and data from the 20 newsgroups dataset. Read more in the :ref:`User Guide <20newsgroups>`. Parameters ---------- subset: 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. data_home: optional, default: None Specify a download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. categories: None or collection of string or unicode If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle: bool, optional Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state: numpy random number generator or seed integer Used to shuffle the dataset. download_if_missing: optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. remove: tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. 'headers' follows an exact standard; the other filters are not always correct. """ data_home = get_data_home(data_home=data_home) cache_path = _pkl_filepath(data_home, CACHE_NAME) classic_home = os.path.join(data_home, CLASSIC_HOME) cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: if download_if_missing: cache = download_classic(target_dir=classic_home, cache_path=cache_path) else: raise IOError('classic dataset not found') if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) data.description = 'the classic dataset' # if 'headers' in remove: # data.data = [strip_newsgroup_header(text) for text in data.data] # if 'footers' in remove: # data.data = [strip_newsgroup_footer(text) for text in data.data] # if 'quotes' in remove: # data.data = [strip_newsgroup_quoting(text) for text in data.data] if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data
def fetch_higgs(data_home=None, download_if_missing=True, random_state=None, shuffle=False): """Load the Higgs dataset, downloading it if necessary. Read more in the :ref:`User Guide <datasets>`. Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, optional (default=None) Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default=False Whether to shuffle dataset. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (11000000, 28) Each row corresponds to the 28 features in the dataset. dataset.target : numpy array of shape (11000000,) The targets are binary: 0 (background) - 1 (signal). dataset.DESCR : string Description of the Higgs dataset. """ data_home = get_data_home(data_home=data_home) higgs_dir = join(data_home, "higgs") samples_path = _pkl_filepath(higgs_dir, "samples") targets_path = _pkl_filepath(higgs_dir, "targets") available = exists(samples_path) if download_if_missing and not available: makedirs(higgs_dir, exist_ok=True) # logger.warning("Downloading %s" % URL) print("Downloading {}".format(URL)) f = BytesIO(urlopen(URL).read()) Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',', dtype=np.float32) X = Xy[:, 1::] y = Xy[:, 0].astype(np.int32) joblib.dump(X, samples_path, compress=9) joblib.dump(y, targets_path, compress=9) print("Dumped the data") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) print("Data loaded from pickle") if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] return Bunch(data=X, target=y, DESCR=__doc__)
def fetch_uci_glass_outlier(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI glass data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 6 Samples total 214 Dimensionality 9 Features real ================= ===================== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. shuffle : boolean, optional If True the order of the dataset is shuffled to avoid having images of the same person grouped. random_state : int, RandomState instance or None (default=0) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : numpy array of shape (214, 9) Each row corresponds to a glass feature of 9 dimension target : numpy array of shape (214, ) Labels associated to each glas. Those labels are from [1,2,3,5,6,7] and correspond to the Subject IDs. """ global GLASS data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'uci_glass_outlier.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") print('downloading UCI GLASS from %s to %s' % (GLASS.url, data_home)) data_path = _fetch_remote(GLASS, dirname=data_home) glass = np.genfromtxt(data_path, delimiter=",") # the class 6 (minority) as outlier and all other classes as inliers glass[:, -1] = 2 * (glass[:, -1] != 6) - 1 _joblib.dump(glass, filepath, compress=6) remove(data_path) else: glass = _joblib.load(filepath) feature = glass[:, 1:-1] target = glass[:, -1] if shuffle: random_state = check_random_state(random_state) order = random_state.permutation(len(glass)) feature = glass[order] target = target[order] return (feature, target)
def fetch_classic(data_home=None, subset='all', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True): """Load the filenames and data from the 20 newsgroups dataset. Read more in the :ref:`User Guide <20newsgroups>`. Parameters ---------- subset: 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. data_home: optional, default: None Specify a download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. categories: None or collection of string or unicode If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle: bool, optional Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state: numpy random number generator or seed integer Used to shuffle the dataset. download_if_missing: optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. remove: tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. 'headers' follows an exact standard; the other filters are not always correct. """ data_home = get_data_home(data_home=data_home) cache_path = _pkl_filepath(data_home, CACHE_NAME) classic_home = os.path.join(data_home, CLASSIC_HOME) cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode( compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: if download_if_missing: cache = download_classic(target_dir=classic_home, cache_path=cache_path) else: raise IOError('classic dataset not found') if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) data.description = 'the classic dataset' # if 'headers' in remove: # data.data = [strip_newsgroup_header(text) for text in data.data] # if 'footers' in remove: # data.data = [strip_newsgroup_footer(text) for text in data.data] # if 'quotes' in remove: # data.data = [strip_newsgroup_quoting(text) for text in data.data] if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data
def setup_working_with_text_data(): check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None, download_if_missing=True, return_X_y=False): """Load the 20 newsgroups dataset and vectorize it into token counts \ (classification). Download it if necessary. This is a convenience function; the transformation is done using the default settings for :class:`sklearn.feature_extraction.text.CountVectorizer`. For more advanced usage (stopword filtering, n-gram extraction, etc.), combine fetch_20newsgroups with a custom :class:`sklearn.feature_extraction.text.CountVectorizer`, :class:`sklearn.feature_extraction.text.HashingVectorizer`, :class:`sklearn.feature_extraction.text.TfidfTransformer` or :class:`sklearn.feature_extraction.text.TfidfVectorizer`. ================= ========== Classes 20 Samples total 18846 Dimensionality 130107 Features real ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`. Parameters ---------- subset : 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. remove : tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. data_home : optional, default: None Specify an download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 Returns ------- bunch : Bunch object bunch.data: sparse matrix, shape [n_samples, n_features] bunch.target: array, shape [n_samples] bunch.target_names: list, length [n_classes] bunch.DESCR: a description of the dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ data_home = get_data_home(data_home=data_home) filebase = '20newsgroup_vectorized' if remove: filebase += 'remove-' + ('-'.join(remove)) target_file = _pkl_filepath(data_home, filebase + ".pkl") # we shuffle but use a fixed seed for the memoization data_train = fetch_data(data_home=data_home, subset='train', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) data_test = fetch_data(data_home=data_home, subset='test', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) if os.path.exists(target_file): X_train, X_test = _joblib.load(target_file) else: vectorizer = CountVectorizer(dtype=np.int16) X_train = vectorizer.fit_transform(data_train.data).tocsr() X_test = vectorizer.transform(data_test.data).tocsr() _joblib.dump((X_train, X_test), target_file, compress=9) # the data is stored as int16 for compactness # but normalize needs floats X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) normalize(X_train, copy=False) normalize(X_test, copy=False) target_names = data_train.target_names if subset == "train": data = X_train target = data_train.target elif subset == "test": data = X_test target = data_test.target elif subset == "all": data = sp.vstack((X_train, X_test)).tocsr() target = np.concatenate((data_train.target, data_test.target)) else: raise ValueError("%r is not a valid subset: should be one of " "['train', 'test', 'all']" % subset) module_path = dirname(__file__) with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file: fdescr = rst_file.read() if return_X_y: return data, target return Bunch(data=data, target=target, target_names=target_names, DESCR=fdescr)
def fetch_data(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True): """Load the filenames and data from the 20 newsgroups dataset \ (classification). Download it if necessary. ================= ========== Classes 20 Samples total 18846 Dimensionality 1 Features text ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`. Parameters ---------- data_home : optional, default: None Specify a download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. subset : 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. categories : None or collection of string or unicode If None (default), load all the categories. If not None, list of category names to load (other categories ignored). shuffle : bool, optional Whether or not to shuffle the data: might be important for models that make the assumption that the samples are independent and identically distributed (i.i.d.), such as stochastic gradient descent. random_state : int, RandomState instance or None (default) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. remove : tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. 'headers' follows an exact standard; the other filters are not always correct. download_if_missing : optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- bunch : Bunch object bunch.data: list, length [n_samples] bunch.target: array, shape [n_samples] bunch.filenames: list, length [n_classes] bunch.DESCR: a description of the dataset. """ data_home = get_data_home(data_home=data_home) cache_path = _pkl_filepath(data_home, CACHE_NAME) twenty_home = os.path.join(data_home, "20news_home") cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode( compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: if download_if_missing: logger.info("Downloading 20news dataset. " "This may take a few minutes.") cache = _download_20newsgroups(target_dir=twenty_home, cache_path=cache_path) else: raise IOError('20Newsgroups dataset not found') if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) module_path = dirname(__file__) with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file: fdescr = rst_file.read() data.DESCR = fdescr if 'headers' in remove: data.data = [strip_newsgroup_header(text) for text in data.data] if 'footers' in remove: data.data = [strip_newsgroup_footer(text) for text in data.data] if 'quotes' in remove: data.data = [strip_newsgroup_quoting(text) for text in data.data] if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data