def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314, 130107) assert bunch.target.shape[0] == 11314 assert bunch.data.dtype == np.float64 # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (7532, 130107) assert bunch.target.shape[0] == 7532 assert bunch.data.dtype == np.float64 # test return_X_y option fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert sp.isspmatrix_csr(bunch.data) assert bunch.data.shape == (11314 + 7532, 130107) assert bunch.target.shape[0] == 11314 + 7532 assert bunch.data.dtype == np.float64
def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 130107)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 130107)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 130107)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)
def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None, shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.): data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train') data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test') X_train = data_train.data X_test = data_test.data y_train = data_train.target y_test = data_test.target if binary_problem: y_train[data_train.target < 10] = minus_value y_train[data_train.target >= 10] = 1. y_test[data_test.target < 10] = minus_value y_test[data_test.target >= 10] = 1. if one_hot: y_train = to_one_hot_enc(y_train) y_test = to_one_hot_enc(y_test) # if shuffle and sk_shuffle: # xtr = X_train.tocoo() # xts = X_test.tocoo() d_train = Dataset(data=X_train, target=y_train, info={'target names': data_train.target_names}) d_test = Dataset(data=X_test, target=y_test, info={'target names': data_train.target_names}) res = [d_train, d_test] if partitions_proportions: res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False) if as_tensor: [dat.convert_to_tensor() for dat in res] return Datasets.from_list(res)
def load_20_newsgroups_data(): """ Load the 20 newsgroups dataset. """ newsgroups_train = fetch_20newsgroups_vectorized(subset="train") newsgroups_test = fetch_20newsgroups_vectorized(subset="test") training_data = newsgroups_train.data training_labels = newsgroups_train.target test_data = newsgroups_test.data test_labels = newsgroups_test.target return training_data, training_labels, test_data, test_labels
def test_20news_normalization(): try: X = datasets.fetch_20newsgroups_vectorized(normalize=False, download_if_missing=False) X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") X_norm = X_['data'][:100] X = X['data'][:100] assert_allclose_dense_sparse(X_norm, normalize(X)) assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
def load_news(): # Load the faces datasets data = fetch_20newsgroups_vectorized(subset='train', remove=('headers', 'footers', 'quotes')) test = fetch_20newsgroups_vectorized(subset='test', remove=('headers', 'footers', 'quotes')) #select features ch2 = SelectKBest(chi2, k=500) X_train = ch2.fit_transform(data.data, data.target) X_test = ch2.transform(test.data) X_train = X_train.toarray() X_test = X_test.toarray() Y_train = data['target'] Y_test = test['target'] def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset([X_test, Y_test]) valid_set_x, valid_set_y = [test_set_x, test_set_y] train_set_x, train_set_y = shared_dataset([X_train, Y_train]) rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval
def test_LogisticRegressionCV(): bunch = fetch_20newsgroups_vectorized(subset="train") X = bunch.data y = bunch.target y[y < y.mean()] = -1 y[y >= y.mean()] = 1 Xt, Xh, yt, yh = cross_validation.train_test_split( X, y, test_size=.5, random_state=0) # compute the scores all_scores = [] all_alphas = np.linspace(-12, 0, 5) for a in all_alphas: lr = linear_model.LogisticRegression( solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6, max_iter=100) lr.fit(Xt, yt) score_scv = linear_model.logistic._logistic_loss( lr.coef_.ravel(), Xh, yh, 0) all_scores.append(score_scv) all_scores = np.array(all_scores) best_alpha = all_alphas[np.argmin(all_scores)] clf = LogisticRegressionCV(max_iter=50) clf.fit(Xt, yt, Xh, yh) np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
def load_data(subset, idx, n): X, y = fetch_20newsgroups_vectorized(subset=subset, return_X_y=True) start = X.shape[0] // n * idx end = X.shape[0] // n * (idx + 1) return X[start:end], y[start:end]
def generate_data(case, sparse=False): # Generate regression / classification data. bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = { 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, } return data
def test_LogisticRegressionCV(): bunch = fetch_20newsgroups_vectorized(subset="train") X = bunch.data y = bunch.target y[y < y.mean()] = -1 y[y >= y.mean()] = 1 Xt, Xh, yt, yh = model_selection.train_test_split(X, y, test_size=.5, random_state=0) # compute the scores all_scores = [] all_alphas = np.linspace(-12, 0, 5) for a in all_alphas: lr = linear_model.LogisticRegression(solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6, max_iter=100) lr.fit(Xt, yt) score_scv = linear_model.logistic._logistic_loss( lr.coef_.ravel(), Xh, yh, 0) all_scores.append(score_scv) all_scores = np.array(all_scores) best_alpha = all_alphas[np.argmin(all_scores)] clf = LogisticRegressionCV(max_iter=50) clf.fit(Xt, yt, Xh, yh) np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
def all_vector(): print("all_vector load:") t0 = time() # raw_data = fetch_20newsgroups(subset='train').data # data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6 tfidf_train_3 = fetch_20newsgroups_vectorized(subset='train') tfidf_test_3 = fetch_20newsgroups_vectorized(subset='test') duration = time() - t0 print("done in %fs" % duration) # print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("the shape of train is " + repr(tfidf_train_3.data.shape)) print("the shape of test is " + repr(tfidf_test_3.data.shape)) return tfidf_train_3.data, tfidf_test_3.data
def fetch_newsgroups(self, ncols=100): """ Returns a 100 column sample of the newsgroups datasets. """ news = fetch_20newsgroups_vectorized().data [nn, dd] = news.shape cols = np.random.choice(dd, size=ncols, replace=False) #rows = np.random.choice(nn,size=nsamples,replace=False) return news[:, cols].toarray()
def __init__(self): n_samples = 10000 dataset = fetch_20newsgroups_vectorized('all') X = dataset.data y = dataset.target X = X[:n_samples] y = y[:n_samples] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, random_state=42, stratify=y, test_size=0.1)
def test_20news_vectorized(): # This test is slow. raise SkipTest bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_equal(bunch.data.shape, (11314, 107130)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_equal(bunch.data.shape, (7532, 107130)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="all") assert_equal(bunch.data.shape, (11314 + 7532, 107130)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == "lfw_people": X = fetch_lfw_people().data elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data elif dataset_name == "rcv1": X = fetch_rcv1().data elif dataset_name == "CIFAR": if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == "low rank matrix": X = make_low_rank_matrix( n_samples=500, n_features=int(1e4), effective_rank=100, tail_strength=0.5, random_state=random_state, ) elif dataset_name == "uncorrelated matrix": X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name, parser="auto").data return X
def test_build(): X, y = fetch_20newsgroups_vectorized(return_X_y=True) # Select only the first 500 samples X = X[:500] y = y[:500] # Precompute cosine distance matrix diss = cosine_distances(X) # run build ske = KMedoids(20, "precomputed", init="build", max_iter=0) ske.fit(diss) assert ske.inertia_ <= 230 assert len(np.unique(ske.labels_)) == 20
def test_20news_vectorized(): # This test is slow. raise SkipTest("Test too slow.") bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 107428)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 107428)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) bunch = datasets.fetch_20newsgroups_vectorized(subset="all") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 107428)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)
def load_data(name, partition_id, n_partitions): """load partition of data into global var `name`""" from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.utils import gen_even_slices dataset = fetch_20newsgroups_vectorized('test') size = dataset.data.shape[0] slices = list(gen_even_slices(size, n_partitions)) part = dataset.data[slices[partition_id]] # put it in globals globals().update({name : part}) return part.shape
def get_mldata(dataset, save_dir): # Use scikit to grab datasets and save them save_dir. # save_dir = FLAGS.save_dir filename = os.path.join(save_dir, dataset[1] + '.pkl') if not os.path.exists(save_dir): os.makedirs(save_dir) if not gfile.Exists(filename): if dataset[0][-3:] == 'csv': data = get_csv_data(dataset[0]) elif dataset[0] == 'breast_cancer': data = load_breast_cancer() elif dataset[0] == 'iris': data = load_iris() elif dataset[0] == 'newsgroup': # Removing header information to make sure that no newsgroup identifying # information is included in data data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(data.data) data.data = X elif dataset[0] == 'rcv1': sklearn.datasets.rcv1.URL = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') sklearn.datasets.rcv1.URL_topics = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') data = sklearn.datasets.fetch_rcv1(data_home='/tmp') elif dataset[0] == 'wikipedia_attack': data = get_wikipedia_talk_data() elif dataset[0] == 'cifar10': data = get_cifar10() elif 'keras' in dataset[0]: data = get_keras_data(dataset[0]) else: try: data = fetch_mldata(dataset[0]) except: print('error') raise Exception('ERROR: failed to fetch data from mldata.org') X = data.data y = data.target if X.shape[0] != y.shape[0]: X = np.transpose(X) assert X.shape[0] == y.shape[0] data = {'data': X, 'target': y} with open(filename, 'wb') as f: pickle.dump(data, f) print(len(data), filename)
def nb_news2(): news = fetch_20newsgroups_vectorized() # print(news.data) x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, random_state=22) estimator = MultinomialNB() estimator.fit(x_train, y_train) score = estimator.score(x_test, y_test) print('score:', score)
def get_mldata(dataset): # Use scikit to grab datasets and save them save_dir. save_dir = FLAGS.save_dir filename = os.path.join(save_dir, dataset[1] + '.pkl') if not gfile.Exists(save_dir): gfile.MkDir(save_dir) #这个函数不能创建多层目录,可以更换成os.makedirs() if not gfile.Exists(filename): if dataset[0][-3:] == 'csv': data = get_csv_data(dataset[0]) elif dataset[0] == 'breast_cancer': data = load_breast_cancer() elif dataset[0] == 'iris': data = load_iris() elif dataset[0] == 'newsgroup': # Removing header information to make sure that no newsgroup identifying # information is included in data data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(data.data) data.data = X elif dataset[0] == 'rcv1': sklearn.datasets.rcv1.URL = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') sklearn.datasets.rcv1.URL_topics = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') data = sklearn.datasets.fetch_rcv1(data_home='/tmp') elif dataset[0] == 'wikipedia_attack': data = get_wikipedia_talk_data() elif dataset[0] == 'cifar10': data = get_cifar10() elif 'keras' in dataset[0]: data = get_keras_data(dataset[0]) else: try: data = fetch_mldata(dataset[0]) except: raise Exception('ERROR: failed to fetch data from mldata.org') X = data.data y = data.target if X.shape[0] != y.shape[0]: X = np.transpose(X) #transpose()函数的作用就是调换数组的行列值的索引值,类似于求矩阵的转置 assert X.shape[0] == y.shape[ 0] #Python assert(断言)用于判断一个表达式,在表达式条件为 false 的时候触发异常 data = {'data': X, 'target': y} pickle.dump(data, gfile.GFile(filename, 'w'))
def create_plot_curve(): clients = parallel.Client() lview = clients.load_balanced_view() dview = clients[:] dview['data']= fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes')) lview.block = True alphas = [1E-4, 1E-3, 1E-2, 1E-1] with dview.sync_imports(): import numpy from sklearn.naive_bayes import MultinomialNB from sklearn.cross_validation import cross_val_score res = lview.map(grid_search, alphas) return res
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity/10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_mldata(dataset_name).data return X
def generate_data(case): """Generate regression/classification data.""" if case == "regression": X, y = datasets.load_diabetes(return_X_y=True) elif case == "classification": X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True) X, y = shuffle(X, y) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] data = { "X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test } return data
def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == "regression": bunch = datasets.load_boston() elif case == "classification": bunch = datasets.fetch_20newsgroups_vectorized(subset="all") X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} return data
def generate_data(case): """Generate regression/classification data.""" if case == "regression": X, y = datasets.load_diabetes(return_X_y=True) train_size = 0.8 elif case == "classification": X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True) train_size = 0.4 # to make the example run faster X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=0) data = { "X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test } return data
def load(name): """ Load the database from Lazy Initialized Dictionary with its known name. :param name: Name of database :return: tuple(X, y) """ databases = LazyDict({ 'breast_cancer': lambda: load_breast_cancer(return_X_y=True), 'cov_type': lambda: itemgetter('data', 'target')(fetch_covtype()), 'digits': lambda: load_digits(return_X_y=True), 'iris': lambda: load_iris(return_X_y=True), 'kddcup99': lambda: load_kddcup99(), 'lfw': lambda: fetch_lfw_people(return_X_y=True), 'mnist': lambda: openml.fetch_openml('mnist_784', version=1, return_X_y=True), 'news_groups': lambda: itemgetter('data', 'target')( fetch_20newsgroups_vectorized(subset='all')), 'olivetti_faces': lambda: itemgetter('data', 'target')( fetch_olivetti_faces()), 'rcv1': lambda: fetch_rcv1(random_state=0, return_X_y=True), 'wine': lambda: load_wine(return_X_y=True) }) return databases.get(name)
def get_results(): # get data data = fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes')) alphas = [1E-4, 1E-3, 1E-2, 1E-1] # set up dview for imports clients = parallel.Client() dview = clients[:] with dview.sync_imports(): # doesn't seem to like import numpy as np, using numpy instead import numpy from sklearn.naive_bayes import MultinomialNB from sklearn.cross_validation import cross_val_score dview.block = True # send data to clients dview['data'] = data # set up load balanced view for parallel processing lview = clients.load_balanced_view() # set blocking to True to get all results once processing is done lview.block = True results = lview.map(get_single_result, alphas) return results
import numpy from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.preprocessing import StandardScaler from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.svm import LinearSVC from sklearn.grid_search import RandomizedSearchCV def best_cv_num(n): return int(1+numpy.log2(n)) def best_n_iter(n): return numpy.ceil(10**6 / n) if __name__ == '__main__': d = fetch_20newsgroups_vectorized( remove=('headers', 'footers', 'quotes')) X = d.data #X = StandardScaler(with_mean=False).fit_transform(X) #X = TruncatedSVD(n_components=400).fit_transform(X) y = d.target _n = X.shape[0] clf = MultinomialNB() params = { 'alpha': 10**numpy.linspace(-7,0,1000) } # http://scikit-learn.org/stable/modules/sgd.html#tips-on-practical-use """ clf = SGDClassifier(n_iter=best_n_iter(_n)) params = {
__author__ = 'dmt101' import logging from sklearn.datasets import fetch_20newsgroups,fetch_20newsgroups_vectorized categories=None # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') data=fetch_20newsgroups_vectorized(subset='all')
from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.decomposition import PCA import featurelearning def evaluate(dataset_name, fl, ratio): print dataset_name, fl.__name__, ratio d = dataset.load_dataset(dataset_name) fea = d.data label = d.target fea = fl(fea) ss = StratifiedShuffleSplit(label, 3, test_size=(1-ratio), random_state=0) svc = LinearSVC() for train, test in ss: svc.fit(fea[train,:], label[train,:]) predict = svc.predict(fea[test, :]) acc = accuracy_score(label[test, :], predict) print acc if __name__ == '__main__': pca = PCA() train = fetch_20newsgroups_vectorized('train') test = fetch_20newsgroups_vectorized('test') svc = LinearSVC() train_data = pca.fit_transform(train.data.toarray()) svc.fit(train_data, train.target) test_data = pca.transform(test.data.toarray()) predict = svc.predict(test_data) acc = accuracy_score(test.target, predict) print acc # evaluate('20newsgroups', featurelearning.TF_IDF, 0.1) # evaluate('20newsgroups', featurelearning.LDA, 0.1)
import sparse_interaction from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import SGDClassifier from sklearn.metrics import f1_score dat = fetch_20newsgroups_vectorized() X = dat.data Y = dat.target cv = StratifiedKFold(Y) X = X[:, :20000] si = sparse_interaction.SparseInteractionFeatures() X_i = si.transform(X) scores, scores_i = [], [] clf = SGDClassifier(penalty='l1', n_iter=10) for train, test in cv: clf.fit(X[train], Y[train]) scores.append(f1_score(Y[test], clf.predict(X[test]), average='macro', pos_label=None)) clf.fit(X_i[train], Y[train]) scores_i.append(f1_score(Y[test], clf.predict(X_i[test]), average='macro', pos_label=None)) print sum(scores), sum(scores_i)
#!/usr/bin/python3 # coding: utf-8 ################################################################## ## 不用下载, 数据集很小, 默认就有 from sklearn import datasets # datasets.load_iris() ################################################################## ## ~/scikit_learn_data ## 没有下载提示, 直接就下载好了 from sklearn.datasets import fetch_20newsgroups from sklearn.datasets import fetch_20newsgroups_vectorized fetch_20newsgroups(subset='all') # 15M; 20news-bydate_py3.pkz fetch_20newsgroups_vectorized(subset='all') # 6.3M; 20newsgroup_vectorized_py3.pkl
plt.savefig(data_set_title + str("_") + str(lambda_value) + ".jpg") if __name__ == '__main__': ''' We can import this file safely into other files and use RandomSampler. This driver in this section is just for when you run "python3 RandomSampling.py" ''' print("Start") training_size = 100 #100 max_unlabeled_size = 400 #400 # lambda_value = 10**(-4)#This needs to be tuned #Newsgroup Data train_dataset = fetch_20newsgroups_vectorized(subset='train') X_train_base = train_dataset.data y_train_base = train_dataset.target test_dataset = fetch_20newsgroups_vectorized(subset='test') X_test = test_dataset.data X_test = vstack([X_test, X_train_base[2000:, :]]).toarray() y_test = test_dataset.target y_test = np.append(y_test, y_train_base[2000:]) X_train_base = X_train_base[:2000, :] y_train_base = y_train_base[:2000] X_train, y_train = X_train_base[: training_size], y_train_base[: training_size]
} ############################################################################### # Data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', '--estimators', nargs="+", required=True, choices=ESTIMATORS) args = vars(parser.parse_args()) data_train = fetch_20newsgroups_vectorized(subset="train") data_test = fetch_20newsgroups_vectorized(subset="test") X_train = check_array(data_train.data, dtype=np.float32, accept_sparse="csc") X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") y_train = data_train.target y_test = data_test.target print("20 newsgroups") print("=============") print("X_train.shape = {0}".format(X_train.shape)) print("X_train.format = {0}".format(X_train.format)) print("X_train.dtype = {0}".format(X_train.dtype)) print("X_train density = {0}" "".format(X_train.nnz / np.product(X_train.shape)))
def exp(solvers, penalty, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): dtypes_mapping = { "float64": np.float64, "float32": np.float32, } if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(fit_single)(solver, X, y, penalty=penalty, single_target=single_target, dtype=dtype, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for dtype in dtypes_mapping.values()) res = [] idx = 0 for dtype_name in dtypes_mapping.keys(): for solver in solvers: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, dtype=dtype_name, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
for n_samples, color in zip(n_samples_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range) plt.semilogy(eps_range, min_n_components, color=color) plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right") plt.xlabel("Distortion eps") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps") # Part 2: perform sparse random projection of some digits images which are # quite low dimensional and dense or documents of the 20 newsgroups dataset # which is both high dimensional and sparse if '--twenty-newsgroups' in sys.argv: # Need an internet connection hence not enabled by default data = fetch_20newsgroups_vectorized().data[:500] else: data = load_digits().data[:500] n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range:
self.client = DistributedRPC.Client( TBinaryProtocol.TBinaryProtocol(self.conn)) self.conn.open() except Thrift.TException, exc: print exc def close(self): self.conn.close() def execute(self, func, args): try: return self.client.execute(func, args) except Thrift.TException, exc: print exc.message() except DRPCExecutionException, exc: print exc if __name__ == '__main__': '''send 10 data to server for prediction.''' client = Client() test = datasets.fetch_20newsgroups_vectorized(subset='test') data_size = 40 input_json = json.dumps(test.data[:data_size].toarray().tolist()) print 'data prepared' result = client.execute('svm', input_json) print 'data predicted' result = array(json.loads(result)) print metrics.classification_report(test.target[:data_size], result) client.close()
############################################################################### # Routines # initialize random generator np.random.seed(0) def generate_data(case, sparse=False): """Generate regression/classification data.""" bunch = None if case == 'regression': bunch = datasets.load_boston() elif case == 'classification': bunch = datasets.fetch_20newsgroups_vectorized(subset='all') X, y = shuffle(bunch.data, bunch.target) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] if sparse: X_train = csr_matrix(X_train) X_test = csr_matrix(X_test) else: X_train = np.array(X_train) X_test = np.array(X_test) y_test = np.array(y_test) y_train = np.array(y_train) data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test} return data
import time import numpy as np import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import datasets from sklearn.cross_validation import train_test_split from sklearn.decomposition import TruncatedSVD from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from GPSVI.core.GPClassifier import GPClassifier np.random.seed(0) data = datasets.fetch_20newsgroups_vectorized() xTr, xTe, yTr, yTe = train_test_split(data.data, data.target, test_size=0.80) svd = TruncatedSVD(algorithm='randomized', n_components=3, tol=0.0) svd.fit(xTr) x = svd.transform(xTr) fig = plt.figure('Show data') ax = fig.add_subplot(111, projection='3d') ax.scatter(x[:,0], x[:,1], x[:,2], c=yTr, cmap=matplotlib.cm.rainbow) t0 = time.time() clf_lr = LogisticRegression(C=2.0) clf_lr.fit(xTr, yTr) lr_score = clf_lr.score(xTe, yTe)
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): mem = Memory(cachedir=expanduser('~/cache'), verbose=0) if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] cached_fit = mem.cache(fit_single) out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(cached_fit)(solver, X, y, penalty=penalty, single_target=single_target, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for penalty in penalties) res = [] idx = 0 for solver in solvers: for penalty in penalties: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
# nodebox section end print(__doc__) # Author: Arthur Mensch t0 = time.clock() # We use SAGA solver solver = 'saga' # Turn down for faster run time n_samples = 10000 # Memorized fetch_rcv1 for faster access dataset = fetch_20newsgroups_vectorized('all') X = dataset.data y = dataset.target X = X[:n_samples] y = y[:n_samples] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.1) train_samples, n_features = X_train.shape n_classes = np.unique(y).shape[0] print('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i' % (train_samples, n_features, n_classes))
def plot_sparse_logistic_regression_20newsgroups(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") t0 = timeit.default_timer() # We use SAGA solver solver = 'saga' # Turn down for faster run time n_samples = 10000 X, y = fetch_20newsgroups_vectorized(subset='all', return_X_y=True) X = X[:n_samples] y = y[:n_samples] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.1) train_samples, n_features = X_train.shape n_classes = np.unique(y).shape[0] print( 'Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i' % (train_samples, n_features, n_classes)) models = { 'ovr': { 'name': 'One versus Rest', 'iters': [1, 2, 4] }, 'multinomial': { 'name': 'Multinomial', 'iters': [1, 3, 7] } } for model in models: # Add initial chance-level values for plotting purpose accuracies = [1 / n_classes] times = [0] densities = [1] model_params = models[model] # Small number of epochs for fast runtime for this_max_iter in model_params['iters']: print('[model=%s, solver=%s] Number of epochs: %s' % (model_params['name'], solver, this_max_iter)) lr = LogisticRegression( solver=solver, multi_class=model, penalty='l1', max_iter=this_max_iter, random_state=42, ) t1 = timeit.default_timer() lr.fit(X_train, y_train) train_time = timeit.default_timer() - t1 y_pred = lr.predict(X_test) accuracy = np.sum(y_pred == y_test) / y_test.shape[0] density = np.mean(lr.coef_ != 0, axis=1) * 100 accuracies.append(accuracy) densities.append(density) times.append(train_time) models[model]['times'] = times models[model]['densities'] = densities models[model]['accuracies'] = accuracies print('Test accuracy for model %s: %.4f' % (model, accuracies[-1])) print('%% non-zero coefficients for model %s, ' 'per class:\n %s' % (model, densities[-1])) print('Run time (%i epochs) for model %s:' '%.2f' % (model_params['iters'][-1], model, times[-1])) fig = plt.figure() ax = fig.add_subplot(111) for model in models: name = models[model]['name'] times = models[model]['times'] accuracies = models[model]['accuracies'] ax.plot(times, accuracies, marker='o', label='Model: %s' % name) ax.set_xlabel('Train time (s)') ax.set_ylabel('Test accuracy') ax.legend() fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n' 'Dataset %s' % '20newsgroups') fig.tight_layout() fig.subplots_adjust(top=0.85) run_time = timeit.default_timer() - t0 print('Example run in %.3f s' % run_time) plt.show()
print(__doc__) import numpy as np from scipy.linalg import svd from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.datasets.samples_generator import make_classification from sklearn.feature_selection import SelectKBest, chi2 from lightning.classification import FistaClassifier def rank(M, eps=1e-9): U, s, V = svd(M, full_matrices=False) return np.sum(s > eps) bunch = fetch_20newsgroups_vectorized(subset="train") X_train = bunch.data y_train = bunch.target # Reduces dimensionality to make the example faster ch2 = SelectKBest(chi2, k=5000) X_train = ch2.fit_transform(X_train, y_train) bunch = fetch_20newsgroups_vectorized(subset="test") X_test = bunch.data y_test = bunch.target X_test = ch2.transform(X_test) clf = FistaClassifier(C=1.0 / X_train.shape[0], max_iter=200, penalty="trace",
""" import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized try: from sklearn.model_selection import train_test_split except ImportError: from sklearn.cross_validation import train_test_split from lightning.classification import CDClassifier from lightning.classification import LinearSVC from lightning.classification import SGDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target # Select a subset of the classes for faster training. ind = np.arange(X.shape[0]) subset = y < 5 X = X[ind[subset]] y = y[subset] # Train / test split. X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)
#!/usr/bin/env python # -*- encoding: utf8 -*- '''Train a svm model using 20newsgroup data. ''' from sklearn import datasets, svm import cPickle as pkl __author__ = 'noahsark' train = datasets.fetch_20newsgroups_vectorized(subset='train') clf = svm.LinearSVC() clf.fit(train.data, train.target) with open('storm-starter/multilang/resources/svm_model.pkl', 'wb') as fp_: pkl.dump(clf, fp_)
analyze = tv.build_analyzer() tv.get_feature_names()#statistical features/terms #(准确率*召回率)/(准确率+召回率) def calculate_result(actual,pred): m_precision = metrics.precision_score(actual,pred); m_recall = metrics.recall_score(actual,pred); print 'predict info:' print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall); print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred)); #或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized print '*************************\nfetch_20newsgroups_vectorized\n*************************' from sklearn.datasets import fetch_20newsgroups_vectorized tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train'); tfidf_test_3 = fetch_20newsgroups_vectorized(subset = 'test'); print "the shape of train is "+repr(tfidf_train_3.data.shape) print "the shape of test is "+repr(tfidf_test_3.data.shape) #分类 ###################################################### #Multinomial Naive Bayes Classifier print '*************************\nNaive Bayes\n*************************' from sklearn.naive_bayes import MultinomialNB from sklearn import metrics newsgroups_test = fetch_20newsgroups(subset = 'test', categories = categories); fea_test = vectorizer.fit_transform(newsgroups_test.data); #create the Multinomial Naive Bayesian Classifier clf = MultinomialNB(alpha = 0.01)
import matplotlib.pyplot as plt import numpy as np from sklearn import svm from sklearn.naive_bayes import MultinomialNB from sklearn.datasets import fetch_20newsgroups_vectorized from sklearn.metrics import roc_curve, auc from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from scipy import interp # import data newsgroups_train = fetch_20newsgroups_vectorized(subset='train') data_train = newsgroups_train.data target_train = newsgroups_train.target newsgroups_test = fetch_20newsgroups_vectorized(subset='test') data_test = newsgroups_test.data target_test = newsgroups_test.target n_samples, n_features = data_train.shape n_classes = np.unique(target_train).shape[0] # classify and calculate scores classifier = OneVsRestClassifier(MultinomialNB()) y_score = classifier.fit(data_train, target_train).predict_proba(data_test) # binarize the target classes of test set y_true = label_binarize(target_test, range(n_classes)) # for each class, compute ROC curve and AUC fpr, tpr = dict(), dict()
import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized train_data = fetch_20newsgroups_vectorized(subset='train', remove=(), data_home=None, download_if_missing=False) test_data = fetch_20newsgroups_vectorized(subset='test', remove=(), data_home=None, download_if_missing=False) train_x = train_data.data.toarray() train_x = np.hstack((np.ones((train_x.shape[0],1)),train_x)) train_y = train_data.target test_x = test_data.data.toarray() test_x = np.hstack((np.ones((test_x.shape[0],1)),test_x)) test_y = test_data.target class_names = train_data.target_names weights = np.zeros((20,test_x.shape[1])) def update_perceptron(i, alpha): labels = np.where(train_y == i, 1, -1) for indx in range(train_x.shape[0]): y_hat = weights[i] @ train_x[indx] if np.sign(labels[indx]) != np.sign(y_hat): weights[i] += alpha * labels[indx] * train_x[indx] def accuracy_report(): report_accuracies = [[0,0] for x in range(20)] scores = np.zeros(20) total = 0
min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range) plt.semilogy(eps_range, min_n_components, color=color) plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right") plt.xlabel("Distortion eps") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps") plt.show() # Part 2: perform sparse random projection of some digits images which are # quite low dimensional and dense or documents of the 20 newsgroups dataset # which is both high dimensional and sparse if '--twenty-newsgroups' in sys.argv: # Need an internet connection hence not enabled by default data = fetch_20newsgroups_vectorized().data[:500] else: data = load_digits().data[:500] n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range:
"naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), } ############################################################################### # Data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', '--estimators', nargs="+", required=True, choices=ESTIMATORS) args = vars(parser.parse_args()) data_train = fetch_20newsgroups_vectorized(subset="train") data_test = fetch_20newsgroups_vectorized(subset="test") X_train = check_array(data_train.data, dtype=np.float32, accept_sparse="csc") X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") y_train = data_train.target y_test = data_test.target print("20 newsgroups") print("=============") print("X_train.shape = {0}".format(X_train.shape)) print("X_train.format = {0}".format(X_train.format)) print("X_train.dtype = {0}".format(X_train.dtype)) print("X_train density = {0}" "".format(X_train.nnz / np.product(X_train.shape))) print("y_train {0}".format(y_train.shape))
clf._finalize_coef() y_pred = clf.decision_function(self.X).ravel() loss = (np.maximum(1 - self.y * y_pred, 0) ** 2).mean() coef = clf.coef_.ravel() regul = 0.5 * clf.alpha * np.dot(coef, coef) self.obj.append(loss + regul) self.test_time += time.clock() - test_time self.times.append(time.clock() - self.start_time - self.test_time) try: dataset = sys.argv[1] except: dataset = "synthetic" if dataset == "news20": bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 etas = (0.5, 1e-1, 1e-2) n_inners = (1.0, 2.0, 3.0) else: X, y = make_classification(n_samples=10000, n_features=100, n_classes=2, random_state=0) etas = (1e-3, 1e-4, 1e-5) n_inners = (0.25, 0.5, 1.0, 1.5) y = y * 2 - 1
def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 130107)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 130107)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) # test return_X_y option fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 130107)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)