Пример #1
0
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all', download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert sp.isspmatrix_csr(bunch.data)
    assert bunch.data.shape == (11314, 130107)
    assert bunch.target.shape[0] == 11314
    assert bunch.data.dtype == np.float64

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert sp.isspmatrix_csr(bunch.data)
    assert bunch.data.shape == (7532, 130107)
    assert bunch.target.shape[0] == 7532
    assert bunch.data.dtype == np.float64

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert sp.isspmatrix_csr(bunch.data)
    assert bunch.data.shape == (11314 + 7532, 130107)
    assert bunch.target.shape[0] == 11314 + 7532
    assert bunch.data.dtype == np.float64
Пример #2
0
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all', download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
Пример #3
0
def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None,
                                shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.):
    data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train')
    data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test')

    X_train = data_train.data
    X_test = data_test.data
    y_train = data_train.target
    y_test = data_test.target
    if binary_problem:
        y_train[data_train.target < 10] = minus_value
        y_train[data_train.target >= 10] = 1.
        y_test[data_test.target < 10] = minus_value
        y_test[data_test.target >= 10] = 1.
    if one_hot:
        y_train = to_one_hot_enc(y_train)
        y_test = to_one_hot_enc(y_test)

    # if shuffle and sk_shuffle:
    #     xtr = X_train.tocoo()
    #     xts = X_test.tocoo()

    d_train = Dataset(data=X_train,
                      target=y_train, info={'target names': data_train.target_names})
    d_test = Dataset(data=X_test,
                     target=y_test, info={'target names': data_train.target_names})
    res = [d_train, d_test]
    if partitions_proportions:
        res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return Datasets.from_list(res)
Пример #4
0
def load_20_newsgroups_data():
    """
    Load the 20 newsgroups dataset.
    """
    newsgroups_train = fetch_20newsgroups_vectorized(subset="train")
    newsgroups_test = fetch_20newsgroups_vectorized(subset="test")

    training_data = newsgroups_train.data
    training_labels = newsgroups_train.target
    test_data = newsgroups_test.data
    test_labels = newsgroups_test.target

    return training_data, training_labels, test_data, test_labels
Пример #5
0
def test_20news_normalization():
    try:
        X = datasets.fetch_20newsgroups_vectorized(normalize=False,
                                                   download_if_missing=False)
        X_ = datasets.fetch_20newsgroups_vectorized(normalize=True,
                                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    X_norm = X_['data'][:100]
    X = X['data'][:100]

    assert_allclose_dense_sparse(X_norm, normalize(X))
    assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
Пример #6
0
def load_news():
    # Load the faces datasets
    data = fetch_20newsgroups_vectorized(subset='train',
                                         remove=('headers', 'footers',
                                                 'quotes'))
    test = fetch_20newsgroups_vectorized(subset='test',
                                         remove=('headers', 'footers',
                                                 'quotes'))

    #select features
    ch2 = SelectKBest(chi2, k=500)
    X_train = ch2.fit_transform(data.data, data.target)
    X_test = ch2.transform(test.data)
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    Y_train = data['target']
    Y_test = test['target']

    def shared_dataset(data_xy, borrow=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy
        shared_x = theano.shared(numpy.asarray(data_x,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        shared_y = theano.shared(numpy.asarray(data_y,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        return shared_x, T.cast(shared_y, 'int32')

    test_set_x, test_set_y = shared_dataset([X_test, Y_test])
    valid_set_x, valid_set_y = [test_set_x, test_set_y]
    train_set_x, train_set_y = shared_dataset([X_train, Y_train])

    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
    return rval
Пример #7
0
def test_LogisticRegressionCV():
    bunch = fetch_20newsgroups_vectorized(subset="train")
    X = bunch.data
    y = bunch.target

    y[y < y.mean()] = -1
    y[y >= y.mean()] = 1
    Xt, Xh, yt, yh = cross_validation.train_test_split(
        X, y, test_size=.5, random_state=0)

    # compute the scores
    all_scores = []
    all_alphas = np.linspace(-12, 0, 5)
    for a in all_alphas:
        lr = linear_model.LogisticRegression(
            solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6,
            max_iter=100)
        lr.fit(Xt, yt)
        score_scv = linear_model.logistic._logistic_loss(
            lr.coef_.ravel(), Xh, yh, 0)
        all_scores.append(score_scv)
    all_scores = np.array(all_scores)

    best_alpha = all_alphas[np.argmin(all_scores)]

    clf = LogisticRegressionCV(max_iter=50)
    clf.fit(Xt, yt, Xh, yh)
    np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
Пример #8
0
def load_data(subset, idx, n):
    X, y = fetch_20newsgroups_vectorized(subset=subset, return_X_y=True)

    start = X.shape[0] // n * idx
    end = X.shape[0] // n * (idx + 1)

    return X[start:end], y[start:end]
    def generate_data(case, sparse=False):
        # Generate regression / classification data. 
        bunch = None 
        if case == 'regression':
            bunch = datasets.load_boston()
        elif case == 'classification': 
            bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
        X, y = shuffle(bunch.data, bunch.target)
        offset = int(X.shape[0] * 0.8) 
        X_train, y_train = X[:offset], y[:offset]
        X_test, y_test = X[offset:], y[offset:] 
        if sparse:
            X_train = csr_matrix(X_train)
            X_test = csr_matrix(X_test)
        else:
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_train = np.array(y_train)
        data = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
        }

        return data 
Пример #10
0
def test_LogisticRegressionCV():
    bunch = fetch_20newsgroups_vectorized(subset="train")
    X = bunch.data
    y = bunch.target

    y[y < y.mean()] = -1
    y[y >= y.mean()] = 1
    Xt, Xh, yt, yh = model_selection.train_test_split(X,
                                                      y,
                                                      test_size=.5,
                                                      random_state=0)

    # compute the scores
    all_scores = []
    all_alphas = np.linspace(-12, 0, 5)
    for a in all_alphas:
        lr = linear_model.LogisticRegression(solver='lbfgs',
                                             C=np.exp(-a),
                                             fit_intercept=False,
                                             tol=1e-6,
                                             max_iter=100)
        lr.fit(Xt, yt)
        score_scv = linear_model.logistic._logistic_loss(
            lr.coef_.ravel(), Xh, yh, 0)
        all_scores.append(score_scv)
    all_scores = np.array(all_scores)

    best_alpha = all_alphas[np.argmin(all_scores)]

    clf = LogisticRegressionCV(max_iter=50)
    clf.fit(Xt, yt, Xh, yh)
    np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
Пример #11
0
def all_vector():
    print("all_vector load:")
    t0 = time()
    # raw_data = fetch_20newsgroups(subset='train').data
    # data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6

    tfidf_train_3 = fetch_20newsgroups_vectorized(subset='train')
    tfidf_test_3 = fetch_20newsgroups_vectorized(subset='test')

    duration = time() - t0
    print("done in %fs" % duration)
    # print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))

    print("the shape of train is " + repr(tfidf_train_3.data.shape))
    print("the shape of test is " + repr(tfidf_test_3.data.shape))
    return tfidf_train_3.data, tfidf_test_3.data
Пример #12
0
 def fetch_newsgroups(self, ncols=100):
     """
     Returns a 100 column sample of the newsgroups datasets.
     """
     news = fetch_20newsgroups_vectorized().data
     [nn, dd] = news.shape
     cols = np.random.choice(dd, size=ncols, replace=False)
     #rows = np.random.choice(nn,size=nsamples,replace=False)
     return news[:, cols].toarray()
 def __init__(self):
     n_samples = 10000
     dataset = fetch_20newsgroups_vectorized('all')
     X = dataset.data
     y = dataset.target
     X = X[:n_samples]
     y = y[:n_samples]
     self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
         X, y, random_state=42, stratify=y, test_size=0.1)
Пример #14
0
def test_20news_vectorized():
    # This test is slow.
    raise SkipTest

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_equal(bunch.data.shape, (11314, 107130))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_equal(bunch.data.shape, (7532, 107130))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_equal(bunch.data.shape, (11314 + 7532, 107130))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
Пример #15
0
def test_20news_vectorized():
    # This test is slow.
    raise SkipTest

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_equal(bunch.data.shape, (11314, 107130))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_equal(bunch.data.shape, (7532, 107130))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_equal(bunch.data.shape, (11314 + 7532, 107130))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == "lfw_people":
        X = fetch_lfw_people().data
    elif dataset_name == "20newsgroups":
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == "olivetti_faces":
        X = fetch_olivetti_faces().data
    elif dataset_name == "rcv1":
        X = fetch_rcv1().data
    elif dataset_name == "CIFAR":
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == "SVHN":
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"]
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == "low rank matrix":
        X = make_low_rank_matrix(
            n_samples=500,
            n_features=int(1e4),
            effective_rank=100,
            tail_strength=0.5,
            random_state=random_state,
        )
    elif dataset_name == "uncorrelated matrix":
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == "big sparse matrix":
        sparsity = int(1e6)
        size = int(1e6)
        small_size = int(1e4)
        data = np.random.normal(0, 1, int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name, parser="auto").data
    return X
def test_build():
    X, y = fetch_20newsgroups_vectorized(return_X_y=True)
    # Select only the first 500 samples
    X = X[:500]
    y = y[:500]
    # Precompute cosine distance matrix
    diss = cosine_distances(X)
    # run build
    ske = KMedoids(20, "precomputed", init="build", max_iter=0)
    ske.fit(diss)
    assert ske.inertia_ <= 230
    assert len(np.unique(ske.labels_)) == 20
Пример #18
0
def test_20news_vectorized():
    # This test is slow.
    raise SkipTest("Test too slow.")

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 107428))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 107428))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 107428))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
Пример #19
0
def test_20news_vectorized():
    # This test is slow.
    raise SkipTest("Test too slow.")

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 107428))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 107428))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 107428))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
Пример #20
0
def load_data(name, partition_id, n_partitions):
    """load partition of data into global var `name`"""
    from sklearn.datasets import fetch_20newsgroups_vectorized
    from sklearn.utils import gen_even_slices
    dataset = fetch_20newsgroups_vectorized('test')
    size = dataset.data.shape[0]
    slices = list(gen_even_slices(size, n_partitions))
    
    part = dataset.data[slices[partition_id]]
    # put it in globals
    globals().update({name : part})
    return part.shape
Пример #21
0
def get_mldata(dataset, save_dir):
    # Use scikit to grab datasets and save them save_dir.
    # save_dir = FLAGS.save_dir
    filename = os.path.join(save_dir, dataset[1] + '.pkl')

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not gfile.Exists(filename):
        if dataset[0][-3:] == 'csv':
            data = get_csv_data(dataset[0])
        elif dataset[0] == 'breast_cancer':
            data = load_breast_cancer()
        elif dataset[0] == 'iris':
            data = load_iris()
        elif dataset[0] == 'newsgroup':
            # Removing header information to make sure that no newsgroup identifying
            # information is included in data
            data = fetch_20newsgroups_vectorized(subset='all',
                                                 remove=('headers'))
            tfidf = TfidfTransformer(norm='l2')
            X = tfidf.fit_transform(data.data)
            data.data = X
        elif dataset[0] == 'rcv1':
            sklearn.datasets.rcv1.URL = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
            sklearn.datasets.rcv1.URL_topics = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
            data = sklearn.datasets.fetch_rcv1(data_home='/tmp')
        elif dataset[0] == 'wikipedia_attack':
            data = get_wikipedia_talk_data()
        elif dataset[0] == 'cifar10':
            data = get_cifar10()
        elif 'keras' in dataset[0]:
            data = get_keras_data(dataset[0])
        else:
            try:
                data = fetch_mldata(dataset[0])
            except:
                print('error')
                raise Exception('ERROR: failed to fetch data from mldata.org')
        X = data.data
        y = data.target
        if X.shape[0] != y.shape[0]:
            X = np.transpose(X)
        assert X.shape[0] == y.shape[0]

        data = {'data': X, 'target': y}
        with open(filename, 'wb') as f:
            pickle.dump(data, f)
        print(len(data), filename)
Пример #22
0
def nb_news2():
    news = fetch_20newsgroups_vectorized()
    # print(news.data)

    x_train, x_test, y_train, y_test = train_test_split(news.data,
                                                        news.target,
                                                        random_state=22)

    estimator = MultinomialNB()
    estimator.fit(x_train, y_train)

    score = estimator.score(x_test, y_test)
    print('score:', score)
Пример #23
0
def get_mldata(dataset):
    # Use scikit to grab datasets and save them save_dir.
    save_dir = FLAGS.save_dir
    filename = os.path.join(save_dir, dataset[1] + '.pkl')

    if not gfile.Exists(save_dir):
        gfile.MkDir(save_dir)  #这个函数不能创建多层目录,可以更换成os.makedirs()
    if not gfile.Exists(filename):
        if dataset[0][-3:] == 'csv':
            data = get_csv_data(dataset[0])
        elif dataset[0] == 'breast_cancer':
            data = load_breast_cancer()
        elif dataset[0] == 'iris':
            data = load_iris()
        elif dataset[0] == 'newsgroup':
            # Removing header information to make sure that no newsgroup identifying
            # information is included in data
            data = fetch_20newsgroups_vectorized(subset='all',
                                                 remove=('headers'))
            tfidf = TfidfTransformer(norm='l2')
            X = tfidf.fit_transform(data.data)
            data.data = X
        elif dataset[0] == 'rcv1':
            sklearn.datasets.rcv1.URL = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
            sklearn.datasets.rcv1.URL_topics = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
            data = sklearn.datasets.fetch_rcv1(data_home='/tmp')
        elif dataset[0] == 'wikipedia_attack':
            data = get_wikipedia_talk_data()
        elif dataset[0] == 'cifar10':
            data = get_cifar10()
        elif 'keras' in dataset[0]:
            data = get_keras_data(dataset[0])
        else:
            try:
                data = fetch_mldata(dataset[0])
            except:
                raise Exception('ERROR: failed to fetch data from mldata.org')
        X = data.data
        y = data.target
        if X.shape[0] != y.shape[0]:
            X = np.transpose(X)  #transpose()函数的作用就是调换数组的行列值的索引值,类似于求矩阵的转置
        assert X.shape[0] == y.shape[
            0]  #Python assert(断言)用于判断一个表达式,在表达式条件为 false 的时候触发异常

        data = {'data': X, 'target': y}
        pickle.dump(data, gfile.GFile(filename, 'w'))
Пример #24
0
def create_plot_curve():
    clients = parallel.Client()
    lview = clients.load_balanced_view()
    dview = clients[:]
    dview['data']= fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
    lview.block = True
    alphas = [1E-4, 1E-3, 1E-2, 1E-1]

    with dview.sync_imports():
        import numpy
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.cross_validation import cross_val_score

    res = lview.map(grid_search, alphas)
    return res
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
              for i in range(5)]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
                                 effective_rank=100, tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity/10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_mldata(dataset_name).data
    return X
def generate_data(case):
    """Generate regression/classification data."""
    if case == "regression":
        X, y = datasets.load_diabetes(return_X_y=True)
    elif case == "classification":
        X, y = datasets.fetch_20newsgroups_vectorized(subset="all",
                                                      return_X_y=True)
    X, y = shuffle(X, y)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]

    data = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }
    return data
def generate_data(case, sparse=False):
    """Generate regression/classification data."""
    bunch = None
    if case == "regression":
        bunch = datasets.load_boston()
    elif case == "classification":
        bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    X, y = shuffle(bunch.data, bunch.target)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
    return data
def generate_data(case):
    """Generate regression/classification data."""
    if case == "regression":
        X, y = datasets.load_diabetes(return_X_y=True)
        train_size = 0.8
    elif case == "classification":
        X, y = datasets.fetch_20newsgroups_vectorized(subset="all",
                                                      return_X_y=True)
        train_size = 0.4  # to make the example run faster

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        random_state=0)

    data = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }
    return data
Пример #29
0
def load(name):
    """
    Load the database from Lazy Initialized Dictionary with its known name.
    :param name: Name of database
    :return: tuple(X, y)
    """
    databases = LazyDict({
        'breast_cancer': lambda: load_breast_cancer(return_X_y=True),
        'cov_type': lambda: itemgetter('data', 'target')(fetch_covtype()),
        'digits': lambda: load_digits(return_X_y=True),
        'iris': lambda: load_iris(return_X_y=True),
        'kddcup99': lambda: load_kddcup99(),
        'lfw': lambda: fetch_lfw_people(return_X_y=True),
        'mnist': lambda: openml.fetch_openml('mnist_784', version=1,
                                             return_X_y=True),
        'news_groups': lambda: itemgetter('data', 'target')(
            fetch_20newsgroups_vectorized(subset='all')),
        'olivetti_faces': lambda: itemgetter('data', 'target')(
            fetch_olivetti_faces()),
        'rcv1': lambda: fetch_rcv1(random_state=0, return_X_y=True),
        'wine': lambda: load_wine(return_X_y=True)
    })
    return databases.get(name)
Пример #30
0
def get_results():
    # get data
    data = fetch_20newsgroups_vectorized(remove=('headers',
                                                 'footers',
                                                 'quotes'))
    alphas = [1E-4, 1E-3, 1E-2, 1E-1]
    # set up dview for imports
    clients = parallel.Client()
    dview = clients[:]
    with dview.sync_imports():
        # doesn't seem to like import numpy as np, using numpy instead
        import numpy
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.cross_validation import cross_val_score
    dview.block = True
    # send data to clients
    dview['data'] = data
    # set up load balanced view for parallel processing
    lview = clients.load_balanced_view()
    # set blocking to True to get all results once processing is done
    lview.block = True
    results = lview.map(get_single_result, alphas)
    return results
Пример #31
0
Файл: ml.py Проект: arosh/ml
import numpy
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.grid_search import RandomizedSearchCV

def best_cv_num(n):
    return int(1+numpy.log2(n))

def best_n_iter(n):
    return numpy.ceil(10**6 / n)

if __name__ == '__main__':
    d = fetch_20newsgroups_vectorized(
            remove=('headers', 'footers', 'quotes'))
    X = d.data
    #X = StandardScaler(with_mean=False).fit_transform(X)
    #X = TruncatedSVD(n_components=400).fit_transform(X)
    y = d.target
    _n = X.shape[0]

    clf = MultinomialNB()
    params = {
            'alpha': 10**numpy.linspace(-7,0,1000)
    }

    # http://scikit-learn.org/stable/modules/sgd.html#tips-on-practical-use
    """
    clf = SGDClassifier(n_iter=best_n_iter(_n))
    params = {
Пример #32
0
__author__ = 'dmt101'


import logging

from sklearn.datasets import fetch_20newsgroups,fetch_20newsgroups_vectorized
categories=None

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

data=fetch_20newsgroups_vectorized(subset='all')

Пример #33
0
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.decomposition import PCA
import featurelearning

def evaluate(dataset_name, fl, ratio):
    print dataset_name, fl.__name__, ratio
    d = dataset.load_dataset(dataset_name)
    fea = d.data
    label = d.target
    fea = fl(fea)
    ss = StratifiedShuffleSplit(label, 3, test_size=(1-ratio), random_state=0)
    svc = LinearSVC()
    for train, test in ss:
        svc.fit(fea[train,:], label[train,:])
        predict = svc.predict(fea[test, :])
        acc = accuracy_score(label[test, :], predict)
        print acc

if __name__ == '__main__':
    pca = PCA()
    train = fetch_20newsgroups_vectorized('train')
    test = fetch_20newsgroups_vectorized('test')
    svc = LinearSVC()
    train_data = pca.fit_transform(train.data.toarray())
    svc.fit(train_data, train.target)
    test_data = pca.transform(test.data.toarray())
    predict = svc.predict(test_data)
    acc = accuracy_score(test.target, predict)
    print acc
    # evaluate('20newsgroups', featurelearning.TF_IDF, 0.1)
    # evaluate('20newsgroups', featurelearning.LDA, 0.1)
Пример #34
0
import sparse_interaction
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

dat = fetch_20newsgroups_vectorized()
X = dat.data
Y = dat.target
cv = StratifiedKFold(Y)

X = X[:, :20000]

si = sparse_interaction.SparseInteractionFeatures()
X_i = si.transform(X)

scores, scores_i = [], []

clf = SGDClassifier(penalty='l1', n_iter=10)

for train, test in cv:
    clf.fit(X[train], Y[train])
    scores.append(f1_score(Y[test], clf.predict(X[test]), average='macro', pos_label=None))
    clf.fit(X_i[train], Y[train])
    scores_i.append(f1_score(Y[test], clf.predict(X_i[test]), average='macro', pos_label=None))
print sum(scores), sum(scores_i)
Пример #35
0
#!/usr/bin/python3
# coding: utf-8
##################################################################
## 不用下载, 数据集很小, 默认就有
from sklearn import datasets
# datasets.load_iris()

##################################################################
## ~/scikit_learn_data
## 没有下载提示, 直接就下载好了
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
fetch_20newsgroups(subset='all')  # 15M; 20news-bydate_py3.pkz
fetch_20newsgroups_vectorized(subset='all')  # 6.3M; 20newsgroup_vectorized_py3.pkl
Пример #36
0
    plt.savefig(data_set_title + str("_") + str(lambda_value) + ".jpg")


if __name__ == '__main__':
    '''
    We can import this file safely into other files and use RandomSampler.
    This driver in this section is just for when you run "python3 RandomSampling.py"
    '''

    print("Start")
    training_size = 100  #100
    max_unlabeled_size = 400  #400
    # lambda_value = 10**(-4)#This needs to be tuned

    #Newsgroup Data
    train_dataset = fetch_20newsgroups_vectorized(subset='train')
    X_train_base = train_dataset.data
    y_train_base = train_dataset.target

    test_dataset = fetch_20newsgroups_vectorized(subset='test')
    X_test = test_dataset.data
    X_test = vstack([X_test, X_train_base[2000:, :]]).toarray()
    y_test = test_dataset.target
    y_test = np.append(y_test, y_train_base[2000:])

    X_train_base = X_train_base[:2000, :]
    y_train_base = y_train_base[:2000]

    X_train, y_train = X_train_base[:
                                    training_size], y_train_base[:
                                                                 training_size]
Пример #37
0
}

###############################################################################
# Data

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--estimators',
                        nargs="+",
                        required=True,
                        choices=ESTIMATORS)
    args = vars(parser.parse_args())

    data_train = fetch_20newsgroups_vectorized(subset="train")
    data_test = fetch_20newsgroups_vectorized(subset="test")
    X_train = check_array(data_train.data,
                          dtype=np.float32,
                          accept_sparse="csc")
    X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
    y_train = data_train.target
    y_test = data_test.target

    print("20 newsgroups")
    print("=============")
    print("X_train.shape = {0}".format(X_train.shape))
    print("X_train.format = {0}".format(X_train.format))
    print("X_train.dtype = {0}".format(X_train.dtype))
    print("X_train density = {0}"
          "".format(X_train.nnz / np.product(X_train.shape)))
Пример #38
0
def exp(solvers, penalty, single_target,
        n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    dtypes_mapping = {
        "float64": np.float64,
        "float32": np.float32,
    }

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(fit_single)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            dtype=dtype,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for dtype in dtypes_mapping.values())

    res = []
    idx = 0
    for dtype_name in dtypes_mapping.keys():
        for solver in solvers:
            if not (skip_slow and
                    solver == 'lightning' and
                    penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                dtype=dtype_name,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
for n_samples, color in zip(n_samples_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
    plt.semilogy(eps_range, min_n_components, color=color)

plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")

# Part 2: perform sparse random projection of some digits images which are
# quite low dimensional and dense or documents of the 20 newsgroups dataset
# which is both high dimensional and sparse

if '--twenty-newsgroups' in sys.argv:
    # Need an internet connection hence not enabled by default
    data = fetch_20newsgroups_vectorized().data[:500]
else:
    data = load_digits().data[:500]

n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections"
      % (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
Пример #40
0
            self.client = DistributedRPC.Client(
                TBinaryProtocol.TBinaryProtocol(self.conn))
            self.conn.open()
        except Thrift.TException, exc:
            print exc

    def close(self):
        self.conn.close()

    def execute(self, func, args):
        try:
            return self.client.execute(func, args)
        except Thrift.TException, exc:
            print exc.message()
        except DRPCExecutionException, exc:
            print exc


if __name__ == '__main__':
    '''send 10 data to server for prediction.'''
    client = Client()
    test = datasets.fetch_20newsgroups_vectorized(subset='test')
    data_size = 40
    input_json = json.dumps(test.data[:data_size].toarray().tolist())
    print 'data prepared'
    result = client.execute('svm', input_json)
    print 'data predicted'
    result = array(json.loads(result))
    print metrics.classification_report(test.target[:data_size], result)
    client.close()
###############################################################################
# Routines


# initialize random generator
np.random.seed(0)


def generate_data(case, sparse=False):
    """Generate regression/classification data."""
    bunch = None
    if case == 'regression':
        bunch = datasets.load_boston()
    elif case == 'classification':
        bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    X, y = shuffle(bunch.data, bunch.target)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train,
            'y_test': y_test}
    return data
import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from GPSVI.core.GPClassifier import GPClassifier

np.random.seed(0)

data = datasets.fetch_20newsgroups_vectorized()

xTr, xTe, yTr, yTe = train_test_split(data.data, data.target, test_size=0.80)

svd = TruncatedSVD(algorithm='randomized', n_components=3, tol=0.0)
svd.fit(xTr)
x = svd.transform(xTr)
fig = plt.figure('Show data')
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0], x[:,1], x[:,2], c=yTr, cmap=matplotlib.cm.rainbow)


t0 = time.time()
clf_lr = LogisticRegression(C=2.0)
clf_lr.fit(xTr, yTr)
lr_score = clf_lr.score(xTe, yTe)
Пример #43
0
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    cached_fit = mem.cache(fit_single)
    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(cached_fit)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for penalty in penalties)

    res = []
    idx = 0
    for solver in solvers:
        for penalty in penalties:
            if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
Пример #44
0
# nodebox section end


print(__doc__)
# Author: Arthur Mensch

t0 = time.clock()

# We use SAGA solver
solver = 'saga'

# Turn down for faster run time
n_samples = 10000

# Memorized fetch_rcv1 for faster access
dataset = fetch_20newsgroups_vectorized('all')
X = dataset.data
y = dataset.target
X = X[:n_samples]
y = y[:n_samples]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y,
                                                    test_size=0.1)
train_samples, n_features = X_train.shape
n_classes = np.unique(y).shape[0]

print('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'
      % (train_samples, n_features, n_classes))
Пример #45
0
def plot_sparse_logistic_regression_20newsgroups():
    warnings.filterwarnings("ignore",
                            category=ConvergenceWarning,
                            module="sklearn")
    t0 = timeit.default_timer()

    # We use SAGA solver
    solver = 'saga'

    # Turn down for faster run time
    n_samples = 10000

    X, y = fetch_20newsgroups_vectorized(subset='all', return_X_y=True)
    X = X[:n_samples]
    y = y[:n_samples]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y,
                                                        test_size=0.1)
    train_samples, n_features = X_train.shape
    n_classes = np.unique(y).shape[0]

    print(
        'Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i' %
        (train_samples, n_features, n_classes))

    models = {
        'ovr': {
            'name': 'One versus Rest',
            'iters': [1, 2, 4]
        },
        'multinomial': {
            'name': 'Multinomial',
            'iters': [1, 3, 7]
        }
    }

    for model in models:
        # Add initial chance-level values for plotting purpose
        accuracies = [1 / n_classes]
        times = [0]
        densities = [1]

        model_params = models[model]

        # Small number of epochs for fast runtime
        for this_max_iter in model_params['iters']:
            print('[model=%s, solver=%s] Number of epochs: %s' %
                  (model_params['name'], solver, this_max_iter))
            lr = LogisticRegression(
                solver=solver,
                multi_class=model,
                penalty='l1',
                max_iter=this_max_iter,
                random_state=42,
            )
            t1 = timeit.default_timer()
            lr.fit(X_train, y_train)
            train_time = timeit.default_timer() - t1

            y_pred = lr.predict(X_test)
            accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
            density = np.mean(lr.coef_ != 0, axis=1) * 100
            accuracies.append(accuracy)
            densities.append(density)
            times.append(train_time)
        models[model]['times'] = times
        models[model]['densities'] = densities
        models[model]['accuracies'] = accuracies
        print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))
        print('%% non-zero coefficients for model %s, '
              'per class:\n %s' % (model, densities[-1]))
        print('Run time (%i epochs) for model %s:'
              '%.2f' % (model_params['iters'][-1], model, times[-1]))

    fig = plt.figure()
    ax = fig.add_subplot(111)

    for model in models:
        name = models[model]['name']
        times = models[model]['times']
        accuracies = models[model]['accuracies']
        ax.plot(times, accuracies, marker='o', label='Model: %s' % name)
        ax.set_xlabel('Train time (s)')
        ax.set_ylabel('Test accuracy')
    ax.legend()
    fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n'
                 'Dataset %s' % '20newsgroups')
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    run_time = timeit.default_timer() - t0
    print('Example run in %.3f s' % run_time)
    plt.show()
Пример #46
0
print(__doc__)
import numpy as np
from scipy.linalg import svd

from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.datasets.samples_generator import make_classification
from sklearn.feature_selection import SelectKBest, chi2

from lightning.classification import FistaClassifier

def rank(M, eps=1e-9):
    U, s, V = svd(M, full_matrices=False)
    return np.sum(s > eps)


bunch = fetch_20newsgroups_vectorized(subset="train")
X_train = bunch.data
y_train = bunch.target

# Reduces dimensionality to make the example faster
ch2 = SelectKBest(chi2, k=5000)
X_train = ch2.fit_transform(X_train, y_train)

bunch = fetch_20newsgroups_vectorized(subset="test")
X_test = bunch.data
y_test = bunch.target
X_test = ch2.transform(X_test)

clf = FistaClassifier(C=1.0 / X_train.shape[0],
                      max_iter=200,
                      penalty="trace",
Пример #47
0
"""
import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

from lightning.classification import CDClassifier
from lightning.classification import LinearSVC
from lightning.classification import SGDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# Select a subset of the classes for faster training.
ind = np.arange(X.shape[0])
subset = y < 5
X = X[ind[subset]]
y = y[subset]

# Train / test split.
X_tr, X_te, y_tr, y_te = train_test_split(X,
                                          y,
                                          train_size=0.75,
                                          test_size=0.25,
                                          random_state=0)
Пример #48
0
#!/usr/bin/env python
# -*- encoding: utf8 -*-
'''Train a svm model using 20newsgroup data.
'''

from sklearn import datasets, svm
import cPickle as pkl


__author__ = 'noahsark'


train = datasets.fetch_20newsgroups_vectorized(subset='train')
clf = svm.LinearSVC()
clf.fit(train.data, train.target)
with open('storm-starter/multilang/resources/svm_model.pkl', 'wb') as fp_:
    pkl.dump(clf, fp_)
Пример #49
0
analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms

#(准确率*召回率)/(准确率+召回率)
def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred);
    m_recall = metrics.recall_score(actual,pred);
    print 'predict info:'
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall);
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));

#或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
tfidf_test_3 = fetch_20newsgroups_vectorized(subset = 'test');
print "the shape of train is "+repr(tfidf_train_3.data.shape)
print "the shape of test is "+repr(tfidf_test_3.data.shape)

#分类
######################################################
#Multinomial Naive Bayes Classifier
print '*************************\nNaive Bayes\n*************************'
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset = 'test',
                                     categories = categories);
fea_test = vectorizer.fit_transform(newsgroups_test.data);
#create the Multinomial Naive Bayesian Classifier
clf = MultinomialNB(alpha = 0.01)
Пример #50
0
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# import data
newsgroups_train = fetch_20newsgroups_vectorized(subset='train')
data_train = newsgroups_train.data
target_train = newsgroups_train.target

newsgroups_test = fetch_20newsgroups_vectorized(subset='test')
data_test = newsgroups_test.data
target_test = newsgroups_test.target

n_samples, n_features = data_train.shape
n_classes = np.unique(target_train).shape[0]

# classify and calculate scores
classifier = OneVsRestClassifier(MultinomialNB())
y_score = classifier.fit(data_train, target_train).predict_proba(data_test)

# binarize the target classes of test set
y_true = label_binarize(target_test, range(n_classes))

# for each class, compute ROC curve and AUC
fpr, tpr = dict(), dict()
Пример #51
0
import numpy as np
from sklearn.datasets import fetch_20newsgroups_vectorized

train_data = fetch_20newsgroups_vectorized(subset='train', remove=(), data_home=None, download_if_missing=False)
test_data = fetch_20newsgroups_vectorized(subset='test', remove=(), data_home=None, download_if_missing=False)

train_x = train_data.data.toarray()
train_x = np.hstack((np.ones((train_x.shape[0],1)),train_x))
train_y = train_data.target

test_x = test_data.data.toarray()
test_x = np.hstack((np.ones((test_x.shape[0],1)),test_x))
test_y = test_data.target

class_names = train_data.target_names

weights = np.zeros((20,test_x.shape[1]))


def update_perceptron(i, alpha):
    labels = np.where(train_y == i, 1, -1)
    for indx in range(train_x.shape[0]):
        y_hat = weights[i] @ train_x[indx]
        if np.sign(labels[indx]) != np.sign(y_hat):
            weights[i] += alpha * labels[indx] * train_x[indx]


def accuracy_report():
    report_accuracies = [[0,0] for x in range(20)]
    scores = np.zeros(20)
    total = 0
    min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
    plt.semilogy(eps_range, min_n_components, color=color)

plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
plt.show()

# Part 2: perform sparse random projection of some digits images which are
# quite low dimensional and dense or documents of the 20 newsgroups dataset
# which is both high dimensional and sparse

if '--twenty-newsgroups' in sys.argv:
    # Need an internet connection hence not enabled by default
    data = fetch_20newsgroups_vectorized().data[:500]
else:
    data = load_digits().data[:500]

n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections" %
      (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
Пример #53
0
    "naive_bayes": MultinomialNB(),
    "adaboost": AdaBoostClassifier(n_estimators=10),
}


###############################################################################
# Data

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--estimators', nargs="+", required=True,
                        choices=ESTIMATORS)
    args = vars(parser.parse_args())

    data_train = fetch_20newsgroups_vectorized(subset="train")
    data_test = fetch_20newsgroups_vectorized(subset="test")
    X_train = check_array(data_train.data, dtype=np.float32,
                          accept_sparse="csc")
    X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
    y_train = data_train.target
    y_test = data_test.target

    print("20 newsgroups")
    print("=============")
    print("X_train.shape = {0}".format(X_train.shape))
    print("X_train.format = {0}".format(X_train.format))
    print("X_train.dtype = {0}".format(X_train.dtype))
    print("X_train density = {0}"
          "".format(X_train.nnz / np.product(X_train.shape)))
    print("y_train {0}".format(y_train.shape))
Пример #54
0
        clf._finalize_coef()
        y_pred = clf.decision_function(self.X).ravel()
        loss = (np.maximum(1 - self.y * y_pred, 0) ** 2).mean()
        coef = clf.coef_.ravel()
        regul = 0.5 * clf.alpha * np.dot(coef, coef)
        self.obj.append(loss + regul)
        self.test_time += time.clock() - test_time
        self.times.append(time.clock() -  self.start_time - self.test_time)

try:
    dataset = sys.argv[1]
except:
    dataset = "synthetic"

if dataset == "news20":
    bunch = fetch_20newsgroups_vectorized(subset="all")
    X = bunch.data
    y = bunch.target
    y[y >= 1] = 1

    etas = (0.5, 1e-1, 1e-2)
    n_inners = (1.0, 2.0, 3.0)
else:
    X, y = make_classification(n_samples=10000,
                               n_features=100,
                               n_classes=2,
                               random_state=0)
    etas = (1e-3, 1e-4, 1e-5)
    n_inners = (0.25, 0.5, 1.0, 1.5)

y = y * 2 - 1
Пример #55
-1
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)