Exemplo n.º 1
0
def test_lda_transform_before_fit():
    """
    test `transform` before `fit`
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    lda = OnlineLDA()
    lda.transform(X)
Exemplo n.º 2
0
def test_lda_transform_before_fit():
    """
    test `transform` before `fit`
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    lda = OnlineLDA()
    lda.transform(X)
Exemplo n.º 3
0
def lda_online_example():
    """
    Example for LDA online update
    """
    def chunks(l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # chunk_size is how many records we want to use
    # in each online iteration
    chunk_size = 2000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with online update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(max_df=0.8,
                                 max_features=n_features,
                                 min_df=3,
                                 stop_words='english')

    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    kappa=0.7,
                    tau=512.,
                    n_jobs=-1,
                    n_docs=1e4,
                    random_state=0,
                    verbose=0)

    for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)):
        if chunk_no == 0:
            doc_mtx = vectorizer.fit_transform(doc_list)
            feature_names = vectorizer.get_feature_names()
        else:
            doc_mtx = vectorizer.transform(doc_list)

        # fit model
        print("\nFitting LDA models with online udpate on chunk %d..." %
              chunk_no)
        lda.partial_fit(doc_mtx)
        print("Topics after training chunk %d:" % chunk_no)
        for topic_idx, topic in enumerate(lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]))
Exemplo n.º 4
0
def test_lda_normalize_docs():
    """
    test sum of topic distribution equals to 1 for each doc
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    X_fit = lda.fit_transform(X)
    assert_array_almost_equal(X_fit.sum(axis=1), np.ones(X.shape[0]))
Exemplo n.º 5
0
def test_lda_normalize_docs():
    """
    test sum of topic distribution equals to 1 for each doc
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    X_fit = lda.fit_transform(X)
    assert_array_almost_equal(X_fit.sum(axis=1), np.ones(X.shape[0]))
Exemplo n.º 6
0
def test_lda_fit_transform():
    """
    Test LDA fit_transform & transform
    fit_transform and transform result should be the same
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)
Exemplo n.º 7
0
def test_lda_fit_transform():
    """
    Test LDA fit_transform & transform
    fit_transform and transform result should be the same
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)
Exemplo n.º 8
0
def test_lda_batch():
    """
    Test LDA batch training(`fit` method)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 9
0
def test_lda_batch():
    """
    Test LDA batch training(`fit` method)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 10
0
def test_lda_transform_mismatch():
    """
    test n_vocab mismatch in fit and transform
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1.0 / n_topics
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng)
    lda.partial_fit(X)
    lda.transform(X_2)
Exemplo n.º 11
0
def test_lda_dense_input():
    """
    Test LDA with dense input.
    Similar to test_lda()
    """
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    alpha0 = eta0 = 1.0 / n_topics
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng)

    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
Exemplo n.º 12
0
def test_lda_partial_fit_dim_mismatch():
    """
    test n_vocab mismatch in partial_fit
    """
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1.0 / n_topics

    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5.0, n_docs=20, random_state=rng)
    for X in [X_1, X_2]:
        lda.partial_fit(X)
Exemplo n.º 13
0
def test_lda_online_multi_jobs():
    """
    Test LDA online training with multi CPU
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5.0, n_docs=30, random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 14
0
def lda_online_example():
    """
    Example for LDA online update
    """

    def chunks(l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # chunk_size is how many records we want to use
    # in each online iteration
    chunk_size = 2000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with online update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(
        max_df=0.8, max_features=n_features, min_df=3, stop_words='english')

    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7,
                    tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0)

    for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)):
        if chunk_no == 0:
            doc_mtx = vectorizer.fit_transform(doc_list)
            feature_names = vectorizer.get_feature_names()
        else:
            doc_mtx = vectorizer.transform(doc_list)

        # fit model
        print("\nFitting LDA models with online udpate on chunk %d..." %
              chunk_no)
        lda.partial_fit(doc_mtx)
        print("Topics after training chunk %d:" % chunk_no)
        for topic_idx, topic in enumerate(lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 15
0
def _lda_simple_example():
    """
    This is for debug
    """

    from sklearn.feature_extraction.text import CountVectorizer

    test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj']
    test_vocab = {}
    for idx, word in enumerate(test_words):
        test_vocab[word] = idx

    # group 1: aa, bb, cc, dd
    # group 2: ee ff gg
    # group 3: hh ii jj
    test_docs = [
        'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj',
        'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg',
        'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg',
        'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc',
        'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj'
    ]

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9,
                                 min_df=1,
                                 vocabulary=test_vocab)

    doc_word_count = vectorizer.fit_transform(test_docs)

    # LDA setting
    n_topics = 3
    alpha = 1. / n_topics
    eta = 1. / n_topics
    n_top_words = 3

    lda = OnlineLDA(n_topics=n_topics,
                    eta=eta,
                    alpha=alpha,
                    random_state=0,
                    n_jobs=1,
                    verbose=0)
    lda.fit(doc_word_count)
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 16
0
def test_lda_dense_input():
    """
    Test LDA with dense input.
    Similar to test_lda()
    """
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    alpha0 = eta0 = 1. / n_topics
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha0,
                    eta=eta0,
                    random_state=rng)

    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
Exemplo n.º 17
0
def test_lda_online():
    """
    Test LDA online training(`partial_fit` method)
    (same as test_lda_batch)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30.0, random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 18
0
def _lda_simple_example():
    """
    This is for debug
    """

    from sklearn.feature_extraction.text import CountVectorizer

    test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj']
    test_vocab = {}
    for idx, word in enumerate(test_words):
        test_vocab[word] = idx

    # group 1: aa, bb, cc, dd
    # group 2: ee ff gg
    # group 3: hh ii jj
    test_docs = ['aa bb cc dd aa aa',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa dd aa bb cc',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa dd aa bb cc',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj']

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9, min_df=1, vocabulary=test_vocab)

    doc_word_count = vectorizer.fit_transform(test_docs)

    # LDA setting
    n_topics = 3
    alpha = 1. / n_topics
    eta = 1. / n_topics
    n_top_words = 3

    lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha,
                    random_state=0, n_jobs=1, verbose=0)
    lda.fit(doc_word_count)
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 19
0
def lda_batch_example():
    """
    Example for LDA batch update
    """

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # bach update is slow, so only use top 4000 records in 20 news groups
    n_samples = 4000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with bath update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    print("convert text into sparse matrix...")
    vectorizer = CountVectorizer(max_df=0.8,
                                 max_features=n_features,
                                 min_df=3,
                                 stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])

    print("Fitting LDA models with batch udpate...")
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    n_jobs=-1,
                    random_state=0,
                    verbose=1)

    feature_names = vectorizer.get_feature_names()
    lda.fit(doc_word_count, max_iters=10)
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 20
0
def test_lda_partial_fit_dim_mismatch():
    """
    test n_vocab mismatch in partial_fit
    """
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1. / n_topics

    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha0,
                    eta=eta0,
                    tau=5.,
                    n_docs=20,
                    random_state=rng)
    for X in [X_1, X_2]:
        lda.partial_fit(X)
Exemplo n.º 21
0
def test_lda_online():
    """
    Test LDA online training(`partial_fit` method)
    (same as test_lda_batch)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    tau=30.,
                    random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 22
0
def test_lda_online_multi_jobs():
    """
    Test LDA online training with multi CPU
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    n_jobs=2,
                    tau=5.,
                    n_docs=30,
                    random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 23
0
def test_lda_preplexity():
    """
    Test LDA preplexity for batch training
    preplexity should be lower after each iteration
    """
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda_1 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)
    lda_2 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)

    distr_1 = lda_1.fit_transform(X, max_iters=1)
    prep_1 = lda_1.preplexity(X, distr_1, sub_sampling=False)

    distr_2 = lda_2.fit_transform(X, max_iters=10)
    prep_2 = lda_2.preplexity(X, distr_2, sub_sampling=False)
    assert_greater_equal(prep_1, prep_2)
Exemplo n.º 24
0
def lda_batch_example():
    """
    Example for LDA batch update
    """

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # bach update is slow, so only use top 4000 records in 20 news groups
    n_samples = 4000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with bath update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    print("convert text into sparse matrix...")
    vectorizer = CountVectorizer(
        max_df=0.8, max_features=n_features, min_df=3, stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])

    print("Fitting LDA models with batch udpate...")
    lda = OnlineLDA(
        n_topics=n_topics, alpha=alpha, eta=eta,
        n_jobs=-1, random_state=0, verbose=1)

    feature_names = vectorizer.get_feature_names()
    lda.fit(doc_word_count, max_iters=10)
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 25
0
def test_lda_preplexity():
    """
    Test LDA preplexity for batch training
    preplexity should be lower after each iteration
    """
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda_1 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)
    lda_2 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)

    distr_1 = lda_1.fit_transform(X, max_iters=1)
    prep_1 = lda_1.preplexity(X, distr_1, sub_sampling=False)

    distr_2 = lda_2.fit_transform(X, max_iters=10)
    prep_2 = lda_2.preplexity(X, distr_2, sub_sampling=False)
    assert_greater_equal(prep_1, prep_2)
Exemplo n.º 26
0
def test_lda_transform_mismatch():
    """
    test n_vocab mismatch in fit and transform
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1. / n_topics
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha0,
                    eta=eta0,
                    random_state=rng)
    lda.partial_fit(X)
    lda.transform(X_2)