Exemplo n.º 1
0
def lda_online_example():
    """
    Example for LDA online update
    """
    def chunks(l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # chunk_size is how many records we want to use
    # in each online iteration
    chunk_size = 2000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with online update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(max_df=0.8,
                                 max_features=n_features,
                                 min_df=3,
                                 stop_words='english')

    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    kappa=0.7,
                    tau=512.,
                    n_jobs=-1,
                    n_docs=1e4,
                    random_state=0,
                    verbose=0)

    for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)):
        if chunk_no == 0:
            doc_mtx = vectorizer.fit_transform(doc_list)
            feature_names = vectorizer.get_feature_names()
        else:
            doc_mtx = vectorizer.transform(doc_list)

        # fit model
        print("\nFitting LDA models with online udpate on chunk %d..." %
              chunk_no)
        lda.partial_fit(doc_mtx)
        print("Topics after training chunk %d:" % chunk_no)
        for topic_idx, topic in enumerate(lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]))
Exemplo n.º 2
0
def test_lda_transform_mismatch():
    """
    test n_vocab mismatch in fit and transform
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1.0 / n_topics
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng)
    lda.partial_fit(X)
    lda.transform(X_2)
Exemplo n.º 3
0
def test_lda_partial_fit_dim_mismatch():
    """
    test n_vocab mismatch in partial_fit
    """
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1.0 / n_topics

    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5.0, n_docs=20, random_state=rng)
    for X in [X_1, X_2]:
        lda.partial_fit(X)
Exemplo n.º 4
0
def test_lda_online_multi_jobs():
    """
    Test LDA online training with multi CPU
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5.0, n_docs=30, random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 5
0
def lda_online_example():
    """
    Example for LDA online update
    """

    def chunks(l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # chunk_size is how many records we want to use
    # in each online iteration
    chunk_size = 2000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with online update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(
        max_df=0.8, max_features=n_features, min_df=3, stop_words='english')

    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7,
                    tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0)

    for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)):
        if chunk_no == 0:
            doc_mtx = vectorizer.fit_transform(doc_list)
            feature_names = vectorizer.get_feature_names()
        else:
            doc_mtx = vectorizer.transform(doc_list)

        # fit model
        print("\nFitting LDA models with online udpate on chunk %d..." %
              chunk_no)
        lda.partial_fit(doc_mtx)
        print("Topics after training chunk %d:" % chunk_no)
        for topic_idx, topic in enumerate(lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 6
0
def test_lda_transform_mismatch():
    """
    test n_vocab mismatch in fit and transform
    """
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1. / n_topics
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha0,
                    eta=eta0,
                    random_state=rng)
    lda.partial_fit(X)
    lda.transform(X_2)
Exemplo n.º 7
0
def test_lda_online():
    """
    Test LDA online training(`partial_fit` method)
    (same as test_lda_batch)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30.0, random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 8
0
def test_lda_partial_fit_dim_mismatch():
    """
    test n_vocab mismatch in partial_fit
    """
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    alpha0 = eta0 = 1. / n_topics

    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha0,
                    eta=eta0,
                    tau=5.,
                    n_docs=20,
                    random_state=rng)
    for X in [X_1, X_2]:
        lda.partial_fit(X)
Exemplo n.º 9
0
def test_lda_online():
    """
    Test LDA online training(`partial_fit` method)
    (same as test_lda_batch)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    tau=30.,
                    random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 10
0
def test_lda_online_multi_jobs():
    """
    Test LDA online training with multi CPU
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    n_jobs=2,
                    tau=5.,
                    n_docs=30,
                    random_state=rng)

    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)