Exemplo n.º 1
0
def test_lda_batch():
    """
    Test LDA batch training(`fit` method)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 2
0
def test_lda_batch():
    """
    Test LDA batch training(`fit` method)
    """
    rng = np.random.RandomState(0)
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Exemplo n.º 3
0
def _lda_simple_example():
    """
    This is for debug
    """

    from sklearn.feature_extraction.text import CountVectorizer

    test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj']
    test_vocab = {}
    for idx, word in enumerate(test_words):
        test_vocab[word] = idx

    # group 1: aa, bb, cc, dd
    # group 2: ee ff gg
    # group 3: hh ii jj
    test_docs = [
        'aa bb cc dd aa aa', 'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj',
        'aa bb cc dd aa aa dd aa bb cc', 'ee ee ff ff gg gg',
        'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa', 'ee ee ff ff gg gg',
        'hh ii hh ii jj jj jj jj', 'aa bb cc dd aa aa dd aa bb cc',
        'ee ee ff ff gg gg', 'hh ii hh ii jj jj jj jj'
    ]

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9,
                                 min_df=1,
                                 vocabulary=test_vocab)

    doc_word_count = vectorizer.fit_transform(test_docs)

    # LDA setting
    n_topics = 3
    alpha = 1. / n_topics
    eta = 1. / n_topics
    n_top_words = 3

    lda = OnlineLDA(n_topics=n_topics,
                    eta=eta,
                    alpha=alpha,
                    random_state=0,
                    n_jobs=1,
                    verbose=0)
    lda.fit(doc_word_count)
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 4
0
def _lda_simple_example():
    """
    This is for debug
    """

    from sklearn.feature_extraction.text import CountVectorizer

    test_words = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj']
    test_vocab = {}
    for idx, word in enumerate(test_words):
        test_vocab[word] = idx

    # group 1: aa, bb, cc, dd
    # group 2: ee ff gg
    # group 3: hh ii jj
    test_docs = ['aa bb cc dd aa aa',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa dd aa bb cc',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj',
                 'aa bb cc dd aa aa dd aa bb cc',
                 'ee ee ff ff gg gg',
                 'hh ii hh ii jj jj jj jj']

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9, min_df=1, vocabulary=test_vocab)

    doc_word_count = vectorizer.fit_transform(test_docs)

    # LDA setting
    n_topics = 3
    alpha = 1. / n_topics
    eta = 1. / n_topics
    n_top_words = 3

    lda = OnlineLDA(n_topics=n_topics, eta=eta, alpha=alpha,
                    random_state=0, n_jobs=1, verbose=0)
    lda.fit(doc_word_count)
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 5
0
def lda_batch_example():
    """
    Example for LDA batch update
    """

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # bach update is slow, so only use top 4000 records in 20 news groups
    n_samples = 4000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with bath update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    print("convert text into sparse matrix...")
    vectorizer = CountVectorizer(max_df=0.8,
                                 max_features=n_features,
                                 min_df=3,
                                 stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])

    print("Fitting LDA models with batch udpate...")
    lda = OnlineLDA(n_topics=n_topics,
                    alpha=alpha,
                    eta=eta,
                    n_jobs=-1,
                    random_state=0,
                    verbose=1)

    feature_names = vectorizer.get_feature_names()
    lda.fit(doc_word_count, max_iters=10)
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
Exemplo n.º 6
0
def lda_batch_example():
    """
    Example for LDA batch update
    """

    # In default, we set topic number to 10, and both hyperparameter
    # eta and alpha to 0.1 (`1 / n_topics`)
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    # bach update is slow, so only use top 4000 records in 20 news groups
    n_samples = 4000
    n_features = 1000
    n_top_words = 15

    print('Example of LDA with bath update')
    print("Loading 20 news groups dataset...")
    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    print("convert text into sparse matrix...")
    vectorizer = CountVectorizer(
        max_df=0.8, max_features=n_features, min_df=3, stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])

    print("Fitting LDA models with batch udpate...")
    lda = OnlineLDA(
        n_topics=n_topics, alpha=alpha, eta=eta,
        n_jobs=-1, random_state=0, verbose=1)

    feature_names = vectorizer.get_feature_names()
    lda.fit(doc_word_count, max_iters=10)
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))