def lda_online_example(): """ Example for LDA online update """ def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i + n] # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # chunk_size is how many records we want to use # in each online iteration chunk_size = 2000 n_features = 1000 n_top_words = 15 print('Example of LDA with online update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7, tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0) for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)): if chunk_no == 0: doc_mtx = vectorizer.fit_transform(doc_list) feature_names = vectorizer.get_feature_names() else: doc_mtx = vectorizer.transform(doc_list) # fit model print("\nFitting LDA models with online udpate on chunk %d..." % chunk_no) lda.partial_fit(doc_mtx) print("Topics after training chunk %d:" % chunk_no) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]))
def test_lda_transform_mismatch(): """ test n_vocab mismatch in fit and transform """ rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1.0 / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) lda.partial_fit(X) lda.transform(X_2)
def test_lda_partial_fit_dim_mismatch(): """ test n_vocab mismatch in partial_fit """ rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1.0 / n_topics n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5.0, n_docs=20, random_state=rng) for X in [X_1, X_2]: lda.partial_fit(X)
def test_lda_online_multi_jobs(): """ Test LDA online training with multi CPU """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5.0, n_docs=30, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def lda_online_example(): """ Example for LDA online update """ def chunks(l, n): for i in xrange(0, len(l), n): yield l[i:i + n] # In default, we set topic number to 10, and both hyperparameter # eta and alpha to 0.1 (`1 / n_topics`) n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics # chunk_size is how many records we want to use # in each online iteration chunk_size = 2000 n_features = 1000 n_top_words = 15 print('Example of LDA with online update') print("Loading 20 news groups dataset...") dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer( max_df=0.8, max_features=n_features, min_df=3, stop_words='english') lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7, tau=512., n_jobs=-1, n_docs=1e4, random_state=0, verbose=0) for chunk_no, doc_list in enumerate(chunks(dataset.data, chunk_size)): if chunk_no == 0: doc_mtx = vectorizer.fit_transform(doc_list) feature_names = vectorizer.get_feature_names() else: doc_mtx = vectorizer.transform(doc_list) # fit model print("\nFitting LDA models with online udpate on chunk %d..." % chunk_no) lda.partial_fit(doc_mtx) print("Topics after training chunk %d:" % chunk_no) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
def test_lda_transform_mismatch(): """ test n_vocab mismatch in fit and transform """ rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1. / n_topics lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, random_state=rng) lda.partial_fit(X) lda.transform(X_2)
def test_lda_online(): """ Test LDA online training(`partial_fit` method) (same as test_lda_batch) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30.0, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_partial_fit_dim_mismatch(): """ test n_vocab mismatch in partial_fit """ rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) alpha0 = eta0 = 1. / n_topics n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = OnlineLDA(n_topics=n_topics, alpha=alpha0, eta=eta0, tau=5., n_docs=20, random_state=rng) for X in [X_1, X_2]: lda.partial_fit(X)
def test_lda_online(): """ Test LDA online training(`partial_fit` method) (same as test_lda_batch) """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, tau=30., random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_online_multi_jobs(): """ Test LDA online training with multi CPU """ rng = np.random.RandomState(0) n_topics, alpha, eta, X = _build_sparse_mtx() lda = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, n_jobs=2, tau=5., n_docs=30, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)