def test_generate_wordclouds_for_topic_words(): try: import lda import PIL from wordcloud import WordCloud except ImportError: pytest.skip('at least one of lda, Pillow, wordcloud not installed') data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle') model = data['model'] vocab = data['vocab'] phi = model.topic_word_ assert phi.shape == (5, len(vocab)) topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10) assert len(topic_word_clouds) == 5 assert set(topic_word_clouds.keys()) == set('topic_%d' % i for i in range(1, 6)) assert all(isinstance(wc, PIL.Image.Image) for wc in topic_word_clouds.values()) topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10, which_topics=('topic_1', 'topic_2'), return_images=False, width=640, height=480) assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'} assert all(isinstance(wc, WordCloud) for wc in topic_word_clouds.values()) assert all(wc.width == 640 and wc.height == 480 for wc in topic_word_clouds.values())
def test_generate_wordclouds_for_document_topics(): try: import lda import PIL from wordcloud import WordCloud except ImportError: pytest.skip('at least one of lda, Pillow, wordcloud not installed') data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle') model = data['model'] doc_labels = data['doc_labels'] theta = model.doc_topic_ assert theta.shape == (len(doc_labels), 5) doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3) assert len(doc_topic_clouds) == len(doc_labels) assert set(doc_topic_clouds.keys()) == set(doc_labels) assert all(isinstance(wc, PIL.Image.Image) for wc in doc_topic_clouds.values()) which_docs = doc_labels[:2] assert len(which_docs) == 2 doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3, which_documents=which_docs, return_images=False, width=640, height=480) assert set(doc_topic_clouds.keys()) == set(which_docs) assert all(isinstance(wc, WordCloud) for wc in doc_topic_clouds.values()) assert all(wc.width == 640 and wc.height == 480 for wc in doc_topic_clouds.values())
def test_exclude_topics(exclude, pass_topic_word, renormalize, return_new_topic_mapping): try: import lda except ImportError: pytest.skip('lda not installed') data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle') model = data['model'] exclude_ind = list(set(exclude)) n_exclude = len(exclude_ind) res = model_stats.exclude_topics(exclude_ind, model.doc_topic_, model.topic_word_ if pass_topic_word else None, renormalize=renormalize, return_new_topic_mapping=return_new_topic_mapping) if pass_topic_word and return_new_topic_mapping: assert isinstance(res, tuple) assert len(res) == 3 new_theta, new_phi, topic_mapping = res elif pass_topic_word and not return_new_topic_mapping: assert isinstance(res, tuple) assert len(res) == 2 new_theta, new_phi = res elif not pass_topic_word and return_new_topic_mapping: assert isinstance(res, tuple) assert len(res) == 2 new_theta, topic_mapping = res else: # not pass_topic_word and not return_new_topic_mapping: assert not isinstance(res, tuple) new_theta = res assert new_theta.shape == (model.doc_topic_.shape[0], model.doc_topic_.shape[1] - n_exclude) if pass_topic_word: assert new_phi.shape == (model.topic_word_.shape[0] - n_exclude, model.topic_word_.shape[1]) if new_theta.shape[1] > 0: if renormalize: assert np.allclose(np.sum(new_theta, axis=1), 1) else: assert np.all(np.sum(new_theta, axis=1) <= 1 + 1e-5) if return_new_topic_mapping: old_indices = list(topic_mapping.keys()) new_indices = list(topic_mapping.values()) assert len(old_indices) == len(new_indices) == new_theta.shape[1] assert 0 <= min(old_indices) < model.doc_topic_.shape[1] assert 0 <= max(old_indices) < model.doc_topic_.shape[1] assert 0 <= min(new_indices) < new_theta.shape[1] assert 0 <= max(new_indices) < new_theta.shape[1] for old_ind, new_ind in topic_mapping.items(): old_t = model.doc_topic_[:, old_ind] new_t = new_theta[:, new_ind] assert np.allclose(old_t, new_t) if pass_topic_word: assert np.allclose(model.topic_word_[old_ind, :], new_phi[new_ind, :])
def test_generate_wordclouds_for_document_topics(): py3file = '.py3' if six.PY3 else '' data = model_io.load_ldamodel_from_pickle( 'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file) model = data['model'] doc_labels = data['doc_labels'] theta = model.doc_topic_ assert theta.shape == (len(doc_labels), 5) doc_topic_clouds = visualize.generate_wordclouds_for_document_topics( theta, doc_labels, 3) assert len(doc_topic_clouds) == len(doc_labels) assert set(doc_topic_clouds.keys()) == set(doc_labels) assert all( isinstance(wc, PIL.Image.Image) for wc in doc_topic_clouds.values()) which_docs = doc_labels[:2] assert len(which_docs) == 2 doc_topic_clouds = visualize.generate_wordclouds_for_document_topics( theta, doc_labels, 3, which_documents=which_docs, return_images=False, width=640, height=480) assert set(doc_topic_clouds.keys()) == set(which_docs) assert all( isinstance(wc, WordCloud) for wc in doc_topic_clouds.values()) assert all(wc.width == 640 and wc.height == 480 for wc in doc_topic_clouds.values())
def test_generate_wordclouds_for_topic_words(): py3file = '.py3' if six.PY3 else '' data = model_io.load_ldamodel_from_pickle( 'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file) model = data['model'] vocab = data['vocab'] phi = model.topic_word_ assert phi.shape == (5, len(vocab)) topic_word_clouds = visualize.generate_wordclouds_for_topic_words( phi, vocab, 10) assert len(topic_word_clouds) == 5 assert set(topic_word_clouds.keys()) == set('topic_%d' % i for i in range(1, 6)) assert all( isinstance(wc, PIL.Image.Image) for wc in topic_word_clouds.values()) topic_word_clouds = visualize.generate_wordclouds_for_topic_words( phi, vocab, 10, which_topics=('topic_1', 'topic_2'), return_images=False, width=640, height=480) assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'} assert all( isinstance(wc, WordCloud) for wc in topic_word_clouds.values()) assert all(wc.width == 640 and wc.height == 480 for wc in topic_word_clouds.values())
def test_save_load_ldamodel_pickle(): pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle' dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]]) doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])] vocab = ['word_' + str(i) for i in range(dtm.shape[1])] model = lda.LDA(2, n_iter=1) model.fit(dtm) model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels) unpickled = model_io.load_ldamodel_from_pickle(pfile) assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_) assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_) assert vocab == unpickled['vocab'] assert doc_labels == unpickled['doc_labels']
def test_write_wordclouds_to_folder(tmpdir): path = tmpdir.mkdir('wordclouds').dirname py3file = '.py3' if six.PY3 else '' data = model_io.load_ldamodel_from_pickle( 'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file) model = data['model'] vocab = data['vocab'] phi = model.topic_word_ assert phi.shape == (5, len(vocab)) topic_word_clouds = visualize.generate_wordclouds_for_topic_words( phi, vocab, 10) visualize.write_wordclouds_to_folder(topic_word_clouds, path, 'cloud_{label}.png') for label in topic_word_clouds.keys(): assert os.path.exists( os.path.join(path, 'cloud_{label}.png'.format(label=label)))
def test_write_wordclouds_to_folder(tmpdir): try: import lda import PIL from wordcloud import WordCloud except ImportError: pytest.skip('at least one of lda, Pillow, wordcloud not installed') path = tmpdir.mkdir('wordclouds').dirname data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle') model = data['model'] vocab = data['vocab'] phi = model.topic_word_ assert phi.shape == (5, len(vocab)) topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10) visualize.write_wordclouds_to_folder(topic_word_clouds, path, 'cloud_{label}.png') for label in topic_word_clouds.keys(): assert os.path.exists(os.path.join(path, 'cloud_{label}.png'.format(label=label)))