def test_from_texts(): cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0), doc2label_vectorizer=LabelCountVectorizer()) actual = cal.from_texts(docs, labels) assert_equal(actual.shape[1], 4) assert_equal(actual.shape[0], 9) assert_equal( cal.index2word_, { 0: u'information', 1: u'language', 2: u'learning', 3: u'machine', 4: u'mining', 5: u'natural', 6: u'processing', 7: u'retrieval', 8: u'text' }) assert_equal( cal.index2label_, { 0: 'information retrieval'.split(), 1: 'machine learning'.split(), 2: 'natural language processing'.split(), 3: 'text mining'.split() })
def test_from_texts_nonexisting_label(): cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0), doc2label_vectorizer=LabelCountVectorizer()) actual = cal.from_texts(docs, labels[:2] + [('haha', 'lala')] + labels[2:] + [('non', 'existing')]) assert_equal(actual.shape[1], 4) assert_equal(cal.index2label_, {0: 'information retrieval'.split(), 1: 'machine learning'.split(), 2: 'natural language processing'.split(), 3: 'text mining'.split()})
def test_from_texts_nonexisting_label(): cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0), doc2label_vectorizer=LabelCountVectorizer()) actual = cal.from_texts( docs, labels[:2] + [('haha', 'lala')] + labels[2:] + [('non', 'existing')]) assert_equal(actual.shape[1], 4) assert_equal( cal.index2label_, { 0: 'information retrieval'.split(), 1: 'machine learning'.split(), 2: 'natural language processing'.split(), 3: 'text mining'.split() })
def main(): with open('machine_learning_tweets.json', 'r') as f: tweets = json.load(f)['tweets'] clean_tweets = prepare_tweets(tweets) test_sample = random.sample(clean_tweets, int(0.1 * len(clean_tweets))) data_sample = list(set(clean_tweets) - set(test_sample)) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_sample) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) docs = [nltk.word_tokenize(doc) for doc in data_sample] finder = BigramLabelFinder('pmi', min_freq=label_min_df, pos=tag_constraints) cand_labels = finder.find(docs, top_n=n_cand_labels) pmi_cal = PMICalculator( doc2word_vectorizer=CountVectorizer(max_df=0.95, min_df=5, max_features=n_features, stop_words='english'), doc2label_vectorizer=LabelCountVectorizer()) pmi_w2l = pmi_cal.from_texts(docs, cand_labels) tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) ranker = LabelRanker(apply_intra_topic_coverage=False) ranked_lables = ranker.top_k_labels(topic_models=lda.components_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels) print 'Labels' print for i, labels in enumerate(ranked_lables): print(u"Topic {}: {}\n".format(i, ', '.join(map(lambda l: ' '.join(l), labels))))
def test_from_texts(): cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0), doc2label_vectorizer=LabelCountVectorizer()) actual = cal.from_texts(docs, labels) assert_equal(actual.shape[1], 4) assert_equal(actual.shape[0], 9) assert_equal(cal.index2word_, {0: u'information', 1: u'language', 2: u'learning', 3: u'machine', 4: u'mining', 5: u'natural', 6: u'processing', 7: u'retrieval', 8: u'text'}) assert_equal(cal.index2label_, {0: 'information retrieval'.split(), 1: 'machine learning'.split(), 2: 'natural language processing'.split(), 3: 'text mining'.split()})
from nose.tools import assert_equal from numpy.testing import assert_array_almost_equal from chowmein.pmi import PMICalculator # 4 words # 3 documents d2w = np.asarray([[2, 0, 0, 1], [0, 0, 1, 1], [0, 3, 0, 1]]) d2w_sparse = csr_matrix(d2w) # 2 labels # 3 documents d2l = np.asarray([[1, 0], [0, 2], [1, 1]]) d2l_sparse = csr_matrix(d2l) cal = PMICalculator() def test_from_matrices_no_smoothing(): # some warning will be output expected = np.log( 3 * np.asarray([[0.25, 0.], [0.16666667, 0.11111111], [0., 0.33333333], [0.16666667, 0.11111111]])) # dense input assert_array_almost_equal(cal.from_matrices(d2w, d2l, pseudo_count=0), expected) # sparse input assert_array_almost_equal( cal.from_matrices(d2w_sparse, d2l_sparse, pseudo_count=0), expected)
def get_topic_labels(corpus_path, n_topics, n_top_words, preprocessing_steps, n_cand_labels, label_min_df, label_tags, n_labels, lda_random_state, lda_n_iter): """ Refer the arguments to `create_parser` """ print("Loading docs...") docs = load_line_corpus(corpus_path) if 'wordlen' in preprocessing_steps: print("Word length filtering...") wl_filter = CorpusWordLengthFilter(minlen=3) docs = wl_filter.transform(docs) if 'stem' in preprocessing_steps: print("Stemming...") stemmer = CorpusStemmer() docs = stemmer.transform(docs) if 'tag' in preprocessing_steps: print("POS tagging...") tagger = CorpusPOSTagger() tagged_docs = tagger.transform(docs) tag_constraints = [] if label_tags != ['None']: for tags in label_tags: tag_constraints.append(tuple(map(lambda t: t.strip(), tags.split(',')))) if len(tag_constraints) == 0: tag_constraints = None print("Tag constraints: {}".format(tag_constraints)) print("Generate candidate bigram labels(with POS filtering)...") finder = BigramLabelFinder('pmi', min_freq=label_min_df, pos=tag_constraints) if tag_constraints: assert 'tag' in preprocessing_steps, \ 'If tag constraint is applied, pos tagging(tag) should be performed' cand_labels = finder.find(tagged_docs, top_n=n_cand_labels) else: # if no constraint, then use untagged docs cand_labels = finder.find(docs, top_n=n_cand_labels) print("Collected {} candidate labels".format(len(cand_labels))) print("Calculate the PMI scores...") pmi_cal = PMICalculator( doc2word_vectorizer=WordCountVectorizer( min_df=5, stop_words=load_lemur_stopwords()), doc2label_vectorizer=LabelCountVectorizer()) pmi_w2l = pmi_cal.from_texts(docs, cand_labels) print("Topic modeling using LDA...") model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter, random_state=lda_random_state) model.fit(pmi_cal.d2w_) print("\nTopical words:") print("-" * 20) for i, topic_dist in enumerate(model.topic_word_): top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1] topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids] print('Topic {}: {}'.format(i, ' '.join(topic_words))) ranker = LabelRanker(apply_intra_topic_coverage=False) return ranker.top_k_labels(topic_models=model.topic_word_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels)
def get_topic_labels( corpus_path, n_topics, n_top_words, preprocessing_steps, n_cand_labels, label_min_df, label_tags, n_labels, lda_random_state, lda_n_iter, ): """ Refer the arguments to `create_parser` """ print("Loading docs...") docs = load_line_corpus(corpus_path) if "wordlen" in preprocessing_steps: print("Word length filtering...") wl_filter = CorpusWordLengthFilter(minlen=3) docs = wl_filter.transform(docs) if "stem" in preprocessing_steps: print("Stemming...") stemmer = CorpusStemmer() docs = stemmer.transform(docs) if "tag" in preprocessing_steps: print("POS tagging...") tagger = CorpusPOSTagger() tagged_docs = tagger.transform(docs) tag_constraints = [] if label_tags != ["None"]: for tags in label_tags: tag_constraints.append(tuple(map(lambda t: t.strip(), tags.split(",")))) if len(tag_constraints) == 0: tag_constraints = None print("Tag constraints: {}".format(tag_constraints)) print("Generate candidate bigram labels(with POS filtering)...") finder = BigramLabelFinder("pmi", min_freq=label_min_df, pos=tag_constraints) if tag_constraints: assert "tag" in preprocessing_steps, "If tag constraint is applied, pos tagging(tag) should be performed" cand_labels = finder.find(tagged_docs, top_n=n_cand_labels) else: # if no constraint, then use untagged docs cand_labels = finder.find(docs, top_n=n_cand_labels) print("Collected {} candidate labels".format(len(cand_labels))) print("Calculate the PMI scores...") pmi_cal = PMICalculator( doc2word_vectorizer=WordCountVectorizer(min_df=5, stop_words=load_lemur_stopwords()), doc2label_vectorizer=LabelCountVectorizer(), ) pmi_w2l = pmi_cal.from_texts(docs, cand_labels) print("Topic modeling using LDA...") model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter, random_state=lda_random_state) model.fit(pmi_cal.d2w_) print("\nTopical words:") print("-" * 20) for i, topic_dist in enumerate(model.topic_word_): top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1] topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids] print("Topic {}: {}".format(i, " ".join(topic_words))) ranker = LabelRanker(apply_intra_topic_coverage=False) return ranker.top_k_labels( topic_models=model.topic_word_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels )