Пример #1
0
def test_from_texts():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels)
    assert_equal(actual.shape[1], 4)
    assert_equal(actual.shape[0], 9)
    assert_equal(
        cal.index2word_, {
            0: u'information',
            1: u'language',
            2: u'learning',
            3: u'machine',
            4: u'mining',
            5: u'natural',
            6: u'processing',
            7: u'retrieval',
            8: u'text'
        })
    assert_equal(
        cal.index2label_, {
            0: 'information retrieval'.split(),
            1: 'machine learning'.split(),
            2: 'natural language processing'.split(),
            3: 'text mining'.split()
        })
Пример #2
0
def test_from_texts_nonexisting_label():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels[:2] + [('haha', 'lala')] +
                            labels[2:] + [('non', 'existing')])
    assert_equal(actual.shape[1], 4)
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()})
Пример #3
0
def test_from_texts_nonexisting_label():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(
        docs,
        labels[:2] + [('haha', 'lala')] + labels[2:] + [('non', 'existing')])
    assert_equal(actual.shape[1], 4)
    assert_equal(
        cal.index2label_, {
            0: 'information retrieval'.split(),
            1: 'machine learning'.split(),
            2: 'natural language processing'.split(),
            3: 'text mining'.split()
        })
Пример #4
0
def main():
    with open('machine_learning_tweets.json', 'r') as f:
        tweets = json.load(f)['tweets']

    clean_tweets = prepare_tweets(tweets)
    test_sample = random.sample(clean_tweets, int(0.1 * len(clean_tweets)))
    data_sample = list(set(clean_tweets) - set(test_sample))

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(data_sample)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    lda.fit(tf)

    docs = [nltk.word_tokenize(doc) for doc in data_sample]
    finder = BigramLabelFinder('pmi', min_freq=label_min_df,
                               pos=tag_constraints)
    cand_labels = finder.find(docs, top_n=n_cand_labels)


    pmi_cal = PMICalculator(
        doc2word_vectorizer=CountVectorizer(max_df=0.95, min_df=5, max_features=n_features, stop_words='english'),
        doc2label_vectorizer=LabelCountVectorizer())
    pmi_w2l = pmi_cal.from_texts(docs, cand_labels)

    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

    ranker = LabelRanker(apply_intra_topic_coverage=False)

    ranked_lables = ranker.top_k_labels(topic_models=lda.components_,
                                        pmi_w2l=pmi_w2l,
                                        index2label=pmi_cal.index2label_,
                                        label_models=None,
                                        k=n_labels)

    print 'Labels'
    print
    for i, labels in enumerate(ranked_lables):
        print(u"Topic {}: {}\n".format(i, ', '.join(map(lambda l: ' '.join(l), labels))))
Пример #5
0
def test_from_texts():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels)
    assert_equal(actual.shape[1], 4)
    assert_equal(actual.shape[0], 9)
    assert_equal(cal.index2word_, {0: u'information',
                                   1: u'language',
                                   2: u'learning',
                                   3: u'machine',
                                   4: u'mining',
                                   5: u'natural',
                                   6: u'processing',
                                   7: u'retrieval',
                                   8: u'text'})
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()})
Пример #6
0
from nose.tools import assert_equal
from numpy.testing import assert_array_almost_equal
from chowmein.pmi import PMICalculator

# 4 words
# 3 documents
d2w = np.asarray([[2, 0, 0, 1], [0, 0, 1, 1], [0, 3, 0, 1]])

d2w_sparse = csr_matrix(d2w)

# 2 labels
# 3 documents
d2l = np.asarray([[1, 0], [0, 2], [1, 1]])
d2l_sparse = csr_matrix(d2l)

cal = PMICalculator()


def test_from_matrices_no_smoothing():
    # some warning will be output
    expected = np.log(
        3 * np.asarray([[0.25, 0.], [0.16666667, 0.11111111], [0., 0.33333333],
                        [0.16666667, 0.11111111]]))

    # dense input
    assert_array_almost_equal(cal.from_matrices(d2w, d2l, pseudo_count=0),
                              expected)
    # sparse input
    assert_array_almost_equal(
        cal.from_matrices(d2w_sparse, d2l_sparse, pseudo_count=0), expected)
Пример #7
0
def get_topic_labels(corpus_path, n_topics,
                     n_top_words,
                     preprocessing_steps,
                     n_cand_labels, label_min_df,
                     label_tags, n_labels,
                     lda_random_state,
                     lda_n_iter):
    """
    Refer the arguments to `create_parser`
    """
    print("Loading docs...")
    docs = load_line_corpus(corpus_path)

    if 'wordlen' in preprocessing_steps:
        print("Word length filtering...")
        wl_filter = CorpusWordLengthFilter(minlen=3)
        docs = wl_filter.transform(docs)

    if 'stem' in preprocessing_steps:
        print("Stemming...")
        stemmer = CorpusStemmer()
        docs = stemmer.transform(docs)

    if 'tag' in preprocessing_steps:
        print("POS tagging...")
        tagger = CorpusPOSTagger()
        tagged_docs = tagger.transform(docs)

    tag_constraints = []
    if label_tags != ['None']:
        for tags in label_tags:
            tag_constraints.append(tuple(map(lambda t: t.strip(),
                                             tags.split(','))))

    if len(tag_constraints) == 0:
        tag_constraints = None

    print("Tag constraints: {}".format(tag_constraints))

    print("Generate candidate bigram labels(with POS filtering)...")
    finder = BigramLabelFinder('pmi', min_freq=label_min_df,
                               pos=tag_constraints)
    if tag_constraints:
        assert 'tag' in preprocessing_steps, \
            'If tag constraint is applied, pos tagging(tag) should be performed'
        cand_labels = finder.find(tagged_docs, top_n=n_cand_labels)
    else:  # if no constraint, then use untagged docs
        cand_labels = finder.find(docs, top_n=n_cand_labels)

    print("Collected {} candidate labels".format(len(cand_labels)))

    print("Calculate the PMI scores...")

    pmi_cal = PMICalculator(
        doc2word_vectorizer=WordCountVectorizer(
            min_df=5,
            stop_words=load_lemur_stopwords()),
        doc2label_vectorizer=LabelCountVectorizer())

    pmi_w2l = pmi_cal.from_texts(docs, cand_labels)

    print("Topic modeling using LDA...")
    model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter,
                    random_state=lda_random_state)
    model.fit(pmi_cal.d2w_)

    print("\nTopical words:")
    print("-" * 20)
    for i, topic_dist in enumerate(model.topic_word_):
        top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
        topic_words = [pmi_cal.index2word_[id_]
                       for id_ in top_word_ids]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    ranker = LabelRanker(apply_intra_topic_coverage=False)

    return ranker.top_k_labels(topic_models=model.topic_word_,
                               pmi_w2l=pmi_w2l,
                               index2label=pmi_cal.index2label_,
                               label_models=None,
                               k=n_labels)
Пример #8
0
def get_topic_labels(
    corpus_path,
    n_topics,
    n_top_words,
    preprocessing_steps,
    n_cand_labels,
    label_min_df,
    label_tags,
    n_labels,
    lda_random_state,
    lda_n_iter,
):
    """
    Refer the arguments to `create_parser`
    """
    print("Loading docs...")
    docs = load_line_corpus(corpus_path)

    if "wordlen" in preprocessing_steps:
        print("Word length filtering...")
        wl_filter = CorpusWordLengthFilter(minlen=3)
        docs = wl_filter.transform(docs)

    if "stem" in preprocessing_steps:
        print("Stemming...")
        stemmer = CorpusStemmer()
        docs = stemmer.transform(docs)

    if "tag" in preprocessing_steps:
        print("POS tagging...")
        tagger = CorpusPOSTagger()
        tagged_docs = tagger.transform(docs)

    tag_constraints = []
    if label_tags != ["None"]:
        for tags in label_tags:
            tag_constraints.append(tuple(map(lambda t: t.strip(), tags.split(","))))

    if len(tag_constraints) == 0:
        tag_constraints = None

    print("Tag constraints: {}".format(tag_constraints))

    print("Generate candidate bigram labels(with POS filtering)...")
    finder = BigramLabelFinder("pmi", min_freq=label_min_df, pos=tag_constraints)
    if tag_constraints:
        assert "tag" in preprocessing_steps, "If tag constraint is applied, pos tagging(tag) should be performed"
        cand_labels = finder.find(tagged_docs, top_n=n_cand_labels)
    else:  # if no constraint, then use untagged docs
        cand_labels = finder.find(docs, top_n=n_cand_labels)

    print("Collected {} candidate labels".format(len(cand_labels)))

    print("Calculate the PMI scores...")

    pmi_cal = PMICalculator(
        doc2word_vectorizer=WordCountVectorizer(min_df=5, stop_words=load_lemur_stopwords()),
        doc2label_vectorizer=LabelCountVectorizer(),
    )

    pmi_w2l = pmi_cal.from_texts(docs, cand_labels)

    print("Topic modeling using LDA...")
    model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter, random_state=lda_random_state)
    model.fit(pmi_cal.d2w_)

    print("\nTopical words:")
    print("-" * 20)
    for i, topic_dist in enumerate(model.topic_word_):
        top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
        topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids]
        print("Topic {}: {}".format(i, " ".join(topic_words)))

    ranker = LabelRanker(apply_intra_topic_coverage=False)

    return ranker.top_k_labels(
        topic_models=model.topic_word_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels
    )