Пример #1
0
class MyModel:
    def __init__(self, dict_file=None, corpus_model=None, corpus_file=None):
        self.dict_file = dict_file
        self.dictionary = None
        self.corpus = None

        if dict_file is not None:
            self.dictionary = corpora.Dictionary.load(dict_file)
        if corpus_model:
            self.corpus = self.corpus_model
        elif corpus_file:
            self.corpus = corpora.MmCorpus(corpus_file)

        self.tf_idf_model = None
        self.corpus_tf_idf = None
        self.lsi_model = None
        self.corpus_lsi = None

        self.lda_model = None
        self.corpus_lda = None

    def tf_idf(self):
        self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True)
        # corpus_vector = [vector for vector in self.corpus]
        self.corpus_tf_idf = self.tf_idf_model[self.corpus]

    def lsi(self):
        self.tf_idf()
        if self.corpus_tf_idf and self.dictionary:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
            print self.lsi_model.print_topic(2)
        elif self.corpus_tf_idf:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]

    def lda(self):
        self.lda_model = models.LsiModel(corpus=self.corpus)
        self.corpus_lda = self.lda_model[self.corpus]

    def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf):
        self.lsi_model.add_documents(addition_corpus_tf_idf)
        lsi_vector = self.lsi_model[addition_vector_tf_idf]
        return lsi_vector

    def save_lsi(self, name='/serialise/model.lsi'):
        self.lsi_model.save(name)

    def save_lda(self, name='/serialise/model.lda'):
        self.lda_model.save(name)

    @staticmethod
    def load_lsi(name='/tmp/model.lsi'):
        my_model = MyModel()
        my_model.lsi_model = models.LsiModel.load(name)
        return my_model
Пример #2
0
def get_topic_labels(corpus_path, n_topics,
                     n_top_words,
                     preprocessing_steps,
                     n_cand_labels, label_min_df,
                     label_tags, n_labels,
                     lda_random_state,
                     lda_n_iter):
    """
    Refer the arguments to `create_parser`
    """
    #gensim's topic modeling
    print("Loading docs...")
    docs = load_line_corpus(corpus_path)
    common_dictionary = Dictionary(docs)
    common_corpus = [common_dictionary.doc2bow(text) for text in docs]

    lda1 = ldamodel.LdaModel(common_corpus, num_topics=10, id2word=common_dictionary)
    lsi = LsiModel(corpus=common_corpus, id2word=common_dictionary, num_topics=400)
    
    lda1.print_topics(5)
    print("---------------------------------------")
    #lsi.print_topics(5)
    print(lsi.print_topic(0))
    
    #print(common_corpus)
    exit()
    if 'wordlen' in preprocessing_steps:
        print("Word length filtering...")
        wl_filter = CorpusWordLengthFilter(minlen=3)
        docs = wl_filter.transform(docs)

    if 'stem' in preprocessing_steps:
        print("Stemming...")
        stemmer = CorpusStemmer()
        docs = stemmer.transform(docs)

    if 'tag' in preprocessing_steps:
        print("POS tagging...")
        tagger = CorpusPOSTagger()
        tagged_docs = tagger.transform(docs)

    tag_constraints = []
    if label_tags != ['None']:
        for tags in label_tags:
            tag_constraints.append(tuple(map(lambda t: t.strip(),
                                             tags.split(','))))

    if len(tag_constraints) == 0:
        tag_constraints = None

    print("Tag constraints: {}".format(tag_constraints))

    print("Generate candidate bigram labels(with POS filtering)...")
    finder = BigramLabelFinder('pmi', min_freq=label_min_df,
                               pos=tag_constraints)
    if tag_constraints:
        assert 'tag' in preprocessing_steps, \
            'If tag constraint is applied, pos tagging(tag) should be performed'
        cand_labels = finder.find(tagged_docs, top_n=n_cand_labels)
    else:  # if no constraint, then use untagged docs
        cand_labels = finder.find(docs, top_n=n_cand_labels)

    print("Collected {} candidate labels".format(len(cand_labels)))

    print("Calculate the PMI scores...")

    pmi_cal = PMICalculator(
        doc2word_vectorizer=WordCountVectorizer(
            min_df=5,
            stop_words=load_lemur_stopwords()),
        doc2label_vectorizer=LabelCountVectorizer())

    pmi_w2l = pmi_cal.from_texts(docs, cand_labels)

    print("Topic modeling using LDA...")
    model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter,
                    random_state=lda_random_state)
    model.fit(pmi_cal.d2w_)

    print("\nTopical words:")
    print("-" * 20)
    for i, topic_dist in enumerate(model.topic_word_):
        top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1]
        topic_words = [pmi_cal.index2word_[id_]
                       for id_ in top_word_ids]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    ranker = LabelRanker(apply_intra_topic_coverage=False)

    return ranker.top_k_labels(topic_models=model.topic_word_,
                               pmi_w2l=pmi_w2l,
                               index2label=pmi_cal.index2label_,
                               label_models=None,
                               k=n_labels)