Python LdaMallet.read_doctopics 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.models.wrappers

클래스/타입: LdaMallet

메소드/함수: read_doctopics

hotexamples.com에서의 예제들: 2

Python LdaMallet.read_doctopics - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.models.wrappers.LdaMallet.read_doctopics에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LdaMallet(30)

load(24)

save(17)

show_topics(13)

print_topics(8)

fdoctopics(5)

get_topics(4)

show_topic(4)

get_document_topics(2)

read_doctopics(2)

load_document_topics(1)

print_topic(1)

train(1)

예제 #1

파일 보기

파일: data.py 프로젝트: Ali-Omrani/NTAP

    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return

예제 #2

파일 보기

파일: gensim_topic_model.py 프로젝트: flnasc/dr_concept_searching

def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "Mallet/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=14,
                             id2word=id2word,
                             optimize_interval=1,
                             random_seed=9,
                             iterations=5)

    doc_topics = list(
        mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    topic_word = TopicWord(mallet_model)
    topic_word.get_topic_word()
    topic_word.write_to_csv("../output/topic_" +
                            str(mallet_model.random_seed) + "_" +
                            str(mallet_model.iterations) + "_" +
                            str(mallet_model.num_topics) + ".csv")

    topic_doc = TopicDoc(mallet_model)
    topic_doc.get_topic_doc()
    topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) +
                           "_" + str(mallet_model.iterations) + "_" +
                           str(mallet_model.num_topics) + ".csv",
                           num_docs=50)

    return 0