예제 #1
0
파일: data.py 프로젝트: Ali-Omrani/NTAP
    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "Mallet/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=14,
                             id2word=id2word,
                             optimize_interval=1,
                             random_seed=9,
                             iterations=5)

    doc_topics = list(
        mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    topic_word = TopicWord(mallet_model)
    topic_word.get_topic_word()
    topic_word.write_to_csv("../output/topic_" +
                            str(mallet_model.random_seed) + "_" +
                            str(mallet_model.iterations) + "_" +
                            str(mallet_model.num_topics) + ".csv")

    topic_doc = TopicDoc(mallet_model)
    topic_doc.get_topic_doc()
    topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) +
                           "_" + str(mallet_model.iterations) + "_" +
                           str(mallet_model.num_topics) + ".csv",
                           num_docs=50)

    return 0