class LdaMalletHandler:
    def __init__(self, mallet_path):
        self.mallet_path = mallet_path

    def run_model(self, model_name, corpus, **kwargs):
        self.model_name = model_name
        self.dictionary = Dictionary(corpus)
        corpus_bow = [self.dictionary.doc2bow(text) for text in corpus]
        os.makedirs("ldamodels/"+model_name, exist_ok=True )
        self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs)

    def save_model(self):
        self.model.save("ldamodels/"+self.model_name+"/model.model")
        self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict")

    def load_model(self, model_name):
        self.model_name = model_name
        self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
        self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
        self.model.mallet_path = self.mallet_path
    
    def doc_topics(self, doc_idx):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        return self.doc_retriever.doc_topics(doc_idx)    
    
    def ext_doc_topics(self, ext_doc):
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        doc_topics.sort(key=lambda x: x[1], reverse=True)
        return doc_topics

    def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        topics = []
        for topic in doc_topics:
            topics.append(topic[1])    
        most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric)    
        return most_similar

    def n_most_representative(self, topic, n=3):
         if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
         topics = np.zeros(self.model.num_topics)
         topics[topic]=1
         most_similar = self.doc_retriever.n_most_similar(topics, n=n)
         return most_similar
        
    def get_string_topics(self, num_topics=-1, num_words=10):
        if(num_topics==-1):
            num_topics = self.model.num_topics 
        string_topics = []
        for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words):
            splitted = topic[1].split("\"")
            result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))]
            string_topics.append(" ".join(result))
        return string_topics    
Пример #2
0
    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "Mallet/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=14,
                             id2word=id2word,
                             optimize_interval=1,
                             random_seed=9,
                             iterations=5)

    doc_topics = list(
        mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    topic_word = TopicWord(mallet_model)
    topic_word.get_topic_word()
    topic_word.write_to_csv("../output/topic_" +
                            str(mallet_model.random_seed) + "_" +
                            str(mallet_model.iterations) + "_" +
                            str(mallet_model.num_topics) + ".csv")

    topic_doc = TopicDoc(mallet_model)
    topic_doc.get_topic_doc()
    topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) +
                           "_" + str(mallet_model.iterations) + "_" +
                           str(mallet_model.num_topics) + ".csv",
                           num_docs=50)

    return 0
Пример #4
0
class LdaMalletHandler(TransformerMixin, BaseEstimator):
    def __init__(self,
                 n_components=100,
                 mallet_path=None,
                 prefix=None,
                 iterations=1000,
                 vectorizer=None):
        self.n_components = n_components
        self.mallet_path = mallet_path
        self.prefix = prefix
        self.iterations = iterations
        self.vectorizer = vectorizer

    def vect2gensim(self, vectorizer, dtmatrix):
        # transform sparse matrix into gensim corpus and dictionary
        corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False)
        dictionary = Dictionary.from_corpus(
            corpus_vect_gensim,
            id2word=dict(
                (id, word) for word, id in vectorizer.vocabulary_.items()))
        return (corpus_vect_gensim, dictionary)

    def fit(self, X, y=None):
        print('vect2gensim')
        corpus, dictionary = self.vect2gensim(self.vectorizer, X)
        self.model = LdaMallet(self.mallet_path,
                               iterations=self.iterations,
                               corpus=corpus,
                               num_topics=self.n_components,
                               id2word=dictionary)
        return self

    def transform(self, X):
        corpus = Sparse2Corpus(X, documents_columns=False)
        doc_topic = self.model[corpus]
        mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64)
        for did, doc in enumerate(doc_topic):
            for topic in doc:
                mat[did][topic[0]] = topic[1]
        return mat

    def get_doc_topic_matrix(self):
        arr = []
        lines = open(self.model.fdoctopics(), "r").read().splitlines()
        for line in lines:
            arr.append(line.split()[2:])
        return np.array(arr, dtype=np.float64)

    def get_topic_words_matrix(self):
        return self.model.get_topics()
Пример #5
0
def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics