Python LdaMallet.fdoctopics примеры использования

Язык программирования: Python

Пространство имен/Пакет: gensim.models.wrappers

Класс/Тип: LdaMallet

Метод/Функция: fdoctopics

Примеров на hotexamples.com: 5

Python LdaMallet.fdoctopics - 5 примеров найдено. Это лучшие примеры Python кода для gensim.models.wrappers.LdaMallet.fdoctopics, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

LdaMallet(30)

load(24)

save(17)

show_topics(13)

print_topics(8)

fdoctopics(5)

get_topics(4)

show_topic(4)

get_document_topics(2)

read_doctopics(2)

load_document_topics(1)

print_topic(1)

train(1)

Пример #1

Показать файл

Файл: LdaMalletHandler.py Проект: siqueiralex/LdaMalletHandler

class LdaMalletHandler:
    def __init__(self, mallet_path):
        self.mallet_path = mallet_path

    def run_model(self, model_name, corpus, **kwargs):
        self.model_name = model_name
        self.dictionary = Dictionary(corpus)
        corpus_bow = [self.dictionary.doc2bow(text) for text in corpus]
        os.makedirs("ldamodels/"+model_name, exist_ok=True )
        self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs)

    def save_model(self):
        self.model.save("ldamodels/"+self.model_name+"/model.model")
        self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict")

    def load_model(self, model_name):
        self.model_name = model_name
        self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
        self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
        self.model.mallet_path = self.mallet_path
    
    def doc_topics(self, doc_idx):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        return self.doc_retriever.doc_topics(doc_idx)    
    
    def ext_doc_topics(self, ext_doc):
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        doc_topics.sort(key=lambda x: x[1], reverse=True)
        return doc_topics

    def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        topics = []
        for topic in doc_topics:
            topics.append(topic[1])    
        most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric)    
        return most_similar

    def n_most_representative(self, topic, n=3):
         if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
         topics = np.zeros(self.model.num_topics)
         topics[topic]=1
         most_similar = self.doc_retriever.n_most_similar(topics, n=n)
         return most_similar
        
    def get_string_topics(self, num_topics=-1, num_words=10):
        if(num_topics==-1):
            num_topics = self.model.num_topics 
        string_topics = []
        for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words):
            splitted = topic[1].split("\"")
            result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))]
            string_topics.append(" ".join(result))
        return string_topics

Пример #2

Показать файл

Файл: data.py Проект: Ali-Omrani/NTAP

    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return

Пример #3

Показать файл

Файл: gensim_topic_model.py Проект: flnasc/dr_concept_searching

def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "Mallet/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=14,
                             id2word=id2word,
                             optimize_interval=1,
                             random_seed=9,
                             iterations=5)

    doc_topics = list(
        mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    topic_word = TopicWord(mallet_model)
    topic_word.get_topic_word()
    topic_word.write_to_csv("../output/topic_" +
                            str(mallet_model.random_seed) + "_" +
                            str(mallet_model.iterations) + "_" +
                            str(mallet_model.num_topics) + ".csv")

    topic_doc = TopicDoc(mallet_model)
    topic_doc.get_topic_doc()
    topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) +
                           "_" + str(mallet_model.iterations) + "_" +
                           str(mallet_model.num_topics) + ".csv",
                           num_docs=50)

    return 0

Пример #4

Показать файл

class LdaMalletHandler(TransformerMixin, BaseEstimator):
    def __init__(self,
                 n_components=100,
                 mallet_path=None,
                 prefix=None,
                 iterations=1000,
                 vectorizer=None):
        self.n_components = n_components
        self.mallet_path = mallet_path
        self.prefix = prefix
        self.iterations = iterations
        self.vectorizer = vectorizer

    def vect2gensim(self, vectorizer, dtmatrix):
        # transform sparse matrix into gensim corpus and dictionary
        corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False)
        dictionary = Dictionary.from_corpus(
            corpus_vect_gensim,
            id2word=dict(
                (id, word) for word, id in vectorizer.vocabulary_.items()))
        return (corpus_vect_gensim, dictionary)

    def fit(self, X, y=None):
        print('vect2gensim')
        corpus, dictionary = self.vect2gensim(self.vectorizer, X)
        self.model = LdaMallet(self.mallet_path,
                               iterations=self.iterations,
                               corpus=corpus,
                               num_topics=self.n_components,
                               id2word=dictionary)
        return self

    def transform(self, X):
        corpus = Sparse2Corpus(X, documents_columns=False)
        doc_topic = self.model[corpus]
        mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64)
        for did, doc in enumerate(doc_topic):
            for topic in doc:
                mat[did][topic[0]] = topic[1]
        return mat

    def get_doc_topic_matrix(self):
        arr = []
        lines = open(self.model.fdoctopics(), "r").read().splitlines()
        for line in lines:
            arr.append(line.split()[2:])
        return np.array(arr, dtype=np.float64)

    def get_topic_words_matrix(self):
        return self.model.get_topics()

Пример #5

Показать файл

Файл: mallet_model.py Проект: Yuxin-9/graduation-project

def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics