예제 #1
0
class LDARecommender(Recommender):
    def __init__(self):
        return

    def preprocess(self, text):
        return preprocessing.cleanTokens(text)

    def train(self, train_filename):
        print("train LDA")
        train_name = os.path.basename(train_filename)
        model_filename = train_name + ".lda_model"
        if os.path.isfile(model_filename):
            self.model = LdaMallet.load(model_filename)
        else:
            self.corpus = preprocessing.GensimCorpus(train_filename)
            self.model = LdaMallet(mallet_path,
                                   self.corpus,
                                   num_topics=100,
                                   id2word=self.corpus.dictionary)
            self.model.save(model_filename)
            topics_str = self.model.show_topics(num_topics=-1)
            open(train_name + ".lda_model.topics", 'w').write(str(topics_str))

    def recommend(self, input_text):
        input_bow = self.corpus.dictionary.doc2bow(self.preprocess(input_text))
        input_topics = self.model[input_bow]
        print("lda topics: " + str(input_topics))
        return input_text
예제 #2
0
    def gensim_mallet_lda(self, num_topics=5, num_words=15):        
        """Performs Mallet LDA using Gensim wrapper.

        Requires gensim_corpus output for a column from gensim_preprocessing().

        Args:
            num_topics (int): Desired number of topics to model.
            num_words (int): Number of words to print for each topic.
        """

        mallet_lda_model = LdaMallet(self.mallet_path,
                                     corpus=self.gensim_corpus,
                                     num_topics=num_topics,
                                     id2word=self.id2word)

        label = self.data_frame.columns.to_numpy()[self.col_num]
        print(f"Column {self.col_num} - Label: {label}\n")
        print(f"MALLET LDA Topic Modeling via Gensim with {num_topics} topics:\n")

        # Print topics and words
        x = mallet_lda_model.show_topics(num_topics=num_topics,
                                        num_words=num_words,
                                        log=False,
                                        formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        for topic, words in topics_words:
            print(f"Topic {str(topic)}:\n{str(words)}\n")

        coherence = self.coherence_score(mallet_lda_model,
                                         self.gensim_words_nostops,
                                         self.id2word)
        print(f"Coherence: {coherence}")
예제 #3
0
def make_mallet_model(main_df, d_path, stop, field, ntopics):
    mallet_path = 'mallet/bin/mallet'
    main_df_notnull = main_df[main_df['abstract'].str.strip() != 'nan.'].copy()
    main_df_notnull = main_df_notnull[main_df_notnull['abstract_length'] > 20]
    main_df_notnull = main_df_notnull[main_df_notnull['Title'].notnull()]
    token_vectorizer = CountVectorizer(
        tokenizer=reflection_tokenizer,
        #max_df=500, min_df=2,
        stop_words=stop,
        ngram_range=(1, 3))
    token_vectorizer.fit(main_df_notnull[field])
    doc_word = token_vectorizer.transform(main_df_notnull[field]).transpose()
    corpus = matutils.Sparse2Corpus(doc_word)
    word2id = dict((v, k) for v, k in token_vectorizer.vocabulary_.items())
    id2word = dict((v, k) for k, v in token_vectorizer.vocabulary_.items())
    dictionary = corpora.Dictionary()
    dictionary.id2token = id2word
    dictionary.token2id = word2id
    texts = main_df_notnull[field].apply(lambda x: x.split()).to_list()
    ldamallet = LdaMallet(mallet_path,
                          corpus=corpus,
                          num_topics=ntopics,
                          id2word=id2word,
                          random_seed=77)
    mallet_topics = pd.DataFrame(
        index=list(id2word.values()),
        columns=['Topic ' + str(x) for x in range(1, ntopics + 1)])
    print(
        ldamallet.show_topics(num_topics=ntopics, num_words=10,
                              formatted=True))
    for topic in ldamallet.show_topics(num_topics=ntopics,
                                       num_words=len(id2word),
                                       formatted=False):
        for tupler in topic[1]:
            mallet_topics.loc[tupler[0],
                              'Topic ' + str(topic[0] + 1)] = tupler[1]
    mallet_topics.to_csv(os.path.join(d_path, 'models', 'mallet_topic_df.csv'))
    return mallet_topics
예제 #4
0
def model_mallet(clean_doc, dictionary, doc_term_matrix):

    lda_mallet = LdaMallet(mallet_path,
                           corpus=doc_term_matrix,
                           id2word=dictionary,
                           num_topics=25,
                           workers=3)
    print("Topics generated with the mallet LDA model are:\n")
    pprint(lda_mallet.show_topics(formatted=False))
    print("----------------------------------------------------")

    coherence_model_mallet = CoherenceModel(model=lda_mallet,
                                            texts=clean_doc,
                                            dictionary=dictionary,
                                            coherence='c_v')
    coherence_mallet = coherence_model_mallet.get_coherence()
    print(f"coherence score: {coherence_mallet}")

    mallet_2 = ldamallet.malletmodel2ldamodel(lda_mallet)

    return mallet_2
예제 #5
0
def main():
    num_topics = 10
    #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt'
    MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin",
                               "mallet.bat")  # r"D:\Mallet\mallet-2.0.8\bin"
    texts = wenzhang_Lemmatizer1.texts2
    dictionary = corpora.Dictionary(texts)
    dictionary.save('dictionary_mallet_10_3.dictionary')
    #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary')
    word_id = dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus)  # 保存corpus
    # corpus = corpora.MmCorpus('corpus_wenzhang.mm')  # 加载
    # print(os.path.abspath('corpus.mm'))
    mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH,
                                 corpus=corpus,
                                 num_topics=num_topics,
                                 id2word=dictionary)
    mallet_lda_model.save(
        'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model')
    #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model')
    topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics,
                                                 num_words=20)
    # print(topic_words20)
    writetopic_wordToExcleFile(
        topic_words20,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls'
    )
    topic_words = mallet_lda_model.get_topics()
    print(len(topic_words), len(topic_words[0]))
    doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics())  #doc_topics_path
    #print(mallet_lda_model.fdoctopics())
    writedoc_topicToExcleFile(
        doc_topics,
        'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3'
    )
    return texts, word_id, topic_words, doc_topics, num_topics
f = open("discursos_all.txt", "r")
discursos_file = f.read()
f.close()

res = eval(discursos_file)

elapsed_time = time.time() - start_time
print(time.strftime("Discursos importados, demorou %H:%M:%S:%m",
                    time.gmtime(elapsed_time)))

start_time = time.time()

data = [a.split() for a in res]

dictionary = Dictionary(data)

corpus = [dictionary.doc2bow(t) for t in data]

mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat'

lda = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=500)

elapsed_time = time.time() - start_time
print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time)))

with open("topics_500_latest.txt", 'w+') as f:
    for index, topic in lda.show_topics(formatted=False, num_words=15):
        f.write('[{}] - '.format(index))
        f.write(', '.join(str(line[0]) for line in topic))
        f.write('\n')
예제 #7
0
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=300,
                              id2word=id2word_dictionary)
        print('LDA Model trained')

        try:
            ldamallet.save('ldamallet_mag.model')
        except OverflowError:
            print("Trying to pickle model using protocol 4")
            with open('ldamallet_mag.model', 'wb') as pick:
                pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL)
        print("Lda model saved to disk")

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        # Compute Coherence Score
        coherence_model_ldamallet = CoherenceModel(
            model=ldamallet,
            texts=data_stemmed,
            dictionary=id2word_dictionary,
            coherence='c_v')

        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        print('\nCoherence Score: ', coherence_ldamallet)

# Memory-friendly
# Create generator
# docstream is a generator which will be passed to Dictionary to create a Gensim dictionary
# docstream = (tokens for tokens in stream_from_file(filename))
예제 #8
0
    - 8 topics, ~ local optimum
    - 30 topic, ~ global optimum
"""

# model with 8 topics
# --+ estimate model
lda_8 = LdaMallet(
    mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123
)
# --+ print topics (20 words per topic)
lda_8.print_topics(num_topics=8, num_words=20)
# --+ translate topic modeling outcome
lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8)

# --+ term-to-topic probabilities (10 words per topic)
top_terms_line = lda_8.show_topics(num_topics=8, num_words=10)
# ----+ rearrange data on top 10 terms per topic
top_terms_m = []
for i in top_terms_line:
    topic_num = i[0]
    prob_terms = i[1].split("+")
    for term_sort, term in enumerate(prob_terms):
        weight = float(term.split("*")[0])
        term = term.split("*")[1].strip('"| ')
        top_terms_m.append([topic_num, term_sort, weight, term])
df = pd.DataFrame(top_terms_m)
# ----+ rename columns
old_names = [0, 1, 2, 3]
new_names = ["topic_n", "term_sort", "weight", "term"]
cols = dict(zip(old_names, new_names))
df.rename(columns=cols, inplace=True)
예제 #9
0
count_vectorizer.fit(docs)
doc_word = count_vectorizer.transform(docs).transpose()
corpus = matutils.Sparse2Corpus(doc_word)

# vocab creation
word2id = dict((v, k) for v, k in count_vectorizer.vocabulary_.items())
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
dictionary = corpora.Dictionary()
dictionary.id2token = id2word
dictionary.token2id = word2id

# topic modeling
ldamallet = LdaMallet(MALLET_PATH,
                      corpus=corpus,
                      num_topics=num_topics,
                      id2word=id2word,
                      iterations=400)

# save topic model to file
topic_file = open("english_topics_{}.pkl".format(sys.argv[1]), "wb")
pickle.dump(ldamallet.show_topics(formatted=False, num_topics=num_topics),
            topic_file)
topic_file.close()

# get NPMI coherence
coherence = CoherenceModel(model=ldamallet,
                           texts=texts,
                           dictionary=dictionary,
                           coherence='c_npmi')
print("coherence:", coherence.get_coherence())
예제 #10
0
파일: gensim_lda.py 프로젝트: Loielaine/NLP
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         alpha=0.1,
                         eta='auto',
                         iterations=args.num_iterations,
                         num_topics=args.num_topics)
    total1 = time.time() - start1
    with open(args.output + "_lda" + ".times", 'w') as out:
        out.write('time: %f s' % float(total1))
    topics1 = lda_model.show_topics(num_topics=args.num_topics,
                                    num_words=50,
                                    log=True,
                                    formatted=False)
    report_topics(args.output + "_lda", topics1, limit=50)

    start2 = time.time()
    lda_mallet_model = LdaMallet('./Mallet/bin/mallet',
                                 corpus=corpus,
                                 id2word=dictionary,
                                 alpha=0.1,
                                 iterations=args.num_iterations,
                                 num_topics=args.num_topics)
    total2 = time.time() - start2
    with open(args.output + "_mallet" + ".times", 'w') as out:
        out.write('time: %f s' % float(total2))
    topics2 = lda_mallet_model.show_topics(num_topics=args.num_topics,
                                           num_words=50,
                                           log=True,
                                           formatted=False)
    report_topics(args.output + "_mallet", topics2, limit=50)
예제 #11
0
def model_topics(era, n_topics=8, n_iterations=2500):
    """Conducts topic modeling with supplied parameters.
    Relies on a MALLET binary in the src directory. Note: this
    binary is not included in the GitHub repository due to
    storage restrictions.

    Saves topic modeling numerical results to CSV file and
    topic words to text file.

    Parameters
    ----------
    era : str
        Century to limit corpus to: '19th' or '20th'
    n_topics : int, optional
        Number of topics to assume for modeling, by default 8
    n_iterations : int, optional
        Number of iterations to run LDA algorithm, by default 2500
    """
    parent_dir = Path(__file__).parents[1]
    seed = 1921

    dictionary, bow_corpus, IDs = pickle.load(open(parent_dir / 'data/corpus.pickle', 'rb'))
    # cast posix path to string for gensim connection to mallet
    path_to_mallet_binary = str(parent_dir / 'src//mallet-2.0.8/bin/mallet')

    model = LdaMallet(path_to_mallet_binary,
                      corpus=bow_corpus,
                      num_topics=n_topics,
                      id2word=dictionary,
                      iterations=n_iterations,
                      random_seed=seed
                      )

    topics_table = {}
    docs = list(model.load_document_topics())
    for i in tqdm(range(len(IDs)), desc='Reading results into dataframe'):
        doc = docs[i]
        topic_percentages = [t[1] for t in doc]
        topics_table[IDs[i]] = topic_percentages

    reference = pd.read_csv(parent_dir / 'data/reference.csv')
    title_dict = OrderedDict()
    for ID in tqdm(reference['ID'], desc='Getting titles from ID'):
        title_dict[ID] = reference[reference['ID'] == ID]['title'].values[0]

    year_dict = OrderedDict()
    for ID in tqdm(reference['ID'], desc='Getting years from ID'):
        year_dict[ID] = reference[reference['ID'] == ID]['date'].values[0]

    column_names = [f'topic_{i}' for i in range(0, n_topics)]
    results = pd.DataFrame.from_dict(topics_table,
                                     orient='index',
                                     columns=column_names)
    results.index.name = "ID"
    results.insert(0, "title", title_dict.values())  # add titles
    results.insert(1, "ID", IDs)  # add ids
    results.insert(2, "year", year_dict.values())  # add years

    # save results to file
    results.to_csv(parent_dir / 'data' / f'{era}_topics.csv', index=False)

    # save topic words to text file
    topics = model.show_topics(num_topics=-1)
    with open(parent_dir / 'data' / f'{era}_topics.txt', 'a') as output:
        output.writelines(str(line)+'\n' for line in topics)
예제 #12
0
'''

# model with 9 topics
# --+ estimate model
lda_9 = LdaMallet(mallet_path,
                  corpus=corpus,
                  id2word=dictionary,
                  num_topics=9,
                  random_seed=123)

# --+ print topics (20 words per topic)
lda_9.print_topics(num_topics=9, num_words=20)
# --+ translate topic modeling outcome
lda_9 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_9)
# --+ term-to-topic probabilities (10 words per topic)
top_terms_line = lda_9.show_topics(num_topics=9, num_words=10)

# ----+ rearrange data on top 10 terms per topic
top_terms_m = []
for i in top_terms_line:
    topic_num = i[0]
    prob_terms = i[1].split('+')
    for term_sort, term in enumerate(prob_terms):
        weight = float(term.split('*')[0])
        term = term.split('*')[1].strip('"| ')
        top_terms_m.append([topic_num, term_sort, weight, term])
df = pd.DataFrame(top_terms_m)
# ----+ rename columns
old_names = [0, 1, 2, 3]
new_names = ['topic_n', 'term_sort', 'weight', 'term']
cols = dict(zip(old_names, new_names))
예제 #13
0
output_path = 'd:/code/gc_text_analysis/mallet_output/'
num_topics = 140
model = LdaMallet(path_to_mallet_binary,
                  corpus=bow_docs,
                  workers=4,
                  iterations=2000,
                  num_topics=num_topics,
                  id2word=dictionary,
                  prefix=output_path)

model.save('gc_lda_model.pkl')

dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items())
words_freq = [(dictionary.id2token[id], cnt)
              for id, cnt in dictionary.dfs.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
words_freq = pd.DataFrame(words_freq, columns=['word', 'count'])

coherence_model_lda = CoherenceModel(model=model,
                                     texts=ngram_docs,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

topics = model.show_topics(num_topics=num_topics,
                           num_words=10,
                           log=False,
                           formatted=False)
topics = list(zip(*topics))[1]

gc_topics = model[bow_docs[-73:]]