class LDARecommender(Recommender): def __init__(self): return def preprocess(self, text): return preprocessing.cleanTokens(text) def train(self, train_filename): print("train LDA") train_name = os.path.basename(train_filename) model_filename = train_name + ".lda_model" if os.path.isfile(model_filename): self.model = LdaMallet.load(model_filename) else: self.corpus = preprocessing.GensimCorpus(train_filename) self.model = LdaMallet(mallet_path, self.corpus, num_topics=100, id2word=self.corpus.dictionary) self.model.save(model_filename) topics_str = self.model.show_topics(num_topics=-1) open(train_name + ".lda_model.topics", 'w').write(str(topics_str)) def recommend(self, input_text): input_bow = self.corpus.dictionary.doc2bow(self.preprocess(input_text)) input_topics = self.model[input_bow] print("lda topics: " + str(input_topics)) return input_text
class LdaMalletHandler: def __init__(self, mallet_path): self.mallet_path = mallet_path def run_model(self, model_name, corpus, **kwargs): self.model_name = model_name self.dictionary = Dictionary(corpus) corpus_bow = [self.dictionary.doc2bow(text) for text in corpus] os.makedirs("ldamodels/"+model_name, exist_ok=True ) self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs) def save_model(self): self.model.save("ldamodels/"+self.model_name+"/model.model") self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict") def load_model(self, model_name): self.model_name = model_name self.dictionary = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict") self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model") self.model.mallet_path = self.mallet_path def doc_topics(self, doc_idx): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) return self.doc_retriever.doc_topics(doc_idx) def ext_doc_topics(self, ext_doc): doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] doc_topics.sort(key=lambda x: x[1], reverse=True) return doc_topics def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] topics = [] for topic in doc_topics: topics.append(topic[1]) most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric) return most_similar def n_most_representative(self, topic, n=3): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) topics = np.zeros(self.model.num_topics) topics[topic]=1 most_similar = self.doc_retriever.n_most_similar(topics, n=n) return most_similar def get_string_topics(self, num_topics=-1, num_words=10): if(num_topics==-1): num_topics = self.model.num_topics string_topics = [] for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words): splitted = topic[1].split("\"") result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))] string_topics.append(" ".join(result)) return string_topics
def fit_lda(prefix, tokenized_docs, id2word, mallet_path=os.environ["MALLET_PATH"], num_topics=500, iterations=500): if not os.path.isdir(prefix): os.makedirs(prefix) if os.path.exists(os.path.join(prefix, "saved_model.pkl")): return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl")) elif tokenized_docs is None: raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl")) if mallet_path is None or mallet_path == "": raise ValueError("No mallet path specified") corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()] lda_model = LdaMallet(mallet_path=mallet_path, prefix=prefix, corpus=corpus, id2word=id2word, iterations=iterations, workers=4, num_topics=num_topics, optimize_interval=20) lda_model.save(os.path.join(prefix, "saved_model.pkl")) id2word.save_as_text(os.path.join(prefix, "id2word")) # save clean lda weights for later analysis W = lda_model.get_topics() W = pd.DataFrame(W).rename(columns=id2word) W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id") W.to_csv(os.path.join(prefix, "lda_weights.csv")) return lda_model
def run(): # Get the Preprocessed Dataset df = pd.read_pickle('./data/tmp/preprocessed.pkl') if os.path.isfile('./models/MALLET/mallet_model.pkl'): # Let's not do any model retraining without building in topic stability constraints # e.g. number of docs or tokens now in different topics seen = False # Data we provide is new and unseen for the model with open('./models/MALLET/mallet_model.pkl', 'rb') as modelfile: topic_model = pickle.load(modelfile) with open('./models/MALLET/mallet_dict.pkl', 'rb') as dictfile: dictionary = pickle.load(dictfile) df['bow'] = df['tokens'].apply(dictionary.doc2bow) else: seen = True # any data we provide is used to train the model with Timer('Train the LDA Model'): test_range = (5, 50) df, corpus, dictionary = get_corpus_and_dict(df, 'tokens') list_of_models, scores = topic_count_selection( dictionary, corpus, list(df['tokens']), test_range) plot_coherence( test_range, scores).savefig('./models/MALLET/ModelCoherence.png') # Let's save the model with highest coherence num_topics = test_range[0] + scores.index(max(scores)) + 1 topic_model = LdaMallet('/home/hadoop/Mallet-master/bin/mallet', corpus=corpus, num_topics=num_topics, id2word=dictionary, iterations=1000, prefix=f'{os.getcwd()}/models/MALLET/', random_seed=42) print(f"* Chosen Model with {num_topics} topics") with open('./models/MALLET/mallet_model.pkl', 'wb') as modelfile: topic_model.save(modelfile) with open('./models/MALLET/mallet_corpus.pkl', 'wb') as corpusfile: pickle.dump(corpus, corpusfile) with open('./models/MALLET/mallet_dict.pkl', 'wb') as dictfile: pickle.dump(dictionary, dictfile) df = get_topic_model_scores(df, topic_model, seen=seen) df.to_pickle('./data/tmp/scored.pkl') print("\nSample") print(df.head(), "\n")
def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, model_file_path: str, language_processed_data: list): my_path = os.path.abspath(os.path.dirname(__file__)) logging.info("---- Creating LDA Mallet model") logging.info("------ Getting LDA Mallet model file") mallet_path = os.path.join(my_path, "../../statics/mallet-2.0.8/bin/mallet") temp = self.essentials.dictionary[0] model = LdaMallet(mallet_path, corpus=self.essentials.corpus, num_topics=self.number_of_topics, id2word=self.essentials.dictionary.id2token) model.save(model_file_path) self.model = model logging.info("---- LDA Mallet model is created") metrics = self.get_model_evaluation_metrics(language_processed_data) parameters = self.get_model_parameters() self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters) return
def get_topics(num, corpus, id2word, output_dir, all_sentences): print(num) ldamallet = LdaMallet(args.mallet_dir, corpus=corpus, num_topics=num, prefix=output_dir + "/" + str(num), workers=4, id2word=id2word, iterations=1000, random_seed=42) coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=all_sentences, dictionary=id2word, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet) keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)} with open(output_dir + "/" + str(num) + '_words.json', 'w') as f: f.write(json.dumps(keywords)) ldamallet.save(output_dir + "/" + str(num)) #ldamallet.show_topics(num_topics=num, formatted=True) return coherence_ldamallet
def main(): num_topics = 10 #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt' MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin", "mallet.bat") # r"D:\Mallet\mallet-2.0.8\bin" texts = wenzhang_Lemmatizer1.texts2 dictionary = corpora.Dictionary(texts) dictionary.save('dictionary_mallet_10_3.dictionary') #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary') word_id = dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus) # 保存corpus # corpus = corpora.MmCorpus('corpus_wenzhang.mm') # 加载 # print(os.path.abspath('corpus.mm')) mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=dictionary) mallet_lda_model.save( 'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model') #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model') topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics, num_words=20) # print(topic_words20) writetopic_wordToExcleFile( topic_words20, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls' ) topic_words = mallet_lda_model.get_topics() print(len(topic_words), len(topic_words[0])) doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics()) #doc_topics_path #print(mallet_lda_model.fdoctopics()) writedoc_topicToExcleFile( doc_topics, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3' ) return texts, word_id, topic_words, doc_topics, num_topics
# TOO BIG TO SERIALIZE # Save the Dict and Corpus try: corpora.MmCorpus.serialize('mag_bow_corpus.mm', corpus) # save corpus to disk except OverflowError: # Don't save corpus, call LDA directly print("Overflow while saving corpus, skip and train.") ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary) print('LDA Model trained') try: ldamallet.save('ldamallet_mag.model') except OverflowError: print("Trying to pickle model using protocol 4") with open('ldamallet_mag.model', 'wb') as pick: pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL) print("Lda model saved to disk") # Show Topics pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel( model=ldamallet, texts=data_stemmed, dictionary=id2word_dictionary, coherence='c_v')
DICT_PATH = 'docs.dict' MODEL_PATH = 'docs.model' raw_corpus = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] docs = [doc.split() for doc in raw_corpus] if exists(MODEL_PATH): print('Testing...\n') dict = corpora.Dictionary.load(DICT_PATH) lda = LdaMallet.load(MODEL_PATH) for doc in docs: topics = lda[dict.doc2bow(doc)] print(topics, doc) else: print('Training...\n') dictionary = corpora.Dictionary(docs) dictionary.save(DICT_PATH) corpus = [dictionary.doc2bow(text) for text in docs] lda = LdaMallet(MALLET_PATH, corpus=corpus, num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX) lda.save(MODEL_PATH)
corpus = corpora.MmCorpus('unpaywallmag_bow_corpus.mm') except FileNotFoundError: corpus = [id2word_dictionary.doc2bow(textlist) for textlist in tqdm(data_stemmed)] print("Doc2Bow corpus created") # TOO BIG TO SERIALIZE # Save the Dict and Corpus try: corpora.MmCorpus.serialize('unpaywallmag_bow_corpus.mm', corpus) # save corpus to disk except OverflowError: # Don't save corpus, call LDA directly print("Overflow while saving corpus, skip and train.") ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary) print('LDA Model trained') try: ldamallet.save('ldamallet_model.model') except OverflowError: print("Trying to pickle model using protocol 4") with open('ldamallet_model.model', 'wb') as pick: pick.dump(ldamallet_model, pick, protocol=pickle.HIGHEST_PROTOCOL) print("Lda model saved to disk") # Show Topics pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_stemmed, dictionary=id2word_dictionary, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet)
def extract_features(max_documents=50000000, max_words_per_doc=50000000, incl_tf=True, incl_df=True, incl_graph=True, incl_w2v=True, incl_topic_model=True, incl_atm=True): ######### SIMPLE FREQUENCY MEASURES ###################################################### if incl_df or incl_tf or incl_graph: doc_cnt = max_documents # set containers: tf, df, network = Counter(), Counter(), nx.Graph() doc_ner_idx = {} dir_ner_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='wiki') dir_filename_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='filename') for filename, words in zip(dir_filename_iterator, dir_ner_iterator): # count the ners: ner_cnt = Counter() ner_cnt.update(words) if ner_cnt: # collect which ners appear in which doc: doc_ner_idx[os.path.basename(filename)] = set([n for n in ner_cnt]) # update global tf and df: for k, v in ner_cnt.items(): tf[k] += v df[k] += 1 # update nodes in network: for ner in ner_cnt: if ner not in network: network.add_node(ner) # update edges in network: for ner1, ner2 in combinations(ner_cnt, 2): try: network[ner1][ner2]['weight'] += 1 except KeyError: network.add_edge(ner1, ner2, weight=1) # dump for reuse: pickle.dump(tf, open('../workspace/tf.m', 'wb')) pickle.dump(df, open('../workspace/df.m', 'wb')) pickle.dump(doc_ner_idx, open('../workspace/doc_ner_idx.m', 'wb')) pickle.dump(network, open('../workspace/nx.m', 'wb')) # scale network values: max_weight = float(max([network[n1][n2]['weight']\ for n1, n2 in network.edges_iter()])) for n1, n2 in network.edges_iter(): network[n1][n2]['weight'] /= max_weight nx.write_gexf(network, '../workspace/dbnl_network.gexf', prettyprint=True) ######### WORD2VEC MODEL ###################################################### if incl_w2v: # build w2v model: dir_w2v_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='w2v') w2v_model = Word2Vec(dir_w2v_iterator, window=15, min_count=10, size=150, workers=10, negative=5) w2v_model.init_sims(replace=True) w2v_model.save(os.path.abspath('../workspace/w2v_model.m')) ######### STANDARD TOPIC MODEL ###################################################### if incl_topic_model: # build vocab for lda: vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda_vocab') lda_dict = corpora.Dictionary(vocab_lda_iterator) lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000) # build lda model: dir_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda', lda_dict=lda_dict) lda_workspace_path = '../workspace/mallet_output/' if not os.path.isdir(lda_workspace_path): os.mkdir(lda_workspace_path) mallet_path = '/home/mike/GitRepos/dbnl/code/mallet-2.0.8RC2/bin/mallet' lda_model = LdaMallet(mallet_path, dir_lda_iterator, num_topics=150, id2word=lda_dict, iterations=1900, prefix=lda_workspace_path) lda_model.save('../workspace/lda_model.m') ######### AUTHOR TOPIC MODEL ###################################################### if incl_atm: # build vocab for lda: vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda_vocab') lda_dict = corpora.Dictionary(vocab_lda_iterator) lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000) lda_dict.compactify() atm_vocab = [] for i, w in lda_dict.items(): atm_vocab.append(w) print(len(atm_vocab), 'vocab') atm_vocab = tuple(atm_vocab) corpus, doc_author = [], [] for filename in sorted(glob.glob('../workspace/wikified_periodicals/*.wikified')): doc_words, auth_set = [], set() max_documents -= 1 if max_documents % 100 == 0: print('\t-', max_documents, 'to go') if max_documents <= 1: break word_cnt = max_words_per_doc for line in codecs.open(filename, 'r', encoding='utf8'): comps = line.strip().split('\t') if comps: idx, token, lemma, pos, pos_conf, ner, wiki = comps if wiki != 'X': auth_set.add(wiki) elif pos.startswith(('N(', 'ADJ(')): try: doc_words.append(atm_vocab.index(token.lower())) except: pass word_cnt -= 1 if word_cnt <= 0: break if auth_set and doc_words: corpus.append(sorted(doc_words)) doc_author.append(sorted(list(auth_set))) atm_author_idx = {} for i1, authors in enumerate(doc_author): for i2, auth in enumerate(authors): if auth not in atm_author_idx: atm_author_idx[auth] = len(atm_author_idx) doc_author[i1][i2] = atm_author_idx[auth] n_topic = 30 atm_model = AuthorTopicModel(n_doc=len(corpus), n_voca=len(atm_vocab), n_topic=n_topic, n_author=len(atm_author_idx)) atm_model.fit(corpus, doc_author, max_iter=10) for k in range(n_topic): top_words = get_top_words(atm_model.TW, atm_vocab, k, 10) print('topic ', k , ','.join(top_words)) author_id = 7 fig = plt.figure(figsize=(12,6)) plt.bar(range(n_topic), atm_model.AT[author_id]/np.sum(atm_model.AT[author_id])) #plt.title(author_idx[author_id]) plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(atm_model.TW, atm_vocab, k, 10)) for k in range(n_topic)]) #plt.show() plt.savefig('atm1.pdf') pickle.dump(atm_vocab, open('../workspace/atm_vocab.m', 'wb')) pickle.dump(atm_model, open('../workspace/atm_model.m', 'wb')) pickle.dump(atm_author_idx, open('../workspace/atm_author_idx.m', 'wb'))
# TOO BIG TO SERIALIZE # Save the Dict and Corpus try: corpora.MmCorpus.serialize('arxivmag_bow_corpus.mm', corpus) # save corpus to disk except OverflowError: # Don't save corpus, call LDA directly print("Overflow while saving corpus, skip and train.") ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary) print('LDA Model trained') try: ldamallet.save('ldamallet_arxiv.model') except OverflowError: print("Trying to pickle model using protocol 4") with open('ldamallet_arxiv.model', 'wb') as pick: pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL) print("Lda model saved to disk") # Show Topics pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel( model=ldamallet, texts=data_stemmed, dictionary=id2word_dictionary, coherence='c_v')
def generate_topics(data, topics, gender): dictionary = Dictionary(data) corpus = [dictionary.doc2bow(text) for text in data] print('performing topic modeling with', topics, 'topics') ldamodel = LdaMallet(TopicModeling.MALLET_PATH, corpus=corpus, num_topics=topics, id2word=dictionary) ldamodel.save('ldamodel.' + gender + '.' + str(topics))
print( 'Train an LDA model over the given corpus using the given dictionary.' ) print('If num_topics is not specified, use the default of 100.') print( 'If num_passes is specified, makes multiple passes over the corpus.' ) print('This uses MALLET to train a topic model.') else: _, mm_fname, dict_fname, model_fname = sys.argv[:4] num_topics = int(sys.argv[4]) if len(sys.argv) >= 5 else 100 try: mallet_path = sep.join( [os.environ['MALLET_HOME'], 'bin', 'mallet']) except KeyError: logging.error('please set the MALLET_HOME environment variable to ' 'the root directory of your MALLET installation') exit() mm = MmCorpus(mm_fname) id2word = Dictionary.load(dict_fname) lda_model = LdaMallet(mallet_path, corpus=normalize_langs(mm), id2word=id2word, num_topics=num_topics, prefix=model_fname[:-6], iterations=100) lda_model.save(model_fname)
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) words_freq = pd.DataFrame(words_freq, columns=['word', 'count']) acronyms = words_freq[words_freq.word.str.len() <= 3] acronyms.to_csv('acronyms.csv') if train: print('begin training mallet LDA model') mallet_lda_model = LdaMallet(path_to_mallet_binary, corpus=bow_corpus, iterations=3900, num_topics=140, alpha=60, id2word=dictionary, prefix=path_to_mallet_output, workers=multiprocessing.cpu_count()) mallet_lda_model.save('{}lda_model.pkl'.format(path_to_mallet_output)) # mallet_lda_model.save('{}lda_model_{}.pkl'.format(path_to_mallet_output, uuid)) print('calculate model coherence C_v score') coherence_model_lda = CoherenceModel(model=mallet_lda_model, texts=docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('model coherence score: {}'.format(coherence_lda)) else: print('load mallet LDA model') # mallet_lda_model = LdaMallet.load('{}lda_model.pkl'.format(path_to_mallet_output)) mallet_lda_model = LdaMallet.load('{}lda_model_{}.pkl'.format( path_to_mallet_output, uuid)) # # #convert the model to gensim format
class NlPipe: def __init__(self, list_of_docs, path, document_ids=None, language_model="en_core_web_lg", tagger=False, parser=False, ner=False, categorization=False, remove_stopwords=True, remove_punctuation=True, set_lower=True, remove_num=True, expand_stopwords=True, language_detection=False, allowed_languages=frozenset({'en'}), no_processes=None): """ :param list_of_docs: List of strings where every document is one string. :param document_ids: The ids of the documents, matching the order of the list_of_docs :param language_model: Spacy language model to be used for text preprocessing :param tagger: Use spacy part-of-speech tagger. :param parser: Use spacy to annotate syntactic dependencies in documents. :param ner: Use spacy for entity recognition and annotation. :param categorization: Use spacy to assign document labels :param remove_stopwords: Remove stop words during text preprocessing. :param remove_punctuation: Remove punctuation during text prssing. :param set_lower: Convert all strings to lowercase during text preprocessing. :param remove_num: Remove numeric characters during text preprocessing. :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words. :param language_detection: Detect language of docs. :param allowed_languages: Allowed language for the documents. """ self.path = path self.pipe_disable = [] if not tagger: self.pipe_disable.append("tagger") if not parser: self.pipe_disable.append("parser") if not ner: self.pipe_disable.append("ner") if not categorization: self.pipe_disable.append("textcat") self.remove_punctuation = remove_punctuation self.remove_stop_words = remove_stopwords self.remove_num = remove_num self.set_lower = set_lower self.input_docs = list_of_docs self.document_ids = np.array(document_ids) self.use_gpu = spacy.prefer_gpu() self.nlp = spacy.load(language_model) if expand_stopwords: stops = [stop for stop in self.nlp.Defaults.stop_words] for stop in stops: self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop)) self.spacy_docs = None self.preprocessed_docs = None self.bag_of_words = None self.preprocessing_batch_size = 50000 if no_processes is None: self.processes = psutil.cpu_count(logical=False) - 1 else: self.processes = no_processes self.lda_model = None self.result_df = None self.word_topic_df = None self.allowed_languages = allowed_languages self.language_detection = language_detection self.id2word = None self.coherence_dict = None self.max_df = None self.min_df = None self.use_phrases = None self.filter_extremes_value = None self.keep_n = None self.keep_tokens = None def enable_pipe_component(self, component): """ Method to enable components of the spacy pipeline after initialization of the class. :param component: Component to enable (see https://spacy.io/usage/processing-pipelines/ for available components). """ if component in self.pipe_disable: self.pipe_disable.remove(component) def disable_pipe_component(self, component): """ Method to disable components of the spacy pipeline after initialization of the class. :param component: Component to disable (see https://spacy.io/usage/processing-pipelines/ for available components). """ if component not in self.pipe_disable: self.pipe_disable.append(component) def preprocess_spacy(self, load_existing=True, save_data=True, filter_loaded=None): """ Method to preprocess the documents using spacy with the enabled pipeline components. """ if os.path.exists( f"{self.path}text_df_preprocessed_spacy") and load_existing: preprocessed_df = pd.read_pickle( f"{self.path}text_df_preprocessed_spacy") if filter_loaded is None: self.spacy_docs = preprocessed_df['preprocessed_text'].to_list( ) else: self.spacy_docs = preprocessed_df['preprocessed_text'].loc[ filter_loaded].to_list() else: if self.language_detection: self.spacy_docs = [ doc for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing text with spacy: ") if detect(doc.text) in self.allowed_languages ] else: self.spacy_docs = [] for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing spacy"): self.spacy_docs.append(doc) if save_data: temp_df = pd.DataFrame([self.document_ids, self.spacy_docs]).transpose() temp_df.columns = ['thread_id', 'preprocessed_text'] temp_df.to_pickle(f"{self.path}text_df_preprocessed_spacy") def preprocess(self, load_existing=True, filter_loaded=None): """ Remove stop words, numbers and punctation as well as lower case all of the tokens, depending on the settings passed to the class during initialization. """ if os.path.exists( f"{self.path}/text_df_preprocessed") and load_existing: print("Found preprocessed data. Loading") preprocessed_df = pd.read_pickle( f"{self.path}/text_df_preprocessed") if filter_loaded is None: self.preprocessed_docs = preprocessed_df[ 'preprocessed_text'].to_list() print('Preprocessed data loaded.') else: self.preprocessed_docs = preprocessed_df[ 'preprocessed_text'].loc[filter_loaded].to_list() if isinstance(self.document_ids, np.ndarray): self.document_ids = self.document_ids[filter_loaded] print( f'{sum(filter_loaded)} preprocessed docs of {len(self.input_docs)} docs loaded.' ) else: self.preprocessed_docs = [] if not self.spacy_docs: self.preprocess_spacy() for spacy_doc in tqdm( self.spacy_docs, desc="Removing stop words/punctuation/numeric chars: "): doc = [] for token in spacy_doc: # todo: check if useful condition if not self.remove_stop_words and token.is_stop: word = token.text elif token.is_stop: continue else: word = token.lemma_ if self.set_lower: word = word.lower() if self.remove_num: word = re.sub(r"[\d]", "", word) if self.remove_punctuation: word = re.sub(r"[\W]", "", word) if len(word) >= 2 and word != "wbr": doc.append(word) self.preprocessed_docs.append(doc) temp_df = pd.DataFrame([self.document_ids, self.preprocessed_docs]).\ transpose() temp_df.columns = ['thread_id', 'preprocessed_text'] temp_df.to_pickle(f"{self.path}/text_df_preprocessed") def create_bag_of_words(self, filter_extremes=True, min_df=5, max_df=0.5, keep_n=100000, keep_tokens=None, use_phrases=None, bigram_min_count=1000, bigram_threshold=100, trigram_threshold=100, load_existing=True, tfidf=False): """ :param filter_extremes: En-/Disable filtering of tokens that occur too frequent/not frequent enough (https://radimrehurek.com/gensim/corpora/dictionary.html) :param min_df: Keep only tokens that appear in at least n documents (see link above) :param max_df: Keep only tokens that appear in less than the fraction of documents (see link above) :param keep_n: Keep only n most frequent tokens (see link above) :param keep_tokens: Iterable of tokens not to be remove (see link above) :param use_phrases: Set to bigram or trigram if the use of Gensmin Phrases (https://radimrehurek.com/gensim/models/phrases.html) is wanted. Will create bigrams/trigrams of frequently co-occuring words (e.g. "new", "york" => "new_yor)k"). :param bigram_min_count: Minimum occurrence of bigrams to be considered by Gensmin Phrases. :param bigram_threshold: Threshold for Gensim Phrases bigram settings. :param trigram_threshold: Threshold for Gensim Phrases trigram settings. """ if use_phrases not in {None, "bigram", "trigram"}: raise Exception( "Please use valid option (None, 'bigram' or 'trigram) to make use of this function." ) #todo: check logic else: if use_phrases == "bigram" and not isinstance( bigram_threshold, int) and not isinstance( bigram_min_count, int): raise Exception( "Thresholds or minimum count for bigrams/trigrams not integer. Please provide " "threshold and minimum count for bigrams (and trigrams) as integer." ) elif use_phrases == "trigram" and not isinstance(bigram_threshold, int) \ or not isinstance(trigram_threshold, int) or not isinstance(bigram_min_count, int): raise Exception( "Thresholds or minimum count for bigrams/trigrams not integer. Please provide " "threshold and minimum count for bigrams (and trigrams) as integer." ) if not self.preprocessed_docs: self.preprocess() if os.path.exists(f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}") \ and load_existing: self.load_dict( path= f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}" ) self.filter_extremes_value = filter_extremes self.min_df = min_df self.max_df = max_df self.use_phrases = use_phrases else: #todo: add auto check for existing dictionary here. if use_phrases == "bigram" or use_phrases == "trigram": self.create_bigrams(bigram_min_count=bigram_min_count, bigram_threshold=bigram_threshold) if use_phrases == "trigram": self.create_bigrams(bigram_min_count=bigram_min_count, bigram_threshold=bigram_threshold) self.create_trigrams(trigram_threshold=trigram_threshold) self.create_dictionary(filter_extremes=filter_extremes, min_df=min_df, max_df=max_df, keep_n=keep_n, keep_tokens=keep_tokens, use_phrases=use_phrases) self.create_bag_of_words_matrix(tfidf=tfidf) def create_bigrams(self, bigram_min_count, bigram_threshold): self.bigram_phrases = Phrases(self.preprocessed_docs, min_count=bigram_min_count, threshold=bigram_threshold) self.bigram_phraser = Phraser(self.bigram_phrases) self.preprocessed_docs = [ self.bigram_phraser[doc] for doc in tqdm(self.preprocessed_docs, desc="Extracting bigrams") ] def create_trigrams(self, trigram_threshold): trigram_phrases = Phrases(self.bigram_phrases[self.preprocessed_docs], threshold=trigram_threshold) trigram_phraser = Phraser(trigram_phrases) self.preprocessed_docs = [ trigram_phraser[self.bigram_phraser[doc]] for doc in tqdm(self.preprocessed_docs, desc="Extracting trigrams") ] def create_bag_of_words_matrix(self, tfidf=False): self.bag_of_words = [ self.id2word.doc2bow(doc) for doc in tqdm(self.preprocessed_docs, desc='Creating bag of words') ] if tfidf: self.create_tfidf() def create_dictionary(self, filter_extremes, min_df, max_df, keep_n, keep_tokens, use_phrases): print('Creating dictionary.') self.id2word = corpora.Dictionary(self.preprocessed_docs) # todo: add autosave of dictionary here self.max_df = max_df self.min_df = min_df self.use_phrases = use_phrases self.filter_extremes_value = filter_extremes self.keep_n = keep_n self.keep_tokens = keep_tokens if filter_extremes: self.filter_extremes(min_df=self.min_df, max_df=self.max_df, keep_n=self.keep_n, keep_tokens=self.keep_tokens) self.save_dict( path= f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}" ) def filter_extremes(self, min_df, max_df, keep_n, keep_tokens=[]): self.filter_extremes_value = True self.max_df = max_df self.min_df = min_df self.keep_n = keep_n self.keep_tokens = keep_tokens self.id2word.filter_extremes(no_below=self.min_df, no_above=self.max_df, keep_n=keep_n, keep_tokens=keep_tokens) def create_tfidf(self): tfidf_model = TfidfModel(self.bag_of_words) self.bag_of_words = [ tfidf_model[vector] for vector in tqdm(self.bag_of_words, desc="Creating tf-idf matrix") ] def create_lda_model(self, no_topics=10, random_state=42, passes=5, alpha='auto', eta=None, workers=None, chunksize=2000): """ :param no_topics: Number of topics that are to be explored by lda model :param random_state: Random state for reproducible results (default 42, gensim default is None) :param passes: Number of times the whole corpus is processed. :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric" (gensim default is "symmetric") :param eta: Word-topic distribution prior eta (beta) :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already uses all available cores. Higher number of workers results in a load bigger than the number of cores. :param chunksize: chunsize parameter of gensim """ if eta is None: eta = 1 / no_topics if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() self.lda_model = LdaMulticore(corpus=self.bag_of_words, id2word=self.id2word, num_topics=no_topics, eta=eta, workers=workers, random_state=random_state, alpha=alpha, passes=passes, chunksize=chunksize) def create_mallet_lda_model(self, no_topics, random_state=42, workers=None, mallet_path="mallet-2.0.8/bin/mallet", iterations=1000, custom_prefix=None): """ Method to create a mallet lda model using gensim wrapper for lda mallet :param no_topics: Number of topics for lda model :param random_state: Random state to be able to reprocude model creation :param workers: Number of workers to use :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet" :param iterations: iterations over the corpus?! """ if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() if custom_prefix is None: prefix = f"{self.path}mallet_temp_" else: prefix = f"{self.path}mallet_temp_{custom_prefix}_" self.lda_model = LdaMallet(num_topics=no_topics, mallet_path=mallet_path, corpus=self.bag_of_words, id2word=self.id2word, random_seed=random_state, iterations=iterations, workers=workers, prefix=prefix) def calculate_coherence(self, model=None, coherence_score='c_v', workers=None): """ Method to calculate the coherence score of a given lda model. The model can either be provided or will be taken from the class. :param model: Model to use instead of the model saved within the class. :param coherence_score: Coherence score to calculate :param workers: Number of workers to use for coherence evaluation. :return: Return coherence model, which also contains the coherence score of a model. """ if workers is None: workers = self.processes if model is None: model = self.lda_model else: model = model if coherence_score != 'u_mass': coherence_model = CoherenceModel(model=model, texts=self.preprocessed_docs, dictionary=self.id2word, coherence=coherence_score, processes=workers) else: coherence_model = CoherenceModel(model=model, corpus=self.bag_of_words, dictionary=self.id2word, coherence=coherence_score, processes=workers) return coherence_model def search_best_model(self, topic_list=frozenset({2, 3, 4, 5, 10, 15, 20, 25}), alphas=[0.9, 0.5, 0.1], etas=['auto', 0.9, 0.5, 0.1], save_best_model=True, save_models=False, return_best_model=False, passes=1, coherence_scores=['c_v'], chunksize=2000, workers=None, coherence_suffix=None): #todo: save best model within class. """ Method to search for the best lda model for a given number of topics. The best model will be determined by its coherence score. :param topic_list: Iterable of integers of topics to test the coherence score for. :param alphas: Iterable of floats between 0 and 1 for determining the dirichlet prior of the lda model. :param save_best_model: Set to true if the best model has to be saved within the class. :param save_models: If set to false (default) only the coherence score for each combination of numbers of topics and alphas will be saved. If set to true, the lda model, the coherence score and the coherence model will be saved. :param return_best_model: If true, the method will return the best found model and the number of topics of this model. :return: Number of topics for the best result and the model with the best result of the coherence score """ if coherence_suffix is None: path = f"{self.path}coherence_results" else: path = f"{self.path}coherence_results_{coherence_suffix}" if os.path.exists(path): print("coherence results found") with open(path, "rb") as f: self.coherence_dict = pickle.load(f) else: self.coherence_dict = {} if workers is None: workers = self.processes if return_best_model and not save_best_model: raise Exception( "To return the best model, the parameter save_best_model has to be set to True." ) if self.coherence_dict and save_best_model: try: best_score = self.coherence_dict['best_score'] except: best_score = 0 else: best_score = 0 for no_topics in tqdm(topic_list, desc="Calculating topic coherences: "): for alpha in tqdm(alphas, desc='Alphas'): for eta in tqdm(etas, desc='Etas'): coherence_key = f"no={no_topics}-a={alpha}-e={eta}-filter={self.filter_extremes_value}" \ f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \ f"-k_n={self.keep_n}-k_t={self.keep_tokens}" if coherence_key in self.coherence_dict.keys(): print("coherence value found, skipping") continue else: self.create_lda_model(no_topics=no_topics, alpha=alpha, eta=eta, passes=passes, chunksize=chunksize, workers=workers) self.coherence_dict[coherence_key] = {} if save_models: self.coherence_dict[coherence_key][ "lda_model"] = self.lda_model for coherence_score in coherence_scores: coherence_model = self.calculate_coherence( coherence_score=coherence_score, workers=workers) coherence_result = coherence_model.get_coherence() if save_models: self.coherence_dict[coherence_key][ "coherence_model"] = coherence_model self.coherence_dict[coherence_key][ coherence_score] = coherence_result if save_best_model and coherence_result > best_score: self.coherence_dict[ "best_score"] = coherence_result self.coherence_dict[ "best_model"] = self.lda_model self.coherence_dict[ "best_topic_no"] = no_topics self.coherence_dict["best_alpha"] = alpha self.coherence_dict["best_eta"] = eta if coherence_result > best_score: best_score = coherence_result with open(path, "wb") as f: pickle.dump(self.coherence_dict, f) if return_best_model: #returns number of topics and the lda_model return self.coherence_dict["best_topic_no"], self.coherence_dict[ "best_model"] def search_best_model_mallet(self, topic_list=frozenset( {2, 3, 4, 5, 10, 15, 20, 25}), save_best_model=True, save_models=False, return_best_model=False, coherence_scores=['c_v'], workers=None, coherence_workers=None, coherence_suffix=None, random_state=42, mallet_path="mallet-2.0.8/bin/mallet", iterations=1000): """ :param topic_list: :param save_best_model: :param save_models: :param return_best_model: :param coherence_scores: :param workers: :param coherence_suffix: :param random_state: :param mallet_path: :param iterations: :return: """ if coherence_suffix is None: path = f"{self.path}coherence_results_mallet" else: path = f"{self.path}coherence_results_mallet_{coherence_suffix}" if os.path.exists(path): print("coherence results found") with open(path, "rb") as f: self.coherence_dict = pickle.load(f) else: self.coherence_dict = {} if workers is None: workers = self.processes if coherence_workers is None: coherence_workers = self.processes if return_best_model and not save_best_model: raise Exception( "To return the best model, the parameter save_best_model has to be set to True." ) if self.coherence_dict and save_best_model: try: best_score = self.coherence_dict['best_score'] except: best_score = 0 else: best_score = 0 for no_topics in tqdm(topic_list, desc="Calculating topic coherences: "): coherence_key = f"mallet-no={no_topics}-filter={self.filter_extremes_value}" \ f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \ f"-k_n={self.keep_n}-k_t={self.keep_tokens}" if coherence_key in self.coherence_dict.keys(): print("coherence value found, skipping") continue else: self.create_mallet_lda_model(no_topics=no_topics, workers=workers, random_state=random_state, mallet_path=mallet_path, iterations=iterations) self.coherence_dict[coherence_key] = {} if save_models: self.coherence_dict[coherence_key][ "lda_model"] = self.lda_model for coherence_score in coherence_scores: coherence_model = self.calculate_coherence( coherence_score=coherence_score, workers=coherence_workers) coherence_result = coherence_model.get_coherence() if save_models: self.coherence_dict[coherence_key][ "coherence_model"] = coherence_model self.coherence_dict[coherence_key][ coherence_score] = coherence_result if save_best_model and coherence_result > best_score: self.coherence_dict["best_score"] = coherence_result self.coherence_dict["best_model"] = self.lda_model self.coherence_dict["best_topic_no"] = no_topics self.coherence_dict[ "best_alpha"] = self.lda_model.alpha if coherence_result > best_score: best_score = coherence_result with open(path, "wb") as f: pickle.dump(self.coherence_dict, f) if return_best_model: #returns number of topics and the lda_model return self.coherence_dict["best_topic_no"], self.coherence_dict[ "best_model"] def create_document_topic_df(self, model=None, no_topics=10): """ Creates a dataframe containing the the result of the LDA model for each document. Will set the topic with the highest share within the document as the dominant topic. :param model: LDA model to use for the calculation of the topic distribution of each document. :param no_topics: Number of topics in case no LDA model is provided. """ if model is None: model = self.lda_model if isinstance(model, LdaMallet): model = malletmodel2ldamodel(model) topic_result_list = [] for doc in model.get_document_topics(bow=self.bag_of_words): temp_dict = {} for topic, probability in doc: temp_dict[topic] = probability topic_result_list.append(temp_dict) self.result_df = pd.DataFrame(data=topic_result_list, columns=range(model.num_topics)) self.result_df = self.result_df.fillna(0) if self.document_ids is not None and not self.language_detection: self.result_df.index = self.document_ids elif self.document_ids is not None and self.language_detection: raise Warning( "Using document ids and language detection together is not implemented (yet)." ) dominant_topic = np.argmax(self.result_df.values, axis=1) self.result_df['dominant_topic'] = dominant_topic def plot_document_topic_distribution(self): #todo: log normalize if self.result_df is None: raise Exception( "Please create the topic distribution dataframe using the 'create_document_topic_df' " "method") counter = Counter(self.result_df.dominant_topic) topic_dict = OrderedDict( sorted(counter.items(), key=lambda x: x[1], reverse=True)) plt.figure(figsize=(10, 6)) g = sns.barplot(x=list(topic_dict.values()), y=list(topic_dict.keys()), order=list(topic_dict.keys()), orient='h') g.set_ylabel("topic number") g.set_xlabel("count") plt.show() def evaluate_model(self, no_words=30): #todo: update 4 gensim keywords = np.array(self.vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in self.lda_model.components_: top_keyword_locations = (-topic_weights).argsort()[:no_words] topic_keywords.append(keywords.take(top_keyword_locations)) self.word_topic_df = pd.DataFrame( topic_keywords, columns=[f"word_{x}" for x in range(no_words)]) def evaluate_pyldavis(self, model=None, use_jupyter=None): """ Method for a visual evaluation of the LDA topic model using pyldavis. :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved within the class. :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run from jupyter and set the method accordingly :return: """ if model is None: if self.lda_model is None: raise Exception( "Please create a LDA model for evaluation before running this method." ) model = self.lda_model if isinstance(model, LdaMallet): model = malletmodel2ldamodel(model) panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word) if use_jupyter is None: try: is_jupyter = os.environ['_'].split( "/")[-1] == "jupyter-notebook" if is_jupyter: pyLDAvis.enable_notebook() except KeyError: is_jupyter = False if is_jupyter: pyLDAvis.display(panel) else: pyLDAvis.show(panel) else: if use_jupyter: pyLDAvis.enable_notebook() pyLDAvis.display(panel) elif not use_jupyter: pyLDAvis.show(panel) def print_bow(self, doc_positions): print([[(self.id2word[token_id], freq) for token_id, freq in doc] for doc in compress(self.bag_of_words, doc_positions)]) def save_model(self, path): self.lda_model.save(path) def load_model(self, path): self.lda_model = LdaMulticore.load(path) def save_dict(self, path): self.id2word.save(path) print("dict saved") def load_dict(self, path): self.id2word = corpora.Dictionary.load(path)
from gensim.models.wrappers import LdaMallet from gensim.models.wrappers.ldamallet import malletmodel2ldamodel from gensim.models import CoherenceModel path_to_mallet_binary = 'd:/mallet-2.0.8/bin/mallet' output_path = 'd:/code/gc_text_analysis/mallet_output/' num_topics = 140 model = LdaMallet(path_to_mallet_binary, corpus=bow_docs, workers=4, iterations=2000, num_topics=num_topics, id2word=dictionary, prefix=output_path) model.save('gc_lda_model.pkl') dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items()) words_freq = [(dictionary.id2token[id], cnt) for id, cnt in dictionary.dfs.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) words_freq = pd.DataFrame(words_freq, columns=['word', 'count']) coherence_model_lda = CoherenceModel(model=model, texts=ngram_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() topics = model.show_topics(num_topics=num_topics, num_words=10,