def model(self, topic_num_best: int = None, topic_num_list: List[int] = range(2, 22, 2)): docs = self.docs time_slice = self.time_slice pkuseg = PKUSegment() docs_segmented = list() word_segment_list = list() tag_segment_list = list() time_slice_segmented = list() time_doc_count_accumulate = 0 for time_doc_count in time_slice: doc_list_part, word_segment_list_part, tag_segment_list_part = pkuseg.segment_docs( docs[time_doc_count_accumulate: time_doc_count_accumulate + time_doc_count], include_tag_list=['a', 'ad', 'j', 'l', 'n', 'ns', 'nt', 'nz', 'v', 'vd', 'vn'], min_length=2 ) docs_segmented.extend(doc_list_part) word_segment_list.extend(word_segment_list_part) tag_segment_list.extend(tag_segment_list_part) time_slice_segmented.append(len(word_segment_list_part)) time_doc_count_accumulate += time_doc_count dictionary, corpus = word_segment_list_to_dictionary_corpus(word_segment_list) self.dictionary = dictionary self.corpus = corpus self.word_segment_list = word_segment_list self.tag_segment_list = tag_segment_list self.docs = docs_segmented lda_model = LdaModelSLab('中共', docs_segmented) lda_model.word_segment_list = word_segment_list lda_model.corpus = corpus lda_model.dictionary = dictionary # 计算最佳主题数量 if topic_num_best is None: coherence_list, coherence_best, model_best, topic_num_best = lda_model.select_best_topic_num(topic_num_list) # 训练模型 self.dtm_model = DtmModel('dtm-win64.exe', corpus, time_slice_segmented, num_topics=topic_num_best, id2word=dictionary, initialize_lda=True, lda_sequence_min_iter=30, lda_sequence_max_iter=100, lda_max_em_iter=50 ) # 得到各文本对应主题 self.topic_index_list = np.argmax(self.dtm_model.gamma_, axis=1) self.topic_num = topic_num_best df = pd.DataFrame({'doc': docs_segmented, 'topic': self.topic_index_list}) self.df = df return df
def dtm_draw_topic(dtm_model: DtmModel, topic_index, time_num=None, topn=10): # 自动判断主题数量 if time_num is None: time_num = 0 while True: try: dtm_model.show_topic(topic_index, time_num, topn) time_num += 1 except: break x = range(time_num) # 统计所有时间的关键词 word_set = set() for time_index in range(time_num): for prob, word in dtm_model.show_topic(topic_index, time_index, topn): word_set.add(word) word_stat = {word: [] for word in word_set} # 在各个时间下,根据关键词获取频率 max_prob = 0 for time_index in range(time_num): try: word_dict = { word: prob for prob, word in dtm_model.show_topic(topic_index, time_index, topn) } except: break for word in word_set: if word in word_dict: word_stat[word].append(word_dict[word]) if word_dict[word] > max_prob: max_prob = word_dict[word] else: word_stat[word].append(0) # 画图 subplot_num = len(word_stat) subplot_col = 4 subplot_row = math.ceil(float(subplot_num) / subplot_col) plt.figure(figsize=(4 * subplot_col, 4 * subplot_row)) for word_index, (word, prob_list) in enumerate(word_stat.items()): plt.subplot(subplot_row, subplot_col, word_index + 1) plt.plot(x, prob_list, label=word) plt.ylim(0, max_prob) plt.legend()
def dtm(dtm_path, corpus, dictionary, time_slices, num_topics=40, load=False): # dtm_path should have your local binary of Blei-DTM print("Running DTM") if load is False: model = DtmModel(dtm_path, corpus, time_slices, num_topics=num_topics, id2word=dictionary, initialize_lda=True) model.save("DTM") return model elif load is True: model = DtmModel.load('DTM') return model
def dtm(time_query_list, topic_num): ''' 根据查询时间列表训练DTM模型 返回词典、语料库、模型 :param time_query_list: :param topic_num: :return: ''' words_slice = [] total_words_ls = [] for time_query in time_query_list: words_ls, _, _ = segment_comment(query_comment(*time_query)) words_slice.append(len(words_ls)) total_words_ls.extend(words_ls) dictionary, corpus = dict_corpus_comment(total_words_ls) dtm_model = DtmModel('dtm-win64.exe', corpus, words_slice, num_topics=topic_num, id2word=dictionary, initialize_lda=True, lda_sequence_min_iter=30, lda_sequence_max_iter=100, lda_max_em_iter=50) return total_words_ls, dictionary, corpus, dtm_model
def dtm_model(dtm_path, corpus=None, time_seq=None, num_topics=10, id2word=None, alpha=0.01, rng_seed=0, model='fixed'): """ :param dtm_path: path to dtm wrapper, see: https://github.com/blei-lab/dtm :param corpus: documents in bag-of-words format :param time_seq: pre-defined timestamps :param num_topics: number of topics :param id2word: mapping between tokens ids and words from corpus :param alpha: hyperparameter of the Dirichlet distribution that affects the document-topics sparsity :param id2word: mapping between tokens ids and words from corpus :param rng_seed: random seed :param model: "fixed" if document influence needed, 'dtm' otherwise :return: dtm model trained with the available corpus """ print("initializing the model...") model = DtmModel(dtm_path=dtm_path, corpus=corpus, time_slices=time_seq, num_topics=num_topics, id2word=id2word, alpha=alpha, rng_seed=rng_seed, model='fixed') print('DTM model loaded') return model
def run_dtm(args, corpus, dictionary, time_slices, pre): """ Function to run DTM over corpus. input: args (argparse object): input arguments corpus: corpus to run LDA over dictionary: dictionary from corpus time_slices (list): list containing number of files per time time slice pre (str): path to save all results to returns DTM model """ DTM_PATH = os.environ.get('DTM_PATH', None) if not DTM_PATH: raise ValueError("You need to set the DTM path.") # Run the model model = DtmModel(DTM_PATH, corpus=corpus, num_topics=args.num_topics, id2word=dictionary, time_slices=time_slices, prefix=pre, lda_sequence_max_iter=args.num_iterations) return model
def create_model(self): self.model = DtmModel(self.dtm_path, self.corpus, self.time_seq, num_topics=15, id2word=self.corpus.dictionary, initialize_lda=True) return self.model
def dtm_print_topic_all_time(dtm_model: DtmModel, topic_index, topn=10): time_index = 0 while True: try: msg = dtm_model.print_topic(topic_index, time_index, topn) print(msg) except: return time_index += 1
def run_model(self): ''' Run the LDA model on a given corpus and dictionary ''' if not hasattr(self, 'corpus'): # if there's no corpus present, read in saved corpus corpus = gensim.corpora.MmCorpus( os.path.join(_MODELS_DIR, self.corpus_file)) if not hasattr(self, 'corpus.dictionary'): # if there's no dictionary present, read in saved dictionary dictionary = gensim.corpora.Dictionary.load( os.path.join(_MODELS_DIR, self.dict_file)) if self.m_type == "LDA": # Run LDA model print("Running LDA Model") t0 = time.time() self.lda = gensim.models.LdaModel(self.corpus, id2word=self.corpus.dictionary, num_topics=self.num_topics) print(time.time() - t0) if self.m_type == "DTM": print("Running DTM Model") t0 = time.time() self.lda = DtmModel(self.dtm_path, self.corpus, self.time_seq, num_topics=self.num_topics, id2word=self.corpus.dictionary, initialize_lda=True) print(time.time() - t0) if self.m_type == "DIM": print("Running DIM Model") t0 = time.time() self.lda = DtmModel(self.dtm_path, self.corpus, self.time_seq, num_topics=self.num_topics, model="fixed", id2word=self.corpus.dictionary, initialize_lda=True) print(time.time() - t0)
def main(args): f = sys.argv[1] if args.model_type == "lda": loaded_model = LdaModel.load(args.model) for topic_num, topic in enumerate( loaded_model.show_topics(num_topics=-1)): topic_num, topic_str = topic print(str(topic_num) + ':', end=' ') for term in topic_str.split(' + '): weight, word = term.split('*') if args.model_type == "dtm": word = "\"" + word + "\"" print(word, end=' ') print() elif args.model_type == "dtm": loaded_model = DtmModel.load(args.model) for topic_id in range(loaded_model.num_topics): for time in range(len(loaded_model.time_slices)): top_words = loaded_model.show_topic(topic_id, time, topn=10) print("Topic", str(topic_id) + ", time slice", str(time) + ':', end=' ') for weight, word in top_words: print(word, end=', ') print() print() elif args.model_type == "ldaseq": loaded_model = LdaSeqModel.load(args.model) # maybe use dtm_coherence? print(loaded_model.num_topics) print(loaded_model.time_slice) for topic_id in range(loaded_model.num_topics): for time in range(len(loaded_model.time_slice)): top_words = loaded_model.print_topic(topic=topic_id, time=time, top_terms=20) print("Topic", str(topic_id) + ", time slice", str(time) + ':', end=' ') for word, weight in top_words: print(word, end=' ') print() print() else: print("Unknown model type provided: " + args.model_type) sys.exit(1)
def initialize_model(self): mydict = corpora.Dictionary() mycorpus = [ mydict.doc2bow(doc, allow_update=True) for doc in self.flat_list ] start = time.time() model = DtmModel(dtm_path, mycorpus, self.time_slices, num_topics=num_topics, id2word=mydict, initialize_lda=True, top_chain_var=0.05, lda_sequence_min_iter=15, lda_sequence_max_iter=50) print(time.time() - start) return model, mycorpus, mydict
def create_dtm_encoding(corpus, vector_size, dictionary, slices): path = './external/dtm_bin/' link = 'https://github.com/magsilva/dtm/tree/master/bin' content = [ f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and 'dtm' in f ] if len(content) != 1: print( "Please place the appropriate binary file (and only this one) from {} into '{}'." .format(path, link)) sys.exit(1) mod_path = path + content[0] dictionary.filter_extremes(keep_n=5000) bow_corpus = [dictionary.doc2bow(x) for x in corpus] mod = DtmModel(mod_path, corpus=bow_corpus, id2word=dictionary, time_slices=slices, num_topics=vector_size) return mod.gamma_, mod
class DtmlModelSLab(): def __init__(self, namespace: str, docs: List[str], time_slice: List[int]): self.namespace = namespace Path(namespace).mkdir(exist_ok=True, parents=True) self.docs = docs self.time_slice = time_slice self.dictionary = None self.corpus = None self.topic_num = None self.topic_index_list = None self.dtm_model = None def model(self, topic_num_best: int = None, topic_num_list: List[int] = range(2, 22, 2)): pkuseg = PKUSegment() docs_segmented = list() word_segment_list = list() tag_segment_list = list() time_slice_segmented = list() time_doc_count_accumulate = 0 for time_doc_count in self.time_slice: doc_list_part, word_segment_list_part, tag_segment_list_part = pkuseg.segment_docs( self.docs[time_doc_count_accumulate:time_doc_count_accumulate + time_doc_count], include_tag_list=[ 'a', 'ad', 'j', 'l', 'n', 'ns', 'nt', 'nz', 'v', 'vd', 'vn' ], min_length=2) docs_segmented.extend(doc_list_part) word_segment_list.extend(word_segment_list_part) tag_segment_list.extend(tag_segment_list_part) time_slice_segmented.append(len(word_segment_list_part)) time_doc_count_accumulate += time_doc_count dictionary, corpus = word_segment_list_to_dictionary_corpus( word_segment_list) self.dictionary = dictionary self.corpus = corpus self.word_segment_list = word_segment_list self.tag_segment_list = tag_segment_list self.docs = docs_segmented self.time_slice = time_slice_segmented lda_model = LdaModelSLab(self.namespace, docs_segmented) lda_model.word_segment_list = word_segment_list lda_model.corpus = corpus lda_model.dictionary = dictionary # 计算最佳主题数量 if topic_num_best is None: coherence_list, coherence_best, model_best, topic_num_best = lda_model.select_best_topic_num( topic_num_list) self.topic_num = topic_num_best # 训练模型 self.dtm_model = DtmModel('dtm-win64.exe', corpus, time_slice_segmented, num_topics=topic_num_best, id2word=dictionary, initialize_lda=True, lda_sequence_min_iter=30, lda_sequence_max_iter=100, lda_max_em_iter=50) # 得到各文本对应主题 self.topic_index_list = np.argmax(self.dtm_model.gamma_, axis=1) df = pd.DataFrame({ 'doc': docs_segmented, 'topic': self.topic_index_list }) self.df = df return df def save(self): pickle_to_file(self, f'{self.namespace}/dtm_slab.pkl') # self.dtm_model.save(f'{self.namespace}/dtm_{self.topic_num}.model') # pickle_to_file(self.docs, f'{self.namespace}/docs.pkl') # pickle_to_file(self.df, f'{self.namespace}/dtm_df.pkl') @classmethod def load(cls, namespace: str): # docs = unpickle_from_file(f'{namespace}/docs.pkl') # instance = cls(namespace, docs) # instance.df = unpickle_from_file(f'{namespace}/dtm_df.pkl') instance = unpickle_from_file(f'{namespace}/dtm_slab.pkl') return instance def draw_topics(self, topn=10): for topic_index in range(self.topic_num): self.draw_topic(topic_index, topn) # 各主题数量 df_topic = pd.DataFrame(np.argmax(self.dtm_model.gamma_, axis=1), columns=['topic']) # 聚合统计列 df_topic.loc[:, 'count'] = 1 df_g = df_topic.groupby('topic').size() df_g.boxplot() plt.savefig(f'{self.namespace}/dtm_topic_num.png') def draw_topic(self, topic_index: int, topn=10): time_length = len(self.time_slice) x = range(time_length) # 统计所有时间的关键词 word_set = set() for time_index in range(time_length): for prob, word in self.dtm_model.show_topic( topic_index, time_index, topn): word_set.add(word) word_stat = {word: [] for word in word_set} # 在各个时间下,根据关键词获取频率 # 画图Y轴最大值 max_prob = 0 for time_index in range(time_length): word_dict = { word: prob for prob, word in self.dtm_model.show_topic( topic_index, time_index, topn) } for word in word_set: if word in word_dict: word_stat[word].append(word_dict[word]) if word_dict[word] > max_prob: max_prob = word_dict[word] else: word_stat[word].append(0) # 统计当前主题文档数量 current_topic_doc_num = pd.Series( np.argmax(self.dtm_model.gamma_, axis=1)).value_counts().sort_index()[topic_index] total_doc_num = len(np.argmax(self.dtm_model.gamma_, axis=1)) # 画图 subplot_num = len(word_stat) subplot_col = 4 subplot_row = math.ceil(float(subplot_num) / subplot_col) plt.figure(figsize=(4 * subplot_col, 4 * subplot_row)) plt.suptitle( f'主题ID:{topic_index},共{self.dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}' ) for word_index, (word, prob_list) in enumerate(word_stat.items()): plt.subplot(subplot_row, subplot_col, word_index + 1) plt.plot(x, prob_list, label=word) plt.xticks([*range(0, x[-1], 2), x[-1]]) plt.ylim(0, max_prob) plt.legend() plt.show() plt.savefig(f'{self.namespace}/dtm_topic{topic_index}.png') def print_topic_all_time_slice(self, topic_index, topn=10): time_index = 0 while True: try: msg = self.dtm_model.print_topic(topic_index, time_index, topn) print(msg) except: return time_index += 1
pickle.dump(bow_path_by_artist, open(MODEL_SAVE_NAME + "bow_paths.pk", "wb")) class BoWCorpus(object): def __iter__(self, bow_path_by_artist=bow_path_by_artist): for artist_id, artist_path, year in bow_path_by_artist: # Extract features for first song in the directory bow = np.load(BOW_DIR + artist_path + os.listdir(BOW_DIR + artist_path)[0]) # Convert to sparse encoding bow_sparse = [(idx, count) for (idx, count) in enumerate(bow) if count > 0] yield bow_sparse corpus = BoWCorpus() start = time() model = DtmModel(dtm_path, corpus, time_seq, num_topics=NUM_TOPICS, initialize_lda=True, model='fixed') # Save model model.save(MODEL_SAVE_NAME) print 'Model fit in', ((time() - start) / 60.) / 60., 'hours'
def DTMimplementForDatasets(): cnxn = pyodbc.connect( 'DRIVER={SQL Server};SERVER=DESKTOP-P61DTNE;DATABASE=Medline;UID=sa;PWD=0000' ) cursor = cnxn.cursor() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() with open('F:\\publication work\\Data\\temp\\sample_dataset.txt', 'r') as f2: s2 = f2.read() all_dataset = ast.literal_eval(s2) for key, value in all_dataset.items(): dataset_name = key stri = 'where' j = 0 for i in value: if (j == 0): stri = stri + ' ' + 'TOPIC_NO=' + str(i) j = j + 1 else: stri = stri + ' or ' + 'TOPIC_NO=' + str(i) # Declare list to create a list of the whole document set doc_set = list() list_of_topics = list() temp_dist_of_docs_over_topics = list() dist_of_docs_over_topics = list() pubmed_identifier_list = list() topic_doc_dictionary = {} cursor.execute( "SELECT [TOPIC_NO],[SERIAL_NO],[PUBMED_IDENTIFIER],[ABSTRACT] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005]" + stri + ";") for row1 in cursor.fetchall(): abstract = row1.ABSTRACT.strip() serial_no = row1.SERIAL_NO topic_no = row1.TOPIC_NO pubmed_identifier = row1.PUBMED_IDENTIFIER pubmed_identifier_list.append(pubmed_identifier) if topic_no in topic_doc_dictionary.keys(): topic_doc_dictionary[topic_no].append(pubmed_identifier) else: topic_doc_dictionary[topic_no] = list() topic_doc_dictionary[topic_no].append(pubmed_identifier) number_of_topics_in_a_dataset = len(topic_doc_dictionary.keys()) if not abstract: cursor1 = cnxn.cursor() cursor1.execute( "SELECT [TITLE] FROM [Medline].[dbo].[OnlyDeeplyRelaGeno2005] where SERIAL_NO='" + str(serial_no) + "';") for row in cursor1.fetchall(): abstract = row.TITLE.strip() #print k doc_set.append(abstract) #number_of_topics_produced=10 number_of_topics_produced = len(doc_set) / 24 #number_of_topics_produced=len(doc_set)/70 print 'Number of documents: ' + str(len(doc_set)) print 'Number of topics produced: ' + str(number_of_topics_produced) print 'Number of passes: ' + str(number_of_passes) print 'Number of clusters: ' + str(number_of_topics_in_a_dataset) # Declaring list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) #print(tokens) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) #print(dictionary.token2id) # convert tokenized documents into a document-term matrix Corpus = [dictionary.doc2bow(text) for text in texts] #print(corpus[0]) """ class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(texts) """ #if len(doc_set)*30%100: # section30=len(doc_set)*30/100+1 #else: # section30=len(doc_set)*30/100 #section70=len(doc_set)*70/100 #time_seq = [section30, section70] time_seq = [len(doc_set), 0] dtm_path = 'C:\Program Files\DTM\dtm-win64.exe' dtmModel = DtmModel(dtm_path, Corpus, time_seq, num_topics=number_of_topics_produced, id2word=dictionary, initialize_lda=True) for i in range(0, number_of_topics_produced): list_of_topics.append(dtmModel.show_topic(i, 1, 10)) for i in range(0, len(doc_set)): temp_dist_of_docs_over_topics.append(dtmModel.gamma_[i]) dist_of_docs_over_topics = [] dist_of_docs_over_topicsindexer = -1 for i in temp_dist_of_docs_over_topics: dist_of_docs_over_topicsindexer = dist_of_docs_over_topicsindexer + 1 dist_of_docs_over_topics.append([]) for j in i: dist_of_docs_over_topics[ dist_of_docs_over_topicsindexer].append(j) with open('F:\\publication work\\Data\\temp\\8.pubmed_identifier.txt', 'w') as f3: f3.write(str(pubmed_identifier_list)) with open('F:\\publication work\\Data\\temp\\9.topics_list.txt', 'w') as f1: f1.write(str(list_of_topics)) #print dist_of_new_docs_over_topics with open( 'F:\\publication work\\Data\\temp\\12.distribution_of_topics_in_docs_bracket_replaced_only_prob.txt', 'w') as f5: f5.write(str(dist_of_docs_over_topics)) with open( 'F:\\publication work\\Data\\temp\\17.topic_doc_dictionary.txt', 'w') as f4: f4.write(str(topic_doc_dictionary)) del doc_set del list_of_topics del dist_of_docs_over_topics del pubmed_identifier_list del stopped_tokens del stemmed_tokens del texts del dictionary del Corpus del tokens del dtmModel del topic_doc_dictionary #replaceBrackets() #print 'Bracket replacing completed' #keepingOnlyProbability(number_of_topics_produced) #print 'Keeping only probability completed' distanceFromJSD() print 'Measuring distance completed' similarityFromJSD() print 'Measuring similarity completed' spectralClustering(number_of_topics_in_a_dataset) print 'Spectral clustering completed' combiningClusterResult(dataset_name) print 'Combining clustering result completed' accuracyMeasure(dataset_name) print 'Measuring accuracy completed' with open('F:\\publication work\\Data\\temp\\18.NMI_dictionary.txt', 'w') as f6: f6.write(str(accuracy_dictionary)) f1.close() f2.close() f3.close() f4.close() f5.close() f6.close() cursor.close() cursor1.close() cnxn.close()
# 读取时间段 t = open(main_path + 'corpus/dtm_o/time_series.txt', 'r') time_series = [int(i) for i in t.read().split()] t.close() # 建模 model_gen = DtmModel(dtm_path, corpus=corpus, time_slices=time_series, mode=para['mode'], model=para['model'], num_topics=para['num_topics'], id2word=corpus.dictionary, prefix=None, lda_sequence_min_iter=para['lda_sequence_min_iter'], lda_sequence_max_iter=para['lda_sequence_max_iter'], lda_max_em_iter=para['lda_max_em_iter'], alpha=para['alpha'], top_chain_var=para['top_chain_var'], rng_seed=para['rng_seed'], initialize_lda=para['initialize_lda']) # model_gen = LdaSeqModel(corpus = corpus, time_slice=time_series, id2word = dictionary, num_topics = num_topics) print 'model training finish' model_gen.save(main_path + 'result/dtm_o_' + sys.platform + '_topic_' + str(para['num_topics']) + '.model') print 'model saving finish' #model1 = DtmModel.load('topic1.model') #topics = model1.show_topic(topicid=0, time=0, topn=10)
#model_DTM = DtmModel(dtm_path, corpus_EI_toy, time_slices_EI_toy, num_topics=num_topics, id2word=corpus_EI_toy.dictionary,initialize_lda=True) #model_DTM.save('dtm_ei_10') print("Fin de l'entrainement du modèle DTM pour EI\n") print("\n---------------------\n") print("Début de l'entrainement du modèle DIM pour AE \n") #model_DIM = DtmModel(dtm_path, corpus_AE_toy, time_slices_AE_toy, num_topics=num_topics, id2word=corpus_AE_toy.dictionary, initialize_lda=True, model='fixed') #model_DIM.save('dim_ae_10') print("Fin de l'entrainement du modèle DIM pour AE\n") print("Début de l'entrainement du modèle DIM pour RI \n") model_DIM = DtmModel(dtm_path, corpus_RI_toy, time_slices_RI_toy, num_topics=num_topics, id2word=corpus_RI_toy.dictionary, initialize_lda=True, model='fixed') model_DIM.save('dim_ri_10') print("Fin de l'entrainement du modèle DIM pour RI\n") print("Début de l'entrainement du modèle DIM pour EI \n") model_DIM = DtmModel(dtm_path, corpus_EI_toy, time_slices_EI_toy, num_topics=num_topics, id2word=corpus_EI_toy.dictionary, initialize_lda=True, model='fixed') model_DIM.save('dim_ei_10')
import time from gensim.models.wrappers.dtmmodel import DtmModel from gensim import corpora start_time = time.time() dtm_path = "dtm-linux64" # Importation de la liste de texte lemmatisé corpus = pickle.load(open('corpus_geo.pkl', 'rb')) # Mise en format pour gensim dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] # Pour 10 topics time_slice = [11468]*9 time_slice.append(11472) # Pour 20 topics #time_slice = [5734]*9 # time_slice.append(5738) nb_topics = 10 model = DtmModel(dtm_path, corpus, time_slice, num_topics=nb_topics, id2word=dictionary, initialize_lda=True) model.save("DTMModel") print("---- %s seconds ----" % (time.time() - start_time))
def dtm_draw_topic(dtm_model: DtmModel, topic_index: int, time_num: int = None, topn=10): # 自动判断主题数量 if time_num is None: time_num = 0 while True: try: dtm_model.show_topic(topic_index, time_num, topn) time_num += 1 except: break x = range(time_num) # 统计所有时间的关键词 word_set = set() for time_index in range(time_num): for prob, word in dtm_model.show_topic(topic_index, time_index, topn): word_set.add(word) word_stat = {word: [] for word in word_set} # 在各个时间下,根据关键词获取频率 max_prob = 0 for time_index in range(time_num): word_dict = { word: prob for prob, word in dtm_model.show_topic(topic_index, time_index, topn) } for word in word_set: if word in word_dict: word_stat[word].append(word_dict[word]) if word_dict[word] > max_prob: max_prob = word_dict[word] else: word_stat[word].append(0) # 统计当前主题文档数量 current_topic_doc_num = pd.Series(np.argmax( dtm_model.gamma_, axis=1)).value_counts().sort_index()[topic_index] total_doc_num = len(np.argmax(dtm_model.gamma_, axis=1)) # 画图 subplot_num = len(word_stat) subplot_col = 4 subplot_row = math.ceil(float(subplot_num) / subplot_col) plt.figure(figsize=(4 * subplot_col, 4 * subplot_row)) plt.suptitle( f'主题ID:{topic_index},共{dtm_model.num_topics}个主题,当前主题文本数量:{current_topic_doc_num}/{total_doc_num}' ) for word_index, (word, prob_list) in enumerate(word_stat.items()): plt.subplot(subplot_row, subplot_col, word_index + 1) plt.plot(x, prob_list, label=word) plt.xticks([*range(0, x[-1], 2), x[-1]]) plt.ylim(0, max_prob) plt.legend() plt.show()
with open('NuclearEnergy/Data/total.txt', 'rb') as p: total = pickle.load(p) total = total[total['press'].str.contains('한수원|원자력문화재단|원자력안전위원회|산자부') == True] time_slice = list(total.groupby('pubtime')['pubtime'].count()) dic = corpora.Dictionary(total['article']) tf = [dic.doc2bow(i) for i in total['article']] tfidfm = models.TfidfModel(tf) tfidf = tfidfm[tf] corpus = tfidf.corpus model = DtmModel('C:/dtm-win64.exe.', corpus, time_slice, num_topics=20, id2word=dic) with open('NuclearEnergy/Result/model1.txt', 'wb') as p: pickle.dump(model, p) with open('NuclearEnergy/Result/model1.txt', 'rb') as p: model = pickle.load(p) for i in range(0, 36): doc_topic_dists = pd.DataFrame(model.dtm_vis(corpus, i)[0]) doc_topic_dists.index.name = 'doc' doc_topic_dists.columns.name = 'topic' doc_lengths = pd.Series(model.dtm_vis(corpus, i)[2]) doc_lengths.name = 'doc_lenghts' topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
dialogues = [x for x in dialogues.values()] no_topics = 2 #Create class wrapper class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(dialogues) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = DtmModel(path_to_dtm, corpus, time_slices=[1,1,1,1,1,1], num_topics=no_topics,id2word=corpus.dictionary, initialize_lda=True) for i in range(0, no_topics): print(model.show_topic(topicid=i, time=0, num_words=8)) m = np.array(model.gamma_) sns.set(style="whitegrid") f, ax = plt.subplots() for j in range(0, no_topics): plt.plot(np.arange(1, 7, 1), m[:, j], '-o', label="Topic {}".format(j)) f.legend(*ax.get_legend_handles_labels(), loc="center right", fontsize='x-large')
#Corpus class for DTM data load class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(documents) #path where dtm file is installed dtm_path = "/home/ankit/NLP_Project/dtm/dtm/dtm" model = DtmModel.load("DTMMOdel.txt") #model.save("DTModel.txt") #Gives top 25 topics tp = model.show_topics(num_topics=-1, times=1, num_words=100, log=False, formatted=False) print tp print type(tp) for i in tp: for j in i: print type(j), j[1].decode("utf-8") #print i.decode("utf-8")
tagged_words = pos_tag(words) lemmatized = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in tagged_words] lemmatized = list(filter(lambda w: not any(p in w for p in punctuation) and w not in stopword_list and w not in punctuation and len(w) >= 3, lemmatized)) return lemmatized def timestamp(): return datetime.now().strftime('%x %X') print('({}) DTM training data preprocessing started'.format(timestamp())) start = datetime.now() orig_df = pd.read_pickle(r'dfs\2020-03-22-to-2020-11-18-1000-daily') orig_texts = [preprocess(text) for text in orig_df['full_text']] orig_dictionary = corpora.Dictionary(orig_texts) orig_corpus = [orig_dictionary.doc2bow(text) for text in orig_texts] dtm_model = DtmModel.load(r'dtm\2020-03-22-to-2020-11-18-1000-daily') print('Time to preprocess training texts:', str(datetime.now() - start)) ###################################### ##### DAY-BY-DAY TOPIC LABELLING ##### ###################################### conn = sqlite3.connect('database/tweets.db') # df = pd.read_pickle(r'dfs\2020-03-22-to-2020-08-19-4000-daily') # df = pd.read_sql_query('select * from tweets where "user.screen_name" in (select screen_name from labels) and created_at between \'2020-03-22\' and \'2020-11-18\'', conn) # comment out n_tweets_per_day lines and cumulative_tweets lines when using full dataset # n_tweets_per_day = df['created_at'].apply(lambda x: x[:10]).value_counts().sort_index().values.tolist() # cumulative_tweets[i] is the index of the first Tweet from i days after START_DATE
u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance', u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa' ] ] time_seq = [3, 7] # first 3 documents are from time slice one # and the other 7 are from the second time slice. class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(documents) # trying to use compiled dtm C++ code but it won't work dtm_path = "/Users/anselmscreen/github/dtm/gensim/dtm-darwin64.app" model = DtmModel(dtm_path, corpus, time_seq, num_topics=2, id2word=corpus.dictionary, initialize_lda=True)
g.load_raw_corpus() time_slices = [13, 11, 11, 10, 15] # number of documents for each month class DTMcorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(g.corpus_raw) model = DtmModel(dtm_compiled_path, corpus, time_slices, num_topics=2, id2word=corpus.dictionary, initialize_lda=True) # collect probabilities for chosen keyterms words_of_interest = ['Tuerkei', 'Fluechtlinge', 'Oesterreich'] topic_choice = 0 results = {} for w in words_of_interest: results[w] = [] for i in range(5): for (p, w) in model.show_topic(topic_choice, i): if w in int_words: results[w].append(p) # plot
# save preprocessed corpus with open(r'dtm\full-preprocessed-pickle', 'wb') as f: pickle.dump(texts, f) # get time slices (number of tweets each day) time_slices = df['created_at'].apply( lambda x: x[:10]).value_counts().sort_index().values.tolist() dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ################################################################################ dtm_exe_path = r'C:\Program Files\DTM\dtm-win64.exe' print('({}) Model started training'.format(timestamp())) start = datetime.now() dtm_model = DtmModel(dtm_exe_path, corpus=corpus[:], time_slices=time_slices, num_topics=20, id2word=dictionary) elapsed = datetime.now() - start print('({}) Model finished training'.format(timestamp())) print('Elapsed time:', elapsed) print('Saving model...') dtm_model.save(dtm_out_path) print('({}) Model saved'.format(timestamp()))
class Pipeline(object): def __init__(self, key="Y02E_10", m_type="LDA", num_topics=7, min_slice_size=200): self.num_topics = num_topics self.key = key self.data_file = '../Data/{}.csv'.format(key) self.dict_file = "{}.dict".format(key) self.corpus_file = "{}.mm".format(key) self.coords_file = "{}_coords.csv".format(key) self.topics_file = "{}_Topics.txt".format(key) self.approved_ids_file = "{}_approved_ids".format(key) self.m_type = m_type self.min_slice_size = min_slice_size dtm_home = os.environ.get('DTM_HOME', "dtm-master") self.dtm_path = os.path.join(dtm_home, 'bin', 'dtm-darwin64') if dtm_home else None def make_corpus(self): # stop list from nltk stoplist = set(nltk.corpus.stopwords.words("english")) if self.m_type in ["DTM", "DIM"]: # time shape stuff self.time_seq, self.approved_ids = get_time_seq( self.data_file, self.min_slice_size) filehandler = open(_MODELS_DIR + self.approved_ids_file + ".obj", "wb") pickle.dump(self.approved_ids, filehandler) filehandler.close() self.approved_ids = list(itertools.chain(*self.approved_ids)) #self.corpus = DTMcorpus(self.corpus) # warning, this reads in the whole corpus to memory! self.corpus = MyCorpus(self.data_file, self.approved_ids) else: # instantiate corpus object self.corpus = MyCorpus(self.data_file) # memory friendly corpus! print("Making dictionary") t0 = time.time() # create dictionary, remove stopwords and words only occurring once, apply stemming self.corpus.make_dictionary( stoplist=stoplist, minfreq=25) # minfreq=25 to match Blei's paper print(time.time() - t0) print("saving dictionary") t0 = time.time() # save the dictionary self.corpus.dictionary.save(_MODELS_DIR + self.dict_file) print(time.time() - t0) print("Saving corpus") t0 = time.time() # save the corpus gensim.corpora.MmCorpus.serialize(_MODELS_DIR + self.corpus_file, self.corpus) print(time.time() - t0) def run_model(self): ''' Run the LDA model on a given corpus and dictionary ''' if not hasattr(self, 'corpus'): # if there's no corpus present, read in saved corpus corpus = gensim.corpora.MmCorpus( os.path.join(_MODELS_DIR, self.corpus_file)) if not hasattr(self, 'corpus.dictionary'): # if there's no dictionary present, read in saved dictionary dictionary = gensim.corpora.Dictionary.load( os.path.join(_MODELS_DIR, self.dict_file)) if self.m_type == "LDA": # Run LDA model print("Running LDA Model") t0 = time.time() self.lda = gensim.models.LdaModel(self.corpus, id2word=self.corpus.dictionary, num_topics=self.num_topics) print(time.time() - t0) if self.m_type == "DTM": print("Running DTM Model") t0 = time.time() self.lda = DtmModel(self.dtm_path, self.corpus, self.time_seq, num_topics=self.num_topics, id2word=self.corpus.dictionary, initialize_lda=True) print(time.time() - t0) if self.m_type == "DIM": print("Running DIM Model") t0 = time.time() self.lda = DtmModel(self.dtm_path, self.corpus, self.time_seq, num_topics=self.num_topics, model="fixed", id2word=self.corpus.dictionary, initialize_lda=True) print(time.time() - t0) def save_model(self, model_name="LDA_model"): ''' Save the current LDA model to an object file in the saved models folder. ''' filehandler = open(_MODELS_DIR + model_name + ".obj", "wb") pickle.dump(self.lda, filehandler) filehandler.close() def topics(self, model_name=None, save=True, viz=True): '''Will print and optionally save topics of a given LDA model. Uses LDA model present in object by default unless alternate saved version is specified. ''' # If model != None: # try to get it from the folder # except, it's not there, throw an error. if model_name != None: filehandler = open(_MODELS_DIR + model_name + ".obj", 'r') self.lda = pickle.load(filehandler) filehandler.close() if not hasattr(self, 'lda'): print("no LDA model detected") else: print(self.lda.print_topics(self.num_topics, num_words=25)) self.topics = self.lda.show_topics(num_topics=self.num_topics, num_words=20) # save the topics and their constituent words if save: f = open(_MODELS_DIR + self.topics_file, 'w') print(self.topics, end="", file=f) f.close() if viz: if not hasattr(self, "topics"): self.topics = open( os.path.join(_MODELS_DIR, self.topics_file), 'rb').readlines()[0] self.topics.close() # parsing topic string lines = str(self.topics).strip("[()").strip("]\n").split("(") lines = [i.strip("),").split(", u'")[1] for i in lines] # plotting word clouds of each topic curr_topic = 0 #classes = np.array(target_labels)[np.array(list(manual_best)) - 1] for j, line in enumerate(lines): scores = [ float(x.split("*")[0]) for x in line.split(" + ") ] words = [ x.split("*")[1].strip("'), ") for x in line.split(" + ") ] freqs = [] for word, score in zip(words, scores): freqs.append((word, score)) wc = WordCloud(max_words=100) elements = wc.fit_words(freqs) default_colors = wc.to_array() plt.figure() plt.title("Topic {}".format(j)) #classes[j]) plt.imshow(default_colors) plt.axis("off") plt.show() curr_topic += 1