def main(): trace('---train topics---', config.log_file) model = DtmModel(dtm_path, corpus=gensim_data.corpus, id2word=gensim_data.dictionary, time_slices=train_set.time_slices[:-1], num_topics=config.z_dim, lda_sequence_min_iter=50, lda_sequence_max_iter=config.epochs) trace('---model trained---', config.log_file) # sample_topic = model.dtm_coherence(time=0, num_words=10) print('sample topic is like: {}'.format(' '.join(sample_topic[0])), config.log_file) # tw_nps = model.show_topics(num_topics=config.z_dim, times=-1, num_words=train_set.vocab_size(), formatted=False) for t in range(T): # topics in time t tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim] tw_np = get_topic_np(tw_np, config.z_dim, gensim_data.dictionary.token2id) tw_tensor = torch.from_numpy(tw_np) tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary) # coh cohs_t = get_cohs(tw_list_t) p = ppl(gensim_data.test, tw_tensor) TWmatrix.append(tw_np) TWlist.append(tw_list_t) COHs.append(cohs_t) PPLs.append(p) avg_COHs.append((sum(cohs_t) / len(cohs_t))) seg = '---------- topics in time {}/{} ----------'.format(t + 1, T) display_topics(tw_list=tw_list_t, cohs=cohs_t, head='topics', seg=seg, file=config.topic_file) trace('topic result(coherence) written.', file=config.log_file) p_file = os.path.join(config.output_path, 'ppl.jpg') draw_ppl(PPLs, title='perplexities over time', file=p_file) a_file = os.path.join(config.output_path, 'avg_coh.jpg') draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
def dtm_run(data, times, dtm_path, **kw): """Run DTM model.""" sname = kw.pop("name", '_temp_') save = kw.pop('save', True) d, bow = lda_get_dictionary(data, save=save, name=sname) key = f"lda_dtm_{ncomps}_" + sname if os.path.exists(os.path.join(PKLDIR, key)): return pickle_load(key) else: mod = DtmModel(dtm_path=dtm_path, corpus=bow, id2word=d, time_slices=times, **dtm_defaults) mod.save(os.path.join(PKLDIR, key)) return mod
def DTM(path, time_slices, num_topics, corpus): """Returns the results of the dynamic topic model and the document-topic matrix. Arguments: path: The path to the binary dtm. time_slices: A sequence of timestamps. num_topics: The number of topics. corpus: A collection of texts in bow format. Returns: dtm_results: A list of lists of lists containing the results over the time slices. doc_topic_matrix: The proportion of the topics for each document. """ # Set the DTM model model = DtmModel(dtm_path=path, time_slices=time_slices, num_topics=num_topics, id2word=corpus.dictionary, top_chain_var=0.01, alpha=50/num_topics, rng_seed=101, initialize_lda=True) # Use LDA in DTM analysis # Save the DTM model for later use model.save('DTM_model') # # Create a list of lists of lists of the top words for each topic dtm_results = [] for topic in range(num_topics): dtm_results.append([[model.show_topic(topicid=topic, time=i, topn=top_words)[j][1] for j in range(top_words)] \ for i in range(len(time_slices))]) # Generate the document-topic matrix doc_topic_matrix = model.dtm_vis(corpus, time=0)[0] return dtm_results, doc_topic_matrix
def train_model(self): # train DTM model print("Start time of DTM training: {}".format(datetime.datetime.now())) self.model = DtmModelClass(self.path_to_dtm_binary, corpus=self.doc_term_matrix, id2word=self.dictionary, time_slices=self.time_slices, num_topics=self.num_topics, rng_seed=self.seed) print("End time of DTM training: {}".format(datetime.datetime.now()))
def getCoherenceScores(nTopics): model = DtmModel(path_to_dtm_binary, corpus=corpus, num_topics=nTopics, id2word=dictionary, time_slices=timeSlice) model.save(f'./Models/model{nTopics}Topics') wordRepresentationTopics = [ model.dtm_coherence(time=time) for time in range(0, len(timeSlice)) ] coherenceModels = [ CoherenceModel(topics=wordRepresentationTopics[time], corpus=corpus, dictionary=dictionary, coherence='u_mass') for time in range(0, len(timeSlice)) ] coherenceScores = [ coherenceModels[time].get_coherence() for time in range(0, len(timeSlice)) ] return coherenceScores
def load_model(self): # Load model self.model = DtmModelClass.load(self.output_file_path) print(f"Model loaded from {self.output_file_path}")
class DtmModel: def __init__(self, date_col, time_ref_col, path_to_dtm_binary, dictionary, doc_term_matrix, seed, num_topics, output_file_path, files): self.date_col = date_col self.time_ref_col = time_ref_col self.path_to_dtm_binary = path_to_dtm_binary self.dictionary = dictionary self.doc_term_matrix = doc_term_matrix self.seed = seed self.num_topics = num_topics self.output_file_path = output_file_path self.files = files self.time_slice_labels = None self.time_slices = None self.model = None self.topic_df_list = None def prepare_data(self, df): # Add year column to data frame def get_year(x): return x.year yrs = df[self.date_col].apply(lambda x: get_year(x)) df["year"] = yrs # Get time slice labels self.time_slice_labels = df[self.time_ref_col].unique() self.time_slices = df.groupby(self.time_ref_col).size() print("Time_slices\n", self.time_slices) return df def train_model(self): # train DTM model print("Start time of DTM training: {}".format(datetime.datetime.now())) self.model = DtmModelClass(self.path_to_dtm_binary, corpus=self.doc_term_matrix, id2word=self.dictionary, time_slices=self.time_slices, num_topics=self.num_topics, rng_seed=self.seed) print("End time of DTM training: {}".format(datetime.datetime.now())) def save_model(self): # Save to file self.model.save(self.output_file_path) print(f"Dynamic topic model saved to {self.output_file_path}") def load_model(self): # Load model self.model = DtmModelClass.load(self.output_file_path) print(f"Model loaded from {self.output_file_path}") def top_term_table(self, topic, slices, topn=10): """Returns a dataframe with the top n terms in the topic for each of the given time slices.""" data = { "Topic_ID": [topic] * topn, "Word_Rank": [i for i in range(topn)] } for time_slice in slices: time = np.where(self.time_slice_labels == time_slice)[0][0] data[time_slice] = [ term for p, term in self.model.show_topic( topic, time=time, topn=topn) ] df = pd.DataFrame(data) return df def get_doc_topics(self, doc_term_matrix, df_agg): # Get topic assignment for each document doc_topic, topic_term, doc_lengths, term_frequency, vocab = self.model.dtm_vis( doc_term_matrix, 0) # Create topic label vector doc_topic_no = [np.argmax(array) for array in doc_topic] # Create document topic matrix topic_cols = [ "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5", "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11", "topic_12", "topic_13", "topic_14", "topic_15", "topic_16", "topic_17", "topic_18", "topic_19" ] df_doc_topic = pd.DataFrame(doc_topic, columns=topic_cols) df_doc_topic["topic_no"] = doc_topic_no df_output = pd.concat([df_agg, df_doc_topic], axis=1) return df_output def generate_topic_tables(self): """ Generate a list with a data frame for each topic, where rows denote a word and columns a time slice. :param files: Needed for the column names of the data frames :return: List of data frames for each topic """ time_slices = self.files # topic_df_list = [] # Gather data for each words in each topic in each time slice all_topics = [] # For each time slice for time_id in range(len(time_slices)): def safe_div(x, y): if y == 0: return 0 return x / y time = time_slices[time_id] # Create data frame with dummy column having the length of the vocab # df_topic = pd.DataFrame([0] * len(vocab)) # Get all topic-word distributions for time slice i _, topic_term, _, _, vocab = self.model.dtm_vis( self.doc_term_matrix, time_id) for topic_id in range(len(topic_term)): # Topic-word distribution for one topic at time slice i topic_at_time_slice = topic_term[topic_id] # For each word in this topic for word_id in range(len(topic_at_time_slice)): # Gather all data records data_word = vocab[word_id] data_topic = topic_id data_time = time data_time_no = time_id data_load = topic_at_time_slice[word_id] # Calculate difference of word load in previous time slice if data_time == time_slices[0]: data_dif = 0 data_dif_big = 0 data_dif_fraq = 0 else: data_load_prev = all_topics[len(all_topics) - (len(topic_at_time_slice) * len(topic_term))][4] data_dif = data_load - data_load_prev data_dif_fraq = safe_div(data_dif, data_load_prev) data_dif_big = data_dif * 100000 data = [ data_word, data_topic, data_time, data_time_no, data_load, data_dif_big, data_dif_fraq ] all_topics.append(data) print(f"Finished gathering data from time slice {time}\n") df_output = pd.DataFrame(all_topics, columns=[ "word", "topic", "time", "time_no", "load", "dif_e5", "dif_fraq" ]) return df_output def generate_topic_detail_tables(self): """ Generate a list with a data frame for each topic, where rows denote a word and columns a time slice. :param files: Needed for the column names of the data frames :return: List of data frames for each topic """ time_slices = self.files topic_df_list = [] # Gather data for each words in each topic in each time slice _, topic_term, _, _, vocab = self.model.dtm_vis( self.doc_term_matrix, 0) for topic_id in range(len(topic_term)): # Create data frame with dummy column having the length of the vocab df_topic = pd.DataFrame([0] * len(vocab)) # For each time slice for time_id in range(len(time_slices)): # Get all topic-word distributions for time slice i _, topic_term, _, _, vocab = self.model.dtm_vis( self.doc_term_matrix, time_id) # Topic-word distribution for one topic at time slice i topic_at_time_slice = topic_term[topic_id] df_topic[time_slices[time_id]] = topic_at_time_slice df_topic.index = vocab df_topic = df_topic.drop(columns=[0]) df_topic["topic"] = topic_id print(f"Finished gathering data for topic {topic_id}") file_path = f"output/topics/topic_{topic_id}.csv" df_topic.to_csv(file_path) print(f"Topic detail data frame written to {file_path}") topic_df_list.append(df_topic) self.topic_df_list = topic_df_list # def write_topic_df_to_excel(self, file_path): # # # Create a Pandas Excel writer using XlsxWriter as the engine. # writer = pd.ExcelWriter(file_path, engine='xlsxwriter') # # # Write each topic dataframe to a different worksheet. # for i in range(len(self.topic_df_list)): # self.topic_df_list[i].to_excel(writer, sheet_name=f'topic_{i}') # # print(f"Topic {i} written to excel sheet") # # # Close the Pandas Excel writer and output the Excel file. # writer.save() # # print(f"Topic dataframes written to excel file under {file_path}") def calculate_word_dif(self, folder_path): for topic_id in range(self.num_topics): df = pd.read_csv(folder_path + f"topic_{topic_id}.csv", index_col=0) # Calculate difference of word probabilities to last time slice for i in range(len(self.files) - 1): df[f"dif_{self.files[i+1]}"] = df[self.files[i + 1]] - df[ self.files[i]] # Calculate difference of word probabilities differences to last time slice if i > 0: df[f"dif_dif_{self.files[i + 1]}"] = df[ f"dif_{self.files[i + 1]}"] - df[f"dif_{self.files[i]}"] # Change order of columns columns = [ "fp1_projects", "fp2_projects", "fp3_projects", "fp4_projects", "fp5_projects", "fp6_projects", "fp7_projects", "h2020_projects", "dif_fp2_projects", "dif_fp3_projects", "dif_fp4_projects", "dif_fp5_projects", "dif_fp6_projects", "dif_fp7_projects", "dif_h2020_projects", "dif_dif_fp3_projects", "dif_dif_fp4_projects", "dif_dif_fp5_projects", "dif_dif_fp6_projects", "dif_dif_fp7_projects", "dif_dif_h2020_projects", "topic" ] df = df[columns] output_file_path = folder_path + f"topic_dif_{topic_id}.csv" df.to_csv(output_file_path) print( f"Finished calculating differences and created file {output_file_path}" ) def construct_final_topic_data(self, folder_path): df_ref = pd.read_csv(folder_path + "topic_dif_0.csv", index_col=0) vocab = df_ref.index topic_list = [] for topic_id in range(self.num_topics): df = pd.read_csv(folder_path + f"topic_dif_{topic_id}.csv", index_col=0) df.dropna() time_slices = self.files for word in vocab: # Account for the nan word try: for i in range(len(time_slices)): time = time_slices[i] load = df.loc[word, time] if i == 0: dif = 0 else: dif = df.loc[word, "dif_" + time] data = [word, topic_id, time, i, load, dif] topic_list.append(data) except: print(f"Error at topic {topic_id} and word {word}") print(f"Finished reformating topic {topic_id}") df_output = pd.DataFrame( topic_list, columns=["word", "topic", "time", "time_no", "load", "dif"]) df_output.to_csv(folder_path + "all_topics.csv") def generate_project_topic_table(self, df_raw): multicol1 = pd.MultiIndex.from_tuples([('topic', 'topic_0'), ('topic', 'topic_1'), ('topic', 'topic_2'), ('topic', 'topic_3'), ('topic', 'topic_4'), ('topic', 'topic_5'), ('topic', 'topic_6'), ('topic', 'topic_7'), ('topic', 'topic_8'), ('topic', 'topic_9'), ('topic', 'topic_10'), ('topic', 'topic_11'), ('topic', 'topic_12'), ('topic', 'topic_13'), ('topic', 'topic_14'), ('topic', 'topic_15'), ('topic', 'topic_16'), ('topic', 'topic_17'), ('topic', 'topic_18'), ('topic', 'topic_19')]) columns = [ "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5", "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11", "topic_12", "topic_13", "topic_14", "topic_15", "topic_16", "topic_17", "topic_18", "topic_19" ] # Set rcn as index df_raw = df_raw.set_index("rcn") # Only keep topic columns df_flat = df_raw[columns] # Create multi index data frame df_flat_multi = pd.DataFrame(df_flat.values, index=df_flat.index, columns=multicol1) # Stack data frame df_stacked = df_flat_multi.stack() # Set rcn as single index df_stacked = df_stacked.reset_index().set_index("rcn") # Rename columns df_stacked = df_stacked.rename(columns={ "level_1": "topic", "topic": "load" }) # Remove unnecessary prefix from topic column def remove_prefix(text): return re.sub("topic_", "", text) df_stacked["topic"] = df_stacked["topic"].apply( lambda text: remove_prefix(text)) # Join project information and make rcn normal column again df_project_info = df_raw[["startDate", "fp", "fp_no", "title"]] df_project_topics = df_stacked.join(df_project_info, how="left").reset_index() return df_project_topics
corpus = DTMcorpus(docs_lem) # creating decade variable that serves as a time frame for DTM data['decade'] = (data['year'] - 1850) / 10 data['decade'] = data['decade'].apply(np.floor) time_slices = list(data['decade'].value_counts())[::-1] # dtm path dtm_path = "/home/sami/dtm/dtm/main" # estimating DTM with 2 topics per decade model = DtmModel(dtm_path, corpus, time_slices, num_topics=2, id2word=corpus.dictionary) # displaying top 10 words in topic number 1 during the second decade model.show_topic(topicid=1, time=1, topn=10) doc_number = 0 num_topics = 2 # topic distribution during first time frame (decade) for i in range(0, num_topics): print("Distribution of topic %d %f" % (i, model.gamma_[doc_number, i])) results = pd.DataFrame()
frequency[token] += 1 texts = [[token for token in text if frequency[token] > 0] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save('dict.pickle') # store the dictionary, for future reference #Corpus Created corpus = [dictionary.doc2bow(text) for text in texts] print("Corpus Created") num_topics = 30 path_to_dtm_binary = "/home/khan/DTM/dtm/dtm/main" model = DtmModel(path_to_dtm_binary, corpus=corpus, time_slices=ts, mode='fit', model='dtm', num_topics=num_topics) training_time = time.time() - start_time print("Model fitted") id2token = id2TokenFn() topics = [] for t in range(len(ts)): for j in range(num_topics): topic = model.show_topic(topicid = j, time = t, topn = 50) new_topic = [] info = ("Time,", t, "TopicID,", j) for item in topic: new_topic.append((id2token[int(item[1])], round(item[0], 4))) new_topic.append(info)
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None, coherence='u_mass', vis_time=0, seed=None): running_os = platform.system() is_os_64bit = platform.machine().endswith('64') if running_os == 'Linux': if is_os_64bit: dtm_filename = 'dtm-linux64' else: dtm_filename = 'dtm-linux32' elif running_os == 'Windows': if is_os_64bit: dtm_filename = 'dtm-win64.exe' else: dtm_filename = 'dtm-win32.exe' else: # Mac dtm_filename = 'dtm-darwin64' dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename) if running_os != 'Windows': bash_command = "chmod +x {}".format(dtm_path) os.system(bash_command) tokenized_doc = np.array(table[input_col]) num_doc = len(tokenized_doc) if time_slice is None: time_slice = [num_doc] elif sum(time_slice) != num_doc: raise_runtime_error("The sum of time slice list does not match the number of documents.") if vis_time < 0 or vis_time >= len(time_slice): raise_runtime_error("Invalid time parameter: {}".format(vis_time)) dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] dtm_params = {"corpus": corpus, "id2word": dictionary, "time_slices": time_slice, "num_topics": num_topic, "lda_sequence_max_iter": max_iter, "model": 'dtm'} if seed is not None: dtm_params["rng_seed"] = seed dtm_model = DtmModel(dtm_path, **dtm_params) topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)] for t in range(len(time_slice))] topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time] timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)] columns = ["topic_{}".format(i + 1) for i in range(num_topic)] topic_table = pd.DataFrame(topic_time, columns=columns) topic_table['time'] = timeline topic_table = topic_table[['time'] + columns] prop_arr = dtm_model.gamma_ out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors( [{'0100': "Existing table contains Topic Column Name. Please choose again."}]) out_table[topic_name] = [item.argmax() + 1 for item in prop_arr] out_table['topic_distribution'] = prop_arr.tolist() coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))] if coherence == 'u_mass': coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence() for item in coherence_topic_arr] else: coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc, coherence='c_v').get_coherence() for item in coherence_topic_arr] doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time) prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False) html_result = plv.prepared_data_to_html(prepared_data) params = {'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Time slice': time_slice, 'Coherence measure': coherence, 'Time to visualize': vis_time} rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Dynamic Topic Modeling Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | ### Coherence for each period | {coh_arr} | | ### Parameters | {params} """.format(coh_arr=coh_arr, params=dict2MD(params)))) model = _model_dict('dtm_model') model['params'] = params model['dtm_model'] = dtm_model model['coherences'] = coh_arr model['corpus'] = corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save( 'deerwester.dict') # store the dictionary, for future reference #Corpus Created corpus = [dictionary.doc2bow(text) for text in texts] print("Corpus Created") path_to_dtm_binary = "/home/khan/DTM/dtm/dtm/main" model = DtmModel(path_to_dtm_binary, corpus, time_slices=[1] * len(corpus), mode='fit', model='dtm', num_topics=20) print("Model fitted") topics = model.show_topic(topicid=1, time=1, topn=10) print(topics) print("Topics finding") training_time = time.time() - start_time # Coverting topic into a excel file print("Putting in DataFrame started") df = pd.DataFrame(topics) writer = pd.ExcelWriter("DTM_topics.xlsx") df.to_excel(writer, 'Sheet1')
time_slices=times, **dtm_defaults) mod.save(os.path.join(PKLDIR, key)) return mod # DTM Analysis nips = NipsData() dat = nips.load_data(sample_frac=1) sections = ['title', 'abstract'] ncomps = dtm_defaults['num_topics'] dat = nips.combined_sections(sections, data=dat) yrs, cnts = papers_per_year(nips.raw) sname = get_save_name(sections) docs = lda_get_corpus(dat, name=sname, save=True) d, bow = lda_get_dictionary(dat, name=sname, save=True) ## mod = dtm_run(dat, cnts, dtm_path, name=sname, save=True) mod = DtmModel.load(os.path.join(PKLDIR, 'lda_dtm_15_title_abstract')) def print_dtm_top_words_for_year(model, years, n_topics, n_words): """Print top n_words from top n_topics for year in years.""" print(f"Top {n_words} from top {n_topics} for year(s) {years}:") yrs = enumerate(range(1987, 2018)) inds = [(i,yr) for i, yr in yrs if yr in years] for i, yr in inds: print(f"Year {yr}:") for topic, words in enumerate(model.dtm_coherence(i, n_words)[:n_topics]): print(f" Topic #{topic}: " + ', '.join(words)) def dtm_coherence(model, corpus, d, year): """Get coherence for DTM model at year."""
import gensim.models from gensim.models.wrappers import DtmModel import gensim.corpora import gensim.matutils import numpy as np import pickle corpus = pickle.load(open("collections/tmp/test_corpus", "rb")) dictionary = pickle.load(open("collections/tmp/test_dictionary", "rb")) num_topics = pickle.load(open("collections/tmp/num_topics", "rb")) top_chain_var = pickle.load(open("collections/tmp/top_chain_variance", "rb")) time_slices = pickle.load(open("collections/tmp/time_slices", "rb")) alpha = pickle.load(open("collections/tmp/alpha", "rb")) path_to_dtm_binary = "global/dtm-linux64" model = DtmModel(path_to_dtm_binary, corpus=corpus, id2word=dictionary, time_slices=time_slices, num_topics=num_topics, mode="fit", top_chain_var=top_chain_var, initialize_lda=True, alpha=alpha)
def loadDTM(amounttopics): modelAmountTopics = DtmModel.load(f'./Models/model{amounttopics}Topics') return modelAmountTopics