def vl(dataground, subK=3, topNum=15): """Retrain a LDA model for subset""" w_mi = dataground.subset_w_mi() new_lda_model = LDA(K=subK, dictionary=dataground.dictionary) new_lda_model.fit(w_mi) sub_n_kt = new_lda_model.n_kt topWordTokens = _top_words(sub_n_kt, topNum) return topWordTokens
def load_data(self, corpus): if self.dictionary is not None: self.lda_model = LDA(dictionary=self.dictionary, K=self.K, n_early_stop=self.n_early_stop) self.lda_model.fit(corpus) else: self.lda_model = LDA(K=self.K, n_early_stop=self.n_early_stop) self.lda_model.fit(corpus, min_df=self._min_df, min_tf=self._min_tf, max_dict_len=self._max_dict_len, stem=self._stem) self.dictionary = self.lda_model.dictionary # self.target_subset_ids = range(int(self.lda_model.M / 20)) self.set_target_subset(range(int(self.lda_model.M / 20)), random_state=0)
def vl(dataground, subK=5): w_mi_train = dataground.subset_w_mi_train() new_lda_model = LDA(K=subK, dictionary=dataground.dictionary) new_lda_model.fit(w_mi_train) sub_n_kt = new_lda_model.n_kt return ReprModel(sub_n_kt)
class DataGround: """DataGround is used to store the corpus-level information""" def __init__(self, K=100, n_early_stop=30, use_default_dict=False, dict_file=None, min_tf=5, min_df=2, max_dict_len=8000, stem=False, held_out_ratio=0.2): self.K = K self.n_early_stop = n_early_stop self.held_out_ratio = held_out_ratio if use_default_dict: self.dictionary = Dictionary(dict_file=default_dict) elif dict_file: self.dictionary = Dictionary(dict_file=dict_file) else: self.dictionary = None self._min_tf = min_tf self._min_df = min_df self._max_dict_len = max_dict_len self._stem = stem def load_data(self, corpus): if self.dictionary is not None: self.lda_model = LDA(dictionary=self.dictionary, K=self.K, n_early_stop=self.n_early_stop) self.lda_model.fit(corpus) else: self.lda_model = LDA(K=self.K, n_early_stop=self.n_early_stop) self.lda_model.fit(corpus, min_df=self._min_df, min_tf=self._min_tf, max_dict_len=self._max_dict_len, stem=self._stem) self.dictionary = self.lda_model.dictionary # self.target_subset_ids = range(int(self.lda_model.M / 20)) self.set_target_subset(range(int(self.lda_model.M / 20)), random_state=0) def _split(self, length, held_out_ratio=0.2, random_state=0): train_length = int(length * (1 - held_out_ratio)) indexes = list(range(length)) random.seed(random_state) random.shuffle(indexes) train_index = indexes[:train_length] test_index = indexes[train_length:] return train_index, test_index def set_target_subset(self, selected_ids, random_state=0): self.target_subset_ids = selected_ids subset_w_mi = self.subset_w_mi() indexes = [self._split(len(doc), self.held_out_ratio, random_state) for doc in subset_w_mi] self.train_index = [x[0] for x in indexes] self.test_index = [x[1] for x in indexes] def subset_w_mi(self, with_dictionary=False): subset_w_mi = [self.lda_model.w_mi[x] for x in self.target_subset_ids] if with_dictionary: return (subset_w_mi, self.dictionary) else: return subset_w_mi def subset_w_mi_train(self): subset_w_mi = [np.array(doc) for doc in self.subset_w_mi()] subset_w_mi_train = [doc[index] for (doc, index) in zip(subset_w_mi, self.train_index)] return subset_w_mi_train def subset_w_mi_test(self): subset_w_mi = [np.array(doc) for doc in self.subset_w_mi()] subset_w_mi_test = [doc[index] for (doc, index) in zip(subset_w_mi, self.test_index)] return subset_w_mi_test def subset_z_mi(self): lda_model = self.lda_model z_mi = [] ids = self.target_subset_ids for doc_id in ids: start = lda_model.I_m[doc_id] end = start + lda_model.N_m[doc_id] z_mi.append(lda_model.Z[start:end]) return z_mi def subset_z_mi_train(self): subset_z_mi = self.subset_z_mi() subset_z_mi_train = [z[index] for (z, index) in zip(subset_z_mi, self.train_index)] return subset_z_mi_train def subset_z_mi_test(self): subset_z_mi = self.subset_z_mi() subset_z_mi_test = [z[index] for (z, index) in zip(subset_z_mi, self.test_index)] return subset_z_mi_test def subset_n_mk(self): n_mk = self.lda_model.n_mk return n_mk[self.target_subset_ids] def subset_n_mk_train(self): z_mi_train = self.subset_z_mi_train() M = len(z_mi_train) n_mk_train = np.zeros((M, self.K)) for m in range(M): for z_i in z_mi_train[m]: n_mk_train[m, z_i] += 1 return n_mk_train def subset_n_mk_test(self): z_mi_test = self.subset_z_mi_test() M = len(z_mi_test) n_mk_test = np.zeros((M, self.K)) for m in range(M): for z_i in z_mi_test[m]: n_mk_test[m, z_i] += 1 return n_mk_test
import os from mylda import LDA, demo_dataset_dir filenames = [os.path.join(demo_dataset_dir, x) for x in os.listdir(demo_dataset_dir)] documents = [open(x, encoding="utf8").read() for x in filenames] # Generate corpus-based dictionary lda_model = LDA(K=5, n_early_stop=20) lda_model.fit(documents, max_dict_len=5000, min_tf=5, stem=False) lda_model.show_topic(topNum=15) # Or use standalone dictionary lda_model = LDA(K=5, n_early_stop=20, use_default_dict=True) # You can also use your own dictionary # lda_model = LDA(K=5, dict_file="yourdictionary.txt") lda_model.fit(documents) lda_model.show_topic(topNum=15)