def lda_model(self, text_data, save_path, topic_number=20): mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=topic_number) index=0 for doc in text_data: print(str(index) + " : " + str(doc)) mdl.add_doc(doc) index+=1 mdl.burn_in = 100 mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) print('Training...', file=sys.stderr, flush=True) for i in range(0, 1500, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) print('Saving...', file=sys.stderr, flush=True) mdl.save(save_path, True) # extract candidates for auto topic labeling extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) cands = extractor.extract(mdl) # ranking the candidates of labels for a specific topic labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) for k in range(mdl.k): print("== Topic #{} ==".format(k)) print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) for word, prob in mdl.get_topic_words(k, top_n=10): print(word, prob, sep='\t') print() return mdl
def infer_new_doc(): ''' Prior to version 0.10.0, we had to make instances of `Document` using `make_doc` first and call `infer`. ''' train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8')) # make LDA model and train mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus) mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) mdl.summary() docs = [] for line in open('enwiki-stemmed-1000.txt', encoding='utf-8'): docs.append(mdl.make_doc(line.lower().split())) topic_distributions, ll = mdl.infer(docs) # print topic distributions of each document for doc, topic_dist in zip(docs, topic_distributions): #print(doc) print(topic_dist)
def word_prior_example(input_file): corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) corpus.process(open(input_file, encoding='utf-8')) # make LDA model and train mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) # The word 'church' is assigned to Topic 0 with a weight of 1.0 and to the remaining topics with a weight of 0.1. # Therefore, a topic related to 'church' can be fixed at Topic 0 . mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)]) # Topic 1 for a topic related to 'softwar' mdl.set_word_prior('softwar', [1.0 if k == 1 else 0.1 for k in range(20)]) # Topic 2 for a topic related to 'citi' mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)]) mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) for k in range(mdl.k): print("== Topic #{} ==".format(k)) for word, prob in mdl.get_topic_words(k, top_n=10): print(word, prob, sep='\t') print()
def raw_corpus_and_labeling_example(input_file): from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords stemmer = PorterStemmer() stops = set(stopwords.words('english')) corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), stopwords=lambda x: len(x) <= 2 or x in stops) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) corpus.process(open(input_file, encoding='utf-8')) # make LDA model and train mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus) mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) mdl.summary() # extract candidates for auto topic labeling extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) cands = extractor.extract(mdl) labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) for k in range(mdl.k): print("== Topic #{} ==".format(k)) print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) for word, prob in mdl.get_topic_words(k, top_n=10): print(word, prob, sep='\t') print()
def infer_new_corpus(): ''' Since 0.10.0 version, inference using an instance of `Corpus` was supported. ''' train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8')) test_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.']) test_corpus.process(open('corpus_to_be_inferred.txt', encoding='utf-8')) # make LDA model and train mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus) mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) mdl.summary() inferred_corpus, ll = mdl.infer(test_corpus) # print topic distributions of each document for doc in inferred_corpus: #print(doc.raw) # print raw string of the document #print(list(doc)) # print a list of words within the document print(doc.get_topic_dist())
def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5, max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'): assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...' assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...' if model_type == 'lda': model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'ctm': model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == "slda": model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'hdp': model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) sample_size = min(sample_size, len(dataset)) # max_iter = max_iter * sample_size * topic_size // 2000 # ensure the number of iterations increases with the size of sample model.burn_in = max_iter // 5 # set burn-in: 20 percent of max iterations for i in range(sample_size): doc, label = dataset[i] if model_type == "slda": model.add_doc(doc,[float(label),]) else: model.add_doc(doc) if min_iter is None: min_iter = max_iter // 5 if checkpoint is None: checkpoint = max_iter // 5 model.train(min_iter) pre_metric = - np.infty stop_increase_cnt = 0. cur_metric = 0. for i in range(1, max_iter+1): model.train(1) # Metric is always larger, better if metric == 'll': cur_metric += model.ll_per_word if metric == 'pp': cur_metric += - model.perplexity # smaller perplexity is better. if i % checkpoint == 0: cur_metric /= checkpoint print(f'Current loss: {cur_metric:.5f}') if cur_metric >= pre_metric: pre_metric = cur_metric else: stop_increase_cnt += 1 cur_metric = 0. if stop_increase_cnt >= stop_increase: break final_metric = model.perplexity if metric == 'pp' else model.ll_per_word print(f'Trial iterations: {i + min_iter}.') return model, final_metric
def train_model(self, dataset, hyperparameters): if self.use_partitions: X_train, X_test = dataset.get_partitioned_corpus( use_validation=False) else: X_train = dataset.get_corpus() X_test = None mdl = tp.LDAModel(k=self.hyperparameters['num_topics'], alpha=self.hyperparameters['alpha'], eta=self.hyperparameters['eta'], min_cf=self.hyperparameters['min_cf'], min_df=self.hyperparameters['min_df'], rm_top=self.hyperparameters['rm_top']) for i in X_train: mdl.add_doc(i) mdl.train(self.hyperparameters['max_iter']) topic_word_matrix = np.stack([ mdl.get_topic_word_dist(k) for k in range(mdl.k) ]) # topic word distribution matrix topic_document_matrix = np.stack([ doc.get_topic_dist() for doc in mdl.docs ]) # topic document distribution matrix # topics extraction topic_w = [] for k in range(mdl.k): topics = [] for word in mdl.get_topic_words(k): topics.append(word[0]) topic_w.append(topics) # Output model on the Train Set info = {} info['topics'] = np.asarray(topic_w) info['topic-word-matrix'] = topic_word_matrix info['topic-document-matrix'] = topic_document_matrix # Inference on the test set if X_test is not None: doc_inst = [mdl.make_doc(i) for i in X_test] topic_dist, _ = mdl.infer(doc_inst) # topic document distribution info['test-topic-document-matrix'] = np.asarray(topic_dist) # Manage the model output info_diz = {} info_diz['topics'] = info['topics'] info_diz['topic-document-matrix'] = info['topic-document-matrix'] if X_test is not None: info_diz['test-topic-document-matrix'] = info[ 'test-topic-document-matrix'] return info_diz
def test_coherence(): mdl = tp.LDAModel(tw=tp.TermWeight.ONE, k=20, min_df=5, rm_top=5) for n, line in enumerate(open(curpath + '/sample.txt', encoding='utf-8')): ch = line.strip().split() mdl.add_doc(ch) mdl.train(1000) for coh in ('u_mass', 'c_uci', 'c_npmi', 'c_v'): coherence = tp.coherence.Coherence(corpus=mdl, coherence=coh) print(coherence.get_score())
def bench_tomotopy(k, ps, w=0): model = tp.LDAModel(k=k) for text in open(filename, encoding='utf-8'): model.add_doc(filter(lambda x:x!='.', text.strip().split())) #print('Number of vocabs:', len(model.vocabs)) start_time = time.time() model.train(200, workers=w, parallel=ps) #for i in range(k): print(model.get_topic_words(i)) print('K=%d\tW=%d\tTime: %.5g' % (k, w, time.time() - start_time), end='\t') print('LL: %g' % model.ll_per_word, flush=True)
def create_lda(tw=tp.TermWeight.IDF, min_cf=0, min_df=5, rm_top=0, k=2, alpha=0.1, eta=1, seed=101, corpus=None): """ Creates a tomotopy LDAModel() Parameters: tw: Union[int, TermWeight] term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ; I chose the default to be inverse document frequency, which means that cards that appear in almost all decks are weighted lower than cards that appear in very few decks. min_cf: int Unless I'm mistaken, this is the minimum number of times that a card must appear at all in any deck to be included. However, since the vast majority of cards can be included at most once, this is almost always going to be the same as min_df. min_df: int Minimum number of times that a card must appear in a deck to be included in the analysis; default is set to 5. rm_top: int When ranking the most popular cards that are included in a given commander's decks, this parameter will remove the top n of them. Default is 0. k: int Number of themes/archetypes to sort decks into from 1 ~ 32,767. The default value is 2. alpha: float "hyperparameter of Dirichlet distribution for document-topic". Increasing alpha ... Based on advice from Eduardo Coronado (@ecoronado92 on Twitter), default for alpha is set to 0.1. eta: float "hyperparameter of Dirichlet distribution for topic-word". Increasing eta ... Based on experimentation, default for eta is 1. seed: int Random seed. Set to 101 as default in an attempt to duplicate results; however, said duplication has proven to be... elusive. corpus: tomotopy Corpus A list of documents to be added into the model. If None, documents have to be added after the model is created through LDAModel.add_doc() before the model can be trained. :return: tomotopy LDA model object """ lda = tp.LDAModel(tw=tw, min_cf=min_cf, min_df=min_df, rm_top=rm_top, k=k, alpha=alpha, eta=eta, seed=seed, corpus=corpus) return lda
def test_uid(): cps = tp.utils.Corpus() cps.add_doc("test text".split(), uid="001") cps.add_doc("test text".split(), uid="abc") cps.add_doc("test text".split(), uid="0x1f") mdl = tp.LDAModel(k=2, corpus=cps) assert len(cps) == len(mdl.docs) assert cps[0].uid == mdl.docs[0].uid print(mdl.docs["001"]) print(mdl.docs["abc"]) print(mdl.docs["0x1f"])
def train_lda_model_from_data(filtered_data, topics=30): mdl = tp.LDAModel(k=topics) for data in tqdm.tqdm(filtered_data.keys()): mdl.add_doc(chain.from_iterable(filtered_data[data][1])) print("Beginning LDA training...") for i in range(0, 1000, 10): mdl.train(10) if (i % 100 == 0): print('Iteration: {}\tLog-likelihood: {}'.format( i, mdl.ll_per_word)) print("Finished Training") return mdl
def train_tomotopy_model(corpus: dict, num_topics: int = 300, verbose: bool = True): mdl = tp.LDAModel(k=num_topics) corpus_ = {k: v for k, v in corpus.items() if len(v) > 0} for doc in list(corpus_.values()): mdl.add_doc(doc) for i in range(0, 150, 10): mdl.train(10) print("Iteration: {}".format(i)) if verbose is True: print(mdl.summary()) return mdl
def tm_estimation(corpus_, k_): # estimatation tp.ParallelScheme(0) model = tp.LDAModel(min_df=5, rm_top=20, k=k_, seed=000, corpus=corpus_) model.train(0) # log print("Num docs:{}, Num Vocabs:{}, Total Words:{}".format( len(model.docs), len(model.used_vocabs), model.num_words)) print("Removed Top words: ", *model.removed_top_words) # model training for i in range(0, 1000, 20): print("Iteration: {:04}, LL per word: {:.4}".format( i, model.ll_per_word)) model.train(20) print("Iteration: {:04}, LL per word: {:.4}".format( 1000, model.ll_per_word)) # summary model.summary() # return return model
def lda_example(input_file, save_path): mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20) for n, line in enumerate(open(input_file, encoding='utf-8')): ch = line.strip().split() mdl.add_doc(ch) mdl.burn_in = 100 mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) print('Training...', file=sys.stderr, flush=True) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) print('Saving...', file=sys.stderr, flush=True) mdl.save(save_path, True) for k in range(mdl.k): print('Topic #{}'.format(k)) for word, prob in mdl.get_topic_words(k): print('\t', word, prob, sep='\t')
def test_docs(): from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords stemmer = PorterStemmer() stopwords = set(stopwords.words('english')) corpus = tp.utils.Corpus( tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), stopwords=lambda x: len(x) <= 2 or x in stopwords) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) for i, line in enumerate( open(curpath + '/sample_raw.txt', encoding='utf-8')): corpus.add_doc(raw=line, uid='doc{:05}'.format(i), etc=len(line)) def _test_doc(doc, etc=False): print("doc", doc) print("len(doc)", len(doc)) print("doc.__getitem__", doc[0], doc[1], doc[2], doc[3]) if etc: print("doc.etc", doc.etc) print("doc.words", doc.words[:10]) print("doc.span", doc.span[:10]) print("doc.raw", doc.raw[:10]) print("len(corpus)", len(corpus)) print("len(corpus[:10])", len(corpus[:10])) _test_doc(corpus[0], etc=True) mdl = tp.LDAModel(k=10, corpus=corpus) mdl.train(100) print("len(mdl.docs)", len(mdl.docs)) print("len(mdl.docs[:10])", len(mdl.docs[:10])) ch = tp.coherence.Coherence(corpus=mdl, coherence='u_mass') for k in range(mdl.k): print('Coherence of #{} : {}'.format(k, ch.get_score(topic_id=k))) _test_doc(mdl.docs[0])
# initialize a pipeline nlp = en_core_web_lg.load() # process data docs_tokens, tmp_tokens = [], [] for item in df.loc[:, "Review"].to_list(): tmp_tokens = [ token.lemma_ for token in nlp(item) if not token.is_stop and not token.is_punct and not token.like_num ] docs_tokens.append(tmp_tokens) tmp_tokens = [] # %% Tomotopy LDA estimation # create a corpus using tp utilities corpus = tp.utils.Corpus() # populate the corpus for item in docs_tokens: corpus.add_doc(words=item) # estimate a model with 10 topics lda = tp.LDAModel(k=10, corpus=corpus) # train the model for i in range(0, 100, 10): lda.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, lda.ll_per_word)) # display words by topics for k in range(lda.k): print("Top 10 words of topic #{}".format(k)) print(lda.get_topic_words(k, top_n=10)) # save model estimates lda.save('hotel_review_lda_estimates.bin')
# load if preprocessed corpus exists corpus = tp.utils.Corpus.load('preprocessed_20news.cps') except IOError: porter_stemmer = nltk.PorterStemmer().stem english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) pat = re.compile('^[a-z]{2,}$') corpus = tp.utils.Corpus( tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), stopwords=lambda x: x in english_stops or not pat.match(x) ) newsgroups_train = fetch_20newsgroups() corpus.process(d.lower() for d in newsgroups_train.data) # save preprocessed corpus for reuse corpus.save('preprocessed_20news.cps') mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus) mdl.train(0) print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( len(mdl.docs), len(mdl.used_vocabs), mdl.num_words )) print('Removed Top words: ', *mdl.removed_top_words) # Let's train the model for i in range(0, 1000, 20): print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word)) mdl.train(20) print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word)) mdl.summary()
def main(): """ """ ###################### ### Setup ###################### ## Parse Command Line args = parse_arguments() ## Load Configuration config = Config(filepath=args.config) ## Output if config.output_dir is not None and not os.path.exists(config.output_dir): _ = os.makedirs(config.output_dir) ## Cache Config if config.output_dir is not None and config.run_id is not None: _ = os.system( f"cp {args.config} {config.output_dir}/{config.run_id}.config.json" ) ## Set Random State if config.random_state is not None: np.random.seed(config.random_state) ###################### ### Data Generating Process ###################### ## Generate Data X_latent, X, y, D, theta, phi = data_generating_process( config.N, config.sigma_0, config.p_domain, config.gamma, config.V, config.theta, config.coef, beta=config.beta, random_state=config.random_state) ## Data Distribution Plot if args.make_plots: fig, ax = fit_latent_regression(X_latent, y, D, config.coef) plt.show() ###################### ### Fit Topic Models ###################### ## Split Data into Training and Test train_ind = list(range(int(config.N * .8))) test_ind = list(range(int(config.N * .8), config.N)) ## Generate Corpus train_corpus = tp.utils.Corpus() full_corpus = tp.utils.Corpus() ## Add Training Data for n in range(X.shape[0]): doc_n = doc_to_str(X[n]) full_corpus.add_doc(doc_n, label=[str(D[n])]) if n <= train_ind[-1]: train_corpus.add_doc(doc_n, label=[str(D[n])]) assert len(train_corpus) == len(train_ind) ## Initialize Models (3 Topics Total) lda = tp.LDAModel(k=3, corpus=train_corpus, seed=config.random_state if config.random_state is not None else np.random.randint(1e6)) plda = tp.PLDAModel(latent_topics=1, topics_per_label=1, corpus=train_corpus, seed=config.random_state if config.random_state is not None else np.random.randint(1e6)) ## Initialize Sampler lda.train(1) plda.train(1) ## Update Parameters based on Corpus V_nn = lda.num_vocabs ## MCMC Storage n_iter = max(config.n_iter_lda, config.n_iter_plda) likelihood = np.zeros((n_iter, 2)) * np.nan theta_lda = np.zeros((n_iter, config.N, 3)) * np.nan theta_plda = np.zeros((n_iter, config.N, 3)) * np.nan phi_lda = np.zeros((n_iter, 3, V_nn)) * np.nan phi_plda = np.zeros((n_iter, 3, V_nn)) * np.nan ## Word Count train_word_n = sum([len(d.words) for d in full_corpus[train_ind]]) test_word_n = sum([len(d.words) for d in full_corpus[test_ind]]) ## Train LDA Model for epoch in tqdm(range(config.n_iter_lda), desc="LDA Training"): lda.train(1) train_inf, train_ll = lda.infer(full_corpus[train_ind], iter=config.n_sample) test_inf, test_ll = lda.infer(full_corpus[test_ind], iter=config.n_sample) likelihood[epoch, 0] = train_ll.sum() / train_word_n theta_lda[epoch] = np.vstack( flatten([[d.get_topic_dist() for d in inf] for inf in [train_inf, test_inf]])) phi_lda[epoch] = np.vstack( [lda.get_topic_word_dist(t) for t in range(lda.k)]) ## Train PLDA Model for epoch in tqdm(range(config.n_iter_plda), desc="PLDA Training"): plda.train(1) train_inf, train_ll = plda.infer(full_corpus[train_ind], iter=config.n_sample) test_inf, test_ll = plda.infer(full_corpus[test_ind], iter=config.n_sample) likelihood[epoch, 1] = train_ll.sum() / train_word_n theta_plda[epoch] = np.vstack( flatten([[d.get_topic_dist() for d in inf] for inf in [train_inf, test_inf]])) phi_plda[epoch] = np.vstack( [plda.get_topic_word_dist(t) for t in range(plda.k)]) ## Plot Likelihood if args.make_plots: plt.figure(figsize=(10, 5.8)) plt.plot(likelihood[:, 0], label="LDA") plt.plot(likelihood[:, 1], label="PLDA") plt.xlabel("Training Epoch", fontweight="bold") plt.ylabel("Log Likelihood Per Word", fontweight="bold") plt.legend(loc="lower right") plt.tight_layout() plt.show() ## Plot Traces for Phi if args.make_plots: fig, axes = plt.subplots(phi_lda.shape[1], 2, figsize=(10, 5.8)) for m, (mphi, mdl) in enumerate(zip([phi_lda, phi_plda], ["LDA", "PLDA"])): ax = axes[:, m] for k in range(mphi.shape[1]): ax[k].plot(mphi[:, k, :]) ax[k].set_ylabel("Parameter Value", fontweight="bold") ax[k].spines["top"].set_visible(False) ax[k].spines["right"].set_visible(False) ax[k].set_xlabel("Training Epoch", fontweight="bold") ax[0].set_title(f"{mdl} $\\phi$ Trace", fontweight="bold") fig.tight_layout() plt.show() ## Plot Sample Traces for Theta if args.make_plots: fig, ax = plt.subplots(5, 2, sharex=False, figsize=(10, 5.8)) for d, doc in enumerate( sorted(np.random.choice(config.N, 5, replace=False))): ax[d, 0].plot(theta_lda[:, doc, :]) ax[d, 1].plot(theta_plda[:, doc, :]) for i in range(2): ax[d, i].spines["right"].set_visible(False) ax[d, i].spines["top"].set_visible(False) ax[d, i].set_title(f"Document {doc}", loc="left", fontstyle="italic") ax[d, i].set_ylabel("$\\theta$") for m, mdl in enumerate(["LDA", "PLDA"]): ax[-1, m].set_xlabel(f"{mdl} Training Epoch", fontweight="bold") fig.tight_layout() plt.show() ## Get Final Representations X_latent_lda = np.vstack([ d.get_topic_dist() for d in lda.infer( full_corpus, iter=config.n_sample, together=False)[0] ]) X_latent_plda = np.vstack([ d.get_topic_dist() for d in plda.infer( full_corpus, iter=config.n_sample, together=False)[0] ]) ## Isolate Latent Variables and Normalize X_latent_plda = X_latent_plda[:, -plda.latent_topics:] ## Fit Classifiers source_train_ind = sorted(set(train_ind) & set(np.where(D == 0)[0])) lr_lda = LogisticRegression() lr_lda.fit(X_latent_lda[source_train_ind], y[source_train_ind]) lr_plda = LogisticRegression() lr_plda.fit(X_latent_plda[source_train_ind], y[source_train_ind]) ## Make Test Predictions y_test_lda = lr_lda.predict_proba(X_latent_lda)[:, 1] y_test_plda = lr_plda.predict_proba(X_latent_plda)[:, 1] ## Score Predictions scores = score_model(y, y_test_lda, y_test_plda, D, test_ind, True) if config.output_dir is not None and config.run_id is not None: with open(f"{config.output_dir}/{config.run_id}.scores.json", "w") as the_file: json.dump(scores, the_file)
def create_model(k, alpha, eta): return tp.LDAModel(k=k, rm_top=20, alpha=alpha, eta=eta), 'lda_{}'.format(k)
def load_model(input, model_name): return tp.LDAModel().load('{}{}'.format(input, model_name))
def main(): """ """ ################### ### Script Setup ################### ## Parse Command Line args = parse_arguments() ## Load Configuration config = Config(filepath=args.config) ## Check Sampler if not valid_sampler(config): raise ValueError("Configuration results in no inferences. Change burn-in or sample frequency.") ## Create Output Directories basedir = f"{config.output_dir}/" if args.fold is None else f"{config.output_dir}/fold-{args.fold}/".replace("//","/") dirs = ["topic_model/document_topic/","topic_model/topic_word/","classification/"] for d in dirs: ddir = f"{basedir}{d}" if not os.path.exists(ddir): _ = os.makedirs(ddir) ## Cache Configuration _ = os.system(f"cp {args.config} {config.output_dir}/config.json") ## Set Random Seed if config.random_seed is not None: np.random.seed(config.random_seed) ################### ### Data Preparation ################### ## Load Data LOGGER.info("Loading Processed Datasets") X_source, y_source, splits_source, filenames_source, users_source, terms_source = load_data(f"{DEPRESSION_DATA_DIR}{config.source}/") X_target, y_target, splits_target, filenames_target, users_target, terms_target = load_data(f"{DEPRESSION_DATA_DIR}{config.target}/") ## Align Vocabulary Spaces LOGGER.info("Aligning Vocabularies") X_source, X_target, vocab = align_data(X_source, X_target, terms_source, terms_target, config.vocab_alignment) ## Split Data LOGGER.info("Separating Datasets by Split") Xs_train, Xs_dev, Xs_test, ys_train, ys_dev, ys_test = split_data(X_source, y_source, splits_source) Xt_train, Xt_dev, Xt_test, yt_train, yt_dev, yt_test = split_data(X_target, y_target, splits_target) ## Sampling LOGGER.info("Sampling Source Data") Xs_train, ys_train = sample_data(Xs_train, ys_train, config.source_class_ratio.get("train"), config.source_sample_size.get("train")) Xs_dev, ys_dev = sample_data(Xs_dev, ys_dev, config.source_class_ratio.get("dev"), config.source_sample_size.get("dev")) Xs_test, ys_test = sample_data(Xs_test, ys_test, config.source_class_ratio.get("test"), config.source_sample_size.get("test")) LOGGER.info("Sampling Target Data") Xt_train, yt_train = sample_data(Xt_train, yt_train, config.target_class_ratio.get("train"), config.target_sample_size.get("train")) Xt_dev, yt_dev = sample_data(Xt_dev, yt_dev, config.target_class_ratio.get("dev"), config.target_sample_size.get("dev")) Xt_test, yt_test = sample_data(Xt_test, yt_test, config.target_class_ratio.get("test"), config.target_sample_size.get("test")) ## Cross Validation if args.fold is not None: LOGGER.info(f"Isolating K-Fold Data (Fold {args.fold})") ## Initialize Splitter splitter = StratifiedKFold(n_splits=args.k_folds, shuffle=True, random_state=config.random_seed) ## Merge Data Xs_all = sparse.vstack([Xs_train, Xs_dev]) Xt_all = sparse.vstack([Xt_train, Xt_dev]) ys_all = np.hstack([ys_train,ys_dev]) yt_all = np.hstack([yt_train,yt_dev]) ## Get Train and Dev Splits for the Fold splits_source = list(splitter.split(Xs_all, ys_all))[args.fold-1] splits_target = list(splitter.split(Xt_all, yt_all))[args.fold-1] ## Isolate Relevant Data Xs_train, ys_train = Xs_all[splits_source[0]], ys_all[splits_source[0]] Xs_dev, ys_dev = Xs_all[splits_source[1]], ys_all[splits_source[1]] Xt_train, yt_train = Xt_all[splits_target[0]], yt_all[splits_target[0]] Xt_dev, yt_dev = Xt_all[splits_target[1]], yt_all[splits_target[1]] ################### ### Corpus Generation ################### ## Sample Topic Model Training Masks if config.topic_model_data.get("source") is not None: if config.topic_model_data.get("source") > Xs_train.shape[0]: LOGGER.warning("Requested Source Topic Model Train Size Greater than Available Data. Downsizing.") config.topic_model_data["source"] = Xs_train.shape[0] source_mask = sorted(np.random.choice(Xs_train.shape[0], size=config.topic_model_data.get("source"), replace=False)) else: source_mask = list(range(Xs_train.shape[0])) if config.topic_model_data.get("target") is not None: if config.topic_model_data.get("target") > Xt_train.shape[0]: LOGGER.warning("Requested Target Topic Model Train Size Greater than Available Data. Downsizing.") config.topic_model_data["target"] = Xt_train.shape[0] target_mask = sorted(np.random.choice(Xt_train.shape[0], size=config.topic_model_data.get("target"), replace=False)) else: target_mask = list(range(Xt_train.shape[0])) ## Initialize Corpus LOGGER.info("Generating Training Corpus (Topic-Model Learning)") train_corpus, train_missing = generate_corpus(Xs_train, label="source", mask=source_mask, num_jobs=args.num_jobs) train_corpus, train_missing = generate_corpus(Xt_train, label="target", mask=target_mask, corpus=train_corpus, missing=train_missing, num_jobs=args.num_jobs) LOGGER.info("Generating Training Corpus (Inference)") if config.topic_model_data.get("source") is None and config.topic_model_data.get("target") is None: LOGGER.info("Using Training Corpus for Inference") train_corpus_infer = train_corpus train_missing_infer = train_missing else: train_corpus_infer, train_missing_infer = generate_corpus(Xs_train, label="source", missing={}, num_jobs=args.num_jobs) train_corpus_infer, train_missing_infer = generate_corpus(Xt_train, label="target", corpus=train_corpus_infer, missing=train_missing_infer, num_jobs=args.num_jobs) LOGGER.info("Generating Development Corpus (Inference)") development_corpus, dev_missing = generate_corpus(Xs_dev, label="source", missing={}, num_jobs=args.num_jobs) development_corpus, dev_missing = generate_corpus(Xt_dev, label="target", corpus=development_corpus, missing=dev_missing, num_jobs=args.num_jobs) if args.evaluate_test: LOGGER.info("Generating Test Corpus (Inference)") test_corpus, test_missing = generate_corpus(Xs_test, label="source", missing={}, num_jobs=args.num_jobs) test_corpus, test_missing = generate_corpus(Xt_test, label="target", corpus=test_corpus, missing=test_missing, num_jobs=args.num_jobs) ################### ### Topic Model (Training) ################### ## Initialize Model if config.use_plda: model = tp.PLDAModel(alpha=config.alpha, eta=config.beta, latent_topics=config.k_latent, topics_per_label=config.k_per_label, min_df=config.min_doc_freq, rm_top=config.rm_top, corpus=train_corpus, seed=config.random_seed) else: model = tp.LDAModel(alpha=config.alpha, eta=config.beta, k=config.k_latent, min_df=config.min_doc_freq, rm_top=config.rm_top, corpus=train_corpus, seed=config.random_seed) ## Initialize Sampler model.train(1, workers=args.num_jobs) ## Corpus-Updated Parameters V = model.num_vocabs N = len(model.docs) N_train = len(train_corpus_infer) N_dev = len(development_corpus) N_test = len(test_corpus) if args.evaluate_test else 0 K = model.k ## Gibbs Cache ll = np.zeros(config.n_iter) theta_train = [] theta_dev = [] theta_test = [] if args.evaluate_test else None if args.cache_parameters: phi = np.zeros((config.n_iter, K, V)) theta = np.zeros((config.n_iter, N, K)) else: phi = np.zeros((K,V)) theta = np.zeros((N, K)) ## Train Model for epoch in tqdm(range(0, config.n_iter), desc="MCMC Iteration", file=sys.stdout): ## Run Sample Epoch model.train(1, workers=args.num_jobs) ## Examine Data Fit ll[epoch] = model.ll_per_word ## Cache Model Parameters if args.cache_parameters: phi[epoch] = np.vstack([model.get_topic_word_dist(i) for i in range(K)]) theta[epoch] = np.vstack([d.get_topic_dist() for d in model.docs]) elif epoch == (config.n_iter - 1): phi = np.vstack([model.get_topic_word_dist(i) for i in range(K)]) theta = np.vstack([d.get_topic_dist() for d in model.docs]) ## Make Inferences Regularly if (epoch + 1) >= config.n_burn and (epoch + 1) % config.infer_sample_rate == 0: ## Training Inference train_dist, _ = model.infer(train_corpus_infer, iter=config.n_sample, together=False) theta_train.append(np.vstack([t.get_topic_dist() for t in train_dist])) ## Development Inference dev_dist, _ = model.infer(development_corpus, iter=config.n_sample, together=False) theta_dev.append(np.vstack([d.get_topic_dist() for d in dev_dist])) ## Test Inference if args.evaluate_test: test_dist, _ = model.infer(test_corpus, iter=config.n_sample, together=False) theta_test.append(np.vstack([t.get_topic_dist() for t in test_dist])) ## Stack Inferences theta_train = np.stack(theta_train) theta_dev = np.stack(theta_dev) if args.evaluate_test: theta_test = np.stack(theta_test) ## Cache Model Summary _ = model.summary(topic_word_top_n=20, file=open(f"{basedir}/topic_model/model_summary.txt","w")) ################ ### Topic Model Diagnostics ################ ## Plot Likelihood fig, ax = plt.subplots(figsize=(10,5.8)) ax.plot(ll) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_xlabel("MCMC Iteration", fontweight="bold") ax.set_ylabel("Log-Likelihood Per Word", fontweight="bold") fig.tight_layout() fig.savefig(f"{basedir}/topic_model/log_likelihood_train{args.plot_fmt}",dpi=300) plt.close(fig) ## Evaluate Topics for k in range(model.k): top_terms = [i[0] for i in model.get_topic_words(k, top_n=20)] if config.use_plda: LOGGER.info("{}: {}".format(which_plda_topic(k, model), ", ".join(top_terms))) else: LOGGER.info("{}: {}".format(k, ", ".join(top_terms))) ## Show Average Topic Distribution (Training Data) LOGGER.info("Plotting Average Topic Distributions") try: fig, ax = plot_average_topic_distribution(theta=theta_train, model=model, use_plda=config.use_plda, n_burn=0) fig.savefig(f"{basedir}/topic_model/average_topic_distribution_train{args.plot_fmt}",dpi=300) plt.close(fig) except: pass ## Show Average Topic Distribution (Development Data) try: fig, ax = plot_average_topic_distribution(theta=theta_dev, model=model, use_plda=config.use_plda, n_burn=0) fig.savefig(f"{basedir}/topic_model/average_topic_distribution_development{args.plot_fmt}",dpi=300) plt.close(fig) except: pass ## Show Trace for a Document Topic Distribution (Random Sample) if args.plot_document_topic: LOGGER.info("Plotting Sample of Document Topic Distributions") for doc_n in np.random.choice(theta_train.shape[1], 10): try: fig, ax = plot_document_topic_distribution(doc=doc_n, theta=theta) fig.savefig(f"{basedir}/topic_model/document_topic/train_{doc_n}{args.plot_fmt}",dpi=300) plt.close(fig) except: pass ## Show Trace for a Topic Word Distribution if args.plot_topic_word: LOGGER.info("Plotting Topic Word Distributions") for topic in tqdm(range(K), total=K, desc="Topic Word Distribution", file=sys.stdout): try: fig, ax = plot_topic_word_distribution(topic=topic, phi=phi, model=model, use_plda=config.use_plda, n_trace=30, n_top=30, n_burn=config.n_burn if config.n_burn < phi.shape[0] else -100) fig.savefig(f"{basedir}/topic_model/topic_word/topic_{topic}{args.plot_fmt}",dpi=300) plt.close(fig) except: pass ################ ### Depression Classifier Training ################ LOGGER.info("Beginning Classifier Training Procedure") ## Isolate General Latent Representations theta_train_latent = theta_train[:,:,-config.k_latent:] theta_dev_latent = theta_dev[:,:,-config.k_latent:] if args.evaluate_test: theta_test_latent = theta_test[:,:,-config.k_latent:] ## Get Ground Truth Labels y_train = np.array( [j for i, j in enumerate(ys_train) if i not in train_missing_infer.get("source")] + \ [j for i, j in enumerate(yt_train) if i not in train_missing_infer.get("target")] ) y_dev = np.array( [j for i, j in enumerate(ys_dev) if i not in dev_missing.get("source")] + \ [j for i, j in enumerate(yt_dev) if i not in dev_missing.get("target")] ) if args.evaluate_test: y_test = np.array( [j for i, j in enumerate(ys_test) if i not in test_missing.get("source")] + \ [j for i, j in enumerate(yt_test) if i not in test_missing.get("target")] ) ## Domain Masks source_train_ind = list(range(Xs_train.shape[0] - len(train_missing_infer.get("source")))) target_train_ind = list(range(len(source_train_ind), y_train.shape[0])) source_dev_ind = list(range(Xs_dev.shape[0] - len(dev_missing.get("source")))) target_dev_ind = list(range(len(source_dev_ind), y_dev.shape[0])) if args.evaluate_test: source_test_ind = list(range(Xs_test.shape[0] - len(test_missing.get("source")))) target_test_ind = list(range(len(source_test_ind), y_test.shape[0])) ## Separate Training Labels y_train_s = y_train[source_train_ind] y_train_t = y_train[target_train_ind] y_dev_s = y_dev[source_dev_ind] y_dev_t = y_dev[target_dev_ind] if args.evaluate_test: y_test_s = y_test[source_test_ind] y_test_t = y_test[target_test_ind] ## Caching if args.cache_predictions: ## Labels _ = np.save(f"{basedir}/classification/labels.train.npy", y_train) _ = np.save(f"{basedir}/classification/labels.dev.npy", y_dev) ## Indices _ = np.save(f"{basedir}/classification/source.train.npy", source_train_ind) _ = np.save(f"{basedir}/classification/target.train.npy", target_train_ind) _ = np.save(f"{basedir}/classification/source.dev.npy", source_dev_ind) _ = np.save(f"{basedir}/classification/target.dev.npy", target_dev_ind) if args.evaluate_test: ## Labels _ = np.save(f"{basedir}/classification/labels.test.npy", y_test) ## Indices _ = np.save(f"{basedir}/classification/source.test.npy", source_test_ind) _ = np.save(f"{basedir}/classification/target.test.npy", target_test_ind) ## Cycle Through Types of Preprocessing, Training, and Inference all_scores = [] for C in config.C: for average_representation in config.averaging: for norm in config.norm: LOGGER.info("Feature Set: Average Representation ({}), Norm ({}), Regularization ({})".format(average_representation, norm, C)) if average_representation: ## Average X_train = theta_train_latent.mean(axis=0) X_dev = theta_dev_latent.mean(axis=0) if args.evaluate_test: X_test = theta_test_latent.mean(axis=0) ## Normalization (If Desired) if norm: X_train = normalize(X_train, norm=norm, axis=1) X_dev = normalize(X_dev, norm=norm, axis=1) if args.evaluate_test: X_test = normalize(X_test, norm=norm, axis=1) ## Reshape Data X_train = X_train.reshape((1,X_train.shape[0],X_train.shape[1])) X_dev = X_dev.reshape((1,X_dev.shape[0], X_dev.shape[1])) if args.evaluate_test: X_test = X_test.reshape((1,X_test.shape[0], X_test.shape[1])) else: ## Remove Burn In X_train = theta_train_latent.copy() X_dev = theta_dev_latent.copy() if args.evaluate_test: X_test = theta_test_latent.copy() ## Normalization (If Desired) if norm: X_train = np.stack([normalize(x, norm=norm, axis=1) for x in X_train]) X_dev = np.stack([normalize(x, norm=norm, axis=1) for x in X_dev]) if args.evaluate_test: X_test = np.stack([normalize(x, norm=norm, axis=1) for x in X_test]) ## Training models = [] for x in tqdm(X_train, desc="Fitting Models", file=sys.stdout): ## Fit Classifier logit = LogisticRegression(C=C, random_state=42, max_iter=config.max_iter, solver='lbfgs') logit.fit(x[source_train_ind], y_train[source_train_ind]) ## Get Predictions models.append(logit) ## Inference y_pred_train = np.zeros((len(models), X_train.shape[0], y_train.shape[0])) y_pred_dev = np.zeros((len(models), X_dev.shape[0], y_dev.shape[0])) if args.evaluate_test: y_pred_test = np.zeros((len(models), X_test.shape[0], y_test.shape[0])) for m, mdl in tqdm(enumerate(models), position=0, desc="Making Predictions", total=len(models), file=sys.stdout): y_pred_train[m] = mdl.predict_proba(X_train[m])[:,1] y_pred_dev[m] = mdl.predict_proba(X_dev[m])[:,1] if args.evaluate_test: y_pred_test[m] = mdl.predict_proba(X_test[m])[:,1] ## Cache Predictions if args.cache_predictions: ## Predictions _ = np.save(f"{basedir}/classification/predictions.train.{C}.{average_representation}.{norm}.npy", y_pred_train) _ = np.save(f"{basedir}/classification/predictions.dev.{C}.{average_representation}.{norm}.npy", y_pred_dev) if args.evaluate_test: _ = np.save(f"{basedir}/classification/predictions.{C}.{average_representation}.{norm}.dev.npy", y_pred_test) ## Learn Optimal Thresholds (Youden's J-Score) thresholds = {} for m, mdl_pred in tqdm(enumerate(y_pred_dev), total=y_pred_dev.shape[0], desc="Learning Binarization Thresholds", file=sys.stdout): for l, latent_pred in enumerate(mdl_pred): for d, dind in enumerate([source_dev_ind, target_dev_ind]): if args.learn_threshold: d_l_pred = latent_pred[dind] d_l_true = y_dev[dind] if len(d_l_pred) == 0: continue fpr, tpr, t = metrics.roc_curve(d_l_true, d_l_pred, drop_intermediate=False) j_scores = tpr - fpr j_ordered = sorted(zip(j_scores, t)) j_opt_thresh = j_ordered[-1][1] thresholds[(m,l,d)] = j_opt_thresh else: thresholds[(m,l,d)] = 0.5 ## ROC Curves LOGGER.info("Plotting ROC/AUC and Scoring Training/Development Predictions") auc_scores = [[[],[]],[[],[]]] fig, ax = plt.subplots(2, 2, figsize=(10,5.8), sharex=True, sharey=True) for m, mdl_pred in tqdm(enumerate(y_pred_train), total=y_pred_train.shape[0], desc="Train Scoring", file=sys.stdout): for l, latent_pred in enumerate(mdl_pred): for d, dind in enumerate([source_train_ind, target_train_ind]): d_l_pred = latent_pred[dind] d_l_true = y_train[dind] if len(d_l_pred) == 0: continue tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)]) auc_scores[d][0].append(dl_scores.get("auc",0)) dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"train","threshold":thresholds[(m,l,d)]}) dl_scores.update({"norm":norm, "is_average_representation":average_representation, "C":C}) all_scores.append(dl_scores) ax[d][0].plot(tpr, fpr, alpha=0.01 if not average_representation else .8, color=f"navy", linewidth=0.5 if not average_representation else 1) for m, mdl_pred in tqdm(enumerate(y_pred_dev), total=y_pred_dev.shape[0], desc="Development Scoring", file=sys.stdout): for l, latent_pred in enumerate(mdl_pred): for d, dind in enumerate([source_dev_ind, target_dev_ind]): d_l_pred = latent_pred[dind] d_l_true = y_dev[dind] if len(d_l_pred) == 0: continue tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)]) dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"development","threshold":thresholds[(m,l,d)]}) dl_scores.update({"norm":norm, "is_average_representation":average_representation,"C":C}) all_scores.append(dl_scores) auc_scores[d][1].append(dl_scores.get("auc",0)) ax[d][1].plot(tpr, fpr, alpha=0.01 if not average_representation else .8, color=f"navy", linewidth=0.5 if not average_representation else 1) if args.evaluate_test: for m, mdl_pred in tqdm(enumerate(y_pred_test), total=y_pred_test.shape[0], desc="Test Scoring", file=sys.stdout): for l, latent_pred in enumerate(mdl_pred): for d, dind in enumerate([source_test_ind, target_test_ind]): d_l_pred = latent_pred[dind] d_l_true = y_test[dind] if len(d_l_pred) == 0: continue tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)]) dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"test","threshold":thresholds[(m,l,d)]}) dl_scores.update({"norm":norm, "is_average_representation":average_representation,"C":C}) all_scores.append(dl_scores) for i, domain in enumerate(["Source","Target"]): ax[-1,i].set_xlabel("True Positive Rate", fontweight="bold") ax[i, 0].set_ylabel("False Positive Rate", fontweight="bold") for j, group in enumerate(["Train","Development"]): ax[i,j].plot([0,1],[0,1],color="black",linestyle="--") ax[i,j].spines["top"].set_visible(False) ax[i,j].spines["right"].set_visible(False) ax[i,j].set_title(f"{domain} {group}", fontweight="bold") if len(auc_scores[i][j]) > 0: ax[i,j].plot([],[],color="navy",label="Mean AUC: {:.3f}".format(np.mean(auc_scores[i][j]))) ax[i,j].legend(loc="lower right") ax[i,j].set_xlim(0,1) ax[i,j].set_ylim(0,1) fig.tight_layout() fig.savefig(f"{basedir}/classification/roc_auc_{average_representation}_{norm}_{C}{args.plot_fmt}",dpi=300) plt.close(fig) ## Format Scores LOGGER.info("Caching Scores") all_scores_df = pd.DataFrame(all_scores) if args.fold is not None: all_scores_df["fold"] = args.fold all_scores_df.to_csv(f"{basedir}/classification/scores.csv",index=False) ## Script Complete LOGGER.info("Done!")
''' extracted_words.append(word) cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = MaxScoreTokenizer(scores=cohesion_score) #=================LDA trian strat======================== #Generate LDAModel #k = the number of topic #alpha = ? #eta = ? #min_cf = min frequency model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5) for i in raw_chat: model.add_doc(tokenizer.tokenize(i)) #check the number of words, vocabulary #prepare the train model.train(0) print('Total docs:', len(model.docs)) print('Total words:', model.num_words) print('Vocab size:', model.num_vocabs) #200times training for i in range(200): print('Iteration {}\tLL per word: {}'.format(i, model.ll_per_word)) model.train(1)
vocab = [vocab[v] for v in vocab_mask] ## Generate Corpus corpus, missing = generate_corpus(X_source, X_target, vocab, source=True, target=True) ## Initialize LDA Model n_iter = 1000 n_burn = 250 model = tp.LDAModel(alpha=0.01, eta=0.01, k=50, min_df=min_user_freq, rm_top=250, corpus=corpus, seed=42) ## Initialize Sampler model.train(1, workers=8) ## Corpus Parameters V = model.num_vocabs N = len(model.docs) K = model.k ## Gibbs Cache ll = np.zeros(n_iter) phi = np.zeros((n_iter, K, V))
def train_model(self, dataset, hyperparameters=None, top_words=10): if hyperparameters is None: hyperparameters = dict() self.set_default_hyperparameters(hyperparameters) if self.use_partitions: x_train, x_test = dataset.get_partitioned_corpus( use_validation=False) else: x_train = dataset.get_corpus() x_test = None lda = tp.LDAModel(k=self.hyperparameters['num_topics'], alpha=self.hyperparameters['alpha'], eta=self.hyperparameters['eta']) for i in x_train: lda.add_doc(i) lda.train(self.hyperparameters['max_iters']) topic_word_matrix = np.stack([ lda.get_topic_word_dist(k, normalize=True) for k in range(lda.k) ]) # topic word distribution matrix topic_document_matrix = np.stack([ doc.get_topic_dist() for doc in lda.docs ]) # topic document distribution matrix additional_words = [ item for item in dataset.get_vocabulary() if item not in list(lda.used_vocabs) ] num_additional_words = len(additional_words) if num_additional_words > 0: topic_word_matrix = np.concatenate( (topic_word_matrix, np.zeros((topic_word_matrix.shape[0], num_additional_words), dtype=float)), axis=1) #new_topic_word_matrix = np.zeros(topic_word_matrix.shape) final_vocab = list(lda.used_vocabs) + additional_words vocab2id = {w: i for i, w in enumerate(final_vocab)} sorted_indexes = [ vocab2id[w] for i, w in enumerate(dataset.get_vocabulary()) ] topic_word_matrix = topic_word_matrix[:, sorted_indexes] # topics extraction topic_w = [] for k in range(lda.k): topics = [] for word in lda.get_topic_words(k): topics.append(word[0]) topic_w.append(topics) # Output model on the Train Set info = {} info['topics'] = topic_w info['topic-word-matrix'] = topic_word_matrix info['topic-document-matrix'] = topic_document_matrix.T # Inference on the test set if x_test is not None: doc_inst = [lda.make_doc(i) for i in x_test] topic_dist, _ = lda.infer(doc_inst) # topic document distribution info['test-topic-document-matrix'] = np.asarray(topic_dist).T return info
def lda_param_checker(tw=tp.TermWeight.IDF, min_cf_0=0, min_cf_f=1, min_cf_s=1, min_df_0=0, min_df_f=1, min_df_s=1, rm_top_0=0, rm_top_f=1, rm_top_s=1, k_0=2, k_f=12, k_s=3, alpha_0=-1, alpha_f=0, alpha_s=1, eta_0=0, eta_f=1, eta_s=1, seed=101, corpus=None, burn=100, train=1001, word_list=None, card_count=30, to_excel=False, fname='param_checking.xlsx'): """ Method to automatically iterate through different LDA parameters to compare results Parameters tw: Union[int, TermWeight] term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ; I chose the default to be inverse document frequency, which means that cards that appear in almost all decks are weighted lower than cards that appear in very few decks. min_cf_0: int Starting minimum card collection frequency min_cf_f: int Ending minimum card collection frequency min_cf_s: int Minimum card collection frequency step size min_df_0: int Starting minimum deck collection frequency min_df_f: int Ending minimum deck collection frequency min_df_s: int Minimum deck collection frequency step size rm_top_0: int Starting number of top cards to exclude rm_top_f: int Ending number of top cards to exclude rm_top_s: int Top cards to exclude step size k_0: int Starting number of topics k_f: int Ending number of topics k_s: int Number of topics to increase by per iteration alpha_0: int Starting number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_0) alpha_f: int Ending number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_f) alpha_s: int Step size for the powers of ten of the alpha hyperparameter eta_0: int Starting number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_0) eta_f: int Ending number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_f) eta_s: int Step size for the powers of ten of the eta hyperparameter seed: int Random seed. Set to 101 as default in an attempt to duplicate results; however, said duplication has proven to be... elusive. corpus: tomotopy Corpus A list of documents to be added into the model. Method will not function without corpus. burn: int Number of initial training iterations to discard the results of? train: int Number of iterations to train over word_list: list of lists of strings Collection of decklists with each card name represented as a string. card_count: int Number of cards used to evaluate card coherence. to_excel: boolean Output the resulting DataFrame to Excel spreadsheet? fname: string ending in '.xlsx' If to_excel == True, filename of the resulting Excel spreadsheet. :return: DataFrame that lists the results of the preceding iterations. Contains the following columns: k - number of topics Avg. LL - Average log likelihood per word (not really sure what this means, but I think that lower is better) LL Std. Dev. - Log Likelihood standard deviation LL CV - Log Likelihood coefficient of variance (Std. Dev./Average) Perplexity - Perplexity of the model (don't know what this means, but pretty sure that lower is better Coherence - (C_V) Coherence of the model. Shooting for ... 0.65? Or between 0.7 and 0.8? I'm honestly not sure. I think that you'll get better results shooting for the latter. """ results_lists = [[ 'tw', 'Min. f_col', 'Min. f_doc', 'Top n Terms Removed', 'k', 'alpha', 'eta', 'Avg. LL', 'LL Std. Dev.', 'LL CV', 'Perplexity' ]] average_coherences = [] coh_std_dev = [] coh_cv = [] for cf in range(min_cf_0, min_cf_f, min_cf_s): print("Collection Frequency = " + str(cf)) for df in range(min_df_0, min_df_f, min_df_s): print("Document Frequency = " + str(df)) for rm in range(rm_top_0, rm_top_f, rm_top_s): print("Remove Top " + str(rm) + " Words") for k in range(k_0, k_f, k_s): print(str(k) + " Topics") for a in range(alpha_0, alpha_f, alpha_s): print("alpha = " + str(10**a)) for e in range(eta_0, eta_f, eta_s): print("eta = " + str(10**e)) ll_list = [] lda = tp.LDAModel(tw=tw, min_cf=cf, min_df=df, rm_top=rm, k=k, alpha=10**a, eta=10**e, seed=seed, corpus=corpus) lda.burn_in = burn lda.train(0) for i in range(0, train, 100): lda.train(100) ll_list.append(lda.ll_per_word) lda_mean = sum(ll_list) / len(ll_list) lda_variance = sum([ ((x - lda_mean)**2) for x in ll_list ]) / len(ll_list) lda_std_dev = lda_variance**0.5 lda_cv = lda_std_dev / lda_mean # I believe that the following method can be used even though it was designed for HDP lda_topics = get_lda_topics(lda, card_count) # I believe that the following method can be used even though it was designed for HDP results_list = [ str(tw), cf, df, rm, k, 10**a, 10**e, lda_mean, lda_std_dev, lda_cv, lda.perplexity ] topic_coherences = eval_coherence_by_topic( lda, deck_lists=word_list) results_list.extend(topic_coherences) average_coh = eval_coherence(lda_topics, word_list) average_coherences.append(average_coh) coh_variance = sum([((x - average_coh)**2) for x in topic_coherences ]) / len(topic_coherences) coh_std_dev.append(coh_variance**2) coh_cv.append((coh_variance**2) / average_coh) results_lists.append(results_list) for num_top in range(0, lda.k): results_lists[0].append('Top ' + str(num_top) + ' Coherence') df = pd.DataFrame(data=results_lists[1:], columns=results_lists[0]) df['Average Coherence'] = average_coherences df['Coherence Std Dev'] = coh_std_dev df['Coherence CV'] = coh_cv if to_excel: df.to_excel(fname, encoding='utf-8') return df
import tomotopy as tp model = tp.LDAModel(k=2, seed=42) model.add_doc(["this", "is", "a", "test"]) model.add_doc(["another", "document"]) model.add_doc(["a", "new", "document"]) model.train(100) print(model.docs[0].get_topics())
sample_ai_ids = set(choice(list(ai_ids), 30000, replace=False)) reference_exc = set(references_ids) - sample_ai_ids citation_exc = set(pre_2021_papers) - sample_ai_ids selected_ids = sample_ai_ids | reference_exc | citation_exc # Removing empty documents selected_corpus = { k: v for k, v in tok.items() if k in selected_ids and len(v) > 0 } # + # Train a topic model and extract the topic mix mdl = tp.LDAModel(k=300) for key, doc in list(selected_corpus.items()): mdl.add_doc(doc) for i in range(0, 150, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) # for k in range(mdl.k): # print('Top 10 words of topic #{}'.format(k)) # print(mdl.get_topic_words(k, top_n=10)) mdl.summary() # Calculate topic distributions by category and compare with AI: are they significantly different? # We could check this by comparing the means of the log of