def test_newt_topic(self): '''new t + topic k sampling''' w = 2 fk = HDP.hdp_cpp.fk_cpp(w, self.n_kv) pval = HDP.posterior_k_new_t(self.topics, self.m_k, fk, self.gamma, self.V) self.assertAlmostEqual(np.sum(pval / pval.sum()), 1) self.assertFalse(any(pval < 0))
def test_rm_xji(self): '''Test x_ji gets removed on all structures''' j, word, tbl, topic = 0, 0, 1, 1 doc_j = copy.deepcopy(self.doc_j) njtw = copy.deepcopy(self.njtw) n_kv = copy.deepcopy(self.n_kv) m_k = copy.deepcopy(self.m_k) d, n_dict, n_kv2, m_k2, topic_idx2 = HDP.remove_xji( j, doc_j, self.topics, word, tbl, njtw, n_kv, m_k) self.assertEqual(d['n_jt'][tbl], self.doc_j['n_jt'][tbl] - 1) self.assertEqual(n_dict[j][tbl][word], self.njtw[j][tbl][word] - 1) self.assertEqual(n_kv2[word, topic], self.n_kv[word, topic] - 1)
def test_rm_Xvec(self): '''rm Xvec conditional distribution''' m_k = self.m_k.copy() doc_j = copy.deepcopy(self.doc_j) topic_idx = self.topics.copy() tbl = 3 k = doc_j['k_jt'][tbl] doc_j, topic_idx, m_k = HDP.remove_Xvec_ji(tbl, doc_j, topic_idx, m_k) self.assertNotIn(k, topic_idx) self.assertEqual(m_k[k], self.m_k[k] - 1) self.assertEqual(doc_j['k_jt'][tbl], 0)
def test_new_table(self): '''new table is added''' doc_j = copy.deepcopy(self.doc_j) k = 2 njtw = copy.deepcopy(self.njtw) t_idx, doc_j, njtw = HDP.new_table(self.j, k, doc_j, njtw) self.assertEqual(len(doc_j['n_jt']), len(self.doc_j['n_jt']) + 1) self.assertEqual(len(doc_j['k_jt']), len(self.doc_j['k_jt']) + 1) self.assertEqual(len(njtw[self.j]), len(self.njtw[self.j]) + 1) self.assertIn(t_idx, doc_j['t_j']) self.assertEqual(doc_j['k_jt'][t_idx], k)
def test_remove_tbl(self): '''Remove empty tables and topics''' doc_j = copy.deepcopy(self.doc_j) topic_idx = copy.deepcopy(self.topics) m_k = self.m_k.copy() tbl = 3 k = doc_j['k_jt'][tbl] d, m_k2, topic_idx2 = HDP.remove_table(doc_j, tbl, topic_idx, m_k) self.assertNotIn(tbl, d['t_j']) self.assertEqual(m_k2[k], self.m_k[k] - 1) self.assertNotIn(topic_idx2, self.topics) # being removed when no more topics k
def test_new_topic(self): '''new topic is added''' topic_idx = self.topics.copy() m_k = self.m_k.copy() n_kv = self.n_kv.copy() k_idx, topic_idx, m_k, n_kv = HDP.new_topic(topic_idx, m_k, n_kv, self.beta, self.V) self.assertEqual(len(topic_idx), len(self.topics) + 1) self.assertIn(k_idx, topic_idx) self.assertEqual(m_k[k_idx], 0) self.assertEqual(n_kv[:, k_idx].sum(), 2.5) self.assertGreater(n_kv.shape[1], self.n_kv.shape[1])
def test_rearrange_cnts(self): '''check that counts are added to new k''' doc_j = copy.deepcopy(self.doc_j) n_kv = copy.deepcopy(self.n_kv) new_k = 1 tbl = 3 k = doc_j['k_jt'][tbl] doc_j, n_kv = HDP.rearranging_k_counts(self.j, tbl, new_k, doc_j, self.njtw, n_kv) self.assertEqual(np.sum(n_kv[2:4, new_k]), np.sum(self.n_kv[2:4, k])) self.assertEqual(np.sum(n_kv[2:4, new_k]), np.sum(n_kv[2:4, k]) + 2) self.assertEqual(doc_j['k_jt'][tbl], new_k) self.assertFalse(doc_j['k_jt'][tbl] == k)
def test_assign_table(self): '''word is assigned to table''' doc_j = copy.deepcopy(self.doc_j) njtw = copy.deepcopy(self.njtw) n_kv = self.n_kv.copy() i, w = 4, 4 new_t = 1 doc_j, n_kv, n_jtw = HDP.assign_to_table(self.j, i, w, new_t, doc_j, n_kv, njtw) self.assertEqual(doc_j['t_ji'][i], new_t) self.assertEqual(doc_j['n_jt'][new_t], self.doc_j['n_jt'][new_t] + 1) self.assertEqual(n_kv[w, 1], 2.5) self.assertEqual(njtw[self.j][new_t][w], 1)
import cPickle as pickle import scipy.sparse with open(sys.path[0] + '\\dict.txt', 'r') as f: vocab_list = [s[:-1] for s in f.readlines()] vectorizer = CountVectorizer(vocabulary=vocab_list) V = len(vectorizer.vocabulary) n_topics = int(sys.argv[1]) n_topics_per_doc = int(sys.argv[2]) batch_size = int(sys.argv[3]) n_iter = int(sys.argv[4]) kappa = float(sys.argv[5]) if len(sys.argv) > 5 else 0.51 D = batch_size * n_iter # is this reasonable? max_retrieve = 64 # largest number of articles that are queried together in 1 function call hdp = HDP(n_topics, n_topics_per_doc, D, V, 1., 0.01, 100., 1, kappa) elbo_lst = [] scrape_time = 0. examples = [] log_likelihoods = [] start_time_loop = time.time() for t in range(n_iter): print '====================BATCH %d====================' % t sys.stdout.flush() articlenames = [] n_requested = 0 mats = [] while n_requested < batch_size: request_size = min(batch_size - n_requested, max_retrieve) start_time = time.time()
graph = Graph(f_net, 'edge list', directed=False, weighted=False, memory_control=True) # generate corpus print('generating training/testing corpus...') train_corpus = Corpus() train_corpus.generate_corpus_from_graph_using_random_walk( graph, avg_l, D, 'deterministic+random') test_corpus = Corpus() test_corpus.generate_corpus_from_graph_using_random_walk(graph, avg_l, 3000) # stochastic variational inference hdp = HDP(T, K, D, graph.n, eta, alpha, gamma, kappa, tau, scale, adding_noise) log_file = open(f_log, "w") log_file.write( "iteration time doc.count score word.count unseen.score unseen.word.count\n" ) max_iter_per_epoch = np.ceil(D / batchsize) total_doc_count = 0 total_time = 0 doc_seen = set() print("stochastic variational inference...") for epoch in range(epochs): iter = 0 printProgress(iter, max_iter_per_epoch, prefix='epoch %s' % int(epoch + 1),
def main(): """Run the project""" start_time = time.time() try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') if not os.path.exists(args.output): os.makedirs(args.output) save_path = os.path.join(args.output, 'saves') if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(os.path.join(args.output, 'task1')): os.makedirs(os.path.join(args.output, 'task1')) if not os.path.exists(os.path.join(args.output, 'task2')): os.makedirs(os.path.join(args.output, 'task2')) print('Loading words...') corpus = Corpus(args.start_corpus, save_path, args.end_corpus, args.floor, args.window_size) print('Setting up initial partition...') for i in corpus.docs: i.init_partition(args.alpha) hdp = HDP(corpus.vocab_size, save_path, alpha=args.alpha, gamma=args.gamma) hdp.init_partition(corpus.docs) print('Done') it = 0 print(f'Running Gibbs sampling for {args.max_iters} iterations...') while it < args.max_iters: for j in corpus.docs: for i in range(len(j.words)): hdp.sample_table(j, i, corpus.collocations[j.words[i]]) it += 1 corpus.save() print(f'Iteration {it}/{args.max_iters}') for i in hdp.senses: i /= i.sum() print('Done') print('Generating scores for word senses...') words = dict() for j in corpus.docs: for i, p in enumerate(j.partition): origin = j.category sense = j.topic_to_global_idx[i] for w in p: if corpus.idx_to_word[w] in words: if origin == 'reference': words[corpus.idx_to_word[w]].senses[sense][0] += 1 else: words[corpus.idx_to_word[w]].senses[sense][1] += 1 else: word = Word(corpus.idx_to_word[w], w, hdp.senses.shape[0]) if origin == 'reference': word.senses[sense][0] += 1 else: word.senses[sense][1] += 1 words[word.word] = word print('Done.') if args.semeval_mode: targets = utils.get_targets(args.targets) results = [] for i in range(len(targets)): t = targets[i][0] pos = targets[i][1] recombine = t+'_'+pos word = words[recombine] scores = word.senses[~np.all(word.senses == 0, axis=1)] dist_1 = scores[:, 0] dist_2 = scores[:, 1] jensenshannon = dist.jensenshannon( dist_1, dist_2) results.append((recombine, jensenshannon)) with open(os.path.join(os.path.join(args.output, 'task1'), 'english.txt'), 'w') as f: for i in results: recombine = i[0] score = i[1] different = 1 if score > args.threshold else 0 f.write(f'{recombine} {different}\n') with open(os.path.join(os.path.join(args.output, 'task2'), 'english.txt'), 'w') as f: for i in results: recombine = i[0] jensenshannon = i[1] f.write(f'{recombine} {jensenshannon:.4f}\n') else: for k, v in words.items(): words[k] = v.calculate() top = sorted(words, key=words.get, reverse=True)[:args.top_k] with open(os.path.join(args.output, 'out.txt'), 'w') as f: f.write(f'Top {args.top_k} most differing words:') f.write('\n'.join(top)) end_time = time.time() print(f'Ran project in {end_time - start_time} seconds')
This module runs the topic models. Created on Apr 20, 2017 @author: maltaweel ''' from lda import LDA from hdp import HDP import os # get the current working path os.chdir("../") pn = os.path.abspath('../') #iterate and try a range of numbers for the number of topics for i in range(10, 100, 10): for j in range(10, 40, 10): hdp = HDP() results = hdp.retrieveText(pn) hdp.applyModel(results, i, j) hdp.printResults(i, j) #iterate and try a range of numbers for the number of topics #for i in range(10,100,10): # for j in range(10,40,10): # for k in range(20,50,10): # lda=LDA() # results=lda.retrieveText(pn) # lda.applyModel(results, i,j,k) # lda.printResults(i,j,k)
v = dct[key] tn = key.split(":")[0] kt = key.split(":")[1] writer.writerow({ 'Topic': str(tn), 'Term': str(kt.encode("utf-8")), 'Value': str(v) }) #lee_train_file=test_directories() #train_texts = list(build_texts(lee_train_file)) #bigram = gensim.models.Phrases(train_texts) hdp = HDP() pn = os.path.abspath(__file__) pn = pn.split("src")[0] results = hdp.retrieveText(pn) bigram = gensim.models.Phrases(results) #train_texts = process_texts(train_texts) train_texts = process_texts(results) preProcsText(results) dictionary = Dictionary(train_texts) corpus = [dictionary.doc2bow(text) for text in train_texts] for i in range(10, 100, 10):