Exemplo n.º 1
0
    def test_newt_topic(self):
        '''new t + topic k sampling'''
        w = 2
        fk = HDP.hdp_cpp.fk_cpp(w, self.n_kv)

        pval = HDP.posterior_k_new_t(self.topics, self.m_k, fk, self.gamma,
                                     self.V)

        self.assertAlmostEqual(np.sum(pval / pval.sum()), 1)
        self.assertFalse(any(pval < 0))
Exemplo n.º 2
0
    def test_rm_xji(self):
        '''Test x_ji gets removed on all structures'''
        j, word, tbl, topic = 0, 0, 1, 1
        doc_j = copy.deepcopy(self.doc_j)
        njtw = copy.deepcopy(self.njtw)
        n_kv = copy.deepcopy(self.n_kv)
        m_k = copy.deepcopy(self.m_k)
        d, n_dict, n_kv2, m_k2, topic_idx2 = HDP.remove_xji(
            j, doc_j, self.topics, word, tbl, njtw, n_kv, m_k)

        self.assertEqual(d['n_jt'][tbl], self.doc_j['n_jt'][tbl] - 1)
        self.assertEqual(n_dict[j][tbl][word], self.njtw[j][tbl][word] - 1)
        self.assertEqual(n_kv2[word, topic], self.n_kv[word, topic] - 1)
Exemplo n.º 3
0
    def test_rm_Xvec(self):
        '''rm Xvec conditional distribution'''
        m_k = self.m_k.copy()
        doc_j = copy.deepcopy(self.doc_j)
        topic_idx = self.topics.copy()
        tbl = 3
        k = doc_j['k_jt'][tbl]

        doc_j, topic_idx, m_k = HDP.remove_Xvec_ji(tbl, doc_j, topic_idx, m_k)

        self.assertNotIn(k, topic_idx)
        self.assertEqual(m_k[k], self.m_k[k] - 1)
        self.assertEqual(doc_j['k_jt'][tbl], 0)
Exemplo n.º 4
0
    def test_new_table(self):
        '''new table is added'''
        doc_j = copy.deepcopy(self.doc_j)
        k = 2
        njtw = copy.deepcopy(self.njtw)

        t_idx, doc_j, njtw = HDP.new_table(self.j, k, doc_j, njtw)

        self.assertEqual(len(doc_j['n_jt']), len(self.doc_j['n_jt']) + 1)
        self.assertEqual(len(doc_j['k_jt']), len(self.doc_j['k_jt']) + 1)
        self.assertEqual(len(njtw[self.j]), len(self.njtw[self.j]) + 1)
        self.assertIn(t_idx, doc_j['t_j'])
        self.assertEqual(doc_j['k_jt'][t_idx], k)
Exemplo n.º 5
0
    def test_remove_tbl(self):
        '''Remove empty tables and topics'''
        doc_j = copy.deepcopy(self.doc_j)
        topic_idx = copy.deepcopy(self.topics)
        m_k = self.m_k.copy()
        tbl = 3
        k = doc_j['k_jt'][tbl]

        d, m_k2, topic_idx2 = HDP.remove_table(doc_j, tbl, topic_idx, m_k)

        self.assertNotIn(tbl, d['t_j'])
        self.assertEqual(m_k2[k], self.m_k[k] - 1)
        self.assertNotIn(topic_idx2,
                         self.topics)  # being removed when no more topics k
Exemplo n.º 6
0
    def test_new_topic(self):
        '''new topic is added'''
        topic_idx = self.topics.copy()
        m_k = self.m_k.copy()
        n_kv = self.n_kv.copy()

        k_idx, topic_idx, m_k, n_kv = HDP.new_topic(topic_idx, m_k, n_kv,
                                                    self.beta, self.V)

        self.assertEqual(len(topic_idx), len(self.topics) + 1)
        self.assertIn(k_idx, topic_idx)
        self.assertEqual(m_k[k_idx], 0)
        self.assertEqual(n_kv[:, k_idx].sum(), 2.5)
        self.assertGreater(n_kv.shape[1], self.n_kv.shape[1])
Exemplo n.º 7
0
    def test_rearrange_cnts(self):
        '''check that counts are added to new k'''
        doc_j = copy.deepcopy(self.doc_j)
        n_kv = copy.deepcopy(self.n_kv)
        new_k = 1
        tbl = 3
        k = doc_j['k_jt'][tbl]

        doc_j, n_kv = HDP.rearranging_k_counts(self.j, tbl, new_k, doc_j,
                                               self.njtw, n_kv)

        self.assertEqual(np.sum(n_kv[2:4, new_k]), np.sum(self.n_kv[2:4, k]))
        self.assertEqual(np.sum(n_kv[2:4, new_k]), np.sum(n_kv[2:4, k]) + 2)
        self.assertEqual(doc_j['k_jt'][tbl], new_k)
        self.assertFalse(doc_j['k_jt'][tbl] == k)
Exemplo n.º 8
0
    def test_assign_table(self):
        '''word is assigned to table'''
        doc_j = copy.deepcopy(self.doc_j)
        njtw = copy.deepcopy(self.njtw)
        n_kv = self.n_kv.copy()
        i, w = 4, 4
        new_t = 1

        doc_j, n_kv, n_jtw = HDP.assign_to_table(self.j, i, w, new_t, doc_j,
                                                 n_kv, njtw)

        self.assertEqual(doc_j['t_ji'][i], new_t)
        self.assertEqual(doc_j['n_jt'][new_t], self.doc_j['n_jt'][new_t] + 1)
        self.assertEqual(n_kv[w, 1], 2.5)
        self.assertEqual(njtw[self.j][new_t][w], 1)
Exemplo n.º 9
0
import cPickle as pickle
import scipy.sparse

with open(sys.path[0] + '\\dict.txt', 'r') as f:
    vocab_list = [s[:-1] for s in f.readlines()]
vectorizer = CountVectorizer(vocabulary=vocab_list)

V = len(vectorizer.vocabulary)
n_topics = int(sys.argv[1])
n_topics_per_doc = int(sys.argv[2])
batch_size = int(sys.argv[3])
n_iter = int(sys.argv[4])
kappa = float(sys.argv[5]) if len(sys.argv) > 5 else 0.51
D = batch_size * n_iter  # is this reasonable?
max_retrieve = 64  # largest number of articles that are queried together in 1 function call
hdp = HDP(n_topics, n_topics_per_doc, D, V, 1., 0.01, 100., 1, kappa)

elbo_lst = []
scrape_time = 0.
examples = []
log_likelihoods = []
start_time_loop = time.time()
for t in range(n_iter):
    print '====================BATCH %d====================' % t
    sys.stdout.flush()
    articlenames = []
    n_requested = 0
    mats = []
    while n_requested < batch_size:
        request_size = min(batch_size - n_requested, max_retrieve)
        start_time = time.time()
Exemplo n.º 10
0
graph = Graph(f_net,
              'edge list',
              directed=False,
              weighted=False,
              memory_control=True)

# generate corpus
print('generating training/testing corpus...')
train_corpus = Corpus()
train_corpus.generate_corpus_from_graph_using_random_walk(
    graph, avg_l, D, 'deterministic+random')
test_corpus = Corpus()
test_corpus.generate_corpus_from_graph_using_random_walk(graph, avg_l, 3000)

# stochastic variational inference
hdp = HDP(T, K, D, graph.n, eta, alpha, gamma, kappa, tau, scale, adding_noise)
log_file = open(f_log, "w")
log_file.write(
    "iteration time doc.count score word.count unseen.score unseen.word.count\n"
)

max_iter_per_epoch = np.ceil(D / batchsize)
total_doc_count = 0
total_time = 0
doc_seen = set()
print("stochastic variational inference...")
for epoch in range(epochs):
    iter = 0
    printProgress(iter,
                  max_iter_per_epoch,
                  prefix='epoch %s' % int(epoch + 1),
Exemplo n.º 11
0
def main():
    """Run the project"""
    start_time = time.time()

    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    save_path = os.path.join(args.output, 'saves')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(args.output, 'task1')):
        os.makedirs(os.path.join(args.output, 'task1'))
    if not os.path.exists(os.path.join(args.output, 'task2')):
        os.makedirs(os.path.join(args.output, 'task2'))

    print('Loading words...')
    corpus = Corpus(args.start_corpus, save_path,
                    args.end_corpus, args.floor, args.window_size)
    print('Setting up initial partition...')
    for i in corpus.docs:
        i.init_partition(args.alpha)

    hdp = HDP(corpus.vocab_size, save_path,
              alpha=args.alpha, gamma=args.gamma)
    hdp.init_partition(corpus.docs)
    print('Done')
    it = 0
    print(f'Running Gibbs sampling for {args.max_iters} iterations...')
    while it < args.max_iters:
        for j in corpus.docs:
            for i in range(len(j.words)):
                hdp.sample_table(j, i, corpus.collocations[j.words[i]])
        it += 1
        corpus.save()
        print(f'Iteration {it}/{args.max_iters}')
    for i in hdp.senses:
        i /= i.sum()
    print('Done')
    print('Generating scores for word senses...')
    words = dict()
    for j in corpus.docs:
        for i, p in enumerate(j.partition):
            origin = j.category
            sense = j.topic_to_global_idx[i]
            for w in p:
                if corpus.idx_to_word[w] in words:
                    if origin == 'reference':
                        words[corpus.idx_to_word[w]].senses[sense][0] += 1
                    else:
                        words[corpus.idx_to_word[w]].senses[sense][1] += 1
                else:
                    word = Word(corpus.idx_to_word[w], w, hdp.senses.shape[0])
                    if origin == 'reference':
                        word.senses[sense][0] += 1
                    else:
                        word.senses[sense][1] += 1
                    words[word.word] = word
    print('Done.')
    if args.semeval_mode:
        targets = utils.get_targets(args.targets)
        results = []
        for i in range(len(targets)):
            t = targets[i][0]
            pos = targets[i][1]
            recombine = t+'_'+pos
            word = words[recombine]
            scores = word.senses[~np.all(word.senses == 0, axis=1)]

            dist_1 = scores[:, 0]
            dist_2 = scores[:, 1]
            jensenshannon = dist.jensenshannon(
                dist_1, dist_2)
            results.append((recombine, jensenshannon))

        with open(os.path.join(os.path.join(args.output, 'task1'),
                               'english.txt'), 'w') as f:
            for i in results:
                recombine = i[0]
                score = i[1]
                different = 1 if score > args.threshold else 0
                f.write(f'{recombine} {different}\n')

        with open(os.path.join(os.path.join(args.output, 'task2'), 'english.txt'), 'w') as f:
            for i in results:
                recombine = i[0]
                jensenshannon = i[1]
                f.write(f'{recombine} {jensenshannon:.4f}\n')

    else:
        for k, v in words.items():
            words[k] = v.calculate()
        top = sorted(words, key=words.get, reverse=True)[:args.top_k]
        with open(os.path.join(args.output, 'out.txt'), 'w') as f:
            f.write(f'Top {args.top_k} most differing words:')
            f.write('\n'.join(top))
    end_time = time.time()
    print(f'Ran project in {end_time - start_time} seconds')
Exemplo n.º 12
0
This module runs the topic models.

Created on Apr 20, 2017

@author: maltaweel
'''

from lda import LDA
from hdp import HDP
import os

# get the current working path
os.chdir("../")
pn = os.path.abspath('../')

#iterate and try a range of numbers for the number of topics
for i in range(10, 100, 10):
    for j in range(10, 40, 10):
        hdp = HDP()
        results = hdp.retrieveText(pn)
        hdp.applyModel(results, i, j)
        hdp.printResults(i, j)

#iterate and try a range of numbers for the number of topics
#for i in range(10,100,10):
#    for j in range(10,40,10):
#        for k in range(20,50,10):
#            lda=LDA()
#            results=lda.retrieveText(pn)
#            lda.applyModel(results, i,j,k)
#            lda.printResults(i,j,k)
Exemplo n.º 13
0
            v = dct[key]
            tn = key.split(":")[0]
            kt = key.split(":")[1]
            writer.writerow({
                'Topic': str(tn),
                'Term': str(kt.encode("utf-8")),
                'Value': str(v)
            })


#lee_train_file=test_directories()
#train_texts = list(build_texts(lee_train_file))

#bigram = gensim.models.Phrases(train_texts)

hdp = HDP()
pn = os.path.abspath(__file__)
pn = pn.split("src")[0]
results = hdp.retrieveText(pn)

bigram = gensim.models.Phrases(results)
#train_texts = process_texts(train_texts)

train_texts = process_texts(results)

preProcsText(results)

dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

for i in range(10, 100, 10):