Пример #1
0
def create_dat_files(corpus, title):
    T = len(corpus)

    folder = DAT_FOLDER + title + '/'
    sys_utils.ensure_dir(folder)
    seq_filename = folder + '%s-seq.dat' % title
    mult_filename = folder + '%s-mult.dat' % title
    with io.open(seq_filename, 'w') as seq_file, io.open(mult_filename, 'w') as mult_file:
        seq_file.write('%d\n' % T)
        for t in range(T):
            d = len(corpus[t])
            seq_file.write('%d\n' % d)
            for d in range(d):
                c = Counter()
                c.update(corpus[t][d])
                mult_file.write(str(len(c)))
                for w, count in c.items():
                    mult_file.write(' %d:%d' % (w, count))
                mult_file.write('\n')
Пример #2
0
sites2ixs = read_dats.sites2ixs(ixs2sites)

etas, phis = read_dats.lda2mats('results/lda/twitter/', 'dat_files/twitter/twitter-mult.dat', times, D, K, V)

# idea: color each topic at time t > 1 the same as the most similar topic at time t=1
# todo: order sites consistently by similarity

time_names = [
    'August 2013',
    'September 2013',
    'October 2013',
    'November 2013',
    'December 2013',
    'January 2014',
    'February 2014',
    'March 2014',
    'April 2014',
    'May 2014',
]
for t, (y, m) in zip(range(10), times):
    print(t)
    for topic in range(10):
        sys_utils.ensure_dir('figs/lda/topic%d' % topic)
        filename = 'figs/lda/topic%d/topic%d_%d_%02d.png' % (topic, topic, y, m)
        viz_map_dynamics.create_topics_snapshot(etas, sites2ixs, topic, t, title=time_names[t], filename=filename)

# for topic in range(10):
#     viz_map_dynamics.create_topics_animation(etas, sites2ixs, 'vids/topic%d.mp4' % topic, topic, time_names)


Пример #3
0
    'April 2014',
    'May 2014',
]

times = [
    (2013, 8),
    (2013, 9),
    (2013, 10),
    (2013, 11),
    (2013, 12),
    (2014, 1),
    (2014, 2),
    (2014, 3),
    (2014, 4),
    (2014, 5),
]
for t, (y, m) in zip(range(10), times):
    print(t)
    for topic in range(10):
        sys_utils.ensure_dir('figs/dtm/topic%d' % topic)
        filename = 'figs/dtm/topic%d/topic%d_%d_%02d.png' % (topic, topic, y,
                                                             m)
        viz_map_dynamics.create_topics_snapshot(etas,
                                                sites2ixs,
                                                topic,
                                                t,
                                                title=titles[t],
                                                filename=filename)

# for topic in range(10):
#     viz_map_dynamics.create_topics_animation(etas, sites2ixs, 'vids/dtm/topic%d.mp4' % topic, topic, titles)
Пример #4
0
import numpy as np
import os
from utils import sys_utils

K = 10
T = 12
FOLDER = '/Users/chandlersquires/Dropbox (MIT)/School/_Spring2018/6882/Project/code'

GET_BLEI_RESULTS = True
if GET_BLEI_RESULTS:
    os.chdir('/Users/chandlersquires/Desktop/dtm-master/dtm/')
    os.system('pwd')
    for i in range(1, 100):
        outfolder = '%s/results/blei/sim_corpora/corpus%d' % (FOLDER, i)
        sys_utils.ensure_dir(outfolder + '/')
        print(outfolder)
        cmd = './main --ntopics=%d' % K
        cmd += ' --mode=fit --rng_seed=0 --initialize_lda=true'
        cmd += ' --corpus_prefix="%s/dat_files/sim_corpora/corpus%d/corpus%d"' % (
            FOLDER, i, i)
        cmd += ' --outname="%s"' % outfolder
        cmd += ' --top_chain_var=0.005 --alpha=0.01'
        cmd += ' --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --lda_max_em_iter=10'
        os.system(cmd)
    os.chdir(FOLDER)

# for i in range(100):
#     for k in range(K):
#         topic_fn = 'topic-%03d-var-e-log-prob.dat' % k
#         log_word_dist = np.loadtxt(topic_fn).reshape([-1, T])
#         word_dist = np.exp(log_word_dist)
Пример #5
0
from dtm import DynamicTopicModel
import itertools as itr
from datetime import datetime
import json
from utils import strings, sys_utils
import pickle
import yaml
import numpy as np

FOLDER = 'results/trials2/'
sys_utils.ensure_dir(FOLDER)
FILENAME = FOLDER + 'sample3.p'


corpus = []
dates = list(itr.product(range(2013, 2014), range(1, 13)))


for i in range(len(dates) - 1):
    d1 = datetime(*dates[i], 1)
    d2 = datetime(*dates[i + 1], 1)
    docs = json.load(open('data/trials_data/docs_%s_%s.json' % (d1, d2)))
    timeslice = [doc['title'] for doc in docs]
    corpus.append(timeslice)

corpus_tokenized, int2word, word_counts = strings.tokenize_corpus(corpus)
json.dump(corpus_tokenized, open(FOLDER + 'corpus.json', 'w'), indent=2)
json.dump(dict(word_counts), open(FOLDER + 'word_counts.json', 'w'), indent=2)
yaml.dump(int2word, open(FOLDER + 'int2word.yaml', 'w'), indent=2)
V = len(int2word)
# samples = pickle.load(open(FILENAME, 'rb'))