def create_dat_files(corpus, title): T = len(corpus) folder = DAT_FOLDER + title + '/' sys_utils.ensure_dir(folder) seq_filename = folder + '%s-seq.dat' % title mult_filename = folder + '%s-mult.dat' % title with io.open(seq_filename, 'w') as seq_file, io.open(mult_filename, 'w') as mult_file: seq_file.write('%d\n' % T) for t in range(T): d = len(corpus[t]) seq_file.write('%d\n' % d) for d in range(d): c = Counter() c.update(corpus[t][d]) mult_file.write(str(len(c))) for w, count in c.items(): mult_file.write(' %d:%d' % (w, count)) mult_file.write('\n')
sites2ixs = read_dats.sites2ixs(ixs2sites) etas, phis = read_dats.lda2mats('results/lda/twitter/', 'dat_files/twitter/twitter-mult.dat', times, D, K, V) # idea: color each topic at time t > 1 the same as the most similar topic at time t=1 # todo: order sites consistently by similarity time_names = [ 'August 2013', 'September 2013', 'October 2013', 'November 2013', 'December 2013', 'January 2014', 'February 2014', 'March 2014', 'April 2014', 'May 2014', ] for t, (y, m) in zip(range(10), times): print(t) for topic in range(10): sys_utils.ensure_dir('figs/lda/topic%d' % topic) filename = 'figs/lda/topic%d/topic%d_%d_%02d.png' % (topic, topic, y, m) viz_map_dynamics.create_topics_snapshot(etas, sites2ixs, topic, t, title=time_names[t], filename=filename) # for topic in range(10): # viz_map_dynamics.create_topics_animation(etas, sites2ixs, 'vids/topic%d.mp4' % topic, topic, time_names)
'April 2014', 'May 2014', ] times = [ (2013, 8), (2013, 9), (2013, 10), (2013, 11), (2013, 12), (2014, 1), (2014, 2), (2014, 3), (2014, 4), (2014, 5), ] for t, (y, m) in zip(range(10), times): print(t) for topic in range(10): sys_utils.ensure_dir('figs/dtm/topic%d' % topic) filename = 'figs/dtm/topic%d/topic%d_%d_%02d.png' % (topic, topic, y, m) viz_map_dynamics.create_topics_snapshot(etas, sites2ixs, topic, t, title=titles[t], filename=filename) # for topic in range(10): # viz_map_dynamics.create_topics_animation(etas, sites2ixs, 'vids/dtm/topic%d.mp4' % topic, topic, titles)
import numpy as np import os from utils import sys_utils K = 10 T = 12 FOLDER = '/Users/chandlersquires/Dropbox (MIT)/School/_Spring2018/6882/Project/code' GET_BLEI_RESULTS = True if GET_BLEI_RESULTS: os.chdir('/Users/chandlersquires/Desktop/dtm-master/dtm/') os.system('pwd') for i in range(1, 100): outfolder = '%s/results/blei/sim_corpora/corpus%d' % (FOLDER, i) sys_utils.ensure_dir(outfolder + '/') print(outfolder) cmd = './main --ntopics=%d' % K cmd += ' --mode=fit --rng_seed=0 --initialize_lda=true' cmd += ' --corpus_prefix="%s/dat_files/sim_corpora/corpus%d/corpus%d"' % ( FOLDER, i, i) cmd += ' --outname="%s"' % outfolder cmd += ' --top_chain_var=0.005 --alpha=0.01' cmd += ' --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --lda_max_em_iter=10' os.system(cmd) os.chdir(FOLDER) # for i in range(100): # for k in range(K): # topic_fn = 'topic-%03d-var-e-log-prob.dat' % k # log_word_dist = np.loadtxt(topic_fn).reshape([-1, T]) # word_dist = np.exp(log_word_dist)
from dtm import DynamicTopicModel import itertools as itr from datetime import datetime import json from utils import strings, sys_utils import pickle import yaml import numpy as np FOLDER = 'results/trials2/' sys_utils.ensure_dir(FOLDER) FILENAME = FOLDER + 'sample3.p' corpus = [] dates = list(itr.product(range(2013, 2014), range(1, 13))) for i in range(len(dates) - 1): d1 = datetime(*dates[i], 1) d2 = datetime(*dates[i + 1], 1) docs = json.load(open('data/trials_data/docs_%s_%s.json' % (d1, d2))) timeslice = [doc['title'] for doc in docs] corpus.append(timeslice) corpus_tokenized, int2word, word_counts = strings.tokenize_corpus(corpus) json.dump(corpus_tokenized, open(FOLDER + 'corpus.json', 'w'), indent=2) json.dump(dict(word_counts), open(FOLDER + 'word_counts.json', 'w'), indent=2) yaml.dump(int2word, open(FOLDER + 'int2word.yaml', 'w'), indent=2) V = len(int2word) # samples = pickle.load(open(FILENAME, 'rb'))