def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using batch VB for LDA. """ # The number of documents to analyze each iteration batchsize = 8 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # The size of window L = 30 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) if (len(sys.argv) >= 3): L = int(sys.argv[2]) if (len(sys.argv) >= 4): batchsize = int(sys.argv[3]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=0.5, eta=0.5, rho = 10^-3 olda = batchldavb.batchLDA(vocab, K, D, 0.5, 0.5, 1e-2, -1, L) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to batch LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = batchldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using batch VB for LDA. """ # The number of documents to analyze each iteration batchsize = 8 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # The size of window L = 30 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) if (len(sys.argv) >= 3): L = int(sys.argv[2]) if (len(sys.argv) >= 4): batchsize = int(sys.argv[3]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=0.5, eta=0.5, rho = 10^-3 olda = batchldavb.batchLDA(vocab, K, D, 0.5, 0.5, 1e-2, -1, L) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to batch LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = batchldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ This function fits Wikipedia articles to the two-hidden-layer model in an online style. """ # The number of documents to analyze in each iteration batchsize = 32 # The estimated total number of documents D = 5.13e6 # The number of topics K1 = 30 K2 = 3 eta0 = 1 / np.float(K1) eta1 = 1 / np.float(K2) eta2 = 1 / np.float(K2) # The total number of iterations if (len(sys.argv) < 2): M = 100 else: M = int(sys.argv[1]) vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm. model = online_wiki_functions.Online_two_hidden_layers(vocab, K1, K2, D, eta0, eta1, eta2, 256, 0.6) for iteration in range(0, M): # Download wikipedia articles randomly. (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Compute the held-out perplexity and fit them to the deep LDA model. bound = model.update_lambda_docs(docset) print '%d: held-out perplexity estimate = %f' % \ (iteration, bound) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. #if (iteration % 10 == 0): # numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) # numpy.savetxt('gamma-%d.dat' % iteration, gamma) return bound
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # The number of documents to analyze each iteration D = 3.3e6 batchsize = 10 # The number of topics K = 100 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(3): # slaves # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration+1, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. gammas = comm.gather(gamma, root = 0) lambdas = comm.gather(olda._lambda, root = 0) if rank == 0: gamma_result = numpy.vstack((x for x in gammas)) lambda_result = numpy.vstack((x for x in lambdas)) numpy.savetxt('lambda_parallel.dat', olda._lambda) numpy.savetxt('gamma_parallel.dat', gamma)
def main(num_batches, K): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. Arguments: - num_batches: the number of batchs to take corpus_size = num_batches * batch_size - K : the number of topics, determined from stdin """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, num_batches): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(num_batches, K): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. Arguments: - num_batches: the number of batchs to take corpus_size = num_batches * batch_size - K : the number of topics, determined from stdin """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, num_batches): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound))
def getArticles(nr = 5): """ Downloads and analyzes a bunch of random Wikipedia articles """ (docs, names) = wikirandom.get_random_wikipedia_articles(nr) return (docs, names)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Add terms and topics to the DB db.init() db.add_terms(vocab) db.add_topics(K) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, documentstoanalyze): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) # Arrays for adding batches of data to the DB doc_array = [] doc_term_array = [] for d in range(len(articlenames)): doc_array.append((articlenames[d], docset[d])) # Add a batch of docs to the DB; this is the one DB task that is not in # the separate DB write thread since later tasks depend on having doc ids. # Since writes take so long, this also balaces the two threads time-wise. doc_ids = db.add_docs(doc_array) doc_topic_array = [] for d in range(len(gamma)): doc_size = len(docset[d]) for k in range(len(gamma[d])): doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size)) db.add_doc_topics(doc_topic_array) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) topic_terms_array =[] for topic in range(len(olda._lambda)): lambda_sum = sum(olda._lambda[topic]) for term in range(len(olda._lambda[topic])): topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum)) db.update_topic_terms(K, topic_terms_array) gc.collect() # probably not necesary, but precautionary for long runs db.print_task_update() db.increment_batch_count() # The DB thread ends only when it has both run out of tasks and it has been # signaled that it will not be recieving any more tasks db.signal_end()
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 100 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 rho_t_vector = [] perplexity_vector = [] time_vector = [] time1_vector = [] # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D / batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 kappa = 0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) t1 = time.time() for iteration in tqdm(range(0, documentstoanalyze)): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity t = time.time() (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) t2 = time.time() time_vector.append(t2 - t1) if len(time1_vector) == 0: time1_vector.append(t2 - t) else: time1_vector.append(time1_vector[-1] + t2 - t) rho_t_vector.append(olda._rhot) perplexity_vector.append(perwordbound) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma) numpy.savetxt('time_%.1f_%d' % (kappa, batchsize), numpy.array(time_vector)) numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize), numpy.array(rho_t_vector)) numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize), numpy.array(perplexity_vector)) numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize), numpy.array(time1_vector))
print len(wordsInVocab) #check length of document and determine whether to bootstrap resample the words if len(wordsInVocab) > maxLen or resampleShortDocuments: adjustedWordsInVocab = []; print 'resampling to length ' + str(maxLen) for j in range(0, maxLen): adjustedWordsInVocab.append(random.choice(wordsInVocab)) #random sampling WITH replacement wordsInVocab = adjustedWordsInVocab docset[i] = ' '.join(wordsInVocab) #create final space-separated pre-processed document return docset for i in range(0, length_seed[0]): seednum = seednummat[i] print seednum n.random.seed(int(seednum)) # Download some articles """ Need to do some pre-processing such that each document has less than maximum length N """ (docset, articlenames) = wikirandom.get_random_wikipedia_articles(int(D)) print 'enforcing document length requirement for privacy' docset = enforceDocumentMaxLength(docset, maxLen, vocabFilename, resampleShortDocuments) #JF: ensure that all documents are no longer than maxLen # """ Save the file """ #the_filename = Data_PATH+'wiki_docs_seednum=%s' %(seednum) the_filename = os.path.join(Data_PATH, 'wiki_docs_seednum=%s' %(seednum)) with open(the_filename, 'wb') as f: cPickle.dump(docset, f)
elbo_lst = [] scrape_time = 0. examples = [] log_likelihoods = [] start_time_loop = time.time() for t in range(n_iter): print '====================BATCH %d====================' % t sys.stdout.flush() articlenames = [] n_requested = 0 mats = [] while n_requested < batch_size: request_size = min(batch_size - n_requested, max_retrieve) start_time = time.time() articles_temp, articlenames_temp = get_random_wikipedia_articles( request_size) sys.stdout.flush() end_time = time.time() scrape_time += end_time - start_time mat_temp = vectorizer.transform(articles_temp) mats.append(mat_temp) articlenames.extend(articlenames_temp) n_requested += request_size del articles_temp, articlenames_temp #mat = vectorizer.transform(articles) mat = scipy.sparse.vstack(tuple(mats), format='csr') mat = mat[filter( lambda d: mat[d].sum() > 1, range(mat.shape[0])
def main(): # unpack input arguments # seednum = 1 # documentstoanalyze = 2000 # batchsize = 1000 # priv = 0 # epsilon = 1 # comp = 2 seednum = int(sys.argv[1]) documentstoanalyze = int(sys.argv[2]) batchsize = int(sys.argv[3]) priv = int(sys.argv[4]) # 1 is private version, 0 is nonprivate version epsilon = float(sys.argv[5]) # total privacy budget comp = int(sys.argv[6]) # 0 conventional, 1 advanced, 2 CDP # The number of topics K = 100 # D = 1000000 D = 5000000 nu = batchsize / float(D) # sampling rate numpy.random.seed(seednum) print('seednum %s mini-batchsize %s and number of iter %s' % (seednum, batchsize, documentstoanalyze)) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) gamma_noise = 0 # will use Laplace noise all the time if comp == 2: # budget = numpy.sqrt(epsilon/float(documentstoanalyze)) # budget = numpy.sqrt(epsilon*D/float(2*batchsize)) budget = numpy.sqrt(2 * epsilon) / float( 2 * nu * numpy.sqrt(documentstoanalyze)) elif comp == 1: delta = 0.000001 budget = epsilon / float( 4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta))) else: # budget = epsilon/float(documentstoanalyze) budget = epsilon / float(2 * documentstoanalyze * nu) if priv: print('private version') olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv, budget, gamma_noise) # the_filename = Data_PATH+'wiki_data' # with open(the_filename, 'rb') as f: # docset = cPickle.load(f) # load all the documents # docset = [] # for whichdoc in range(1, 21): # the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc) # with open(the_filename, 'rb') as f: # docset1 = cPickle.load(f) # docset = docset + docset1 # print "docset %s is loaded" %(whichdoc) # # print "docset all loaded" perplexity = numpy.zeros(documentstoanalyze) # D_test = 10000 # for iteration in range(0, maxIter): for iteration in range(0, documentstoanalyze): # subset of data # rand_perm_nums = numpy.random.permutation(len(docset)) # idx_minibatch = rand_perm_nums[0:batchsize] # docsubset = list(docset[i] for i in idx_minibatch) # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA (gamma, bound) = olda.update_lambda_docs(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print('%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound))) # # Give them to online LDA # (gamma, bound) = olda.update_lambda_docs(docsubset) # # Compute an estimate of held-out perplexity # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab) # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts))) # print '%d: rho_t = %f, training perplexity estimate = %f' % \ # (iteration, olda._rhot, numpy.exp(-perwordbound)) # compute test perplexity # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test] # doctest = list(docset[i] for i in idx_test) # # (gamma_test, ss) = olda.do_e_step_docs(doctest) # # Estimate held-out likelihood for current values of lambda. # bound_test = olda.approx_bound_docs(doctest, gamma_test) # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab) # # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test))) # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test))) # print '%d: rho_t = %f, test perplexity estimate = %f' % \ # (iteration, olda._rhot, perword_test_log_likelihood) perplexity[iteration] = numpy.exp(-perwordbound) # save perplexity if priv: # if gamma_noise: # method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise) # else: # method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp) method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) else: method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % ( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) numpy.save(method + '.npy', perplexity) # method = 'private_epsilon_1' # filename = method+'_D=_%s_S=_%s' %(D, batchsize) # numpy.save(filename+'.npy', test_log_likelihood) # save lambda and gamma numpy.savetxt(method + '_lambda.dat', olda._lambda) numpy.savetxt(method + '_gamma.dat', gamma)
def getArticles(nr=5): """ Downloads and analyzes a bunch of random Wikipedia articles """ (docs, names) = wikirandom.get_random_wikipedia_articles(nr) return (docs, names)
def main(batchnumber = 3.3e4 ): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ # The number of documents to analyze each iteration batchsize = 64*8 # The total number of documents in Wikipedia D = 3.3e6 # The number of topics K = 100 documentstoanalyze = batchnumber # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # record time used for training start = time.time() # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much # sooner than this.) perplexity_plot = list() perplexity = [] time_track = list() for iteration in range(1, documentstoanalyze+1): # Download some articles (docset, articlenames) = \ wikirandom.get_random_wikipedia_articles(batchsize) # Give them to online LDA bound = olda.update_lambda(docset) # Compute an estimate of held-out perplexity perwordbound = bound * len(docset) / (D * sum(map(sum, olda._wordcts))) tmp = numpy.exp(-perwordbound) if iteration == 1 : perplexity = tmp elif (tmp - perplexity)>50 : perplexity = perplexity + 50 else: perplexity = tmp perplexity_plot.append(perplexity) time_track.append(time.time()-start) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) numpy.savetxt('lambda.dat', olda._lambda) #print time taken, save time to file end = time.time() time_track_file = open("time_track.txt","w") for item in time_track: time_track_file.write("%s\n"% item) time_track_file.close() print "time taken for training %f" % (end-start) perplexity_file = open("perplexity.txt","w") for per in perplexity_plot: perplexity_file.write("%s\n"% per) perplexity_file.close() #plot perplexity plt.figure(1) plt.plot(range(len(perplexity_plot)), perplexity_plot, 'g') plt.xlabel('Number of Iterations') plt.ylabel('Perplexity') #plt.show() #plt.pause(100) plt.savefig("perplexity%s.png" % batchnumber) plt.figure(2) plt.plot(time_track, perplexity_plot, 'g') plt.xlabel('Time in seconds') plt.ylabel('Perplexity') #plt.show() #plt.pause(100) plt.savefig("time_track%s.png" % batchnumber)