def main(): options = parse_args(); # parameter set 2 assert(options.number_of_topics > 0); number_of_topics = options.number_of_topics; assert(options.training_iterations > 0); training_iterations = options.training_iterations; # parameter set 3 alpha_alpha = 1.0 / number_of_topics; if options.alpha_alpha > 0: alpha_alpha = options.alpha_alpha; # assert options.default_correlation_prior>0; # default_correlation_prior = options.default_correlation_prior; # assert options.positive_correlation_prior>0; # positive_correlation_prior = options.positive_correlation_prior; # assert options.negative_correlation_prior>0; # negative_correlation_prior = options.negative_correlation_prior; # parameter set 4 # disable_alpha_theta_update = options.disable_alpha_theta_update; inference_mode = options.inference_mode; #update_hyperparameter = options.update_hyperparameter; # parameter set 5 assert(options.snapshot_interval > 0); if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval; # parameter set 1 # assert(options.corpus_name!=None); assert(options.input_directory != None); assert(options.output_directory != None); assert(options.tree_name != None); tree_name = options.tree_name; input_directory = options.input_directory; input_directory = input_directory.rstrip("/"); corpus_name = os.path.basename(input_directory); output_directory = options.output_directory; if not os.path.exists(output_directory): os.mkdir(output_directory); output_directory = os.path.join(output_directory, corpus_name); if not os.path.exists(output_directory): os.mkdir(output_directory); # Document train_docs_path = os.path.join(input_directory, 'train.dat') input_doc_stream = open(train_docs_path, 'r'); train_docs = []; for line in input_doc_stream: train_docs.append(line.strip().lower()); print "successfully load all training docs from %s..." % (os.path.abspath(train_docs_path)); # Vocabulary vocabulary_path = os.path.join(input_directory, 'voc.dat'); input_voc_stream = open(vocabulary_path, 'r'); vocab = []; for line in input_voc_stream: vocab.append(line.strip().lower().split()[0]); vocab = list(set(vocab)); print "successfully load all the words from %s..." % (os.path.abspath(vocabulary_path)); ''' # create output directory now = datetime.datetime.now(); output_directory += now.strftime("%y%b%d-%H%M%S") + ""; output_directory += "-prior_tree-K%d-I%d-a%g-S%d-%s-%s-%s/" \ % (number_of_topics, training_iterations, alpha_alpha, snapshot_interval, tree_name, inference_mode, update_hyperparameter); ''' # create output directory now = datetime.datetime.now(); suffix = now.strftime("%y%m%d-%H%M%S") + ""; suffix += "-%s" % ("lda"); suffix += "-I%d" % (training_iterations); suffix += "-S%d" % (snapshot_interval); suffix += "-K%d" % (number_of_topics); suffix += "-aa%f" % (alpha_alpha); #suffix += "-ab%f" % (alpha_beta); suffix += "-im%d" % (inference_mode); # suffix += "-%s" % (resample_topics); # suffix += "-%s" % (hash_oov_words); suffix += "/"; output_directory = os.path.join(output_directory, suffix); os.mkdir(os.path.abspath(output_directory)); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); options_output_file.write("tree_name=" + str(tree_name) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (training_iterations)); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n"); # options_output_file.write("default_correlation_prior=" + str(default_correlation_prior) + "\n"); # options_output_file.write("positive_correlation_prior=" + str(positive_correlation_prior) + "\n"); # options_output_file.write("negative_correlation_prior=" + str(negative_correlation_prior) + "\n"); # parameter set 4 options_output_file.write("inference_mode=%d\n" % (inference_mode)); #options_output_file.write("update_hyperparameter=%s\n" % (update_hyperparameter)); # parameter set 5 #options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "tree prior file=" + str(tree_name) # parameter set 2 print "training_iterations=%d" % (training_iterations); print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha_alpha=" + str(alpha_alpha) # print "default_correlation_prior=" + str(default_correlation_prior) # print "positive_correlation_prior=" + str(positive_correlation_prior) # print "negative_correlation_prior=" + str(negative_correlation_prior) # parameter set 4 print "inference_mode=%d" % (inference_mode) #print "update_hyperparameter=%s" % (update_hyperparameter); # parameter set 5 #print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" if inference_mode==0: import hybrid; lda_inferencer = hybrid.Hybrid(); #lda_inferencer = hybrid.Hybrid(update_hyperparameter); #import hybrid.parse_data as parse_data elif inference_mode==1: #import monte_carlo #lda_inferencer = monte_carlo.MonteCarlo(); sys.stderr.write("warning: monte carlo inference method is not implemented yet...\n"); pass elif inference_mode==2: #from prior.tree import variational_bayes #lda_inferencer = variational_bayes.VariationalBayes(); import variational_bayes lda_inferencer = variational_bayes.VariationalBayes(); #lda_inferencer = variational_bayes.VariationalBayes(update_hyperparameter); #from variational_bayes import parse_data else: sys.stderr.write("error: unrecognized inference mode %d...\n" % (inference_mode)); return; # initialize tree import priortree prior_tree = priortree.VocabTreePrior(); # from vb.prior.tree.priortree import VocabTreePrior; # prior_tree = VocabTreePrior(); # prior_tree._initialize(input_directory+"tree.wn.*", vocab, default_correlation_prior, positive_correlation_prior, negative_correlation_prior); prior_tree._initialize(os.path.join(input_directory, tree_name + ".wn.*"), os.path.join(input_directory, tree_name + ".hyperparams"), vocab) lda_inferencer._initialize(train_docs, vocab, prior_tree, number_of_topics, alpha_alpha); for iteration in xrange(training_iterations): lda_inferencer.learning(); if (lda_inferencer._counter % snapshot_interval == 0): lda_inferencer.export_beta(os.path.join(output_directory, 'exp_beta-' + str(lda_inferencer._counter))); model_snapshot_path = os.path.join(output_directory, 'model-' + str(lda_inferencer._counter)); cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb')); model_snapshot_path = os.path.join(output_directory, 'model-' + str(lda_inferencer._counter)); cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb'));
def main(): import option_parser options = option_parser.parse_args() # parameter set 2 assert (options.number_of_documents > 0) number_of_documents = options.number_of_documents assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics assert (options.truncation_level > 0) truncation_level = options.truncation_level # parameter set 3 assert (options.vocab_prune_interval > 0) vocab_prune_interval = options.vocab_prune_interval snapshot_interval = vocab_prune_interval if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval #assert(options.batch_size>0); batch_size = options.batch_size #assert(number_of_documents % batch_size==0); online_iterations = number_of_documents / batch_size if options.online_iterations > 0: online_iterations = options.online_iterations # parameter set 4 assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print "warning: running in batch mode..." kappa = 0 alpha_theta = 1.0 / number_of_topics if options.alpha_theta > 0: alpha_theta = options.alpha_theta assert (options.alpha_beta > 0) alpha_beta = options.alpha_beta # parameter set 5 #heldout_data = options.heldout_data; # parameter set 1 assert (options.corpus_name != None) assert (options.input_directory != None) assert (options.output_directory != None) corpus_name = options.corpus_name input_directory = options.input_directory input_directory = os.path.join(input_directory, corpus_name) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" suffix += "-D%d" % (number_of_documents) suffix += "-K%d" % (number_of_topics) suffix += "-T%d" % (truncation_level) suffix += "-P%d" % (vocab_prune_interval) suffix += "-I%d" % (snapshot_interval) suffix += "-B%d" % (batch_size) suffix += "-O%d" % (online_iterations) suffix += "-t%d" % (tau) suffix += "-k%g" % (kappa) suffix += "-at%g" % (alpha_theta) suffix += "-ab%g" % (alpha_beta) suffix += "/" ''' suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents, number_of_topics, truncation_level, vocab_prune_interval, snapshot_interval, batch_size, online_iterations, tau, kappa, alpha_theta, alpha_beta); ''' output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) dictionary_file = options.dictionary if dictionary_file != None: dictionary_file = dictionary_file.strip() # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n") # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") options_output_file.write("truncation_level=" + str(truncation_level) + "\n") # parameter set 3 options_output_file.write("vocab_prune_interval=" + str(vocab_prune_interval) + "\n") options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("online_iterations=" + str(online_iterations) + "\n") # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n") options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n") # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) print "truncation_level=" + str(truncation_level) # parameter set 3 print "vocab_prune_interval=" + str(vocab_prune_interval) print "snapshot_interval=" + str(snapshot_interval) print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_beta=" + str(alpha_beta) # parameter set 5 #print "heldout_data=" + str(heldout_data) print "========== ========== ========== ========== ==========" # Vocabulary #file = open(input_directory+'voc.dat', 'r'); # Seed the vocabulary vocab = ['team'] # Documents train_docs = [] file = open(os.path.join(input_directory, 'doc.dat'), 'r') for line in file: train_docs.append(line.strip()) print "successfully load all training documents..." import hybrid olda = hybrid.Hybrid(3, 20, dictionary_file) olda._initialize(vocab, number_of_topics, number_of_documents, batch_size, truncation_level, alpha_theta, alpha_beta, tau, kappa, vocab_prune_interval, True) olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100) document_topic_distribution = None # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.) for iteration in xrange(online_iterations): if batch_size <= 0: docset = train_docs else: docset = train_docs[(batch_size * iteration) % len(train_docs):(batch_size * (iteration + 1) - 1) % len(train_docs) + 1] print "select documents from %d to %d" % ( (batch_size * iteration) % (number_of_documents), (batch_size * (iteration + 1) - 1) % number_of_documents + 1) clock = time.time() batch_gamma = olda.learning(docset) if (olda._counter % snapshot_interval == 0): olda.export_beta( os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50) if document_topic_distribution == None: document_topic_distribution = batch_gamma else: document_topic_distribution = numpy.vstack( (document_topic_distribution, batch_gamma)) clock = time.time() - clock print "vocabulary size = %s" % (olda._truncation_size) print 'training iteration %d finished in %f seconds: epsilon = %f' % ( olda._counter, clock, olda._epsilon) gamma_path = os.path.join(output_directory, "gamma.txt") numpy.savetxt(gamma_path, document_topic_distribution) '''
def main(): options = parse_args() # parameter set 1 #assert(options.corpus_name!=None); assert (options.input_directory != None) assert (options.output_directory != None) input_directory = options.input_directory input_directory = input_directory.rstrip("/") corpus_name = os.path.basename(input_directory) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) assert (options.grammar_file != None) grammar_file = options.grammar_file assert (os.path.exists(grammar_file)) # Documents train_docs = [] input_stream = open(os.path.join(input_directory, 'train.dat'), 'r') for line in input_stream: train_docs.append(line.strip()) input_stream.close() print("successfully load all training documents...") # parameter set 2 if options.number_of_documents > 0: number_of_documents = options.number_of_documents else: number_of_documents = len(train_docs) if options.batch_size > 0: batch_size = options.batch_size else: batch_size = number_of_documents #assert(number_of_documents % batch_size==0); training_iterations = number_of_documents / batch_size if options.training_iterations > 0: training_iterations = options.training_iterations #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size)); #multiprocesses = options.multiprocesses; assert (options.number_of_processes >= 0) number_of_processes = options.number_of_processes # parameter set 3 assert (options.grammaton_prune_interval > 0) grammaton_prune_interval = options.grammaton_prune_interval snapshot_interval = grammaton_prune_interval if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print("warning: running in batch mode...") kappa = 0 # read in adaptor grammars desired_truncation_level = {} alpha_pi = {} beta_pi = {} grammar_rules = [] adapted_non_terminals = set() #for line in codecs.open(grammar_file, 'r', encoding='utf-8'): for line in open(grammar_file, 'r'): line = line.strip() if line.startswith("%"): continue if line.startswith("@"): tokens = line.split() assert (len(tokens) == 5) adapted_non_terminal = nltk.Nonterminal(tokens[1]) adapted_non_terminals.add(adapted_non_terminal) desired_truncation_level[adapted_non_terminal] = int(tokens[2]) alpha_pi[adapted_non_terminal] = float(tokens[3]) beta_pi[adapted_non_terminal] = float(tokens[4]) continue grammar_rules.append(line) grammar_rules = "\n".join(grammar_rules) # Warning: if you are using nltk 2.x, please use parse_grammar() #from nltk.grammar import parse_grammar, standard_nonterm_parser #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) from nltk.grammar import read_grammar, standard_nonterm_parser start, productions = read_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) print("start, productions: ", start, productions) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]); #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]); #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]); #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % ( number_of_documents, #number_of_topics, grammaton_prune_interval, snapshot_interval, batch_size, training_iterations, tau, kappa, #alpha_theta, #alpha_pi_string, #beta_pi_string, #desired_truncation_level_string, os.path.basename(grammar_file)) output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a input_stream options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("grammar_file=" + str(grammar_file) + "\n") # parameter set 2 options_output_file.write("number_of_processes=" + str(number_of_processes) + "\n") #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n"); options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("training_iterations=" + str(training_iterations) + "\n") # parameter set 3 options_output_file.write("grammaton_prune_interval=" + str(grammaton_prune_interval) + "\n") options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") # parameter set 4 #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_pi=%s\n" % alpha_pi) options_output_file.write("beta_pi=%s\n" % beta_pi) options_output_file.write("desired_truncation_level=%s\n" % desired_truncation_level) # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print("========== ========== ========== ========== ==========") # parameter set 1 print("output_directory=" + output_directory) print("input_directory=" + input_directory) print("corpus_name=" + corpus_name) print("grammar_file=" + str(grammar_file)) # parameter set 2 print("number_of_documents=" + str(number_of_documents)) print("batch_size=" + str(batch_size)) print("training_iterations=" + str(training_iterations)) print("number_of_processes=" + str(number_of_processes)) #print("multiprocesses=" + str(multiprocesses) # parameter set 3 print("grammaton_prune_interval=" + str(grammaton_prune_interval)) print("snapshot_interval=" + str(snapshot_interval)) print("tau=" + str(tau)) print("kappa=" + str(kappa)) # parameter set 4 #print("alpha_theta=" + str(alpha_theta) print("alpha_pi=%s" % alpha_pi) print("beta_pi=%s" % beta_pi) print("desired_truncation_level=%s" % desired_truncation_level) # parameter set 5 #print("heldout_data=" + str(heldout_data) print("========== ========== ========== ========== ==========") import hybrid print("passing prodcutions = : ", productions) adagram_inferencer = hybrid.Hybrid(start, productions, adapted_non_terminals) adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa, alpha_pi, beta_pi, None, desired_truncation_level, grammaton_prune_interval) ''' clock_iteration = time.time(); clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs); clock_iteration = time.time()-clock_iteration; print('E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p ''' #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0")) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0")) random.shuffle(train_docs) training_clock = time.time() snapshot_clock = time.time() for iteration in range(int(training_iterations)): start_index = batch_size * iteration end_index = batch_size * (iteration + 1) if start_index / number_of_documents < end_index / number_of_documents: #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)]; train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents):] random.shuffle(train_docs) train_doc_set += train_docs[:(batch_size * (iteration + 1)) % (number_of_documents)] else: train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents): (batch_size * (iteration + 1)) % number_of_documents] clock_iteration = time.time() #print("processing document:", train_doc_set clock_e_step, clock_m_step = adagram_inferencer.learning( train_doc_set, number_of_processes) if (iteration + 1) % snapshot_interval == 0: #pickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb'); #pickle.dump(adagram_inferencer, pickle_file); #pickle_file.close(); adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str( (iteration + 1)))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) if (iteration + 1) % 1000 == 0: snapshot_clock = time.time() - snapshot_clock print('Processing 1000 mini-batches take %g seconds...' % (snapshot_clock)) snapshot_clock = time.time() clock_iteration = time.time() - clock_iteration print( 'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...' % (adagram_inferencer._counter, clock_e_step, clock_m_step, clock_iteration)) adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str(adagram_inferencer._counter + 1))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) pickle_file = open( os.path.join(output_directory, "model-%d" % (iteration + 1)), 'wb') pickle.dump(adagram_inferencer, pickle_file) pickle_file.close() training_clock = time.time() - training_clock print('Training finished in %g seconds...' % (training_clock))
MSE_error[trajIdx, 5] = np.std(np.multiply(outDR, outDR))/np.sqrt(numTrials) MSE_error[trajIdx, 6] = np.std(np.multiply(outWDR, outWDR))/np.sqrt(numTrials) MSE_error[trajIdx, 7] = np.std(np.multiply(outMagic, outMagic))/np.sqrt(numTrials) # Let's now write the MSE results to a file # np.savetxt(filename + '_MSE.txt', MSE) # np.savetxt(filename+'_MSE_error.txt', MSE_error) np.savetxt('condition_number_hybrid.txt', cond_tot) if __name__ == '__main__': print 'Running 50 trials for the given MDP model...' trueHorizon = False delta = 0.1 # env = modelfail.ModelFail() # print 'Generating data for ModelFail MDP...' # compute_data(env, 1, 2, 'out_ModelFail', 15, delta) # env = modelwin.ModelWin() # print 'Generating data for ModelWin MDP...' # compute_data(env, 1, 2, 'out_MSE_ModelWin', 15, delta) env = hybrid.Hybrid() print 'Generating data for Hybrid Domain MDP...' compute_data(env, 1, 2, 'out_HybridDomain', 13, delta) # env = gridworld.Gridworld(trueHorizon) # print 'Generating data for Gridworld MDP p4 p5...' # compute_data(env, 4, 5, 'out_GridWorld_p4p5', 11, delta) print 'Done...'
def main(): options = parse_args() # parameter set 2 assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics assert (options.training_iterations > 0) training_iterations = options.training_iterations assert (options.snapshot_interval > 0) if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; inference_mode = options.inference_mode # parameter set 1 #assert(options.corpus_name!=None); assert (options.input_directory != None) assert (options.output_directory != None) input_directory = options.input_directory input_directory = input_directory.rstrip("/") corpus_name = os.path.basename(input_directory) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # Document train_docs_path = os.path.join(input_directory, 'train.dat') input_doc_stream = open(train_docs_path, 'r') train_docs = [] for line in input_doc_stream: train_docs.append(line.strip().lower()) print "successfully load all training docs from %s..." % ( os.path.abspath(train_docs_path)) # Vocabulary vocabulary_path = os.path.join(input_directory, 'voc.dat') input_voc_stream = open(vocabulary_path, 'r') vocab = [] for line in input_voc_stream: vocab.append(line.strip().lower().split()[0]) vocab = list(set(vocab)) print "successfully load all the words from %s..." % ( os.path.abspath(vocabulary_path)) # parameter set 3 alpha_alpha = 1.0 / number_of_topics if options.alpha_alpha > 0: alpha_alpha = options.alpha_alpha alpha_beta = options.alpha_beta if alpha_beta <= 0: alpha_beta = 1.0 / len(vocab) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%m%d-%H%M%S") + "" suffix += "-%s" % ("lda") suffix += "-I%d" % (training_iterations) suffix += "-S%d" % (snapshot_interval) suffix += "-K%d" % (number_of_topics) suffix += "-aa%f" % (alpha_alpha) suffix += "-ab%f" % (alpha_beta) suffix += "-im%d" % (inference_mode) # suffix += "-%s" % (resample_topics); # suffix += "-%s" % (hash_oov_words); suffix += "/" output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) #dict_file = options.dictionary; #if dict_file != None: #dict_file = dict_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") #options_output_file.write("vocabulary_path=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("training_iterations=%d\n" % (training_iterations)) options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") # parameter set 3 options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n") options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n") # parameter set 4 options_output_file.write("inference_mode=%d\n" % (inference_mode)) options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "training_iterations=%d" % (training_iterations) print "snapshot_interval=" + str(snapshot_interval) print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha_alpha=" + str(alpha_alpha) print "alpha_beta=" + str(alpha_beta) # parameter set 4 print "inference_mode=%d" % (inference_mode) print "========== ========== ========== ========== ==========" if inference_mode == 0: import hybrid lda_inferencer = hybrid.Hybrid() elif inference_mode == 1: import monte_carlo lda_inferencer = monte_carlo.MonteCarlo() elif inference_mode == 2: import variational_bayes lda_inferencer = variational_bayes.VariationalBayes() else: sys.stderr.write("error: unrecognized inference mode %d...\n" % (inference_mode)) return lda_inferencer._initialize(train_docs, vocab, number_of_topics, alpha_alpha, alpha_beta) for iteration in xrange(training_iterations): lda_inferencer.learning() if (lda_inferencer._counter % snapshot_interval == 0): lda_inferencer.export_beta(output_directory + 'exp_beta-' + str(lda_inferencer._counter)) model_snapshot_path = os.path.join(output_directory, 'model-' + str(lda_inferencer._counter)) cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb'))
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; assert(options.number_of_iterations>0); number_of_iterations = options.number_of_iterations; # parameter set 3 alpha = 1.0/number_of_topics; if options.alpha>0: alpha=options.alpha; assert(options.eta>0); eta = options.eta; # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert(options.snapshot_interval>0); if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; if not input_directory.endswith('/'): input_directory += '/'; input_directory += corpus_name+'/'; output_directory = options.output_directory; if not output_directory.endswith('/'): output_directory += '/'; output_directory += corpus_name+'/'; # create output directory now = datetime.datetime.now(); output_directory += now.strftime("%y%b%d-%H%M%S")+""; output_directory += "-hybrid-K%d-I%d-a%g-e%g-S%d/" \ % (number_of_topics, number_of_iterations, alpha, eta, snapshot_interval); os.mkdir(os.path.abspath(output_directory)); #dict_file = options.dictionary; #if dict_file != None: #dict_file = dict_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("alpha=" + str(alpha) + "\n"); options_output_file.write("eta=" + str(eta) + "\n"); # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" %(number_of_iterations); print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha=" + str(alpha) print "eta=" + str(eta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat'); print "successfully load all training documents..." import hybrid; lda_inference = hybrid.Hybrid(); lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, eta); for iteration in xrange(number_of_iterations): lda_inference.learn(); if (lda_inference._counter % snapshot_interval == 0): lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
def main(): import option_parser options = option_parser.parse_args() # parameter set 2 assert (options.number_of_documents > 0) number_of_documents = options.number_of_documents assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics # parameter set 3 assert (options.snapshot_interval > 0) snapshot_interval = options.snapshot_interval #assert(options.batch_size>0); batch_size = options.batch_size #assert(number_of_documents % batch_size==0); online_iterations = number_of_documents / batch_size if options.online_iterations > 0: online_iterations = options.online_iterations # parameter set 5 hybrid_mode = options.hybrid_mode hash_oov_words = options.hash_oov_words # parameter set 1 assert (options.corpus_name != None) assert (options.input_directory != None) assert (options.output_directory != None) corpus_name = options.corpus_name input_directory = options.input_directory #if not input_directory.endswith('/'): #input_directory += '/'; input_directory = os.path.join(input_directory, corpus_name) #input_directory += corpus_name+'/'; output_directory = options.output_directory #if not output_directory.endswith('/'): #output_directory += '/'; if not os.path.exists(output_directory): os.mkdir(output_directory) #output_directory += corpus_name+'/'; output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # Documents train_docs = [] input_file = open(os.path.join(input_directory, 'doc.dat'), 'r') for line in input_file: train_docs.append(line.strip()) print "successfully load all training documents..." # Vocabulary dictionary_file = options.dictionary if dictionary_file == None: dictionary_file = os.path.join(input_directory, 'voc.dat') input_file = open(dictionary_file, 'r') vocab = [] for line in input_file: vocab.append(line.strip().split()[0]) vocab = list(set(vocab)) print "successfully load all the words from %s..." % (dictionary_file) # parameter set 4 assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print "warning: running in batch mode..." kappa = 0 alpha_theta = 1.0 / number_of_topics if options.alpha_theta > 0: alpha_theta = options.alpha_theta alpha_eta = 1.0 / len(vocab) if options.alpha_eta > 0: alpha_eta = options.alpha_eta # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" suffix += "-%s" % ("fixvoc") suffix += "-D%d" % (number_of_documents) suffix += "-K%d" % (number_of_topics) suffix += "-I%d" % (snapshot_interval) suffix += "-B%d" % (batch_size) suffix += "-O%d" % (online_iterations) suffix += "-t%d" % (tau) suffix += "-k%g" % (kappa) suffix += "-at%g" % (alpha_theta) suffix += "-ae%g" % (alpha_eta) suffix += "-%s" % (hybrid_mode) suffix += "-%s" % (hash_oov_words) suffix += "/" output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n") # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") # parameter set 3 options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("online_iterations=" + str(online_iterations) + "\n") # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n") options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n") # parameter set 5 options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n") options_output_file.write("hash_oov_words=%s\n" % hash_oov_words) options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "snapshot_interval=" + str(snapshot_interval) print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_eta=" + str(alpha_eta) # parameter set 5 print "hybrid_mode=" + str(hybrid_mode) print "hash_oov_words=%s" % (hash_oov_words) print "========== ========== ========== ========== ==========" if hybrid_mode: import hybrid olda = hybrid.Hybrid(hash_oov_words) else: import variational olda = variational.Variational(hash_oov_words) olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta, alpha_eta, tau, kappa) olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50) document_topic_distribution = None for iteration in xrange(online_iterations): if batch_size <= 0: docset = train_docs else: docset = train_docs[(batch_size * iteration) % len(train_docs):(batch_size * (iteration + 1) - 1) % len(train_docs) + 1] print "select documents from %d to %d" % ( (batch_size * iteration) % (number_of_documents), (batch_size * (iteration + 1) - 1) % number_of_documents + 1) clock = time.time() batch_gamma, elbo = olda.learning(docset) if document_topic_distribution == None: document_topic_distribution = batch_gamma else: document_topic_distribution = numpy.vstack( (document_topic_distribution, batch_gamma)) clock = time.time() - clock print 'training iteration %d finished in %f seconds: epsilon = %f' % ( olda._counter, clock, olda._epsilon) # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration. #if ((olda._counter+1) % snapshot_interval == 0): #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1)); if (olda._counter % snapshot_interval == 0): olda.export_beta( os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50) gamma_path = os.path.join(output_directory, 'gamma.txt') numpy.savetxt(gamma_path, document_topic_distribution)