def main(): import option_parser options = option_parser.parse_args() # parameter set 2 assert (options.number_of_documents > 0) number_of_documents = options.number_of_documents assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics assert (options.truncation_level > 0) truncation_level = options.truncation_level # parameter set 3 assert (options.vocab_prune_interval > 0) vocab_prune_interval = options.vocab_prune_interval snapshot_interval = vocab_prune_interval if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval #assert(options.batch_size>0); batch_size = options.batch_size #assert(number_of_documents % batch_size==0); online_iterations = number_of_documents / batch_size if options.online_iterations > 0: online_iterations = options.online_iterations # parameter set 4 assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print "warning: running in batch mode..." kappa = 0 alpha_theta = 1.0 / number_of_topics if options.alpha_theta > 0: alpha_theta = options.alpha_theta assert (options.alpha_beta > 0) alpha_beta = options.alpha_beta # parameter set 5 #heldout_data = options.heldout_data; # parameter set 1 assert (options.corpus_name != None) assert (options.input_directory != None) assert (options.output_directory != None) corpus_name = options.corpus_name input_directory = options.input_directory input_directory = os.path.join(input_directory, corpus_name) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" suffix += "-D%d" % (number_of_documents) suffix += "-K%d" % (number_of_topics) suffix += "-T%d" % (truncation_level) suffix += "-P%d" % (vocab_prune_interval) suffix += "-I%d" % (snapshot_interval) suffix += "-B%d" % (batch_size) suffix += "-O%d" % (online_iterations) suffix += "-t%d" % (tau) suffix += "-k%g" % (kappa) suffix += "-at%g" % (alpha_theta) suffix += "-ab%g" % (alpha_beta) suffix += "/" ''' suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents, number_of_topics, truncation_level, vocab_prune_interval, snapshot_interval, batch_size, online_iterations, tau, kappa, alpha_theta, alpha_beta); ''' output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) dictionary_file = options.dictionary if dictionary_file != None: dictionary_file = dictionary_file.strip() # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n") # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") options_output_file.write("truncation_level=" + str(truncation_level) + "\n") # parameter set 3 options_output_file.write("vocab_prune_interval=" + str(vocab_prune_interval) + "\n") options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("online_iterations=" + str(online_iterations) + "\n") # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n") options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n") # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) print "truncation_level=" + str(truncation_level) # parameter set 3 print "vocab_prune_interval=" + str(vocab_prune_interval) print "snapshot_interval=" + str(snapshot_interval) print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_beta=" + str(alpha_beta) # parameter set 5 #print "heldout_data=" + str(heldout_data) print "========== ========== ========== ========== ==========" # Vocabulary #file = open(input_directory+'voc.dat', 'r'); # Seed the vocabulary vocab = ['team'] # Documents train_docs = [] file = open(os.path.join(input_directory, 'doc.dat'), 'r') for line in file: train_docs.append(line.strip()) print "successfully load all training documents..." import hybrid olda = hybrid.Hybrid(3, 20, dictionary_file) olda._initialize(vocab, number_of_topics, number_of_documents, batch_size, truncation_level, alpha_theta, alpha_beta, tau, kappa, vocab_prune_interval, True) olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100) document_topic_distribution = None # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.) for iteration in xrange(online_iterations): if batch_size <= 0: docset = train_docs else: docset = train_docs[(batch_size * iteration) % len(train_docs):(batch_size * (iteration + 1) - 1) % len(train_docs) + 1] print "select documents from %d to %d" % ( (batch_size * iteration) % (number_of_documents), (batch_size * (iteration + 1) - 1) % number_of_documents + 1) clock = time.time() batch_gamma = olda.learning(docset) if (olda._counter % snapshot_interval == 0): olda.export_beta( os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50) if document_topic_distribution == None: document_topic_distribution = batch_gamma else: document_topic_distribution = numpy.vstack( (document_topic_distribution, batch_gamma)) clock = time.time() - clock print "vocabulary size = %s" % (olda._truncation_size) print 'training iteration %d finished in %f seconds: epsilon = %f' % ( olda._counter, clock, olda._epsilon) gamma_path = os.path.join(output_directory, "gamma.txt") numpy.savetxt(gamma_path, document_topic_distribution) '''
def main(): import option_parser options = option_parser.parse_args() # parameter set 2 assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics assert (options.number_of_iterations > 0) number_of_iterations = options.number_of_iterations # parameter set 3 alpha = 1.0 / number_of_topics if options.alpha > 0: alpha = options.alpha assert (options.beta > 0) beta = options.beta # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert (options.snapshot_interval > 0) if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval # parameter set 1 assert (options.corpus_name != None) assert (options.input_directory != None) assert (options.output_directory != None) corpus_name = options.corpus_name input_directory = options.input_directory if not input_directory.endswith('/'): input_directory += '/' input_directory += corpus_name + '/' output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) if not output_directory.endswith('/'): output_directory += '/' output_directory += corpus_name + '/' if not os.path.exists(output_directory): os.mkdir(output_directory) # create output directory now = datetime.datetime.now() output_directory += now.strftime("%y%b%d-%H%M%S") + "" #output_directory += "-" + str(now.microsecond) + "/"; output_directory += "-cgs-K%d-I%d-a%g-b%g-S%d/" \ % (number_of_topics, number_of_iterations, alpha, beta, snapshot_interval) os.mkdir(os.path.abspath(output_directory)) #dict_file = options.dictionary; #if dict_file != None: #dict_file = dict_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)) options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") # parameter set 3 options_output_file.write("alpha=" + str(alpha) + "\n") options_output_file.write("beta=" + str(beta) + "\n") # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" % (number_of_iterations) print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha=" + str(alpha) print "beta=" + str(beta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval) print "========== ========== ========== ========== ==========" documents, type_to_index, index_to_type = parse_data( input_directory + 'doc.dat', input_directory + 'voc.dat') print "successfully load all training documents..." import cgs lda_inference = cgs.CollapsedGibbsSampling() lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, beta) for iteration in xrange(number_of_iterations): lda_inference.sample() if (lda_inference._counter % snapshot_interval == 0): lda_inference.export_topic_term_distribution( output_directory + 'exp_beta-' + str(lda_inference._counter))
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; assert(options.number_of_iterations>0); number_of_iterations = options.number_of_iterations; # parameter set 3 alpha = 1.0/number_of_topics; if options.alpha>0: alpha=options.alpha; assert(options.beta>0); beta = options.beta; # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert(options.snapshot_interval>0); if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; if not input_directory.endswith('/'): input_directory += '/'; input_directory += corpus_name+'/'; output_directory = options.output_directory; if not output_directory.endswith('/'): output_directory += '/'; output_directory += corpus_name+'/'; # create output directory now = datetime.datetime.now(); output_directory += now.strftime("%y%b%d-%H%M%S")+""; #output_directory += "-" + str(now.microsecond) + "/"; output_directory += "-cgs-K%d-I%d-a%g-b%g-S%d/" \ % (number_of_topics, number_of_iterations, alpha, beta, snapshot_interval); os.mkdir(os.path.abspath(output_directory)); #dict_file = options.dictionary; #if dict_file != None: #dict_file = dict_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("alpha=" + str(alpha) + "\n"); options_output_file.write("beta=" + str(beta) + "\n"); # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" %(number_of_iterations); print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha=" + str(alpha) print "beta=" + str(beta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat'); print "successfully load all training documents..." import cgs; lda_inference = cgs.CollapsedGibbsSampling() lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, beta); for iteration in xrange(number_of_iterations): lda_inference.sample(); if (lda_inference._counter % snapshot_interval == 0): lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_clusters>0); number_of_clusters = options.number_of_clusters; assert(options.number_of_iterations>0); number_of_iterations = options.number_of_iterations; # parameter set 3 alpha_alpha = 1.0/number_of_clusters; if options.alpha_alpha>0: alpha_alpha=options.alpha_alpha; assert(options.alpha_beta>0); alpha_beta = options.alpha_beta; # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert(options.snapshot_interval>0); if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; # parameter set 1 #assert(options.dataset_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); input_directory = options.input_directory; input_directory = input_directory.rstrip("/"); dataset_name = os.path.basename(input_directory); output_directory = options.output_directory; if not os.path.exists(output_directory): os.mkdir(output_directory); output_directory = os.path.join(output_directory, dataset_name); if not os.path.exists(output_directory): os.mkdir(output_directory); # create output directory now = datetime.datetime.now(); suffix = now.strftime("%y%b%d-%H%M%S"); #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-naive_bayes_new" suffix += "-K%d" % (number_of_clusters) suffix += "-I%d" % (number_of_iterations) suffix += "-a%g" % (alpha_alpha) suffix += "-b%g" % (alpha_beta) suffix += "-S%d" % (snapshot_interval) output_directory = os.path.join(output_directory, suffix); os.mkdir(os.path.abspath(output_directory)); # store all the options to a file options_output_file = open(os.path.join(output_directory, "option.txt"), 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("dataset_name=" + dataset_name + "\n"); #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)); options_output_file.write("number_of_clusters=" + str(number_of_clusters) + "\n"); # parameter set 3 options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n"); options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n"); # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "dataset_name=" + dataset_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" %(number_of_iterations); print "number_of_clusters=" + str(number_of_clusters) # parameter set 3 print "alpha_alpha=" + str(alpha_alpha) print "alpha_beta=" + str(alpha_beta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" documents = parse_data(os.path.join(input_directory, 'data.dat')); print "successfully load all training documents..." from naive_bayes_new import monte_carlo naive_bayes = monte_carlo.MonteCarlo() naive_bayes._initialize(documents, number_of_clusters, alpha_alpha, alpha_beta); for iteration in xrange(number_of_iterations): naive_bayes.learning(); if (naive_bayes._counter % snapshot_interval == 0): naive_bayes.export_model_snapshot(output_directory, input_directory);
def main(): import option_parser options = option_parser.parse_args() # parameter set 2 initial_number_of_clusters = options.initial_number_of_clusters if initial_number_of_clusters <= 0: initial_number_of_clusters = 10 assert (options.number_of_iterations > 0) number_of_iterations = options.number_of_iterations # parameter set 3 assert options.alpha_alpha > 0 alpha_alpha = options.alpha_alpha assert (options.alpha_beta > 0) alpha_beta = options.alpha_beta # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert (options.snapshot_interval > 0) if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval # parameter set 1 #assert(options.dataset_name!=None); assert (options.input_directory != None) assert (options.output_directory != None) input_directory = options.input_directory input_directory = input_directory.rstrip("/") dataset_name = os.path.basename(input_directory) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, dataset_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-naive_bayes_dp" #suffix += "-K%d" % (initial_number_of_clusters) suffix += "-I%d" % (number_of_iterations) suffix += "-a%g" % (alpha_alpha) suffix += "-b%g" % (alpha_beta) suffix += "-S%d" % (snapshot_interval) output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a file options_output_file = open(os.path.join(output_directory, "option.txt"), 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("dataset_name=" + dataset_name + "\n") #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)) #options_output_file.write("initial_number_of_clusters=" + str(initial_number_of_clusters) + "\n"); # parameter set 3 options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n") options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n") # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "dataset_name=" + dataset_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" % (number_of_iterations) #print "initial_number_of_clusters=" + str(initial_number_of_clusters) # parameter set 3 print "alpha_alpha=" + str(alpha_alpha) print "alpha_beta=" + str(alpha_beta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval) print "========== ========== ========== ========== ==========" documents = parse_data(os.path.join(input_directory, 'data.dat')) print "successfully load all training documents..." from naive_bayes_dp import monte_carlo naive_bayes = monte_carlo.MonteCarlo() naive_bayes._initialize(documents, alpha_alpha, alpha_beta, initial_number_of_clusters) for iteration in xrange(number_of_iterations): naive_bayes.learning() if (naive_bayes._counter % snapshot_interval == 0): naive_bayes.export_model_snapshot(output_directory, input_directory)
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; assert(options.number_of_iterations>0); number_of_iterations = options.number_of_iterations; # parameter set 3 alpha = 1.0/number_of_topics; if options.alpha>0: alpha=options.alpha; #assert options.default_correlation_prior>0; #default_correlation_prior = options.default_correlation_prior; #assert options.positive_correlation_prior>0; #positive_correlation_prior = options.positive_correlation_prior; #assert options.negative_correlation_prior>0; #negative_correlation_prior = options.negative_correlation_prior; # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; hybrid_mode = options.hybrid_mode; update_hyperparameter = options.update_hyperparameter; # parameter set 5 assert(options.snapshot_interval>0); if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); assert(options.tree_name!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; if not input_directory.endswith('/'): input_directory += '/'; input_directory += corpus_name+'/'; output_directory = options.output_directory; if not os.path.exists(output_directory): os.mkdir(output_directory); if not output_directory.endswith('/'): output_directory += '/'; output_directory += corpus_name+'/'; if not os.path.exists(output_directory): os.mkdir(output_directory); tree_name = options.tree_name.strip(); # create output directory now = datetime.datetime.now(); output_directory += now.strftime("%y%b%d-%H%M%S")+""; output_directory += "-prior_tree-K%d-I%d-a%g-S%d-%s-%s-%s/" \ % (number_of_topics, number_of_iterations, alpha, snapshot_interval, tree_name, hybrid_mode, update_hyperparameter); #output_directory += "-prior_tree_uvb-K%d-I%d-a%g-dcp%g-pcp%g-ncp%g-S%d/" \ #% (number_of_topics, #number_of_iterations, #alpha, #default_correlation_prior, #positive_correlation_prior, #negative_correlation_prior, #snapshot_interval); os.mkdir(os.path.abspath(output_directory)); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); options_output_file.write("tree_name=" + str(tree_name) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("alpha=" + str(alpha) + "\n"); #options_output_file.write("default_correlation_prior=" + str(default_correlation_prior) + "\n"); #options_output_file.write("positive_correlation_prior=" + str(positive_correlation_prior) + "\n"); #options_output_file.write("negative_correlation_prior=" + str(negative_correlation_prior) + "\n"); # parameter set 4 options_output_file.write("hybrid_mode=%s\n" % (hybrid_mode)); options_output_file.write("update_hyperparameter=%s\n" % (update_hyperparameter)); # parameter set 5 options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "tree prior file=" + str(tree_name) # parameter set 2 print "number_of_iterations=%d" %(number_of_iterations); print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha=" + str(alpha) #print "default_correlation_prior=" + str(default_correlation_prior) #print "positive_correlation_prior=" + str(positive_correlation_prior) #print "negative_correlation_prior=" + str(negative_correlation_prior) # parameter set 4 print "hybrid_mode=%s" % (hybrid_mode) print "update_hyperparameter=%s" % (update_hyperparameter); # parameter set 5 print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" if hybrid_mode: import hybrid; lda_inference = hybrid.Hybrid(update_hyperparameter); import hybrid.parse_data as parse_data else: import uvb; lda_inference = uvb.UncollapsedVariationalBayes(update_hyperparameter); import uvb.parse_data as parse_data documents, type_to_index, index_to_type, vocabulary = parse_data(input_directory+'doc.dat', input_directory+'voc.dat'); print "successfully load all training documents..." # initialize tree import priortree prior_tree = priortree.PriorTree(); #from vb.prior.tree.priortree import PriorTree; #prior_tree = PriorTree(); #prior_tree._initialize(input_directory+"tree.wn.*", vocabulary, default_correlation_prior, positive_correlation_prior, negative_correlation_prior); prior_tree.initialize(input_directory+tree_name+".wn.*", input_directory+tree_name+".hyperparams", vocabulary) lda_inference._initialize(documents, prior_tree, type_to_index, index_to_type, number_of_topics, alpha); for iteration in xrange(number_of_iterations): lda_inference.train(); if (lda_inference._counter % snapshot_interval == 0): lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_documents>0); number_of_documents = options.number_of_documents; assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; # parameter set 3 assert(options.snapshot_interval>0); snapshot_interval=options.snapshot_interval; #assert(options.batch_size>0); batch_size = options.batch_size; #assert(number_of_documents % batch_size==0); online_iterations=number_of_documents/batch_size; if options.online_iterations>0: online_iterations=options.online_iterations; # parameter set 5 hybrid_mode = options.hybrid_mode; hash_oov_words = options.hash_oov_words; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; #if not input_directory.endswith('/'): #input_directory += '/'; input_directory = os.path.join(input_directory, corpus_name); #input_directory += corpus_name+'/'; output_directory = options.output_directory; #if not output_directory.endswith('/'): #output_directory += '/'; if not os.path.exists(output_directory): os.mkdir(output_directory); #output_directory += corpus_name+'/'; output_directory = os.path.join(output_directory, corpus_name); if not os.path.exists(output_directory): os.mkdir(output_directory); # Documents train_docs = []; input_file = open(os.path.join(input_directory, 'doc.dat'), 'r'); for line in input_file: train_docs.append(line.strip()); print "successfully load all training documents..." # Vocabulary dictionary_file = options.dictionary; if dictionary_file==None: dictionary_file = os.path.join(input_directory, 'voc.dat'); input_file = open(dictionary_file, 'r'); vocab = []; for line in input_file: vocab.append(line.strip().split()[0]); vocab = list(set(vocab)); print "successfully load all the words from %s..." % (dictionary_file); # parameter set 4 assert(options.tau>=0); tau = options.tau; #assert(options.kappa>=0.5 and options.kappa<=1); assert(options.kappa>=0 and options.kappa<=1); kappa = options.kappa; if batch_size<=0: print "warning: running in batch mode..." kappa = 0; alpha_theta = 1.0/number_of_topics; if options.alpha_theta>0: alpha_theta=options.alpha_theta; alpha_eta = 1.0/len(vocab); if options.alpha_eta>0: alpha_eta=options.alpha_eta # create output directory now = datetime.datetime.now(); suffix = now.strftime("%y%b%d-%H%M%S")+""; suffix += "-%s" % ("fixvoc"); suffix += "-D%d" % (number_of_documents); suffix += "-K%d" % (number_of_topics) suffix += "-I%d" % (snapshot_interval); suffix += "-B%d" % (batch_size); suffix += "-O%d" % (online_iterations); suffix += "-t%d" % (tau); suffix += "-k%g" % (kappa); suffix += "-at%g" % (alpha_theta); suffix += "-ae%g" % (alpha_eta); suffix += "-%s" % (hybrid_mode); suffix += "-%s" % (hash_oov_words); suffix += "/"; output_directory = os.path.join(output_directory, suffix); os.mkdir(os.path.abspath(output_directory)); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n"); # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n"); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.write("batch_size=" + str(batch_size) + "\n"); options_output_file.write("online_iterations=" + str(online_iterations) + "\n"); # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n"); options_output_file.write("kappa=" + str(kappa) + "\n"); options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n"); # parameter set 5 options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n"); options_output_file.write("hash_oov_words=%s\n" % hash_oov_words); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "snapshot_interval=" + str(snapshot_interval); print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_eta=" + str(alpha_eta) # parameter set 5 print "hybrid_mode=" + str(hybrid_mode) print "hash_oov_words=%s" % (hash_oov_words) print "========== ========== ========== ========== ==========" if hybrid_mode: import hybrid; olda = hybrid.Hybrid(hash_oov_words); else: import variational; olda = variational.Variational(hash_oov_words); olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta, alpha_eta, tau, kappa); olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50); document_topic_distribution = None; for iteration in xrange(online_iterations): if batch_size<=0: docset = train_docs; else: docset = train_docs[(batch_size * iteration) % len(train_docs) : (batch_size * (iteration+1) - 1) % len(train_docs) + 1]; print "select documents from %d to %d" % ((batch_size * iteration) % (number_of_documents), (batch_size * (iteration+1) - 1) % number_of_documents + 1) clock = time.time(); batch_gamma, elbo = olda.learning(docset) if document_topic_distribution==None: document_topic_distribution = batch_gamma; else: document_topic_distribution = numpy.vstack((document_topic_distribution, batch_gamma)); clock = time.time()-clock; print 'training iteration %d finished in %f seconds: epsilon = %f' % (olda._counter, clock, olda._epsilon); # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration. #if ((olda._counter+1) % snapshot_interval == 0): #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1)); if (olda._counter % snapshot_interval == 0): olda.export_beta(os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50); gamma_path = os.path.join(output_directory, 'gamma.txt'); numpy.savetxt(gamma_path, document_topic_distribution);
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_documents>0); number_of_documents = options.number_of_documents; assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; assert(options.truncation_level>0); truncation_level = options.truncation_level; # parameter set 3 assert(options.vocab_prune_interval>0); vocab_prune_interval = options.vocab_prune_interval; snapshot_interval = vocab_prune_interval; if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; #assert(options.batch_size>0); batch_size = options.batch_size; #assert(number_of_documents % batch_size==0); online_iterations=number_of_documents/batch_size; if options.online_iterations>0: online_iterations=options.online_iterations; # parameter set 4 assert(options.tau>=0); tau = options.tau; #assert(options.kappa>=0.5 and options.kappa<=1); assert(options.kappa>=0 and options.kappa<=1); kappa = options.kappa; if batch_size<=0: print "warning: running in batch mode..." kappa = 0; alpha_theta = 1.0/number_of_topics; if options.alpha_theta>0: alpha_theta=options.alpha_theta; assert(options.alpha_beta>0); alpha_beta = options.alpha_beta; # parameter set 5 #heldout_data = options.heldout_data; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; input_directory = os.path.join(input_directory, corpus_name); output_directory = options.output_directory; if not os.path.exists(output_directory): os.mkdir(output_directory); output_directory = os.path.join(output_directory, corpus_name); if not os.path.exists(output_directory): os.mkdir(output_directory); # create output directory now = datetime.datetime.now(); suffix = now.strftime("%y%b%d-%H%M%S")+""; suffix += "-D%d" % (number_of_documents); suffix += "-K%d" % (number_of_topics) suffix += "-T%d" % (truncation_level); suffix += "-P%d" % (vocab_prune_interval); suffix += "-I%d" % (snapshot_interval); suffix += "-B%d" % (batch_size); suffix += "-O%d" % (online_iterations); suffix += "-t%d" % (tau); suffix += "-k%g" % (kappa); suffix += "-at%g" % (alpha_theta); suffix += "-ab%g" % (alpha_beta); suffix += "/"; ''' suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents, number_of_topics, truncation_level, vocab_prune_interval, snapshot_interval, batch_size, online_iterations, tau, kappa, alpha_theta, alpha_beta); ''' output_directory = os.path.join(output_directory, suffix); os.mkdir(os.path.abspath(output_directory)); dictionary_file = options.dictionary; if dictionary_file != None: dictionary_file = dictionary_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n"); # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n"); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); options_output_file.write("truncation_level=" + str(truncation_level) + "\n"); # parameter set 3 options_output_file.write("vocab_prune_interval=" + str(vocab_prune_interval) + "\n"); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.write("batch_size=" + str(batch_size) + "\n"); options_output_file.write("online_iterations=" + str(online_iterations) + "\n"); # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n"); options_output_file.write("kappa=" + str(kappa) + "\n"); options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n"); # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) print "truncation_level=" + str(truncation_level) # parameter set 3 print "vocab_prune_interval=" + str(vocab_prune_interval) print "snapshot_interval=" + str(snapshot_interval); print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_beta=" + str(alpha_beta) # parameter set 5 #print "heldout_data=" + str(heldout_data) print "========== ========== ========== ========== ==========" # Vocabulary #file = open(input_directory+'voc.dat', 'r'); # Seed the vocabulary vocab = ['team']; # Documents train_docs = []; file = open(os.path.join(input_directory, 'doc.dat'), 'r'); for line in file: train_docs.append(line.strip()); print "successfully load all training documents..." import hybrid; olda = hybrid.Hybrid(3, 20, dictionary_file) olda._initialize(vocab, number_of_topics, number_of_documents, batch_size, truncation_level, alpha_theta, alpha_beta, tau, kappa, vocab_prune_interval, True ); olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100); document_topic_distribution = None; # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.) for iteration in xrange(online_iterations): if batch_size<=0: docset = train_docs; else: docset = train_docs[(batch_size * iteration) % len(train_docs) : (batch_size * (iteration+1) - 1) % len(train_docs) + 1]; print "select documents from %d to %d" % ((batch_size * iteration) % (number_of_documents), (batch_size * (iteration+1) - 1) % number_of_documents + 1) clock = time.time(); batch_gamma = olda.learning(docset); if (olda._counter % snapshot_interval == 0): olda.export_beta(os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50); if document_topic_distribution==None: document_topic_distribution = batch_gamma; else: document_topic_distribution = numpy.vstack((document_topic_distribution, batch_gamma)); clock = time.time()-clock; print "vocabulary size = %s" % (olda._truncation_size); print 'training iteration %d finished in %f seconds: epsilon = %f' % (olda._counter, clock, olda._epsilon); gamma_path = os.path.join(output_directory, "gamma.txt"); numpy.savetxt(gamma_path, document_topic_distribution); '''
def main(): import option_parser; options = option_parser.parse_args(); # parameter set 2 assert(options.number_of_topics>0); number_of_topics = options.number_of_topics; assert(options.number_of_iterations>0); number_of_iterations = options.number_of_iterations; # parameter set 3 alpha = 1.0/number_of_topics; if options.alpha>0: alpha=options.alpha; assert(options.eta>0); eta = options.eta; # parameter set 4 #disable_alpha_theta_update = options.disable_alpha_theta_update; #inference_type = options.hybrid_mode; assert(options.snapshot_interval>0); if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; # parameter set 1 assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); corpus_name = options.corpus_name; input_directory = options.input_directory; if not input_directory.endswith('/'): input_directory += '/'; input_directory += corpus_name+'/'; output_directory = options.output_directory; if not output_directory.endswith('/'): output_directory += '/'; output_directory += corpus_name+'/'; # create output directory now = datetime.datetime.now(); output_directory += now.strftime("%y%b%d-%H%M%S")+""; output_directory += "-hybrid-K%d-I%d-a%g-e%g-S%d/" \ % (number_of_topics, number_of_iterations, alpha, eta, snapshot_interval); os.mkdir(os.path.abspath(output_directory)); #dict_file = options.dictionary; #if dict_file != None: #dict_file = dict_file.strip(); # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); #options_output_file.write("dictionary_file=" + str(dict_file) + "\n"); # parameter set 2 options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations)); options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n"); # parameter set 3 options_output_file.write("alpha=" + str(alpha) + "\n"); options_output_file.write("eta=" + str(eta) + "\n"); # parameter set 4 #options_output_file.write("inference_type=%s\n" % (inference_type)); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name #print "dictionary file=" + str(dict_file) # parameter set 2 print "number_of_iterations=%d" %(number_of_iterations); print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "alpha=" + str(alpha) print "eta=" + str(eta) # parameter set 4 #print "inference_type=%s" % (inference_type) print "snapshot_interval=" + str(snapshot_interval); print "========== ========== ========== ========== ==========" documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat'); print "successfully load all training documents..." import hybrid; lda_inference = hybrid.Hybrid(); lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, eta); for iteration in xrange(number_of_iterations): lda_inference.learn(); if (lda_inference._counter % snapshot_interval == 0): lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
def main(): import option_parser options = option_parser.parse_args() # parameter set 2 assert (options.number_of_documents > 0) number_of_documents = options.number_of_documents assert (options.number_of_topics > 0) number_of_topics = options.number_of_topics # parameter set 3 assert (options.snapshot_interval > 0) snapshot_interval = options.snapshot_interval #assert(options.batch_size>0); batch_size = options.batch_size #assert(number_of_documents % batch_size==0); online_iterations = number_of_documents / batch_size if options.online_iterations > 0: online_iterations = options.online_iterations # parameter set 5 hybrid_mode = options.hybrid_mode hash_oov_words = options.hash_oov_words # parameter set 1 assert (options.corpus_name != None) assert (options.input_directory != None) assert (options.output_directory != None) corpus_name = options.corpus_name input_directory = options.input_directory #if not input_directory.endswith('/'): #input_directory += '/'; input_directory = os.path.join(input_directory, corpus_name) #input_directory += corpus_name+'/'; output_directory = options.output_directory #if not output_directory.endswith('/'): #output_directory += '/'; if not os.path.exists(output_directory): os.mkdir(output_directory) #output_directory += corpus_name+'/'; output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) # Documents train_docs = [] input_file = open(os.path.join(input_directory, 'doc.dat'), 'r') for line in input_file: train_docs.append(line.strip()) print "successfully load all training documents..." # Vocabulary dictionary_file = options.dictionary if dictionary_file == None: dictionary_file = os.path.join(input_directory, 'voc.dat') input_file = open(dictionary_file, 'r') vocab = [] for line in input_file: vocab.append(line.strip().split()[0]) vocab = list(set(vocab)) print "successfully load all the words from %s..." % (dictionary_file) # parameter set 4 assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print "warning: running in batch mode..." kappa = 0 alpha_theta = 1.0 / number_of_topics if options.alpha_theta > 0: alpha_theta = options.alpha_theta alpha_eta = 1.0 / len(vocab) if options.alpha_eta > 0: alpha_eta = options.alpha_eta # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" suffix += "-%s" % ("fixvoc") suffix += "-D%d" % (number_of_documents) suffix += "-K%d" % (number_of_topics) suffix += "-I%d" % (snapshot_interval) suffix += "-B%d" % (batch_size) suffix += "-O%d" % (online_iterations) suffix += "-t%d" % (tau) suffix += "-k%g" % (kappa) suffix += "-at%g" % (alpha_theta) suffix += "-ae%g" % (alpha_eta) suffix += "-%s" % (hybrid_mode) suffix += "-%s" % (hash_oov_words) suffix += "/" output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a file options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n") # parameter set 2 options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n") # parameter set 3 options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("online_iterations=" + str(online_iterations) + "\n") # parameter set 4 options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n") options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n") # parameter set 5 options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n") options_output_file.write("hash_oov_words=%s\n" % hash_oov_words) options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "dictionary_file=" + str(dictionary_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "number_of_topics=" + str(number_of_topics) # parameter set 3 print "snapshot_interval=" + str(snapshot_interval) print "batch_size=" + str(batch_size) print "online_iterations=" + str(online_iterations) # parameter set 4 print "tau=" + str(tau) print "kappa=" + str(kappa) print "alpha_theta=" + str(alpha_theta) print "alpha_eta=" + str(alpha_eta) # parameter set 5 print "hybrid_mode=" + str(hybrid_mode) print "hash_oov_words=%s" % (hash_oov_words) print "========== ========== ========== ========== ==========" if hybrid_mode: import hybrid olda = hybrid.Hybrid(hash_oov_words) else: import variational olda = variational.Variational(hash_oov_words) olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta, alpha_eta, tau, kappa) olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50) document_topic_distribution = None for iteration in xrange(online_iterations): if batch_size <= 0: docset = train_docs else: docset = train_docs[(batch_size * iteration) % len(train_docs):(batch_size * (iteration + 1) - 1) % len(train_docs) + 1] print "select documents from %d to %d" % ( (batch_size * iteration) % (number_of_documents), (batch_size * (iteration + 1) - 1) % number_of_documents + 1) clock = time.time() batch_gamma, elbo = olda.learning(docset) if document_topic_distribution == None: document_topic_distribution = batch_gamma else: document_topic_distribution = numpy.vstack( (document_topic_distribution, batch_gamma)) clock = time.time() - clock print 'training iteration %d finished in %f seconds: epsilon = %f' % ( olda._counter, clock, olda._epsilon) # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration. #if ((olda._counter+1) % snapshot_interval == 0): #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1)); if (olda._counter % snapshot_interval == 0): olda.export_beta( os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50) gamma_path = os.path.join(output_directory, 'gamma.txt') numpy.savetxt(gamma_path, document_topic_distribution)