Пример #1
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    assert (options.number_of_documents > 0)
    number_of_documents = options.number_of_documents
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics
    assert (options.truncation_level > 0)
    truncation_level = options.truncation_level

    # parameter set 3
    assert (options.vocab_prune_interval > 0)
    vocab_prune_interval = options.vocab_prune_interval
    snapshot_interval = vocab_prune_interval
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval
    #assert(options.batch_size>0);
    batch_size = options.batch_size
    #assert(number_of_documents % batch_size==0);
    online_iterations = number_of_documents / batch_size
    if options.online_iterations > 0:
        online_iterations = options.online_iterations

    # parameter set 4
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print "warning: running in batch mode..."
        kappa = 0
    alpha_theta = 1.0 / number_of_topics
    if options.alpha_theta > 0:
        alpha_theta = options.alpha_theta
    assert (options.alpha_beta > 0)
    alpha_beta = options.alpha_beta

    # parameter set 5
    #heldout_data = options.heldout_data;

    # parameter set 1
    assert (options.corpus_name != None)
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    corpus_name = options.corpus_name

    input_directory = options.input_directory
    input_directory = os.path.join(input_directory, corpus_name)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    suffix += "-D%d" % (number_of_documents)
    suffix += "-K%d" % (number_of_topics)
    suffix += "-T%d" % (truncation_level)
    suffix += "-P%d" % (vocab_prune_interval)
    suffix += "-I%d" % (snapshot_interval)
    suffix += "-B%d" % (batch_size)
    suffix += "-O%d" % (online_iterations)
    suffix += "-t%d" % (tau)
    suffix += "-k%g" % (kappa)
    suffix += "-at%g" % (alpha_theta)
    suffix += "-ab%g" % (alpha_beta)
    suffix += "/"
    '''
    suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents,
                                                                   number_of_topics,
                                                                   truncation_level,
                                                                   vocab_prune_interval,
                                                                   snapshot_interval,
                                                                   batch_size,
                                                                   online_iterations,
                                                                   tau,
                                                                   kappa,
                                                                   alpha_theta,
                                                                   alpha_beta);
    '''
    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    dictionary_file = options.dictionary
    if dictionary_file != None:
        dictionary_file = dictionary_file.strip()

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    options_output_file.write("truncation_level=" + str(truncation_level) +
                              "\n")
    # parameter set 3
    options_output_file.write("vocab_prune_interval=" +
                              str(vocab_prune_interval) + "\n")
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("online_iterations=" + str(online_iterations) +
                              "\n")
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n")
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n")
    # parameter set 5
    #options_output_file.write("heldout_data=" + str(heldout_data) + "\n");
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    print "truncation_level=" + str(truncation_level)
    # parameter set 3
    print "vocab_prune_interval=" + str(vocab_prune_interval)
    print "snapshot_interval=" + str(snapshot_interval)
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 5
    #print "heldout_data=" + str(heldout_data)
    print "========== ========== ========== ========== =========="

    # Vocabulary
    #file = open(input_directory+'voc.dat', 'r');
    # Seed the vocabulary
    vocab = ['team']

    # Documents
    train_docs = []
    file = open(os.path.join(input_directory, 'doc.dat'), 'r')
    for line in file:
        train_docs.append(line.strip())
    print "successfully load all training documents..."

    import hybrid
    olda = hybrid.Hybrid(3, 20, dictionary_file)

    olda._initialize(vocab, number_of_topics, number_of_documents, batch_size,
                     truncation_level, alpha_theta, alpha_beta, tau, kappa,
                     vocab_prune_interval, True)
    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100)

    document_topic_distribution = None

    # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in xrange(online_iterations):
        if batch_size <= 0:
            docset = train_docs
        else:
            docset = train_docs[(batch_size * iteration) %
                                len(train_docs):(batch_size *
                                                 (iteration + 1) - 1) %
                                len(train_docs) + 1]
            print "select documents from %d to %d" % (
                (batch_size * iteration) % (number_of_documents),
                (batch_size * (iteration + 1) - 1) % number_of_documents + 1)

        clock = time.time()

        batch_gamma = olda.learning(docset)

        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(
                os.path.join(output_directory,
                             'exp_beta-' + str(olda._counter)), 50)

        if document_topic_distribution == None:
            document_topic_distribution = batch_gamma
        else:
            document_topic_distribution = numpy.vstack(
                (document_topic_distribution, batch_gamma))

        clock = time.time() - clock
        print "vocabulary size = %s" % (olda._truncation_size)
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (
            olda._counter, clock, olda._epsilon)

    gamma_path = os.path.join(output_directory, "gamma.txt")
    numpy.savetxt(gamma_path, document_topic_distribution)
    '''
Пример #2
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics
    assert (options.number_of_iterations > 0)
    number_of_iterations = options.number_of_iterations

    # parameter set 3
    alpha = 1.0 / number_of_topics
    if options.alpha > 0:
        alpha = options.alpha
    assert (options.beta > 0)
    beta = options.beta

    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert (options.snapshot_interval > 0)
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval

    # parameter set 1
    assert (options.corpus_name != None)
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    corpus_name = options.corpus_name

    input_directory = options.input_directory
    if not input_directory.endswith('/'):
        input_directory += '/'
    input_directory += corpus_name + '/'

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    if not output_directory.endswith('/'):
        output_directory += '/'
    output_directory += corpus_name + '/'
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # create output directory
    now = datetime.datetime.now()
    output_directory += now.strftime("%y%b%d-%H%M%S") + ""
    #output_directory += "-" + str(now.microsecond) + "/";
    output_directory += "-cgs-K%d-I%d-a%g-b%g-S%d/" \
                        % (number_of_topics,
                           number_of_iterations,
                           alpha,
                           beta,
                           snapshot_interval)

    os.mkdir(os.path.abspath(output_directory))

    #dict_file = options.dictionary;
    #if dict_file != None:
    #dict_file = dict_file.strip();

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" %
                              (number_of_iterations))
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    # parameter set 3
    options_output_file.write("alpha=" + str(alpha) + "\n")
    options_output_file.write("beta=" + str(beta) + "\n")
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" % (number_of_iterations)
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha=" + str(alpha)
    print "beta=" + str(beta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval)
    print "========== ========== ========== ========== =========="

    documents, type_to_index, index_to_type = parse_data(
        input_directory + 'doc.dat', input_directory + 'voc.dat')
    print "successfully load all training documents..."

    import cgs
    lda_inference = cgs.CollapsedGibbsSampling()
    lda_inference._initialize(documents, type_to_index, index_to_type,
                              number_of_topics, alpha, beta)

    for iteration in xrange(number_of_iterations):
        lda_inference.sample()

        if (lda_inference._counter % snapshot_interval == 0):
            lda_inference.export_topic_term_distribution(
                output_directory + 'exp_beta-' + str(lda_inference._counter))
Пример #3
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;
    assert(options.number_of_iterations>0);
    number_of_iterations = options.number_of_iterations;

    # parameter set 3
    alpha = 1.0/number_of_topics;
    if options.alpha>0:
        alpha=options.alpha;
    assert(options.beta>0);
    beta = options.beta;
    
    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert(options.snapshot_interval>0);
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    
    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    if not input_directory.endswith('/'):
        input_directory += '/';
    input_directory += corpus_name+'/';
        
    output_directory = options.output_directory;
    if not output_directory.endswith('/'):
        output_directory += '/';
    output_directory += corpus_name+'/';
     
    # create output directory
    now = datetime.datetime.now();
    output_directory += now.strftime("%y%b%d-%H%M%S")+"";
    #output_directory += "-" + str(now.microsecond) + "/";
    output_directory += "-cgs-K%d-I%d-a%g-b%g-S%d/" \
                        % (number_of_topics,
                           number_of_iterations,
                           alpha,
                           beta,
                           snapshot_interval);

    os.mkdir(os.path.abspath(output_directory));
    
    #dict_file = options.dictionary;
    #if dict_file != None:
        #dict_file = dict_file.strip();
        
    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations));
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("alpha=" + str(alpha) + "\n");
    options_output_file.write("beta=" + str(beta) + "\n");
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" %(number_of_iterations);
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha=" + str(alpha)
    print "beta=" + str(beta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="

    documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat');
    print "successfully load all training documents..."

    import cgs;
    lda_inference = cgs.CollapsedGibbsSampling()
    lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, beta);
    
    for iteration in xrange(number_of_iterations):
        lda_inference.sample();
        
        if (lda_inference._counter % snapshot_interval == 0):
            lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
Пример #4
0
def main():
    import option_parser;
    options = option_parser.parse_args();
    
    # parameter set 2
    assert(options.number_of_clusters>0);
    number_of_clusters = options.number_of_clusters;
    assert(options.number_of_iterations>0);
    number_of_iterations = options.number_of_iterations;
    
    # parameter set 3
    alpha_alpha = 1.0/number_of_clusters;
    if options.alpha_alpha>0:
        alpha_alpha=options.alpha_alpha;
    assert(options.alpha_beta>0);
    alpha_beta = options.alpha_beta;
    
    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert(options.snapshot_interval>0);
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    
    # parameter set 1
    #assert(options.dataset_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);
    
    input_directory = options.input_directory;
    input_directory = input_directory.rstrip("/");
    dataset_name = os.path.basename(input_directory);
    
    output_directory = options.output_directory;
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    output_directory = os.path.join(output_directory, dataset_name);
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
        
    # create output directory
    now = datetime.datetime.now();
    suffix = now.strftime("%y%b%d-%H%M%S");
    #output_directory += "-" + str(now.microsecond) + "/";
    suffix += "-naive_bayes_new"
    suffix += "-K%d" % (number_of_clusters)
    suffix += "-I%d" % (number_of_iterations)
    suffix += "-a%g" % (alpha_alpha)
    suffix += "-b%g" % (alpha_beta)
    suffix += "-S%d" % (snapshot_interval)
    
    output_directory = os.path.join(output_directory, suffix);
    os.mkdir(os.path.abspath(output_directory));
    
    # store all the options to a file
    options_output_file = open(os.path.join(output_directory, "option.txt"), 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("dataset_name=" + dataset_name + "\n");
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations));
    options_output_file.write("number_of_clusters=" + str(number_of_clusters) + "\n");
    # parameter set 3
    options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n");
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n");
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");
    
    options_output_file.close()
    
    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "dataset_name=" + dataset_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" %(number_of_iterations);
    print "number_of_clusters=" + str(number_of_clusters)
    # parameter set 3
    print "alpha_alpha=" + str(alpha_alpha)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="
    
    documents = parse_data(os.path.join(input_directory, 'data.dat'));
    print "successfully load all training documents..."
    
    from naive_bayes_new import monte_carlo
    naive_bayes = monte_carlo.MonteCarlo()
    naive_bayes._initialize(documents, number_of_clusters, alpha_alpha, alpha_beta);
    
    for iteration in xrange(number_of_iterations):
        naive_bayes.learning();
        
        if (naive_bayes._counter % snapshot_interval == 0):
            naive_bayes.export_model_snapshot(output_directory, input_directory);
Пример #5
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    initial_number_of_clusters = options.initial_number_of_clusters
    if initial_number_of_clusters <= 0:
        initial_number_of_clusters = 10
    assert (options.number_of_iterations > 0)
    number_of_iterations = options.number_of_iterations

    # parameter set 3
    assert options.alpha_alpha > 0
    alpha_alpha = options.alpha_alpha
    assert (options.alpha_beta > 0)
    alpha_beta = options.alpha_beta

    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert (options.snapshot_interval > 0)
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval

    # parameter set 1
    #assert(options.dataset_name!=None);
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    input_directory = options.input_directory
    input_directory = input_directory.rstrip("/")
    dataset_name = os.path.basename(input_directory)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, dataset_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S")
    #output_directory += "-" + str(now.microsecond) + "/";
    suffix += "-naive_bayes_dp"
    #suffix += "-K%d" % (initial_number_of_clusters)
    suffix += "-I%d" % (number_of_iterations)
    suffix += "-a%g" % (alpha_alpha)
    suffix += "-b%g" % (alpha_beta)
    suffix += "-S%d" % (snapshot_interval)

    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    # store all the options to a file
    options_output_file = open(os.path.join(output_directory, "option.txt"),
                               'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("dataset_name=" + dataset_name + "\n")
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" %
                              (number_of_iterations))
    #options_output_file.write("initial_number_of_clusters=" + str(initial_number_of_clusters) + "\n");
    # parameter set 3
    options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n")
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n")
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "dataset_name=" + dataset_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" % (number_of_iterations)
    #print "initial_number_of_clusters=" + str(initial_number_of_clusters)
    # parameter set 3
    print "alpha_alpha=" + str(alpha_alpha)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval)
    print "========== ========== ========== ========== =========="

    documents = parse_data(os.path.join(input_directory, 'data.dat'))
    print "successfully load all training documents..."

    from naive_bayes_dp import monte_carlo
    naive_bayes = monte_carlo.MonteCarlo()
    naive_bayes._initialize(documents, alpha_alpha, alpha_beta,
                            initial_number_of_clusters)

    for iteration in xrange(number_of_iterations):
        naive_bayes.learning()

        if (naive_bayes._counter % snapshot_interval == 0):
            naive_bayes.export_model_snapshot(output_directory,
                                              input_directory)
Пример #6
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;
    assert(options.number_of_iterations>0);
    number_of_iterations = options.number_of_iterations;

    # parameter set 3
    alpha = 1.0/number_of_topics;
    if options.alpha>0:
        alpha=options.alpha;
    
    #assert options.default_correlation_prior>0;
    #default_correlation_prior = options.default_correlation_prior;
    #assert options.positive_correlation_prior>0;
    #positive_correlation_prior = options.positive_correlation_prior;
    #assert options.negative_correlation_prior>0;
    #negative_correlation_prior = options.negative_correlation_prior;
    
    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    hybrid_mode = options.hybrid_mode;
    update_hyperparameter = options.update_hyperparameter;
    
    # parameter set 5
    assert(options.snapshot_interval>0);
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    
    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);
    assert(options.tree_name!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    if not input_directory.endswith('/'):
        input_directory += '/';
    input_directory += corpus_name+'/';
        
    output_directory = options.output_directory;
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    if not output_directory.endswith('/'):
        output_directory += '/';
    output_directory += corpus_name+'/';
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);

    tree_name = options.tree_name.strip();

    # create output directory
    now = datetime.datetime.now();
    output_directory += now.strftime("%y%b%d-%H%M%S")+"";
    output_directory += "-prior_tree-K%d-I%d-a%g-S%d-%s-%s-%s/" \
                        % (number_of_topics,
                           number_of_iterations,
                           alpha,
                           snapshot_interval,
                           tree_name,
                           hybrid_mode,
                           update_hyperparameter);

    #output_directory += "-prior_tree_uvb-K%d-I%d-a%g-dcp%g-pcp%g-ncp%g-S%d/" \
                        #% (number_of_topics,
                           #number_of_iterations,
                           #alpha,
                           #default_correlation_prior,
                           #positive_correlation_prior,
                           #negative_correlation_prior,
                           #snapshot_interval);

    os.mkdir(os.path.abspath(output_directory));

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    options_output_file.write("tree_name=" + str(tree_name) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations));
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("alpha=" + str(alpha) + "\n");
    #options_output_file.write("default_correlation_prior=" + str(default_correlation_prior) + "\n");
    #options_output_file.write("positive_correlation_prior=" + str(positive_correlation_prior) + "\n");
    #options_output_file.write("negative_correlation_prior=" + str(negative_correlation_prior) + "\n");
    # parameter set 4
    options_output_file.write("hybrid_mode=%s\n" % (hybrid_mode));
    options_output_file.write("update_hyperparameter=%s\n" % (update_hyperparameter));
    # parameter set 5
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "tree prior file=" + str(tree_name)
    # parameter set 2
    print "number_of_iterations=%d" %(number_of_iterations);
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha=" + str(alpha)
    #print "default_correlation_prior=" + str(default_correlation_prior)
    #print "positive_correlation_prior=" + str(positive_correlation_prior)
    #print "negative_correlation_prior=" + str(negative_correlation_prior)
    # parameter set 4
    print "hybrid_mode=%s" % (hybrid_mode)
    print "update_hyperparameter=%s" % (update_hyperparameter);
    # parameter set 5
    print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="

    if hybrid_mode:
        import hybrid;
        lda_inference = hybrid.Hybrid(update_hyperparameter);
        import hybrid.parse_data as parse_data
    else:
        import uvb;
        lda_inference = uvb.UncollapsedVariationalBayes(update_hyperparameter);
        import uvb.parse_data as parse_data
        
    documents, type_to_index, index_to_type, vocabulary = parse_data(input_directory+'doc.dat', input_directory+'voc.dat');
    print "successfully load all training documents..."

    # initialize tree
    import priortree
    prior_tree = priortree.PriorTree();
    #from vb.prior.tree.priortree import PriorTree;
    #prior_tree = PriorTree();
    #prior_tree._initialize(input_directory+"tree.wn.*", vocabulary, default_correlation_prior, positive_correlation_prior, negative_correlation_prior);
    prior_tree.initialize(input_directory+tree_name+".wn.*", input_directory+tree_name+".hyperparams", vocabulary)

    lda_inference._initialize(documents, prior_tree, type_to_index, index_to_type, number_of_topics, alpha);
    
    for iteration in xrange(number_of_iterations):
        lda_inference.train();
        
        if (lda_inference._counter % snapshot_interval == 0):
            lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
Пример #7
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_documents>0);
    number_of_documents = options.number_of_documents;
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;

    # parameter set 3
    assert(options.snapshot_interval>0);
    snapshot_interval=options.snapshot_interval;
    #assert(options.batch_size>0);
    batch_size = options.batch_size;
    #assert(number_of_documents % batch_size==0);
    online_iterations=number_of_documents/batch_size;
    if options.online_iterations>0:
        online_iterations=options.online_iterations;
    
    # parameter set 5
    hybrid_mode = options.hybrid_mode;
    hash_oov_words = options.hash_oov_words;

    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    #if not input_directory.endswith('/'):
        #input_directory += '/';
    input_directory = os.path.join(input_directory, corpus_name);
    #input_directory += corpus_name+'/';
        
    output_directory = options.output_directory;
    #if not output_directory.endswith('/'):
        #output_directory += '/';
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    #output_directory += corpus_name+'/';
    output_directory = os.path.join(output_directory, corpus_name);
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    
    # Documents
    train_docs = [];
    input_file = open(os.path.join(input_directory, 'doc.dat'), 'r');
    for line in input_file:
        train_docs.append(line.strip());
    print "successfully load all training documents..."
    
    # Vocabulary
    dictionary_file = options.dictionary;
    if dictionary_file==None:
        dictionary_file = os.path.join(input_directory, 'voc.dat');
    input_file = open(dictionary_file, 'r');
    vocab = [];
    for line in input_file:
        vocab.append(line.strip().split()[0]);
    vocab = list(set(vocab));
    print "successfully load all the words from %s..." % (dictionary_file);        
        
    # parameter set 4
    assert(options.tau>=0);
    tau = options.tau;
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert(options.kappa>=0 and options.kappa<=1);
    kappa = options.kappa;
    if batch_size<=0:
        print "warning: running in batch mode..."
        kappa = 0;
    alpha_theta = 1.0/number_of_topics;
    if options.alpha_theta>0:
        alpha_theta=options.alpha_theta;
    alpha_eta = 1.0/len(vocab);
    if options.alpha_eta>0:
        alpha_eta=options.alpha_eta
     
    # create output directory
    now = datetime.datetime.now();
    suffix = now.strftime("%y%b%d-%H%M%S")+"";
    suffix += "-%s" % ("fixvoc");
    suffix += "-D%d" % (number_of_documents);
    suffix += "-K%d" % (number_of_topics)
    suffix += "-I%d" % (snapshot_interval);
    suffix += "-B%d" % (batch_size);
    suffix += "-O%d" % (online_iterations);
    suffix += "-t%d" % (tau);
    suffix += "-k%g" % (kappa);
    suffix += "-at%g" % (alpha_theta);
    suffix += "-ae%g" % (alpha_eta);
    suffix += "-%s" % (hybrid_mode);
    suffix += "-%s" % (hash_oov_words);
    suffix += "/";
    
    output_directory = os.path.join(output_directory, suffix);
    
    os.mkdir(os.path.abspath(output_directory));
        
    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n");
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");
    options_output_file.write("batch_size=" + str(batch_size) + "\n");
    options_output_file.write("online_iterations=" + str(online_iterations) + "\n");
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n");
    options_output_file.write("kappa=" + str(kappa) + "\n");
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n");
    options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n");
    # parameter set 5
    options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n");
    options_output_file.write("hash_oov_words=%s\n" % hash_oov_words);
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "snapshot_interval=" + str(snapshot_interval);
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_eta=" + str(alpha_eta)
    # parameter set 5
    print "hybrid_mode=" + str(hybrid_mode)
    print "hash_oov_words=%s" % (hash_oov_words)
    print "========== ========== ========== ========== =========="

    if hybrid_mode:
        import hybrid;
        olda = hybrid.Hybrid(hash_oov_words);
    else:
        import variational;
        olda = variational.Variational(hash_oov_words);
        
    olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta, alpha_eta, tau, kappa);

    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50);

    document_topic_distribution = None;

    for iteration in xrange(online_iterations):
        if batch_size<=0:
            docset = train_docs;
        else:
            docset = train_docs[(batch_size * iteration) % len(train_docs) : (batch_size * (iteration+1) - 1) % len(train_docs) + 1];
            print "select documents from %d to %d" % ((batch_size * iteration) % (number_of_documents), (batch_size * (iteration+1) - 1) % number_of_documents + 1)

        clock = time.time();
        
        batch_gamma, elbo = olda.learning(docset)

        if document_topic_distribution==None:
            document_topic_distribution = batch_gamma;
        else:
            document_topic_distribution = numpy.vstack((document_topic_distribution, batch_gamma));
            
        clock = time.time()-clock;
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (olda._counter, clock, olda._epsilon);

        # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration.
        #if ((olda._counter+1) % snapshot_interval == 0):
            #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1));
        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50);
    
    gamma_path = os.path.join(output_directory, 'gamma.txt');
    numpy.savetxt(gamma_path, document_topic_distribution);
Пример #8
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_documents>0);
    number_of_documents = options.number_of_documents;
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;
    assert(options.truncation_level>0);
    truncation_level = options.truncation_level;

    # parameter set 3
    assert(options.vocab_prune_interval>0);
    vocab_prune_interval = options.vocab_prune_interval;
    snapshot_interval = vocab_prune_interval;
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    #assert(options.batch_size>0);
    batch_size = options.batch_size;
    #assert(number_of_documents % batch_size==0);
    online_iterations=number_of_documents/batch_size;
    if options.online_iterations>0:
        online_iterations=options.online_iterations;

    # parameter set 4
    assert(options.tau>=0);
    tau = options.tau;
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert(options.kappa>=0 and options.kappa<=1);
    kappa = options.kappa;
    if batch_size<=0:
        print "warning: running in batch mode..."
        kappa = 0;
    alpha_theta = 1.0/number_of_topics;
    if options.alpha_theta>0:
        alpha_theta=options.alpha_theta;
    assert(options.alpha_beta>0);
    alpha_beta = options.alpha_beta;
    
    # parameter set 5
    #heldout_data = options.heldout_data;

    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    input_directory = os.path.join(input_directory, corpus_name);
        
    output_directory = options.output_directory;
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    output_directory = os.path.join(output_directory, corpus_name);
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
     
    # create output directory
    now = datetime.datetime.now();
    suffix = now.strftime("%y%b%d-%H%M%S")+"";
    suffix += "-D%d" % (number_of_documents);
    suffix += "-K%d" % (number_of_topics)
    suffix += "-T%d" % (truncation_level);
    suffix += "-P%d" % (vocab_prune_interval);
    suffix += "-I%d" % (snapshot_interval);
    suffix += "-B%d" % (batch_size);
    suffix += "-O%d" % (online_iterations);
    suffix += "-t%d" % (tau);
    suffix += "-k%g" % (kappa);
    suffix += "-at%g" % (alpha_theta);
    suffix += "-ab%g" % (alpha_beta);
    suffix += "/";
    '''
    suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents,
                                                                   number_of_topics,
                                                                   truncation_level,
                                                                   vocab_prune_interval,
                                                                   snapshot_interval,
                                                                   batch_size,
                                                                   online_iterations,
                                                                   tau,
                                                                   kappa,
                                                                   alpha_theta,
                                                                   alpha_beta);
    '''
    output_directory = os.path.join(output_directory, suffix);
    os.mkdir(os.path.abspath(output_directory));
    
    dictionary_file = options.dictionary;
    if dictionary_file != None:
        dictionary_file = dictionary_file.strip();
        
    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n");
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    options_output_file.write("truncation_level=" + str(truncation_level) + "\n");
    # parameter set 3
    options_output_file.write("vocab_prune_interval=" + str(vocab_prune_interval) + "\n");
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");
    options_output_file.write("batch_size=" + str(batch_size) + "\n");
    options_output_file.write("online_iterations=" + str(online_iterations) + "\n");
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n");
    options_output_file.write("kappa=" + str(kappa) + "\n");
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n");
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n");
    # parameter set 5    
    #options_output_file.write("heldout_data=" + str(heldout_data) + "\n");
    options_output_file.close()
    
    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    print "truncation_level=" + str(truncation_level)
    # parameter set 3
    print "vocab_prune_interval=" + str(vocab_prune_interval)
    print "snapshot_interval=" + str(snapshot_interval);
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 5
    #print "heldout_data=" + str(heldout_data)
    print "========== ========== ========== ========== =========="

    # Vocabulary
    #file = open(input_directory+'voc.dat', 'r');
    # Seed the vocabulary
    vocab = ['team'];

    # Documents
    train_docs = [];
    file = open(os.path.join(input_directory, 'doc.dat'), 'r');
    for line in file:
        train_docs.append(line.strip());
    print "successfully load all training documents..."

    import hybrid;
    olda = hybrid.Hybrid(3, 20, dictionary_file)

    olda._initialize(vocab,
                     number_of_topics,
                     number_of_documents,
                     batch_size,
                     truncation_level,
                     alpha_theta,
                     alpha_beta,
                     tau,
                     kappa,
                     vocab_prune_interval,
                     True
                     );
    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100);
    
    document_topic_distribution = None;

    # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in xrange(online_iterations):
        if batch_size<=0:
            docset = train_docs;
        else:
            docset = train_docs[(batch_size * iteration) % len(train_docs) : (batch_size * (iteration+1) - 1) % len(train_docs) + 1];
            print "select documents from %d to %d" % ((batch_size * iteration) % (number_of_documents), (batch_size * (iteration+1) - 1) % number_of_documents + 1)

        clock = time.time();

        batch_gamma = olda.learning(docset);
        
        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(os.path.join(output_directory, 'exp_beta-' + str(olda._counter)), 50);

        if document_topic_distribution==None:
            document_topic_distribution = batch_gamma;
        else:
            document_topic_distribution = numpy.vstack((document_topic_distribution, batch_gamma));
        
        clock = time.time()-clock;
        print "vocabulary size = %s" % (olda._truncation_size);
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (olda._counter, clock, olda._epsilon);

    gamma_path = os.path.join(output_directory, "gamma.txt");
    numpy.savetxt(gamma_path, document_topic_distribution);

    '''
Пример #9
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;
    assert(options.number_of_iterations>0);
    number_of_iterations = options.number_of_iterations;

    # parameter set 3
    alpha = 1.0/number_of_topics;
    if options.alpha>0:
        alpha=options.alpha;
    assert(options.eta>0);
    eta = options.eta;
    
    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert(options.snapshot_interval>0);
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    
    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    if not input_directory.endswith('/'):
        input_directory += '/';
    input_directory += corpus_name+'/';
        
    output_directory = options.output_directory;
    if not output_directory.endswith('/'):
        output_directory += '/';
    output_directory += corpus_name+'/';
     
    # create output directory
    now = datetime.datetime.now();
    output_directory += now.strftime("%y%b%d-%H%M%S")+"";
    output_directory += "-hybrid-K%d-I%d-a%g-e%g-S%d/" \
                        % (number_of_topics,
                           number_of_iterations,
                           alpha,
                           eta,
                           snapshot_interval);

    os.mkdir(os.path.abspath(output_directory));
    
    #dict_file = options.dictionary;
    #if dict_file != None:
        #dict_file = dict_file.strip();
        
    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations));
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("alpha=" + str(alpha) + "\n");
    options_output_file.write("eta=" + str(eta) + "\n");
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" %(number_of_iterations);
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha=" + str(alpha)
    print "eta=" + str(eta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="

    documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat');
    print "successfully load all training documents..."

    import hybrid;
    lda_inference = hybrid.Hybrid();
    lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, eta);
    
    for iteration in xrange(number_of_iterations):
        lda_inference.learn();
        
        if (lda_inference._counter % snapshot_interval == 0):
            lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
Пример #10
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    assert (options.number_of_documents > 0)
    number_of_documents = options.number_of_documents
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics

    # parameter set 3
    assert (options.snapshot_interval > 0)
    snapshot_interval = options.snapshot_interval
    #assert(options.batch_size>0);
    batch_size = options.batch_size
    #assert(number_of_documents % batch_size==0);
    online_iterations = number_of_documents / batch_size
    if options.online_iterations > 0:
        online_iterations = options.online_iterations

    # parameter set 5
    hybrid_mode = options.hybrid_mode
    hash_oov_words = options.hash_oov_words

    # parameter set 1
    assert (options.corpus_name != None)
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    corpus_name = options.corpus_name

    input_directory = options.input_directory
    #if not input_directory.endswith('/'):
    #input_directory += '/';
    input_directory = os.path.join(input_directory, corpus_name)
    #input_directory += corpus_name+'/';

    output_directory = options.output_directory
    #if not output_directory.endswith('/'):
    #output_directory += '/';
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    #output_directory += corpus_name+'/';
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # Documents
    train_docs = []
    input_file = open(os.path.join(input_directory, 'doc.dat'), 'r')
    for line in input_file:
        train_docs.append(line.strip())
    print "successfully load all training documents..."

    # Vocabulary
    dictionary_file = options.dictionary
    if dictionary_file == None:
        dictionary_file = os.path.join(input_directory, 'voc.dat')
    input_file = open(dictionary_file, 'r')
    vocab = []
    for line in input_file:
        vocab.append(line.strip().split()[0])
    vocab = list(set(vocab))
    print "successfully load all the words from %s..." % (dictionary_file)

    # parameter set 4
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print "warning: running in batch mode..."
        kappa = 0
    alpha_theta = 1.0 / number_of_topics
    if options.alpha_theta > 0:
        alpha_theta = options.alpha_theta
    alpha_eta = 1.0 / len(vocab)
    if options.alpha_eta > 0:
        alpha_eta = options.alpha_eta

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    suffix += "-%s" % ("fixvoc")
    suffix += "-D%d" % (number_of_documents)
    suffix += "-K%d" % (number_of_topics)
    suffix += "-I%d" % (snapshot_interval)
    suffix += "-B%d" % (batch_size)
    suffix += "-O%d" % (online_iterations)
    suffix += "-t%d" % (tau)
    suffix += "-k%g" % (kappa)
    suffix += "-at%g" % (alpha_theta)
    suffix += "-ae%g" % (alpha_eta)
    suffix += "-%s" % (hybrid_mode)
    suffix += "-%s" % (hash_oov_words)
    suffix += "/"

    output_directory = os.path.join(output_directory, suffix)

    os.mkdir(os.path.abspath(output_directory))

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    # parameter set 3
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("online_iterations=" + str(online_iterations) +
                              "\n")
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n")
    options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n")
    # parameter set 5
    options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n")
    options_output_file.write("hash_oov_words=%s\n" % hash_oov_words)
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "snapshot_interval=" + str(snapshot_interval)
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_eta=" + str(alpha_eta)
    # parameter set 5
    print "hybrid_mode=" + str(hybrid_mode)
    print "hash_oov_words=%s" % (hash_oov_words)
    print "========== ========== ========== ========== =========="

    if hybrid_mode:
        import hybrid
        olda = hybrid.Hybrid(hash_oov_words)
    else:
        import variational
        olda = variational.Variational(hash_oov_words)

    olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta,
                     alpha_eta, tau, kappa)

    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50)

    document_topic_distribution = None

    for iteration in xrange(online_iterations):
        if batch_size <= 0:
            docset = train_docs
        else:
            docset = train_docs[(batch_size * iteration) %
                                len(train_docs):(batch_size *
                                                 (iteration + 1) - 1) %
                                len(train_docs) + 1]
            print "select documents from %d to %d" % (
                (batch_size * iteration) % (number_of_documents),
                (batch_size * (iteration + 1) - 1) % number_of_documents + 1)

        clock = time.time()

        batch_gamma, elbo = olda.learning(docset)

        if document_topic_distribution == None:
            document_topic_distribution = batch_gamma
        else:
            document_topic_distribution = numpy.vstack(
                (document_topic_distribution, batch_gamma))

        clock = time.time() - clock
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (
            olda._counter, clock, olda._epsilon)

        # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration.
        #if ((olda._counter+1) % snapshot_interval == 0):
        #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1));
        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(
                os.path.join(output_directory,
                             'exp_beta-' + str(olda._counter)), 50)

    gamma_path = os.path.join(output_directory, 'gamma.txt')
    numpy.savetxt(gamma_path, document_topic_distribution)