Пример #1
0
def main():
    options = parse_args();

    # parameter set 2
    assert(options.number_of_topics > 0);
    number_of_topics = options.number_of_topics;
    assert(options.training_iterations > 0);
    training_iterations = options.training_iterations;

    # parameter set 3
    alpha_alpha = 1.0 / number_of_topics;
    if options.alpha_alpha > 0:
        alpha_alpha = options.alpha_alpha;
    
    # assert options.default_correlation_prior>0;
    # default_correlation_prior = options.default_correlation_prior;
    # assert options.positive_correlation_prior>0;
    # positive_correlation_prior = options.positive_correlation_prior;
    # assert options.negative_correlation_prior>0;
    # negative_correlation_prior = options.negative_correlation_prior;
    
    # parameter set 4
    # disable_alpha_theta_update = options.disable_alpha_theta_update;
    inference_mode = options.inference_mode;
    #update_hyperparameter = options.update_hyperparameter;
    
    # parameter set 5
    assert(options.snapshot_interval > 0);
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval;
    
    # parameter set 1
    # assert(options.corpus_name!=None);
    assert(options.input_directory != None);
    assert(options.output_directory != None);
    
    assert(options.tree_name != None);
    tree_name = options.tree_name;
    
    input_directory = options.input_directory;
    input_directory = input_directory.rstrip("/");
    corpus_name = os.path.basename(input_directory);
    
    output_directory = options.output_directory;
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    output_directory = os.path.join(output_directory, corpus_name);
    if not os.path.exists(output_directory):
        os.mkdir(output_directory);
    
    # Document
    train_docs_path = os.path.join(input_directory, 'train.dat')
    input_doc_stream = open(train_docs_path, 'r');
    train_docs = [];
    for line in input_doc_stream:
        train_docs.append(line.strip().lower());
    print "successfully load all training docs from %s..." % (os.path.abspath(train_docs_path));
    
    # Vocabulary
    vocabulary_path = os.path.join(input_directory, 'voc.dat');
    input_voc_stream = open(vocabulary_path, 'r');
    vocab = [];
    for line in input_voc_stream:
        vocab.append(line.strip().lower().split()[0]);
    vocab = list(set(vocab));
    print "successfully load all the words from %s..." % (os.path.abspath(vocabulary_path));

    '''
    # create output directory
    now = datetime.datetime.now();
    output_directory += now.strftime("%y%b%d-%H%M%S") + "";
    output_directory += "-prior_tree-K%d-I%d-a%g-S%d-%s-%s-%s/" \
                        % (number_of_topics,
                           training_iterations,
                           alpha_alpha,
                           snapshot_interval,
                           tree_name,
                           inference_mode,
                           update_hyperparameter);
    '''

    # create output directory
    now = datetime.datetime.now();
    suffix = now.strftime("%y%m%d-%H%M%S") + "";
    suffix += "-%s" % ("lda");
    suffix += "-I%d" % (training_iterations);
    suffix += "-S%d" % (snapshot_interval);
    suffix += "-K%d" % (number_of_topics);
    suffix += "-aa%f" % (alpha_alpha);
    #suffix += "-ab%f" % (alpha_beta);
    suffix += "-im%d" % (inference_mode);
    # suffix += "-%s" % (resample_topics);
    # suffix += "-%s" % (hash_oov_words);
    suffix += "/";
    
    output_directory = os.path.join(output_directory, suffix);
    os.mkdir(os.path.abspath(output_directory));

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    options_output_file.write("tree_name=" + str(tree_name) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (training_iterations));
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n");
    # options_output_file.write("default_correlation_prior=" + str(default_correlation_prior) + "\n");
    # options_output_file.write("positive_correlation_prior=" + str(positive_correlation_prior) + "\n");
    # options_output_file.write("negative_correlation_prior=" + str(negative_correlation_prior) + "\n");
    # parameter set 4
    options_output_file.write("inference_mode=%d\n" % (inference_mode));
    #options_output_file.write("update_hyperparameter=%s\n" % (update_hyperparameter));
    # parameter set 5
    #options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "tree prior file=" + str(tree_name)
    # parameter set 2
    print "training_iterations=%d" % (training_iterations);
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha_alpha=" + str(alpha_alpha)
    # print "default_correlation_prior=" + str(default_correlation_prior)
    # print "positive_correlation_prior=" + str(positive_correlation_prior)
    # print "negative_correlation_prior=" + str(negative_correlation_prior)
    # parameter set 4
    print "inference_mode=%d" % (inference_mode)
    #print "update_hyperparameter=%s" % (update_hyperparameter);
    # parameter set 5
    #print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="

    if inference_mode==0:
        import hybrid;
        lda_inferencer = hybrid.Hybrid();
        #lda_inferencer = hybrid.Hybrid(update_hyperparameter);
        #import hybrid.parse_data as parse_data
    elif inference_mode==1:
        #import monte_carlo
        #lda_inferencer = monte_carlo.MonteCarlo();
        sys.stderr.write("warning: monte carlo inference method is not implemented yet...\n");
        pass
    elif inference_mode==2:
        #from prior.tree import variational_bayes
        #lda_inferencer = variational_bayes.VariationalBayes();
        import variational_bayes
        lda_inferencer = variational_bayes.VariationalBayes();
        #lda_inferencer = variational_bayes.VariationalBayes(update_hyperparameter);
        #from variational_bayes import parse_data
    else:
        sys.stderr.write("error: unrecognized inference mode %d...\n" % (inference_mode));
        return;
    
    # initialize tree
    import priortree
    prior_tree = priortree.VocabTreePrior();
    # from vb.prior.tree.priortree import VocabTreePrior;
    # prior_tree = VocabTreePrior();
    # prior_tree._initialize(input_directory+"tree.wn.*", vocab, default_correlation_prior, positive_correlation_prior, negative_correlation_prior);
    prior_tree._initialize(os.path.join(input_directory, tree_name + ".wn.*"), os.path.join(input_directory, tree_name + ".hyperparams"), vocab)

    lda_inferencer._initialize(train_docs, vocab, prior_tree, number_of_topics, alpha_alpha);
    
    for iteration in xrange(training_iterations):
        lda_inferencer.learning();
        
        if (lda_inferencer._counter % snapshot_interval == 0):
            lda_inferencer.export_beta(os.path.join(output_directory, 'exp_beta-' + str(lda_inferencer._counter)));
            model_snapshot_path = os.path.join(output_directory, 'model-' + str(lda_inferencer._counter));
            cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb'));
    
    model_snapshot_path = os.path.join(output_directory, 'model-' + str(lda_inferencer._counter));
    cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb'));
Пример #2
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    assert (options.number_of_documents > 0)
    number_of_documents = options.number_of_documents
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics
    assert (options.truncation_level > 0)
    truncation_level = options.truncation_level

    # parameter set 3
    assert (options.vocab_prune_interval > 0)
    vocab_prune_interval = options.vocab_prune_interval
    snapshot_interval = vocab_prune_interval
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval
    #assert(options.batch_size>0);
    batch_size = options.batch_size
    #assert(number_of_documents % batch_size==0);
    online_iterations = number_of_documents / batch_size
    if options.online_iterations > 0:
        online_iterations = options.online_iterations

    # parameter set 4
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print "warning: running in batch mode..."
        kappa = 0
    alpha_theta = 1.0 / number_of_topics
    if options.alpha_theta > 0:
        alpha_theta = options.alpha_theta
    assert (options.alpha_beta > 0)
    alpha_beta = options.alpha_beta

    # parameter set 5
    #heldout_data = options.heldout_data;

    # parameter set 1
    assert (options.corpus_name != None)
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    corpus_name = options.corpus_name

    input_directory = options.input_directory
    input_directory = os.path.join(input_directory, corpus_name)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    suffix += "-D%d" % (number_of_documents)
    suffix += "-K%d" % (number_of_topics)
    suffix += "-T%d" % (truncation_level)
    suffix += "-P%d" % (vocab_prune_interval)
    suffix += "-I%d" % (snapshot_interval)
    suffix += "-B%d" % (batch_size)
    suffix += "-O%d" % (online_iterations)
    suffix += "-t%d" % (tau)
    suffix += "-k%g" % (kappa)
    suffix += "-at%g" % (alpha_theta)
    suffix += "-ab%g" % (alpha_beta)
    suffix += "/"
    '''
    suffix += "-D%d-K%d-T%d-P%d-S%d-B%d-O%d-t%d-k%g-at%g-ab%g/" % (number_of_documents,
                                                                   number_of_topics,
                                                                   truncation_level,
                                                                   vocab_prune_interval,
                                                                   snapshot_interval,
                                                                   batch_size,
                                                                   online_iterations,
                                                                   tau,
                                                                   kappa,
                                                                   alpha_theta,
                                                                   alpha_beta);
    '''
    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    dictionary_file = options.dictionary
    if dictionary_file != None:
        dictionary_file = dictionary_file.strip()

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    options_output_file.write("truncation_level=" + str(truncation_level) +
                              "\n")
    # parameter set 3
    options_output_file.write("vocab_prune_interval=" +
                              str(vocab_prune_interval) + "\n")
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("online_iterations=" + str(online_iterations) +
                              "\n")
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n")
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n")
    # parameter set 5
    #options_output_file.write("heldout_data=" + str(heldout_data) + "\n");
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    print "truncation_level=" + str(truncation_level)
    # parameter set 3
    print "vocab_prune_interval=" + str(vocab_prune_interval)
    print "snapshot_interval=" + str(snapshot_interval)
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 5
    #print "heldout_data=" + str(heldout_data)
    print "========== ========== ========== ========== =========="

    # Vocabulary
    #file = open(input_directory+'voc.dat', 'r');
    # Seed the vocabulary
    vocab = ['team']

    # Documents
    train_docs = []
    file = open(os.path.join(input_directory, 'doc.dat'), 'r')
    for line in file:
        train_docs.append(line.strip())
    print "successfully load all training documents..."

    import hybrid
    olda = hybrid.Hybrid(3, 20, dictionary_file)

    olda._initialize(vocab, number_of_topics, number_of_documents, batch_size,
                     truncation_level, alpha_theta, alpha_beta, tau, kappa,
                     vocab_prune_interval, True)
    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 100)

    document_topic_distribution = None

    # Run until we've seen number_of_documents documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in xrange(online_iterations):
        if batch_size <= 0:
            docset = train_docs
        else:
            docset = train_docs[(batch_size * iteration) %
                                len(train_docs):(batch_size *
                                                 (iteration + 1) - 1) %
                                len(train_docs) + 1]
            print "select documents from %d to %d" % (
                (batch_size * iteration) % (number_of_documents),
                (batch_size * (iteration + 1) - 1) % number_of_documents + 1)

        clock = time.time()

        batch_gamma = olda.learning(docset)

        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(
                os.path.join(output_directory,
                             'exp_beta-' + str(olda._counter)), 50)

        if document_topic_distribution == None:
            document_topic_distribution = batch_gamma
        else:
            document_topic_distribution = numpy.vstack(
                (document_topic_distribution, batch_gamma))

        clock = time.time() - clock
        print "vocabulary size = %s" % (olda._truncation_size)
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (
            olda._counter, clock, olda._epsilon)

    gamma_path = os.path.join(output_directory, "gamma.txt")
    numpy.savetxt(gamma_path, document_topic_distribution)
    '''
Пример #3
0
def main():
    options = parse_args()

    # parameter set 1
    #assert(options.corpus_name!=None);
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    input_directory = options.input_directory
    input_directory = input_directory.rstrip("/")
    corpus_name = os.path.basename(input_directory)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    assert (options.grammar_file != None)
    grammar_file = options.grammar_file
    assert (os.path.exists(grammar_file))

    # Documents
    train_docs = []
    input_stream = open(os.path.join(input_directory, 'train.dat'), 'r')
    for line in input_stream:
        train_docs.append(line.strip())
    input_stream.close()
    print("successfully load all training documents...")

    # parameter set 2
    if options.number_of_documents > 0:
        number_of_documents = options.number_of_documents
    else:
        number_of_documents = len(train_docs)
    if options.batch_size > 0:
        batch_size = options.batch_size
    else:
        batch_size = number_of_documents
    #assert(number_of_documents % batch_size==0);
    training_iterations = number_of_documents / batch_size
    if options.training_iterations > 0:
        training_iterations = options.training_iterations
    #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size));
    #multiprocesses = options.multiprocesses;
    assert (options.number_of_processes >= 0)
    number_of_processes = options.number_of_processes

    # parameter set 3
    assert (options.grammaton_prune_interval > 0)
    grammaton_prune_interval = options.grammaton_prune_interval
    snapshot_interval = grammaton_prune_interval
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print("warning: running in batch mode...")
        kappa = 0

    # read in adaptor grammars
    desired_truncation_level = {}
    alpha_pi = {}
    beta_pi = {}

    grammar_rules = []
    adapted_non_terminals = set()
    #for line in codecs.open(grammar_file, 'r', encoding='utf-8'):
    for line in open(grammar_file, 'r'):
        line = line.strip()
        if line.startswith("%"):
            continue
        if line.startswith("@"):
            tokens = line.split()
            assert (len(tokens) == 5)
            adapted_non_terminal = nltk.Nonterminal(tokens[1])
            adapted_non_terminals.add(adapted_non_terminal)
            desired_truncation_level[adapted_non_terminal] = int(tokens[2])
            alpha_pi[adapted_non_terminal] = float(tokens[3])
            beta_pi[adapted_non_terminal] = float(tokens[4])
            continue
        grammar_rules.append(line)
    grammar_rules = "\n".join(grammar_rules)

    # Warning: if you are using nltk 2.x, please use parse_grammar()
    #from nltk.grammar import parse_grammar, standard_nonterm_parser
    #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False)
    from nltk.grammar import read_grammar, standard_nonterm_parser
    start, productions = read_grammar(grammar_rules,
                                      standard_nonterm_parser,
                                      probabilistic=False)
    print("start, productions: ", start, productions)
    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]);
    #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]);
    #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]);
    #output_directory += "-" + str(now.microsecond) + "/";
    suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % (
        number_of_documents,
        #number_of_topics,
        grammaton_prune_interval,
        snapshot_interval,
        batch_size,
        training_iterations,
        tau,
        kappa,
        #alpha_theta,
        #alpha_pi_string,
        #beta_pi_string,
        #desired_truncation_level_string,
        os.path.basename(grammar_file))

    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    # store all the options to a input_stream
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("grammar_file=" + str(grammar_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_processes=" +
                              str(number_of_processes) + "\n")
    #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n");
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("training_iterations=" +
                              str(training_iterations) + "\n")

    # parameter set 3
    options_output_file.write("grammaton_prune_interval=" +
                              str(grammaton_prune_interval) + "\n")
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")

    # parameter set 4
    #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n");
    options_output_file.write("alpha_pi=%s\n" % alpha_pi)
    options_output_file.write("beta_pi=%s\n" % beta_pi)
    options_output_file.write("desired_truncation_level=%s\n" %
                              desired_truncation_level)
    # parameter set 5
    #options_output_file.write("heldout_data=" + str(heldout_data) + "\n");
    options_output_file.close()

    print("========== ========== ========== ========== ==========")
    # parameter set 1
    print("output_directory=" + output_directory)
    print("input_directory=" + input_directory)
    print("corpus_name=" + corpus_name)
    print("grammar_file=" + str(grammar_file))

    # parameter set 2
    print("number_of_documents=" + str(number_of_documents))
    print("batch_size=" + str(batch_size))
    print("training_iterations=" + str(training_iterations))
    print("number_of_processes=" + str(number_of_processes))
    #print("multiprocesses=" + str(multiprocesses)

    # parameter set 3
    print("grammaton_prune_interval=" + str(grammaton_prune_interval))
    print("snapshot_interval=" + str(snapshot_interval))
    print("tau=" + str(tau))
    print("kappa=" + str(kappa))

    # parameter set 4
    #print("alpha_theta=" + str(alpha_theta)
    print("alpha_pi=%s" % alpha_pi)
    print("beta_pi=%s" % beta_pi)
    print("desired_truncation_level=%s" % desired_truncation_level)
    # parameter set 5
    #print("heldout_data=" + str(heldout_data)
    print("========== ========== ========== ========== ==========")

    import hybrid
    print("passing prodcutions = : ", productions)
    adagram_inferencer = hybrid.Hybrid(start, productions,
                                       adapted_non_terminals)

    adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa,
                                   alpha_pi, beta_pi, None,
                                   desired_truncation_level,
                                   grammaton_prune_interval)
    '''
    clock_iteration = time.time();
    clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs);
    clock_iteration = time.time()-clock_iteration;
    print('E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p
    '''

    #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0"))
    #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0"))

    random.shuffle(train_docs)
    training_clock = time.time()
    snapshot_clock = time.time()
    for iteration in range(int(training_iterations)):
        start_index = batch_size * iteration
        end_index = batch_size * (iteration + 1)
        if start_index / number_of_documents < end_index / number_of_documents:
            #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)];
            train_doc_set = train_docs[(batch_size * iteration) %
                                       (number_of_documents):]
            random.shuffle(train_docs)
            train_doc_set += train_docs[:(batch_size * (iteration + 1)) %
                                        (number_of_documents)]
        else:
            train_doc_set = train_docs[(batch_size * iteration) %
                                       (number_of_documents):
                                       (batch_size *
                                        (iteration + 1)) % number_of_documents]

        clock_iteration = time.time()
        #print("processing document:", train_doc_set
        clock_e_step, clock_m_step = adagram_inferencer.learning(
            train_doc_set, number_of_processes)

        if (iteration + 1) % snapshot_interval == 0:
            #pickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb');
            #pickle.dump(adagram_inferencer, pickle_file);
            #pickle_file.close();
            adagram_inferencer.export_adaptor_grammar(
                os.path.join(output_directory, "infag-" + str(
                    (iteration + 1))))
            #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1))))

        if (iteration + 1) % 1000 == 0:
            snapshot_clock = time.time() - snapshot_clock
            print('Processing 1000 mini-batches take %g seconds...' %
                  (snapshot_clock))
            snapshot_clock = time.time()

        clock_iteration = time.time() - clock_iteration
        print(
            'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...'
            % (adagram_inferencer._counter, clock_e_step, clock_m_step,
               clock_iteration))

    adagram_inferencer.export_adaptor_grammar(
        os.path.join(output_directory,
                     "infag-" + str(adagram_inferencer._counter + 1)))
    #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1))))

    pickle_file = open(
        os.path.join(output_directory, "model-%d" % (iteration + 1)), 'wb')
    pickle.dump(adagram_inferencer, pickle_file)
    pickle_file.close()

    training_clock = time.time() - training_clock
    print('Training finished in %g seconds...' % (training_clock))
Пример #4
0
		MSE_error[trajIdx, 5] = np.std(np.multiply(outDR, outDR))/np.sqrt(numTrials)
		MSE_error[trajIdx, 6] = np.std(np.multiply(outWDR, outWDR))/np.sqrt(numTrials)
		MSE_error[trajIdx, 7] = np.std(np.multiply(outMagic, outMagic))/np.sqrt(numTrials)
	# Let's now write the MSE results to a file
	# np.savetxt(filename + '_MSE.txt', MSE)
	# np.savetxt(filename+'_MSE_error.txt', MSE_error)
	np.savetxt('condition_number_hybrid.txt', cond_tot)

if __name__ == '__main__':
	print 'Running 50 trials for the given MDP model...'

	trueHorizon = False
	delta = 0.1

	# env = modelfail.ModelFail()
	# print 'Generating data for ModelFail MDP...'
	# compute_data(env, 1, 2, 'out_ModelFail', 15, delta)

	# env = modelwin.ModelWin()
	# print 'Generating data for ModelWin MDP...'
	# compute_data(env, 1, 2, 'out_MSE_ModelWin', 15, delta)

	env = hybrid.Hybrid()
	print 'Generating data for Hybrid Domain MDP...'
	compute_data(env, 1, 2, 'out_HybridDomain', 13, delta)

	# env = gridworld.Gridworld(trueHorizon)
	# print 'Generating data for Gridworld MDP p4 p5...'
	# compute_data(env, 4, 5, 'out_GridWorld_p4p5', 11, delta)

	print 'Done...'
Пример #5
0
def main():
    options = parse_args()

    # parameter set 2
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics
    assert (options.training_iterations > 0)
    training_iterations = options.training_iterations
    assert (options.snapshot_interval > 0)
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval

    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    inference_mode = options.inference_mode

    # parameter set 1
    #assert(options.corpus_name!=None);
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    input_directory = options.input_directory
    input_directory = input_directory.rstrip("/")
    corpus_name = os.path.basename(input_directory)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # Document
    train_docs_path = os.path.join(input_directory, 'train.dat')
    input_doc_stream = open(train_docs_path, 'r')
    train_docs = []
    for line in input_doc_stream:
        train_docs.append(line.strip().lower())
    print "successfully load all training docs from %s..." % (
        os.path.abspath(train_docs_path))

    # Vocabulary
    vocabulary_path = os.path.join(input_directory, 'voc.dat')
    input_voc_stream = open(vocabulary_path, 'r')
    vocab = []
    for line in input_voc_stream:
        vocab.append(line.strip().lower().split()[0])
    vocab = list(set(vocab))
    print "successfully load all the words from %s..." % (
        os.path.abspath(vocabulary_path))

    # parameter set 3
    alpha_alpha = 1.0 / number_of_topics
    if options.alpha_alpha > 0:
        alpha_alpha = options.alpha_alpha
    alpha_beta = options.alpha_beta
    if alpha_beta <= 0:
        alpha_beta = 1.0 / len(vocab)

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%m%d-%H%M%S") + ""
    suffix += "-%s" % ("lda")
    suffix += "-I%d" % (training_iterations)
    suffix += "-S%d" % (snapshot_interval)
    suffix += "-K%d" % (number_of_topics)
    suffix += "-aa%f" % (alpha_alpha)
    suffix += "-ab%f" % (alpha_beta)
    suffix += "-im%d" % (inference_mode)
    # suffix += "-%s" % (resample_topics);
    # suffix += "-%s" % (hash_oov_words);
    suffix += "/"

    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    #dict_file = options.dictionary;
    #if dict_file != None:
    #dict_file = dict_file.strip();

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    #options_output_file.write("vocabulary_path=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("training_iterations=%d\n" %
                              (training_iterations))
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    # parameter set 3
    options_output_file.write("alpha_alpha=" + str(alpha_alpha) + "\n")
    options_output_file.write("alpha_beta=" + str(alpha_beta) + "\n")
    # parameter set 4
    options_output_file.write("inference_mode=%d\n" % (inference_mode))
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "training_iterations=%d" % (training_iterations)
    print "snapshot_interval=" + str(snapshot_interval)
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha_alpha=" + str(alpha_alpha)
    print "alpha_beta=" + str(alpha_beta)
    # parameter set 4
    print "inference_mode=%d" % (inference_mode)
    print "========== ========== ========== ========== =========="

    if inference_mode == 0:
        import hybrid
        lda_inferencer = hybrid.Hybrid()
    elif inference_mode == 1:
        import monte_carlo
        lda_inferencer = monte_carlo.MonteCarlo()
    elif inference_mode == 2:
        import variational_bayes
        lda_inferencer = variational_bayes.VariationalBayes()
    else:
        sys.stderr.write("error: unrecognized inference mode %d...\n" %
                         (inference_mode))
        return

    lda_inferencer._initialize(train_docs, vocab, number_of_topics,
                               alpha_alpha, alpha_beta)

    for iteration in xrange(training_iterations):
        lda_inferencer.learning()

        if (lda_inferencer._counter % snapshot_interval == 0):
            lda_inferencer.export_beta(output_directory + 'exp_beta-' +
                                       str(lda_inferencer._counter))

    model_snapshot_path = os.path.join(output_directory,
                                       'model-' + str(lda_inferencer._counter))
    cPickle.dump(lda_inferencer, open(model_snapshot_path, 'wb'))
Пример #6
0
def main():
    import option_parser;
    options = option_parser.parse_args();

    # parameter set 2
    assert(options.number_of_topics>0);
    number_of_topics = options.number_of_topics;
    assert(options.number_of_iterations>0);
    number_of_iterations = options.number_of_iterations;

    # parameter set 3
    alpha = 1.0/number_of_topics;
    if options.alpha>0:
        alpha=options.alpha;
    assert(options.eta>0);
    eta = options.eta;
    
    # parameter set 4
    #disable_alpha_theta_update = options.disable_alpha_theta_update;
    #inference_type = options.hybrid_mode;
    assert(options.snapshot_interval>0);
    if options.snapshot_interval>0:
        snapshot_interval=options.snapshot_interval;
    
    # parameter set 1
    assert(options.corpus_name!=None);
    assert(options.input_directory!=None);
    assert(options.output_directory!=None);

    corpus_name = options.corpus_name;

    input_directory = options.input_directory;
    if not input_directory.endswith('/'):
        input_directory += '/';
    input_directory += corpus_name+'/';
        
    output_directory = options.output_directory;
    if not output_directory.endswith('/'):
        output_directory += '/';
    output_directory += corpus_name+'/';
     
    # create output directory
    now = datetime.datetime.now();
    output_directory += now.strftime("%y%b%d-%H%M%S")+"";
    output_directory += "-hybrid-K%d-I%d-a%g-e%g-S%d/" \
                        % (number_of_topics,
                           number_of_iterations,
                           alpha,
                           eta,
                           snapshot_interval);

    os.mkdir(os.path.abspath(output_directory));
    
    #dict_file = options.dictionary;
    #if dict_file != None:
        #dict_file = dict_file.strip();
        
    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w');
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n");
    options_output_file.write("corpus_name=" + corpus_name + "\n");
    #options_output_file.write("dictionary_file=" + str(dict_file) + "\n");
    # parameter set 2
    options_output_file.write("number_of_iteration=%d\n" % (number_of_iterations));
    options_output_file.write("number_of_topics=" + str(number_of_topics) + "\n");
    # parameter set 3
    options_output_file.write("alpha=" + str(alpha) + "\n");
    options_output_file.write("eta=" + str(eta) + "\n");
    # parameter set 4
    #options_output_file.write("inference_type=%s\n" % (inference_type));
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n");

    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    #print "dictionary file=" + str(dict_file)
    # parameter set 2
    print "number_of_iterations=%d" %(number_of_iterations);
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "alpha=" + str(alpha)
    print "eta=" + str(eta)
    # parameter set 4
    #print "inference_type=%s" % (inference_type)
    print "snapshot_interval=" + str(snapshot_interval);
    print "========== ========== ========== ========== =========="

    documents, type_to_index, index_to_type = parse_data(input_directory+'doc.dat', input_directory+'voc.dat');
    print "successfully load all training documents..."

    import hybrid;
    lda_inference = hybrid.Hybrid();
    lda_inference._initialize(documents, type_to_index, index_to_type, number_of_topics, alpha, eta);
    
    for iteration in xrange(number_of_iterations):
        lda_inference.learn();
        
        if (lda_inference._counter % snapshot_interval == 0):
            lda_inference.export_topic_term_distribution(output_directory + 'exp_beta-' + str(lda_inference._counter));
Пример #7
0
def main():
    import option_parser
    options = option_parser.parse_args()

    # parameter set 2
    assert (options.number_of_documents > 0)
    number_of_documents = options.number_of_documents
    assert (options.number_of_topics > 0)
    number_of_topics = options.number_of_topics

    # parameter set 3
    assert (options.snapshot_interval > 0)
    snapshot_interval = options.snapshot_interval
    #assert(options.batch_size>0);
    batch_size = options.batch_size
    #assert(number_of_documents % batch_size==0);
    online_iterations = number_of_documents / batch_size
    if options.online_iterations > 0:
        online_iterations = options.online_iterations

    # parameter set 5
    hybrid_mode = options.hybrid_mode
    hash_oov_words = options.hash_oov_words

    # parameter set 1
    assert (options.corpus_name != None)
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    corpus_name = options.corpus_name

    input_directory = options.input_directory
    #if not input_directory.endswith('/'):
    #input_directory += '/';
    input_directory = os.path.join(input_directory, corpus_name)
    #input_directory += corpus_name+'/';

    output_directory = options.output_directory
    #if not output_directory.endswith('/'):
    #output_directory += '/';
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    #output_directory += corpus_name+'/';
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    # Documents
    train_docs = []
    input_file = open(os.path.join(input_directory, 'doc.dat'), 'r')
    for line in input_file:
        train_docs.append(line.strip())
    print "successfully load all training documents..."

    # Vocabulary
    dictionary_file = options.dictionary
    if dictionary_file == None:
        dictionary_file = os.path.join(input_directory, 'voc.dat')
    input_file = open(dictionary_file, 'r')
    vocab = []
    for line in input_file:
        vocab.append(line.strip().split()[0])
    vocab = list(set(vocab))
    print "successfully load all the words from %s..." % (dictionary_file)

    # parameter set 4
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print "warning: running in batch mode..."
        kappa = 0
    alpha_theta = 1.0 / number_of_topics
    if options.alpha_theta > 0:
        alpha_theta = options.alpha_theta
    alpha_eta = 1.0 / len(vocab)
    if options.alpha_eta > 0:
        alpha_eta = options.alpha_eta

    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    suffix += "-%s" % ("fixvoc")
    suffix += "-D%d" % (number_of_documents)
    suffix += "-K%d" % (number_of_topics)
    suffix += "-I%d" % (snapshot_interval)
    suffix += "-B%d" % (batch_size)
    suffix += "-O%d" % (online_iterations)
    suffix += "-t%d" % (tau)
    suffix += "-k%g" % (kappa)
    suffix += "-at%g" % (alpha_theta)
    suffix += "-ae%g" % (alpha_eta)
    suffix += "-%s" % (hybrid_mode)
    suffix += "-%s" % (hash_oov_words)
    suffix += "/"

    output_directory = os.path.join(output_directory, suffix)

    os.mkdir(os.path.abspath(output_directory))

    # store all the options to a file
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("dictionary_file=" + str(dictionary_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("number_of_topics=" + str(number_of_topics) +
                              "\n")
    # parameter set 3
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("online_iterations=" + str(online_iterations) +
                              "\n")
    # parameter set 4
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")
    options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n")
    options_output_file.write("alpha_eta=" + str(alpha_eta) + "\n")
    # parameter set 5
    options_output_file.write("hybrid_mode=" + str(hybrid_mode) + "\n")
    options_output_file.write("hash_oov_words=%s\n" % hash_oov_words)
    options_output_file.close()

    print "========== ========== ========== ========== =========="
    # parameter set 1
    print "output_directory=" + output_directory
    print "input_directory=" + input_directory
    print "corpus_name=" + corpus_name
    print "dictionary_file=" + str(dictionary_file)
    # parameter set 2
    print "number_of_documents=" + str(number_of_documents)
    print "number_of_topics=" + str(number_of_topics)
    # parameter set 3
    print "snapshot_interval=" + str(snapshot_interval)
    print "batch_size=" + str(batch_size)
    print "online_iterations=" + str(online_iterations)
    # parameter set 4
    print "tau=" + str(tau)
    print "kappa=" + str(kappa)
    print "alpha_theta=" + str(alpha_theta)
    print "alpha_eta=" + str(alpha_eta)
    # parameter set 5
    print "hybrid_mode=" + str(hybrid_mode)
    print "hash_oov_words=%s" % (hash_oov_words)
    print "========== ========== ========== ========== =========="

    if hybrid_mode:
        import hybrid
        olda = hybrid.Hybrid(hash_oov_words)
    else:
        import variational
        olda = variational.Variational(hash_oov_words)

    olda._initialize(vocab, number_of_documents, number_of_topics, alpha_theta,
                     alpha_eta, tau, kappa)

    olda.export_beta(os.path.join(output_directory, 'exp_beta-0'), 50)

    document_topic_distribution = None

    for iteration in xrange(online_iterations):
        if batch_size <= 0:
            docset = train_docs
        else:
            docset = train_docs[(batch_size * iteration) %
                                len(train_docs):(batch_size *
                                                 (iteration + 1) - 1) %
                                len(train_docs) + 1]
            print "select documents from %d to %d" % (
                (batch_size * iteration) % (number_of_documents),
                (batch_size * (iteration + 1) - 1) % number_of_documents + 1)

        clock = time.time()

        batch_gamma, elbo = olda.learning(docset)

        if document_topic_distribution == None:
            document_topic_distribution = batch_gamma
        else:
            document_topic_distribution = numpy.vstack(
                (document_topic_distribution, batch_gamma))

        clock = time.time() - clock
        print 'training iteration %d finished in %f seconds: epsilon = %f' % (
            olda._counter, clock, olda._epsilon)

        # Save lambda, the parameters to the variational distributions over topics, and batch_gamma, the parameters to the variational distributions over topic weights for the articles analyzed in the last iteration.
        #if ((olda._counter+1) % snapshot_interval == 0):
        #olda.export_beta(output_directory + 'exp_beta-' + str(olda._counter+1));
        if (olda._counter % snapshot_interval == 0):
            olda.export_beta(
                os.path.join(output_directory,
                             'exp_beta-' + str(olda._counter)), 50)

    gamma_path = os.path.join(output_directory, 'gamma.txt')
    numpy.savetxt(gamma_path, document_topic_distribution)