def gen_sent_on_topic(idxvocab, vocabxid, start_symbol, end_symbol, cf):
    output = codecs.open(args.gen_sent_on_topic, "wb", "utf-8")
    topics, entropy = tm.get_topics(sess, topn=topn)
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \
            reuse_conv_variables=True)

    for t in range(cf.topic_number):
        output.write("\n" + "=" * 100 + "\n")
        output.write("Topic " + str(t) + ":\n")
        output.write(" ".join([idxvocab[item] for item in topics[t]]) + "\n\n")

        output.write("\nSentence generation (greedy; argmax):" + "\n")
        s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], 0,
                                   cf.lm_sent_len + 10, vocabxid[end_symbol])
        output.write("[0] " + " ".join([idxvocab[item] for item in s]) + "\n")

        for temp in gen_temps:
            output.write("\nSentence generation (random; temperature = " +
                         str(temp) + "):\n")
            for i in range(gen_num):
                s = mgen.generate_on_topic(sess, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \
                    vocabxid[end_symbol])
                output.write("[" + str(i) + "] " +
                             " ".join([idxvocab[item] for item in s]) + "\n")
Пример #2
0
def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf):
    topics, _ = tm.get_topics(sess, topn=topn)
    topics = [ " ".join([idxvocab[w] for w in t]) for t in topics ]
    doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ]
    output = codecs.open(args.gen_sent_on_doc, "w", "utf-8")
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \
            reuse_conv_variables=True)

    for d in range(len(docs)):
        output.write("\n" + "="*100 + "\n")
        output.write("Doc " +  str(d) +":\n")
        output.write(doc_text[d])

        doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol])
        best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn)
        
        output.write("\nRepresentative topics:\n")
        output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \
            for item in best_topics ]) + "\n")

        output.write("\nRepresentative words:\n")
        output.write("\n".join([ ("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words ]) + "\n")

        output.write("\nSentence generation (greedy; argmax):" + "\n")
        s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len+10, vocabxid[end_symbol])
        output.write("[0] " + " ".join([ idxvocab[item] for item in s ]) + "\n")

        for temp in gen_temps:
            output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n")

            for i in xrange(gen_num):
                s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \
                    vocabxid[end_symbol])
                output.write("[" + str(i) + "] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
                                 "model.ckpt"))
                print "\tNew valid performance > prev valid performance: restoring previous parameters..."

    #print top-N words from topics
    if cf.topic_number > 0:
        print "\nTopics\n======"
        topics, entropy = tm_train.get_topics(sess, topn=20)
        for ti, t in enumerate(topics):
            print "Topic", ti, "[", ("%.2f" % entropy[ti]), "] :", " ".join(
                [idxvocab[item] for item in t])

    #generate some random sentences
    if cf.rnn_hidden_size > 0:
        print "\nRandom Generated Sentences\n=========================="
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \
                reuse_conv_variables=True)
        for temp in [1.0, 0.75, 0.5]:
            print "\nTemperature =", temp
            for _ in xrange(10):
                #select a random topic
                if cf.topic_number > 0:
                    topic = random.randint(0, cf.topic_number - 1)
                    print "\tTopic", topic, ":",
                else:
                    topic = -1
                    print "\t",

                s = mgen.generate_on_topic(sess, topic, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \
                    vocabxid[end_symbol])
                s = [idxvocab[item] for item in s]
                print " ".join(s)