示例#1
0
def gen_sent_on_doc(docs, tags, idxvocab, vocabxid, start_symbol, end_symbol, cf):
    topics, _ = tm.get_topics(sess, topn=topn)
    topics = [ " ".join([idxvocab[w] for w in t]) for t in topics ]
    doc_text = [ item.replace("\t", "\n") for item in codecs.open(args.input_doc, "r", "utf-8").readlines() ]
    output = codecs.open(args.gen_sent_on_doc, "w", "utf-8")
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mgen = LM(is_training=False, vocab_size=len(idxvocab), batch_size=1, num_steps=1, config=cf, \
            reuse_conv_variables=True)

    for d in range(len(docs)):
        output.write("\n" + "="*100 + "\n")
        output.write("Doc " +  str(d) +":\n")
        output.write(doc_text[d])

        doc, _, _, t, _ = get_batch_doc(docs, None, tags, d, cf.doc_len, cf.tag_len, 1, vocabxid[pad_symbol])
        best_topics, best_words = mgen.get_topics_on_doc(sess, doc, t, topn)
        
        output.write("\nRepresentative topics:\n")
        output.write("\n".join([ ("[%.3f] %s: %s" % (item[1],str(item[0]).zfill(3),topics[item[0]])) \
            for item in best_topics ]) + "\n")

        output.write("\nRepresentative words:\n")
        output.write("\n".join([ ("[%.3f] %s" % (item[1], idxvocab[item[0]])) for item in best_words ]) + "\n")

        output.write("\nSentence generation (greedy; argmax):" + "\n")
        s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], 0, cf.lm_sent_len+10, vocabxid[end_symbol])
        output.write("[0] " + " ".join([ idxvocab[item] for item in s ]) + "\n")

        for temp in gen_temps:
            output.write("\nSentence generation (random; temperature = " + str(temp) + "):\n")

            for i in xrange(gen_num):
                s = mgen.generate_on_doc(sess, doc, t, vocabxid[start_symbol], temp, cf.lm_sent_len+10, \
                    vocabxid[end_symbol])
                output.write("[" + str(i) + "] " + " ".join([ idxvocab[item] for item in s ]) + "\n")
示例#2
0
def compute_dt_dist(docs, labels, tags, model, max_len, batch_size, pad_id,
                    idxvocab, output_file):
    #generate batches
    num_batches = int(math.ceil(float(len(docs)) / batch_size))
    dt_dist = []
    t = []
    combined = []
    docid = 0
    for i in xrange(num_batches):
        x, _, _, t, s = get_batch_doc(docs, labels, tags, i, max_len,
                                      cf.tag_len, batch_size, pad_id)
        attention, mean_topic = sess.run([model.attention, model.mean_topic], {
            model.doc: x,
            model.tag: t
        })
        dt_dist.extend(attention[:s])

        if debug:
            for si in xrange(s):
                d = x[si]
                print "\n\nDoc", docid, "=", " ".join(
                    [idxvocab[item] for item in d if (item != pad_id)])
                sorted_dist = matutils.argsort(attention[si], reverse=True)
                for ti in sorted_dist:
                    print "Topic", ti, "=", attention[si][ti]
                docid += 1

    np.save(open(output_file, "w"), dt_dist)
示例#3
0
def run_epoch_doc(docs, labels, tags, tm, pad_id, cf):
    batches = int(math.ceil(float(len(docs))/cf.batch_size))
    accs = []
    for b in xrange(batches):
        d, y, m, t, num_docs = get_batch_doc(docs, labels, tags, b, cf.doc_len, cf.tag_len, cf.batch_size, pad_id)
        prob = sess.run(tm.sup_probs, {tm.doc:d, tm.label:y, tm.sup_mask: m, tm.tag: t})
        pred = np.argmax(prob, axis=1)
        accs.extend(pred[:num_docs] == y[:num_docs])

    print "\ntest classification accuracy = %.3f" % np.mean(accs)
def run_epoch(sents, docs, labels, tags, models, is_training):

    ####unsupervised topic and language model training####

    #generate the batches
    tm_num_batches, lm_num_batches = int(math.ceil(float(len(sents[0]))/cf.batch_size)), \
        int(math.ceil(float(len(sents[1]))/cf.batch_size))
    batch_ids = [(item, 0) for item in range(tm_num_batches)
                 ] + [(item, 1) for item in range(lm_num_batches)]
    seq_lens = (cf.tm_sent_len, cf.lm_sent_len)
    #shuffle batches and sentences
    random.shuffle(batch_ids)
    random.shuffle(sents[0])
    random.shuffle(sents[1])

    #set training and cost ops for topic and language model training
    tm_cost_ops = (tf.no_op(), tf.no_op(), tf.no_op(), tf.no_op())
    lm_cost_ops = (tf.no_op(), tf.no_op(), tf.no_op(), tf.no_op())
    if models[0] != None:
        tm_cost_ops = (models[0].tm_cost,
                       (models[0].tm_train_op if is_training else tf.no_op()),
                       tf.no_op(), tf.no_op())
    if models[1] != None:
        lm_cost_ops = (tf.no_op(), tf.no_op(), models[1].lm_cost,
                       (models[1].lm_train_op if is_training else tf.no_op()))
    cost_ops = (tm_cost_ops, lm_cost_ops)

    start_time = time.time()
    lm_costs, tm_costs, lm_words, tm_words = 0.0, 0.0, 0.0, 0.0
    for bi, (b, model_id) in enumerate(batch_ids):
        tm_costs, tm_words, lm_costs, lm_words = fetch_batch_and_train(sents[model_id], docs[model_id], tags, \
            models[model_id], seq_lens[model_id], b, (tm_costs, tm_words, lm_costs, lm_words), cost_ops[model_id])

        #print progress
        output_string = "%d/%d: tm ppl = %.3f; lm ppl = %.3f; word/sec = %.1f" % \
            (bi+1, len(batch_ids), np.exp(tm_costs/max(tm_words, 1.0)), np.exp(lm_costs/max(lm_words, 1.0)),  \
            float(tm_words + lm_words)/(time.time()-start_time))
        print_progress(bi, len(batch_ids), is_training, output_string)

    ####supervised classification training####

    if labels != None:
        #randomise the batches
        batch_ids = range(int(math.ceil(float(len(docs[0])) / cf.batch_size)))
        random.shuffle(batch_ids)

        start_time = time.time()
        costs, accs = 0.0, []
        for bi, b in enumerate(batch_ids):
            d, y, m, t, num_docs = get_batch_doc(docs[0], labels, tags, b,
                                                 cf.doc_len, cf.tag_len,
                                                 cf.batch_size, 0)
            cost, prob, _ = sess.run([models[0].sup_cost, models[0].sup_probs, \
                (models[0].sup_train_op if is_training else tf.no_op())], \
                {models[0].doc:d, models[0].label:y, models[0].sup_mask: m, models[0].tag: t})
            costs += cost * cf.batch_size  #keep track of full cost
            pred = np.argmax(prob, axis=1)
            accs.extend(pred[:num_docs] == y[:num_docs])

            #print progress
            output_string = "%d/%d: sup loss = %.3f; sup acc = %.3f; doc/sec = %.1f" % \
                (bi+1, len(batch_ids), costs/((bi+1)*cf.batch_size), np.mean(accs), \
                (bi+1)*cf.batch_size/(time.time()-start_time))
            print_progress(bi, len(batch_ids), is_training, output_string)
    else:
        accs = None

    return -np.mean(accs) if accs != None else np.exp(lm_costs /
                                                      max(lm_words, 1.0))