예제 #1
0
def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len):
    scores = []
    d = get_overlap_dict(test, alphabet, q_len, a_len)
    for data in batch_gen_with_single(test,
                                      alphabet,
                                      batch_size,
                                      q_len,
                                      a_len,
                                      overlap_dict=d):
        feed_dict = {
            cnn.question: data[0],
            cnn.answer: data[1],
            cnn.answer_negative: data[1],
            cnn.q_pos_overlap: data[2],
            cnn.q_neg_overlap: data[2],
            cnn.a_pos_overlap: data[3],
            cnn.a_neg_overlap: data[3],
            cnn.q_position: data[4],
            cnn.a_pos_position: data[5],
            cnn.a_neg_position: data[5]
        }

        score = sess.run(cnn.score12, feed_dict)
        # print len(score)
        # if batch_size == 20:
        #     attention.extend((q,a))
        scores.extend(score)
    pickle.dump(attention, open('attention.file', 'w'))
    return np.array(scores[:len(test)])
예제 #2
0
def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len, step, type):
    scores = []
    d = get_overlap_dict(test, alphabet, q_len, a_len)
    for data in batch_gen_with_single(test,
                                      alphabet,
                                      batch_size,
                                      q_len,
                                      a_len,
                                      overlap_dict=d):
        feed_dict = {
            cnn.question: data[0],
            cnn.answer: data[1],
            cnn.answer_negative: data[1],
            cnn.q_pos_overlap: data[2],
            cnn.q_neg_overlap: data[2],
            cnn.a_pos_overlap: data[3],
            cnn.a_neg_overlap: data[3],
            cnn.q_position: data[4],
            cnn.a_pos_position: data[5],
            cnn.a_neg_position: data[5]
        }

        score = sess.run(cnn.score12, feed_dict)

        scores.extend(score)
    with open(data_file + '_' + type + "_score_%d.txt" % step, 'w') as ff:
        string_tmp = '\n'.join([str(i) for i in scores])
        ff.write(string_tmp)
    return np.array(scores[:len(test)])
예제 #3
0
def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len):
    scores = []
    d = get_overlap_dict(test, alphabet, q_len, a_len)
    for data in batch_gen_with_single(test,
                                      alphabet,
                                      batch_size,
                                      q_len,
                                      a_len,
                                      overlap_dict=d):
        feed_dict = {
            cnn.question: data[0],
            cnn.answer: data[1],
            cnn.q_overlap: data[2],
            cnn.a_overlap: data[3],
            cnn.q_position: data[4],
            cnn.a_position: data[5]
        }
        score = sess.run(cnn.scores, feed_dict)
        scores.extend(score)
    return np.array(scores[:len(test)])
예제 #4
0
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    # train = train[:10000]
    # test = test[:10000]
    # dev = dev[:10000]
    # submit = submit[:1000]
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print 'q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length)
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=FLAGS.fresh)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            folder = 'runs/' + timeDay + '/' + timeStamp + '/'
            out_dir = folder + FLAGS.data
            if not os.path.exists(folder):
                os.makedirs(folder)
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            print "start build model"
            cnn = QA_RNN_extend(max_input_left=q_max_sent_length,
                                max_input_right=a_max_sent_length,
                                batch_size=FLAGS.batch_size,
                                vocab_size=len(alphabet),
                                embedding_size=FLAGS.embedding_dim,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                dropout_keep_prob=FLAGS.dropout_keep_prob,
                                embeddings=embeddings,
                                l2_reg_lambda=FLAGS.l2_reg_lambda,
                                overlap_needed=FLAGS.overlap_needed,
                                learning_rate=FLAGS.learning_rate,
                                trainable=FLAGS.trainable,
                                extend_feature_dim=FLAGS.extend_feature_dim,
                                pooling=FLAGS.pooling,
                                position_needed=FLAGS.position_needed,
                                conv=FLAGS.conv)
            cnn.build_graph()

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            train_writer = tf.summary.FileWriter(log_dir + '/train',
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(log_dir + '/test')
            # Initialize all variables
            print "build over"
            sess.run(tf.global_variables_initializer())
            print "variables_initializer"

            map_max = 0.65
            for i in range(FLAGS.num_epochs):
                if FLAGS.dns == True:
                    samples = dns_sample(train,
                                         alphabet,
                                         q_max_sent_length,
                                         a_max_sent_length,
                                         sess,
                                         cnn,
                                         FLAGS.batch_size,
                                         neg_sample_num=10)
                    datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size)
                    print 'load dns datas'
                    for data in datas:
                        feed_dict = {
                            cnn.question: data[0],
                            cnn.answer: data[1],
                            cnn.answer_negative: data[2]
                        }
                        _, step, loss, accuracy, score12, score13 = sess.run([
                            cnn.train_op, cnn.global_step, cnn.loss,
                            cnn.accuracy, cnn.score12, cnn.score13
                        ], feed_dict)
                        time_str = datetime.datetime.now().isoformat()
                        print(
                            "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                            .format(time_str, step, loss, accuracy,
                                    np.mean(score12), np.mean(score13)))
                        line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                            time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13))
                else:
                    d = get_overlap_dict(train,
                                         alphabet,
                                         q_len=q_max_sent_length,
                                         a_len=a_max_sent_length)
                    datas = batch_gen_with_pair_overlap(
                        train,
                        alphabet,
                        FLAGS.batch_size,
                        q_len=q_max_sent_length,
                        a_len=a_max_sent_length,
                        fresh=FLAGS.fresh,
                        overlap_dict=d)
                    print "load data"
                    for data in datas:
                        feed_dict = {
                            cnn.question: data[0],
                            cnn.answer: data[1],
                            cnn.answer_negative: data[2],
                            cnn.q_pos_overlap: data[3],
                            cnn.q_neg_overlap: data[4],
                            cnn.a_pos_overlap: data[5],
                            cnn.a_neg_overlap: data[6],
                            cnn.q_position: data[7],
                            cnn.a_pos_position: data[8],
                            cnn.a_neg_position: data[9]
                        }
                        _, summary, step, loss, accuracy, score12, score13 = sess.run(
                            [
                                cnn.train_op, cnn.merged, cnn.global_step,
                                cnn.loss, cnn.accuracy, cnn.score12,
                                cnn.score13
                            ], feed_dict)
                        train_writer.add_summary(summary, i)
                        time_str = datetime.datetime.now().isoformat()
                        print(
                            "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                            .format(time_str, step, loss, accuracy,
                                    np.mean(score12), np.mean(score13)))
                        line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                            time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13))
                        # print loss
                if i % 1 == 0:
                    predicted_dev = predict(sess, cnn, dev, alphabet,
                                            FLAGS.batch_size,
                                            q_max_sent_length,
                                            a_max_sent_length)
                    map_mrr_dev = evaluation.evaluationBypandas(
                        dev, predicted_dev)
                    predicted_test = predict(sess, cnn, test, alphabet,
                                             FLAGS.batch_size,
                                             q_max_sent_length,
                                             a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(
                        test, predicted_test)

                    print "{}:epoch:dev map mrr {}".format(i, map_mrr_dev)
                    print "{}:epoch:test map mrr {}".format(i, map_mrr_test)
                    line = " {}:epoch: map_dev{}-------map_mrr_test{}".format(
                        i, map_mrr_dev[0], map_mrr_test)
                    if map_mrr_dev[0] > map_max:
                        map_max = map_mrr_dev[0]
                        # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))

                        save_path = saver.save(sess, out_dir)
                        print "Model saved in file: ", save_path

                log.write(line + '\n')
                log.flush()
            print 'train over'
            saver.restore(sess, out_dir)
            predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size,
                                q_max_sent_length, a_max_sent_length)
            train['predicted'] = predicted
            train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',
                                      index=False,
                                      sep='\t')
            map_mrr_train = evaluation.evaluationBypandas(train, predicted)

            predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
            dev['predicted'] = predicted_dev
            dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score',
                                    index=False,
                                    sep='\t')
            map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev)

            predicted_test = predict(sess, cnn, test, alphabet,
                                     FLAGS.batch_size, q_max_sent_length,
                                     a_max_sent_length)

            test['predicted'] = predicted_test
            test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',
                                     index=False,
                                     sep='\t')
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            print 'map_mrr train', map_mrr_train
            print 'map_mrr dev', map_mrr_dev
            print 'map_mrr test', map_mrr_test
            log.write(str(map_mrr_train) + '\n')
            log.write(str(map_mrr_test) + '\n')
            log.write(str(map_mrr_dev) + '\n')
            predict(sess, cnn, train[:100], alphabet, 20, q_max_sent_length,
                    a_max_sent_length)
예제 #5
0
def test_point_wise():
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    train = train.fillna('')
    test = test.fillna('')
    dev = dev.fillna('')
    # submit = submit.fillna('')
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    # train = train[:1000]
    # test = test[:1000]
    # dev = dev[:1000]
    # submit = dev[:100]
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)

    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=True)
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            # session_conf = tf.ConfigProto(
            #     allow_soft_placement=FLAGS.allow_soft_placement,
            #     log_device_placement=FLAGS.log_device_placement)

            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            cnn = QA(max_input_left=q_max_sent_length,
                     max_input_right=a_max_sent_length,
                     vocab_size=len(alphabet),
                     embedding_size=FLAGS.embedding_dim,
                     batch_size=FLAGS.batch_size,
                     embeddings=embeddings,
                     dropout_keep_prob=FLAGS.dropout_keep_prob,
                     filter_sizes=list(map(int,
                                           FLAGS.filter_sizes.split(","))),
                     num_filters=FLAGS.num_filters,
                     l2_reg_lambda=FLAGS.l2_reg_lambda,
                     is_Embedding_Needed=True,
                     trainable=FLAGS.trainable,
                     overlap_needed=FLAGS.overlap_needed,
                     position_needed=FLAGS.position_needed,
                     pooling=FLAGS.pooling,
                     extend_feature_dim=FLAGS.extend_feature_dim)
            cnn.build_graph()
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step, 100, 0.96)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # seq_process(train, alphabet)
            # seq_process(test, alphabet)
            map_max = 0.65
            for i in range(30):
                d = get_overlap_dict(train,
                                     alphabet,
                                     q_len=q_max_sent_length,
                                     a_len=a_max_sent_length)
                datas = batch_gen_with_point_wise(train,
                                                  alphabet,
                                                  FLAGS.batch_size,
                                                  overlap_dict=d,
                                                  q_len=q_max_sent_length,
                                                  a_len=a_max_sent_length)
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.input_y: data[2],
                        cnn.q_overlap: data[3],
                        cnn.a_overlap: data[4],
                        cnn.q_position: data[5],
                        cnn.a_position: data[6]
                    }
                    _, step, loss, accuracy, pred, scores, see = sess.run([
                        train_op, global_step, cnn.loss, cnn.accuracy,
                        cnn.predictions, cnn.scores, cnn.see
                    ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}  ".format(
                        time_str, step, loss, accuracy))

                    # print loss

                # predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted[:,-1])
                predicted = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
                map_mrr_dev = evaluation.evaluationBypandas(
                    dev, predicted[:, -1])
                predicted_test = predict(sess, cnn, test, alphabet,
                                         FLAGS.batch_size, q_max_sent_length,
                                         a_max_sent_length)
                map_mrr_test = evaluation.evaluationBypandas(
                    test, predicted_test[:, -1])
                if map_mrr_dev[0] > map_max:
                    map_max = map_mrr_dev[0]
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + timeDay
                    out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str(
                        map_mrr_dev[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    save_path = saver.save(sess, out_dir)
                    print "Model saved in file: ", save_path
                # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1])
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # print evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # print "{}:train epoch:map mrr {}".format(i,map_mrr_train)
                print "{}:dev epoch:map mrr {}".format(i, map_mrr_dev)
                print "{}:test epoch:map mrr {}".format(i, map_mrr_test)
                # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0])
                line = " {}:epoch: map_dev{}----map_test{}".format(
                    i, map_mrr_dev[0], map_mrr_test[0])
                log.write(line + '\n')
                log.flush()
            log.close()