Пример #1
0
def train():
    train_x, train_y, words_dict, labels_dict, seqlen_all = data_helper.load("train.txt", 10000, 35)
    test_x, test_y, seqlen_test = data_helper.load_test_data("test_filter_2.txt", seqlen, words_dict, labels_dict)
    model = bilstm_text(voc_size,batch_size,seqlen,n_class,embedding_size,learn_rate)
    op_pred = model.pred
    op_loss = model.loss
    op_train = model.train_op
    op_acc = model.acc
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)
    epoachs = 50
    cnt = 0

    for epoach in range(epoachs):
        batchs = data_helper.get_batch(64, train_x, train_y, seqlen_all)
        for batch_x,batch_y, batch_len in batchs:
            [_,train_acc] = sess.run([op_train,op_acc],feed_dict={model.inputs:batch_x,model.outputs:batch_y,model.seqlen_hdr:batch_len})
            print("{0} epoach {1} iters acc = {2}".format(epoach,cnt,train_acc))
            if cnt % 50 == 0:
                tmp_pred = sess.run(op_pred,feed_dict={model.inputs:batch_x,model.outputs:batch_y,model.seqlen_hdr:batch_len})
                print(tmp_pred)
                test(model, test_x, test_y, seqlen_test)
            cnt += 1
        print("---------test----------------")
        test(model,test_x, test_y, seqlen_test)
Пример #2
0
def test():
    imgs, tags = data_helper.load()
    model = Model()

    saver = tf.train.Saver()
    sess = tf.Session()
    with sess.as_default():
        saver.restore(sess, tf.train.latest_checkpoint('./'))

        sess.run(tf.global_variables_initializer())

        # Testing
        imgs, tags = data_helper.load_test()
        for i in range(len(imgs)):
            feed_dict = {
                model.x: np.asarray([np.expand_dims(imgs[i], -1)]),
                model.y: np.asarray([data_helper.indexTo01(tags[i])])
            }
            _output = sess.run(model.output, feed_dict=feed_dict)
            print "tag: ", tags[i], "\toutput: ", _output
Пример #3
0
print("pool flat:",pool_flat)
#full connect layers
h_drop = tf.nn.dropout(pool_flat,keep_prob)

full_W = tf.Variable(tf.truncated_normal([4,n_class],stddev=0.1 ,dtype=tf.float32))
full_B = tf.Variable(tf.constant(0.1,dtype=tf.float32))

outputs = tf.nn.softmax(tf.matmul(h_drop,full_W)+full_B)
pred = tf.argmax(outputs,1)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=outputs,labels=labels))
acc = tf.reduce_mean(tf.cast(tf.equal(pred,tf.argmax(labels,1)),tf.float32))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

train_x, train_y, words_dict, labels_dict, all_len = data_helper.load("data/train.txt",1000,s_limit_len)
test_x,test_y, testlen =  data_helper.load_test_data("data/test_filter_2.txt",s_limit_len,words_dict,labels_dict)

def test(sess,acc,pred,tes_x,test_y):
    y_pred, acc_test = sess.run([pred,acc],feed_dict={inputs:test_x,labels:test_y,keep_prob:1.0})
    y_true = sess.run(tf.argmax(test_y,1))

    print(metrics.classification_report(y_true,y_pred))



for epoach in range(1000):
    iter = 0
    test(sess,acc,pred,test_x,test_y)
    batchs = data_helper.get_batch(64,train_x,train_y,all_len)
    for [batch_x,batch_y,batch_len] in batchs:
Пример #4
0
from ml_model import SVM, RandomForest, GradientBoostingDecisionTree
from dl_model import DNN
import numpy as np
import data_helper

train_csv_path = './TrainingData.csv'
valid_csv_path = './ValidationData.csv'

if __name__ == '__main__':
    # Load data
    train_x, train_y, valid_x, valid_y, test_x, test_y = \
        data_helper.load(train_csv_path, valid_csv_path)

    # Training
    svm_model = SVM()
    svm_model.fit(train_x, train_y)
    rf_model = RandomForest()
    rf_model.fit(train_x, train_y)
    gbdt_model = GradientBoostingDecisionTree()
    gbdt_model.fit(train_x, train_y)
    dnn_model = DNN()
    dnn_model.fit(train_x, train_y)

    # Print testing result
    print 'SVM eror: ', svm_model.error(test_x, test_y)
    print 'Random forest error: ', rf_model.error(test_x, test_y)
    print 'Gradient boosting decision tree error: ', gbdt_model.error(
        test_x, test_y)
    print 'DNN error: ', dnn_model.error(test_x, test_y)
def dev_point_wise():
    if FLAGS.data == 'TREC' or FLAGS.data == 'sst2' or FLAGS.data == 'aclImdb' or FLAGS.data == 'sst5':
        train, dev, test = load_trec_sst2(FLAGS.data)
    else:
        train, dev = load(FLAGS.data)

    #we add the end of the sentence..
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split())) + 1
    print(q_max_sent_length)
    print(len(train))
    print('train question unique:{}'.format(len(train['question'].unique())))
    print('train length', len(train))
    num_train_steps = len(
        train['question'].unique()) // FLAGS.batch_size * FLAGS.num_epochs

    print('dev length', len(dev))
    if FLAGS.data == 'TREC' or FLAGS.data == 'sst2' or FLAGS.data == 'aclImdb' or FLAGS.data == 'sst5':
        alphabet, embeddings = prepare(FLAGS.data, [train, dev, test],
                                       max_sent_length=q_max_sent_length,
                                       dim=FLAGS.embedding_dim,
                                       is_embedding_needed=True,
                                       fresh=False)
    else:
        alphabet, embeddings = prepare(FLAGS.data, [train, dev],
                                       max_sent_length=q_max_sent_length,
                                       dim=FLAGS.embedding_dim,
                                       is_embedding_needed=True,
                                       fresh=False)
    print('alphabet:', len(alphabet))
    print('word embeddings:', np.shape(embeddings))

    if FLAGS.data == 'TREC':
        max_input_right = 6
    elif FLAGS.data == 'sst5':
        max_input_right = 5
    else:
        max_input_right = 2

    model_dict = {
        'T5_PE': T5_PE,
        'TPE_reduce': TPE_reduce,
        'Non_PE': Non_PE,
        'Soft_T5_PE': Soft_T5_PE,
        'Soft_T5_PE_NoB': Soft_T5_PE_NoB,
        'T5_PE_NoB': T5_PE_NoB
    }

    ckpt_path = 'model_save-3/'+FLAGS.data+'_'+ \
                        '_'.join([FLAGS.model_name,str(FLAGS.embedding_dim),
                            'L'+str(bert_config.num_hidden_layers),
                            'H'+ str(bert_config.num_attention_heads),
                            str(FLAGS.training_nums),
                            str(FLAGS.transformer_ret_pooling),
                            FLAGS.trail,
                            str(FLAGS.t5_bucket),
                            str(FLAGS.learning_rate)])

    if FLAGS.is_Embedding_Needed == False:
        ckpt_path += '_random'

    if FLAGS.batch_size != 64:
        ckpt_path = ckpt_path + '_bz-' + str(FLAGS.batch_size)

    if 'Soft_T5' in FLAGS.model_name:
        bert_config.bucket_slop_min = FLAGS.bucket_slop_min
        bert_config.bucket_slop_max = FLAGS.bucket_slop_max
        bert_config.l1_width = FLAGS.l1_width
        bert_config.l2_width = FLAGS.l2_width
        bert_config.stddev = FLAGS.stddev

        ckpt_path = ckpt_path + '_' + str(bert_config.bucket_slop_min)
        ckpt_path = ckpt_path + '_' + str(bert_config.bucket_slop_max)
        ckpt_path = ckpt_path + '_' + str(bert_config.l1_width)
        ckpt_path = ckpt_path + '_' + str(bert_config.l2_width)
        ckpt_path = ckpt_path + '_' + str(bert_config.stddev)

    if not os.path.exists(ckpt_path):
        os.makedirs(ckpt_path)

    if 'concat' in FLAGS.model_name:
        bert_config.hidden_size = 2 * FLAGS.embedding_dim
        bert_config.intermediate_size = bert_config.hidden_size * 2
        hidden_num = 2 * FLAGS.embedding_dim
    else:
        bert_config.hidden_size = FLAGS.embedding_dim
        bert_config.intermediate_size = bert_config.hidden_size * 2
        hidden_num = FLAGS.embedding_dim

    data_file = ckpt_path + '/test'

    if FLAGS.is_training == True:
        precision = data_file + '_precise'
    else:
        precision = data_file + '_precise_test'

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True

        sess = tf.Session(config=session_conf)
        now = int(time.time())
        timeArray = time.localtime(now)
        timeStamp1 = time.strftime("%Y%m%d%H%M%S", timeArray)
        timeDay = time.strftime("%Y%m%d", timeArray)
        print(timeStamp1)

        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')

            writer = tf.summary.FileWriter(ckpt_path)

            #model instantiation
            model = model_dict[FLAGS.model_name](
                dataset=FLAGS.data,
                max_input_left=q_max_sent_length,
                max_input_right=max_input_right,
                vocab_size=len(alphabet),
                embeddings=embeddings,
                embedding_size=FLAGS.embedding_dim,
                batch_size=FLAGS.batch_size,
                l2_reg_lambda=1.0,
                is_Embedding_Needed=FLAGS.is_Embedding_Needed,
                hidden_num=hidden_num,
                extend_feature_dim=FLAGS.extend_feature_dim,
                bert_config=bert_config,
                transformer_ret_pooling=FLAGS.transformer_ret_pooling,
                t5_bucket=FLAGS.t5_bucket,
                is_training=True)

            model.build_graph()

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            grads_and_vars = optimizer.compute_gradients(model.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            all_trainable_vars = tf.reduce_sum(
                [tf.reduce_prod(v.shape) for v in tf.trainable_variables()])

            sess.run(tf.global_variables_initializer())
            print('tf.trainable_variables():', tf.trainable_variables())
            print('all_trainable_vars:', sess.run(all_trainable_vars))
            saver = tf.train.Saver(max_to_keep=5)

            if FLAGS.is_training == False:
                print("strating load", tf.train.latest_checkpoint(ckpt_path))
                saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))

                predicted_test, _ = predict(sess, model, test, alphabet,
                                            FLAGS.batch_size,
                                            q_max_sent_length)
                predicted_label = np.argmax(predicted_test, 1)
                test_acc = accuracy_score(test['flag'], predicted_label)
                print('test_acc:', test_acc)
                #pickle.dump(test_acc,open(ret_file,'wb'))
                exit(0)
            else:
                acc_max, loss_min = 0.0000, 100000
                acc_test = 0.000
                early_stop = 20
                patience = 0
                early_stop_flags = False
                train_flag = 'Full'
                if FLAGS.training_nums != 20000:
                    train_flag = 'Small'

                idx = 0
                for i in range(FLAGS.num_epochs):
                    datas = batch_gen_with_point_wise(train,
                                                      alphabet,
                                                      FLAGS.batch_size,
                                                      q_len=q_max_sent_length,
                                                      flag=train_flag)

                    for data in datas:
                        idx += 1
                        feed_dict = {
                            model.question:
                            data[0],
                            model.input_y:
                            data[1],
                            model.q_position:
                            data[2],
                            model.input_dropout_prob:
                            bert_config.input_dropout_prob,
                            model.hidden_dropout_prob:
                            bert_config.hidden_dropout_prob,
                            model.attention_probs_dropout_prob:
                            bert_config.attention_probs_dropout_prob
                        }

                        _, step, l2_loss, loss, accuracy = sess.run([
                            train_op, global_step, model.l2_loss, model.loss,
                            model.accuracy
                        ], feed_dict)
                        time_str = datetime.datetime.now().isoformat()

                        loss_sum = tf.Summary(value=[
                            tf.Summary.Value(tag="model/loss",
                                             simple_value=loss),
                        ])
                        writer.add_summary(loss_sum, step)

                        acc_sum = tf.Summary(value=[
                            tf.Summary.Value(tag="model/acc uracy",
                                             simple_value=accuracy),
                        ])
                        writer.add_summary(acc_sum, step)

                        if idx % 50 == 0:
                            print(
                                "{}: step {}, l2_loss {:g}, loss {:g}, acc {:g}  "
                                .format(time_str, step, l2_loss, loss,
                                        accuracy))

                    if True:
                        if True:
                            predicted_dev, dev_loss = predict(
                                sess, model, dev, alphabet, FLAGS.batch_size,
                                q_max_sent_length)

                            predicted_label = np.argmax(predicted_dev, 1)
                            print(predicted_label[:10])
                            print(dev['flag'][:10])
                            acc_dev = accuracy_score(dev['flag'],
                                                     predicted_label)

                            dev_acc_sum = tf.Summary(value=[
                                tf.Summary.Value(tag="dev/accuracy",
                                                 simple_value=acc_dev),
                            ])
                            writer.add_summary(dev_acc_sum, step)

                            curr_step = step
                            if acc_dev > acc_max:  # and dev_loss < loss_min:
                                save_path = os.path.join(
                                    ckpt_path,
                                    "model_{}.ckpt".format(curr_step))

                                saver.save(sess,
                                           save_path,
                                           write_meta_graph=True,
                                           write_state=True)

                                acc_max = acc_dev
                                loss_min = dev_loss

                                if FLAGS.data in [
                                        'sst2', 'TREC', 'sst5', 'aclImdb'
                                ]:
                                    predicted_test, _ = predict(
                                        sess, model, test, alphabet,
                                        FLAGS.batch_size, q_max_sent_length)
                                    predicted_label = np.argmax(
                                        predicted_test, 1)
                                    acc_test = accuracy_score(
                                        test['flag'], predicted_label)

                                    #save the test performance at the best dev score...
                                    pickle.dump(
                                        acc_test,
                                        open(ckpt_path + '/test_ret.p', 'wb'))

                                if 'T5_PE' in FLAGS.model_name:
                                    t5_att_bias = sess.run(
                                        model.single_t5_att_bias, feed_dict)
                                    param = {
                                        'right': t5_att_bias[0, :, 0],
                                        'left': t5_att_bias[0, :, -1]
                                    }

                                    pickle.dump(
                                        param,
                                        open(ckpt_path + '/' + 't5_att_bias.p',
                                             'wb'))

                                if 'Soft_T5_PE' == FLAGS.model_name:
                                    soft_t5_alpha, soft_t5_beta = sess.run([
                                        model.soft_t5_alpha, model.soft_t5_beta
                                    ])

                                    params = {
                                        'soft_t5_alpha': soft_t5_alpha,
                                        'soft_t5_beta': soft_t5_beta
                                    }
                                    pickle.dump(
                                        params,
                                        open(ckpt_path + '/' + 'params.p',
                                             'wb'))

                                if 'Soft_T5_PE_NoB' == FLAGS.model_name:
                                    soft_t5_rd_bucket_mat_val = sess.run(
                                        model.soft_t5_rd_bucket_mat)
                                    params = {
                                        'soft_t5_rd_bucket_mat':
                                        soft_t5_rd_bucket_mat_val
                                    }
                                    pickle.dump(
                                        params,
                                        open(ckpt_path + '/' + 'params.p',
                                             'wb'))
                                patience = 0
                            else:
                                patience += 1
                                if patience > early_stop:
                                    early_stop_flags = True
                                    break

                            print("{}:dev epoch:loss {}".format(i, loss_min))
                            print("{}:dev epoch:acc {}".format(i, acc_max))
                            if FLAGS.data in [
                                    'sst2', 'TREC', 'sst5', 'aclImdb'
                            ]:
                                print("{}:test epoch:acc {}".format(
                                    i, acc_test))
                            line2 = " {}:test epoch: acc{}".format(i, acc_test)
                            log.write(line2 + '\n')
                            log.flush()
                        if early_stop_flags:
                            break

                acc_flod.append(acc_max)
            log.close()
Пример #6
0
from data_helper import log_time_delta, getLogger

logger = getLogger()

args = Singleton().get_rnn_flag()
#args = Singleton().get_8008_flag()

args._parse_flags()
opts = dict()
logger.info("\nParameters:")
for attr, value in sorted(args.__flags.items()):
    logger.info(("{}={}".format(attr.upper(), value)))
    opts[attr] = value

train, test, dev = data_helper.load(args.data, filter=args.clean)

q_max_sent_length = max(map(lambda x: len(x), train['question'].str.split()))
a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))

alphabet = data_helper.get_alphabet([train, test, dev], dataset=args.data)
logger.info('the number of words :%d ' % len(alphabet))

if args.data == "quora" or args.data == "8008":
    print("cn embedding")
    embedding = data_helper.get_embedding(alphabet,
                                          dim=200,
                                          language="cn",
                                          dataset=args.data)
    train_data_loader = data_helper.getBatch48008
else:
Пример #7
0
def main(args):
    args._parse_flags()
    print("\nParameters:")
    for attr, value in sorted(args.__flags.items()):
        print(("{}={}".format(attr.upper(), value)))
    log_dir = 'log/' + timeDay
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    data_file = log_dir + '/test_' + args.data + timeStamp
    precision = data_file + 'precise'
    print('load data ...........')
    train, test, dev = data_helper.load(args.data, filter=args.clean)

    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))

    alphabet = data_helper.get_alphabet([train, test, dev])
    print('the number of words', len(alphabet))

    print('get embedding')
    if args.data == "quora":
        embedding = data_helper.get_embedding(alphabet, language="cn")
    else:
        embedding = data_helper.get_embedding(alphabet)

    with tf.Graph().as_default(), tf.device("/gpu:" + str(args.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = args.allow_soft_placement
        session_conf.log_device_placement = args.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)

        model = QA_CNN_extend(max_input_left=q_max_sent_length,
                              max_input_right=a_max_sent_length,
                              batch_size=args.batch_size,
                              vocab_size=len(alphabet),
                              embedding_size=args.embedding_dim,
                              filter_sizes=list(
                                  map(int, args.filter_sizes.split(","))),
                              num_filters=args.num_filters,
                              hidden_size=args.hidden_size,
                              dropout_keep_prob=args.dropout_keep_prob,
                              embeddings=embedding,
                              l2_reg_lambda=args.l2_reg_lambda,
                              trainable=args.trainable,
                              pooling=args.pooling,
                              conv=args.conv)

        model.build_graph()

        sess.run(tf.global_variables_initializer())

        def train_step(model, sess, batch):
            for data in batch:
                feed_dict = {
                    model.question: data[0],
                    model.answer: data[1],
                    model.answer_negative: data[2],
                    model.q_mask: data[3],
                    model.a_mask: data[4],
                    model.a_neg_mask: data[5]
                }
                _, summary, step, loss, accuracy, score12, score13, see = sess.run(
                    [
                        model.train_op, model.merged, model.global_step,
                        model.loss, model.accuracy, model.score12,
                        model.score13, model.see
                    ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print(
                    "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                    .format(time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13)))

        def predict(model, sess, batch, test):
            scores = []
            for data in batch:
                feed_dict = {
                    model.question: data[0],
                    model.answer: data[1],
                    model.q_mask: data[2],
                    model.a_mask: data[3]
                }
                score = sess.run(model.score12, feed_dict)
                scores.extend(score)

            return np.array(scores[:len(test)])

        for i in range(args.num_epoches):
            datas = data_helper.get_mini_batch(train, alphabet,
                                               args.batch_size)
            train_step(model, sess, datas)
            test_datas = data_helper.get_mini_batch_test(
                test, alphabet, args.batch_size)

            predicted_test = predict(model, sess, test_datas, test)
            print(len(predicted_test))
            print(len(test))
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            print('map_mrr test', map_mrr_test)
Пример #8
0
import csv
import matplotlib.pyplot as plt
import os

# np.random.seed(423453)
base_dir = os.getcwd()
train_csv_path = os.path.join(base_dir, 'trainingData.csv')
test_csv_path = os.path.join(base_dir, 'validationData.csv')

if __name__ == '__main__':

    results = []
    for i in range(10):
        # Load data
        train_x, train_y, valid_x, valid_y, test_x, test_y = \
            data_helper.load(train_csv_path, test_csv_path)
        # Training
        encode_dnn_model = EncoderDNN()
        strat = time.time()
        # tbCallBack=keras.callbacks.TensorBoard(log_dir='./Graph',
        #                                        histogram_freq=1,
        #                                        write_graph=True,
        #                                        write_images=True)

        encode_dnn_model.fit(train_x,
                             train_y,
                             valid_x=valid_x,
                             valid_y=valid_y)  #,tensorbd=tbCallBack)
        end = time.time()

        floor_right = encode_dnn_model.error(test_x, test_y)
Пример #9
0
import tensorflow as tf
import numpy as np
import data_helper
from model import Model

loss_list = []


def record():
    with open('loss.txt', 'w') as f:
        for _loss in loss_list:
            f.write(str(_loss) + '\n')


if __name__ == '__main__':
    imgs, tags = data_helper.load()
    model = Model()
    sess = tf.Session()
    with sess.as_default():
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(0.01)
        grad_and_vars = optimizer.compute_gradients(model.loss)
        train_op = optimizer.apply_gradients(grad_and_vars,
                                             global_step=global_step)

        loss_summary = tf.scalar_summary("loss", model.loss)
        train_summary_op = tf.merge_summary([loss_summary])
        train_summary_writer = tf.train.SummaryWriter('summary/train',
                                                      sess.graph)

        sess.run(tf.initialize_all_variables())
Пример #10
0
import tensorflow as tf
import numpy as np
import data_helper
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as birnn
voc_size = 10000
batch_size = 64
seqlen = 35
learn_rate = 0.05
n_class = 2
embedding_size = 100
train_x, train_y, words_dict, labels_dict, seqlen_all = data_helper.load(
    "train.txt", 10000, 35)
one_hot_label = tf.one_hot(train_y, n_class)
test_x, test_y, seqlen_test = data_helper.load_test_data(
    "test_filter_2.txt", seqlen, words_dict, labels_dict)
# seqlen_all = np.array(seqlen_all)*10
inputs = tf.placeholder(tf.int64, [None, seqlen], name="seq_inputs")
outputs = tf.placeholder(tf.int64, [None, 2], name="outputs")
seqlen_hdr = tf.placeholder(tf.int64, [None])
W_embedding = tf.Variable(tf.random_uniform(shape=[voc_size, embedding_size]))
embedding = tf.nn.embedding_lookup(W_embedding, inputs)
# print("embding",embedding)
#embedding shape(35,100)
fwcell = LSTMCell(embedding_size)
bwcell = LSTMCell(embedding_size)
#seqlen这里应该是一个batchsize的长度,应该是一个tensor
out_bilstm, final_state = birnn(fwcell,
                                bwcell,
                                inputs=embedding,
                                sequence_length=seqlen_hdr,
Пример #11
0
base_dir = os.getcwd()

train_csv_path = os.path.join(base_dir, 'UTS_training.csv')
test_csv_path = os.path.join(base_dir, 'UTS_test.csv')

# train_csv_path = os.path.join(base_dir,'trainingData.csv')
# test_csv_path=os.path.join(base_dir,'validationData.csv')
# valid_csv_path=os.path.join(base_dir,'AllValuationData.csv')
# train_csv_path=os.path.join(base_dir,'arrAllTrainingData.csv')

log_dir = 'DEEPLEARNING_MODEL_log.txt'

if __name__ == '__main__':
    # Load data
    # (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = data_helper.load_data_all(train_csv_path, valid_csv_path,test_csv_path)
    train_x, train_y, valid_x, valid_y, test_x, test_y = data_helper.load(
        train_csv_path, test_csv_path)

    # patience=[i for i in range(1,50,2)]
    # patience=[21,]

    # B=[i for i in np.linspace(3.0,3.1,2)]
    # for b in B:
    # for p in patience:

    # dropout=[i for i in np.linspace(0.4,0.7,4)]
    dropout = [
        1,
    ]
    for dp in dropout:

        # save_picture_dir = "picfhfg"
Пример #12
0
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    test = test.reindex(np.random.permutation(test.index))

    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print('q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length))
    print('train question unique:{}'.format(len(train['question'].unique())))
    print('train length', len(train))
    print('test length', len(test))
    print('dev length', len(dev))
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=FLAGS.fresh)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print('alphabet:', len(alphabet))
    with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            folder = 'runs/' + timeDay + '/' + timeStamp + '/'
            out_dir = folder + FLAGS.data
            if not os.path.exists(folder):
                os.makedirs(folder)
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            print("start build model")
            cnn = QA_CNN_quantum_extend(
                max_input_left=q_max_sent_length,
                max_input_right=a_max_sent_length,
                batch_size=FLAGS.batch_size,
                vocab_size=len(alphabet),
                embedding_size=FLAGS.embedding_dim,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                dropout_keep_prob=FLAGS.dropout_keep_prob,
                embeddings=embeddings,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                overlap_needed=FLAGS.overlap_needed,
                learning_rate=FLAGS.learning_rate,
                trainable=FLAGS.trainable,
                extend_feature_dim=FLAGS.extend_feature_dim,
                pooling=FLAGS.pooling,
                position_needed=FLAGS.position_needed,
                conv=FLAGS.conv,
                margin=FLAGS.margin)
            cnn.build_graph()

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            train_writer = tf.summary.FileWriter(log_dir + '/train',
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(log_dir + '/test')
            # Initialize all variables
            print("build over")
            sess.run(tf.global_variables_initializer())
            print("variables_initializer")

            # saver.restore(sess, 'runs/20170910/20170910154937/wiki')
            map_max = 0.65
            for i in range(FLAGS.num_epochs):

                datas = batch_gen_with_pair(train,
                                            alphabet,
                                            FLAGS.batch_size,
                                            q_len=q_max_sent_length,
                                            a_len=a_max_sent_length,
                                            fresh=FLAGS.fresh,
                                            overlap_dict=None)
                print("load data")
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.answer_negative: data[2],
                        cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, summary, step, loss, accuracy, score12, score13, see = sess.run(
                        [
                            cnn.train_op, cnn.merged, cnn.global_step,
                            cnn.loss, cnn.accuracy, cnn.score12, cnn.score13,
                            cnn.see
                        ], feed_dict)

                    train_writer.add_summary(summary, i)
                    time_str = datetime.datetime.now().isoformat()
                    print(
                        "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                        .format(time_str, step, loss, accuracy,
                                np.mean(score12), np.mean(score13)))
                    line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                        time_str, step, loss, accuracy, np.mean(score12),
                        np.mean(score13))
                    # print loss
                if i % 1 == 0:
                    predicted_dev = predict(sess, cnn, dev, alphabet,
                                            FLAGS.batch_size,
                                            q_max_sent_length,
                                            a_max_sent_length)
                    map_mrr_dev = evaluation.evaluationBypandas(
                        dev, predicted_dev)
                    predicted_test = predict(sess, cnn, test, alphabet,
                                             FLAGS.batch_size,
                                             q_max_sent_length,
                                             a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(
                        test, predicted_test)

                    precise_test = evaluation.precision(test, predicted_test)

                    print("test precise : {}".format(precise_test))
                    print("{}:epoch:dev map mrr {}".format(i, map_mrr_dev))
                    print("{}:epoch:test map mrr {}".format(i, map_mrr_test))
                    line = " {}:epoch: precise: {}--- map_dev{}-------map_mrr_test{}".format(
                        i, precise_test, map_mrr_dev[0], map_mrr_test)
                    if map_mrr_dev[0] > map_max:
                        map_max = map_mrr_dev[0]

                        save_path = saver.save(sess, out_dir)
                        print("Model saved in file: ", save_path)

                log.write(line + '\n')
                log.flush()
            print('train over')
            saver.restore(sess, out_dir)
            predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size,
                                q_max_sent_length, a_max_sent_length)
            train['predicted'] = predicted
            map_mrr_train = evaluation.evaluationBypandas(train, predicted)
            predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
            dev['predicted'] = predicted_dev
            map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev)
            predicted_test = predict(sess, cnn, test, alphabet,
                                     FLAGS.batch_size, q_max_sent_length,
                                     a_max_sent_length)
            test['predicted'] = predicted_test
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            ap = evaluation.get_ap(test, predicted_test)
            ap.to_csv('ap_score_qlm_wiki', header=None, sep='\t')
            print('map_mrr train', map_mrr_train)
            print('map_mrr dev', map_mrr_dev)
            print('map_mrr test', map_mrr_test)
            log.write(str(map_mrr_train) + '\n')
            log.write(str(map_mrr_test) + '\n')
            log.write(str(map_mrr_dev) + '\n')