def predict_bilstm(inpath,tokenize_style,ckpt_dir,model_name,name_scope,graph):
    logits_result=None
    with graph:
        vocabulary_word2index, vocabulary_index2label= load_vocabulary(FLAGS.traning_data_path,FLAGS.vocab_size,
                                                              name_scope=name_scope,tokenize_style=tokenize_style)
        vocab_size = len(vocabulary_word2index);print(model_name+".vocab_size:",vocab_size);num_classes=len(vocabulary_index2label);print("num_classes:",num_classes)
        lineno_list, X1, X2=load_test_data(inpath, vocabulary_word2index, FLAGS.sentence_len, tokenize_style=tokenize_style)
        #2.create session.
        config=tf.ConfigProto()
        config.gpu_options.allow_growth=True
        with tf.Session(config=config) as sess:
            #Instantiate Model
            model=DualBilstmCnnModel(filter_sizes,FLAGS.num_filters,num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,
                            FLAGS.decay_rate,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training,model=model_name,
                                       similiarity_strategy=FLAGS.similiarity_strategy,top_k=FLAGS.top_k,max_pooling_style=FLAGS.max_pooling_style)
            #Initialize Save
            saver=tf.train.Saver()
            if os.path.exists(ckpt_dir+"checkpoint"):
                print(model_name+".Restoring Variables from Checkpoint.")
                saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir))
            else:
                print(model_name+".Not able to find Checkpoint. Going to stop now...")
                iii=0
                iii/0
            #3.feed data & training
            number_of_test_data=len(X1)
            print(model_name+".number_of_test_data:",number_of_test_data)
            batch_size=FLAGS.batch_size
            iteration=0
            divide_equally=(number_of_test_data%batch_size==0)
            steps=0
            if divide_equally:
                steps=int(number_of_test_data/batch_size)
            else:
                steps=int(number_of_test_data/batch_size)+1

            print("steps:",steps)
            start=0
            end=0
            logits_result=np.zeros((number_of_test_data,len(vocabulary_index2label)))
            for i in range(steps):
                print("i:",i)
                start=i*batch_size
                if i!=steps or divide_equally:
                    end=(i+1)*batch_size
                    feed_dict = {model.input_x1: X1[start:end],model.input_x2: X2[start:end],
                                 model.dropout_keep_prob: FLAGS.dropout_keep_prob,
                                 model.iter: iteration,model.tst: not FLAGS.is_training}
                    print(i*batch_size,end)
                else:
                    end=number_of_test_data-(batch_size*int(number_of_test_data%batch_size))
                    feed_dict = {model.input_x1: X1[start:end],model.input_x2: X2[start:end],
                                 model.dropout_keep_prob: FLAGS.dropout_keep_prob,
                                 model.iter: iteration,model.tst: not FLAGS.is_training}
                    print("start:",i*batch_size,";end:",end)
                logits_batch=sess.run(model.logits,feed_dict) #[batch_size,num_classes]
                logits_result[start:end]=logits_batch

        print("logits_result:",logits_result)
        return logits_result,lineno_list,vocabulary_index2label
示例#2
0
def predict_bilstm(inpath, outpath):
    vocabulary_word2index, vocabulary_index2label= load_vocabulary(FLAGS.traning_data_path,FLAGS.vocab_size,
                                                          name_scope=FLAGS.name_scope,tokenize_style=FLAGS.tokenize_style)
    vocab_size = len(vocabulary_word2index);print("cnn_model.vocab_size:",vocab_size);num_classes=len(vocabulary_index2label);print("num_classes:",num_classes)
    lineno_list, X1, X2=load_test_data(inpath, vocabulary_word2index, FLAGS.sentence_len, tokenize_style=FLAGS.tokenize_style)
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN=DualBilstmCnnModel(filter_sizes,FLAGS.num_filters,num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,
                        FLAGS.decay_rate,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.hidden_size,FLAGS.is_training,model=FLAGS.model,
                                   similiarity_strategy=FLAGS.similiarity_strategy,top_k=FLAGS.top_k,max_pooling_style=FLAGS.max_pooling_style)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Not able to find Checkpoint. Going to stop now...")
            iii=0
            iii/0
        #3.feed data & training
        number_of_test_data=len(X1)
        print("number_of_test_data:",number_of_test_data)
        batch_size=FLAGS.batch_size
        iteration=0
        file_object=open(outpath, 'a')
        divide_equally=(number_of_test_data%batch_size==0)
        steps=0
        if divide_equally:
            steps=int(number_of_test_data/batch_size)
        else:
            steps=int(number_of_test_data/batch_size)+1

        print("steps:",steps)
        start=0
        end=0
        for i in range(steps):
            print("i:",i)
            start=i*batch_size
            if i!=steps or divide_equally:
                end=(i+1)*batch_size
                feed_dict = {textCNN.input_x1: X1[start:end],textCNN.input_x2: X2[start:end],
                             textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob,
                             textCNN.iter: iteration,textCNN.tst: not FLAGS.is_training}
                print(i*batch_size,end)
            else:
                end=number_of_test_data-(batch_size*int(number_of_test_data%batch_size))
                feed_dict = {textCNN.input_x1: X1[start:end],textCNN.input_x2: X2[start:end],
                             textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob,
                             textCNN.iter: iteration,textCNN.tst: not FLAGS.is_training}
                print("start:",i*batch_size,";end:",end)
            logits=sess.run(textCNN.logits,feed_dict)
            label_list=get_label_by_logits(logits,vocabulary_index2label)
            write_predict_result(lineno_list[start:end],label_list,file_object)
        file_object.close()
def main(_):
    #if FLAGS.use_pingyin:
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        FLAGS.traning_data_path,
        FLAGS.vocab_size,
        name_scope=FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    vocab_size = len(vocabulary_word2index)
    print("cnn_model.vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    train, valid, test, true_label_percent = load_data(
        FLAGS.traning_data_path,
        vocabulary_word2index,
        vocabulary_label2index,
        FLAGS.sentence_len,
        FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    trainX1, trainX2, trainBlueScores, trainY = train
    validX1, validX2, validBlueScores, validY = valid
    testX1, testX2, testBlueScores, testY = test
    length_data_mining_features = len(trainBlueScores[0])
    print("length_data_mining_features:", length_data_mining_features)
    #print some message for debug purpose
    print("model_name:", FLAGS.model_name, ";length of training data:",
          len(trainX1), ";length of validation data:", len(testX1),
          ";true_label_percent:", true_label_percent, ";tokenize_style:",
          FLAGS.tokenize_style, ";vocabulary size:", vocab_size)
    print("train_x1:", trainX1[0], ";train_x2:", trainX2[0])
    print("data mining features.length:", len(trainBlueScores[0]),
          "data_mining_features:", trainBlueScores[0], ";train_y:", trainY[0])
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN = DualBilstmCnnModel(
            filter_sizes,
            FLAGS.num_filters,
            num_classes,
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sentence_len,
            vocab_size,
            FLAGS.embed_size,
            FLAGS.hidden_size,
            FLAGS.is_training,
            model=FLAGS.model_name,
            similiarity_strategy=FLAGS.similiarity_strategy,
            top_k=FLAGS.top_k,
            max_pooling_style=FLAGS.max_pooling_style,
            length_data_mining_features=length_data_mining_features)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            if FLAGS.decay_lr_flag:
                #trainX1, trainX2, trainY = shuffle_data(trainX1, trainX2, trainY)
                for i in range(2):  # decay learning rate if necessary.
                    print(i, "Going to decay learning rate by half.")
                    sess.run(textCNN.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if not os.path.exists(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)

            if FLAGS.use_pretrained_embedding:  #load pre-trained word embedding
                print("===>>>going to use pretrained word embeddings...")
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textCNN,
                                                 FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX1)
        batch_size = FLAGS.batch_size
        iteration = 0
        best_acc = 0.60
        best_f1_score = 0.20
        weights_dict = init_weights_dict(
            vocabulary_label2index)  #init weights dict.
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            print("Auto.Going to shuffle data")
            trainX1, trainX2, trainBlueScores, trainY = shuffle_data(
                trainX1, trainX2, trainBlueScores, trainY)
            loss, eval_acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                input_x1, input_x2, input_bluescores, input_y = generate_batch_training_data(
                    trainX1, trainX2, trainBlueScores, trainY,
                    number_of_training_data, batch_size)
                #input_x1=trainX1[start:end]
                #input_x2=trainX2[start:end]
                #input_bluescores=trainBlueScores[start:end]
                #input_y=trainY[start:end]
                weights = get_weights_for_current_batch(input_y, weights_dict)

                feed_dict = {
                    textCNN.input_x1: input_x1,
                    textCNN.input_x2: input_x2,
                    textCNN.input_bluescores: input_bluescores,
                    textCNN.input_y: input_y,
                    textCNN.weights: np.array(weights),
                    textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    textCNN.iter: iteration,
                    textCNN.tst: not FLAGS.is_training
                }
                curr_loss, curr_acc, lr, _, _ = sess.run([
                    textCNN.loss_val, textCNN.accuracy, textCNN.learning_rate,
                    textCNN.update_ema, textCNN.train_op
                ], feed_dict)
                loss, eval_acc, counter = loss + curr_loss, eval_acc + curr_acc, counter + 1
                if counter % 100 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\tLearning rate:%.5f"
                        % (epoch, counter, loss / float(counter),
                           eval_acc / float(counter), lr))
                #middle checkpoint
                #if start!=0 and start%(500*FLAGS.batch_size)==0: # eval every 3000 steps.
                #eval_loss, acc,f1_score, precision, recall,_ = do_eval(sess, textCNN, validX1, validX2, validY,iteration)
                #print("【Validation】Epoch %d Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, acc,eval_loss, f1_score, precision, recall))
                # save model to checkpoint
                #save_path = FLAGS.ckpt_dir + "model.ckpt"
                #saver.save(sess, save_path, global_step=epoch)
            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))

            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval(
                    sess, textCNN, validX1, validX2, validBlueScores, validY,
                    iteration, vocabulary_index2word)
                weights_dict = get_weights_label_as_standard_dict(
                    weights_label)
                print("label accuracy(used for label weight):==========>>>>",
                      weights_dict)
                print(
                    "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                    % (epoch, eval_loss, eval_accc, f1_scoree, precision,
                       recall))
                #save model to checkpoint
                if eval_accc * 1.05 > best_acc and f1_scoree > best_f1_score:
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    print("going to save model. eval_f1_score:", f1_scoree,
                          ";previous best f1 score:", best_f1_score,
                          ";eval_acc", str(eval_accc), ";previous best_acc:",
                          str(best_acc))
                    saver.save(sess, save_path, global_step=epoch)
                    best_acc = eval_accc
                    best_f1_score = f1_scoree

                if FLAGS.decay_lr_flag and (epoch != 0 and
                                            (epoch == 1 or epoch == 3
                                             or epoch == 5 or epoch == 8)):
                    #TODO print("Auto.Restoring Variables from Checkpoint.")
                    #TODO saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))

                    for i in range(2):  # decay learning rate if necessary.
                        print(i, "Going to decay learning rate by half.")
                        sess.run(textCNN.learning_rate_decay_half_op)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval(
            sess, textCNN, testX1, testX2, testBlueScores, testY, iteration,
            vocabulary_index2word)
        print(
            "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:"
            % (test_loss, acc_t, f1_score_t, precision, recall))
    pass