Python create_vocabulary 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: data_util

메소드/함수: create_vocabulary

hotexamples.com에서의 예제들: 10

Python create_vocabulary - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 data_util.create_vocabulary에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def main():
    setup_logging()
    logger = logging.getLogger(__name__)

    processing_word = get_processing_word(lowercase=True)
    dataset = Dataset('./data/test.txt', processing_word=processing_word)
    create_vocabulary([dataset], './data/words.txt', './data/tags.txt')
    dataset = Dataset('./data/test.txt')
    create_char_vocabulary([dataset], './data/chars.txt')

예제 #2

파일 보기

def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在，那么加载故事（词汇表索引化的）
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_vocabulary(
            word2vec_model_path=FLAGS.word2vec_model_path,
            name_scope="transformer_classification")
        vocab_size = len(vocabulary_word2index)
        print("transformer.vocab_size:", vocab_size)
        train, test, _ = load_data_multilabel_new(
            vocabulary_word2index, training_data_path=FLAGS.training_data_path)

        compare_train_data = WikiQA(word2vec=Word2Vec(),
                                    max_len=FLAGS.max_len_compare)
        compare_train_data.open_file(mode="train")
        compare_test_data = WikiQA(word2vec=Word2Vec(),
                                   max_len=FLAGS.max_len_compare)
        compare_test_data.open_file(mode="valid")

        trainX, trainY, = train
        testX, testY = test

        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = Transformer(FLAGS.num_classes,
                            FLAGS.learning_rate,
                            FLAGS.batch_size,
                            FLAGS.decay_steps,
                            FLAGS.decay_rate,
                            FLAGS.sequence_length,
                            vocab_size,
                            FLAGS.embed_size,
                            FLAGS.d_model,
                            FLAGS.d_k,
                            FLAGS.d_v,
                            FLAGS.h,
                            FLAGS.num_layer,
                            FLAGS.is_training,
                            compare_train_data.num_features,
                            di=50,
                            s=compare_train_data.max_len,
                            w=4,
                            l2_reg=0.0004,
                            l2_lambda=FLAGS.l2_lambda)
        print("=" * 50)
        print("List of Variables:")
        for v in tf.trainable_variables():
            print(v.name)
        print("=" * 50)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(
                    sess,
                    vocabulary_index2word,
                    vocab_size,
                    model,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(model.epoch_step)
        number_of_training_data = len(trainX)
        print("number_of_training_data:", number_of_training_data)

        previous_eval_loss = 10000
        best_eval_loss = 10000
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            compare_train_data.reset_index()
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                batch_x1, batch_x2, _, batch_features = compare_train_data.next_batch(
                    batch_size=end - start)
                feed_dict = {
                    model.input_x: trainX[start:end],
                    model.dropout_keep_prob: 0.9,
                    model.x1: batch_x1,
                    model.x2: batch_x2,
                    model.features: batch_features
                }
                feed_dict[model.input_y_label] = trainY[start:end]
                curr_loss, curr_acc, _ = sess.run(
                    [model.loss_val, model.accuracy, model.train_op],
                    feed_dict)  #curr_acc--->TextCNN.accuracy
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 50 == 0:
                    print(
                        "transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter))
                    )  #tTrain Accuracy:%.3f---》acc/float(counter)
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################
                if FLAGS.batch_size != 0 and (
                        start % (FLAGS.validate_step * FLAGS.batch_size) == 0):
                    eval_loss, eval_acc = do_eval(sess, model, testX, testY,
                                                  compare_test_data,
                                                  batch_size)
                    print(
                        "transformer.classification.validation.part. previous_eval_loss:",
                        previous_eval_loss, ";current_eval_loss:", eval_loss)
                    if eval_loss > previous_eval_loss:  #if loss is not decreasing
                        # reduce the learning rate by a factor of 0.5
                        print(
                            "transformer.classification.==>validation.part.going to reduce the learning rate."
                        )
                        learning_rate1 = sess.run(model.learning_rate)
                        lrr = sess.run([model.learning_rate_decay_half_op])
                        learning_rate2 = sess.run(model.learning_rate)
                        print(
                            "transformer.classification==>validation.part.learning_rate1:",
                            learning_rate1, " ;learning_rate2:",
                            learning_rate2)
                    #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc))
                    else:  # loss is decreasing
                        if eval_loss < best_eval_loss:
                            print(
                                "transformer.classification==>going to save the model.eval_loss:",
                                eval_loss, ";best_eval_loss:", best_eval_loss)
                            # save model to checkpoint
                            save_path = FLAGS.ckpt_dir + "model.ckpt"
                            saver.save(sess, save_path, global_step=epoch)
                            best_eval_loss = eval_loss
                    previous_eval_loss = eval_loss
                    compare_test_data.reset_index()
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)

예제 #3

파일 보기

파일: a1_dual_bilstm_cnn_train.py 프로젝트: zzdgit/nlu_sim

def main(_):
    #if FLAGS.use_pingyin:
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        FLAGS.traning_data_path,
        FLAGS.vocab_size,
        name_scope=FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    vocab_size = len(vocabulary_word2index)
    print("cnn_model.vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    train, valid, test, true_label_percent = load_data(
        FLAGS.traning_data_path,
        vocabulary_word2index,
        vocabulary_label2index,
        FLAGS.sentence_len,
        FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    trainX1, trainX2, trainBlueScores, trainY = train
    validX1, validX2, validBlueScores, validY = valid
    testX1, testX2, testBlueScores, testY = test
    length_data_mining_features = len(trainBlueScores[0])
    print("length_data_mining_features:", length_data_mining_features)
    #print some message for debug purpose
    print("model_name:", FLAGS.model_name, ";length of training data:",
          len(trainX1), ";length of validation data:", len(testX1),
          ";true_label_percent:", true_label_percent, ";tokenize_style:",
          FLAGS.tokenize_style, ";vocabulary size:", vocab_size)
    print("train_x1:", trainX1[0], ";train_x2:", trainX2[0])
    print("data mining features.length:", len(trainBlueScores[0]),
          "data_mining_features:", trainBlueScores[0], ";train_y:", trainY[0])
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN = DualBilstmCnnModel(
            filter_sizes,
            FLAGS.num_filters,
            num_classes,
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sentence_len,
            vocab_size,
            FLAGS.embed_size,
            FLAGS.hidden_size,
            FLAGS.is_training,
            model=FLAGS.model_name,
            similiarity_strategy=FLAGS.similiarity_strategy,
            top_k=FLAGS.top_k,
            max_pooling_style=FLAGS.max_pooling_style,
            length_data_mining_features=length_data_mining_features)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            if FLAGS.decay_lr_flag:
                #trainX1, trainX2, trainY = shuffle_data(trainX1, trainX2, trainY)
                for i in range(2):  # decay learning rate if necessary.
                    print(i, "Going to decay learning rate by half.")
                    sess.run(textCNN.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if not os.path.exists(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)

            if FLAGS.use_pretrained_embedding:  #load pre-trained word embedding
                print("===>>>going to use pretrained word embeddings...")
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textCNN,
                                                 FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX1)
        batch_size = FLAGS.batch_size
        iteration = 0
        best_acc = 0.60
        best_f1_score = 0.20
        weights_dict = init_weights_dict(
            vocabulary_label2index)  #init weights dict.
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            print("Auto.Going to shuffle data")
            trainX1, trainX2, trainBlueScores, trainY = shuffle_data(
                trainX1, trainX2, trainBlueScores, trainY)
            loss, eval_acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                input_x1, input_x2, input_bluescores, input_y = generate_batch_training_data(
                    trainX1, trainX2, trainBlueScores, trainY,
                    number_of_training_data, batch_size)
                #input_x1=trainX1[start:end]
                #input_x2=trainX2[start:end]
                #input_bluescores=trainBlueScores[start:end]
                #input_y=trainY[start:end]
                weights = get_weights_for_current_batch(input_y, weights_dict)

                feed_dict = {
                    textCNN.input_x1: input_x1,
                    textCNN.input_x2: input_x2,
                    textCNN.input_bluescores: input_bluescores,
                    textCNN.input_y: input_y,
                    textCNN.weights: np.array(weights),
                    textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    textCNN.iter: iteration,
                    textCNN.tst: not FLAGS.is_training
                }
                curr_loss, curr_acc, lr, _, _ = sess.run([
                    textCNN.loss_val, textCNN.accuracy, textCNN.learning_rate,
                    textCNN.update_ema, textCNN.train_op
                ], feed_dict)
                loss, eval_acc, counter = loss + curr_loss, eval_acc + curr_acc, counter + 1
                if counter % 100 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\tLearning rate:%.5f"
                        % (epoch, counter, loss / float(counter),
                           eval_acc / float(counter), lr))
                #middle checkpoint
                #if start!=0 and start%(500*FLAGS.batch_size)==0: # eval every 3000 steps.
                #eval_loss, acc,f1_score, precision, recall,_ = do_eval(sess, textCNN, validX1, validX2, validY,iteration)
                #print("【Validation】Epoch %d Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, acc,eval_loss, f1_score, precision, recall))
                # save model to checkpoint
                #save_path = FLAGS.ckpt_dir + "model.ckpt"
                #saver.save(sess, save_path, global_step=epoch)
            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))

            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval(
                    sess, textCNN, validX1, validX2, validBlueScores, validY,
                    iteration, vocabulary_index2word)
                weights_dict = get_weights_label_as_standard_dict(
                    weights_label)
                print("label accuracy(used for label weight):==========>>>>",
                      weights_dict)
                print(
                    "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                    % (epoch, eval_loss, eval_accc, f1_scoree, precision,
                       recall))
                #save model to checkpoint
                if eval_accc * 1.05 > best_acc and f1_scoree > best_f1_score:
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    print("going to save model. eval_f1_score:", f1_scoree,
                          ";previous best f1 score:", best_f1_score,
                          ";eval_acc", str(eval_accc), ";previous best_acc:",
                          str(best_acc))
                    saver.save(sess, save_path, global_step=epoch)
                    best_acc = eval_accc
                    best_f1_score = f1_scoree

                if FLAGS.decay_lr_flag and (epoch != 0 and
                                            (epoch == 1 or epoch == 3
                                             or epoch == 5 or epoch == 8)):
                    #TODO print("Auto.Restoring Variables from Checkpoint.")
                    #TODO saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))

                    for i in range(2):  # decay learning rate if necessary.
                        print(i, "Going to decay learning rate by half.")
                        sess.run(textCNN.learning_rate_decay_half_op)

        # 5.最后在测试集上做测试，并报告测试准确率 Test
        test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval(
            sess, textCNN, testX1, testX2, testBlueScores, testY, iteration,
            vocabulary_index2word)
        print(
            "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:"
            % (test_loss, acc_t, f1_score_t, precision, recall))
    pass

예제 #4

파일 보기

파일: p7_TextCNN_train.py 프로젝트: zlsama/deep_learning

def main(_):
    training_data_path = '/Users/liyangyang/Downloads/bdci/train.txt'
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = \
        data_util.create_vocabulary(training_data_path, 17259, name_scope='cnn')
    vocab_size = len(vocabulary_word2index) + 1
    print("cnn_model.vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    print(vocabulary_index2label)
    train, test = data_util.load_data_multilabel(training_data_path,
                                                 vocabulary_word2index,
                                                 vocabulary_label2index, 200)
    trainX, trainY = train
    testX, testY = test
    # trainX = trainX[0:8000]
    # trainY = trainY[0:8000]
    # testX = testX[0:500]
    # testY = testY[0:500]
    # print some message for debug purpose
    print("length of training data:", len(trainX),
          ";length of validation data:", len(testX))
    print("trainX.shape", np.array(trainX).shape)
    print("trainY.shape", np.array(trainY).shape)
    print("trainX[0]:", trainX[1])
    print("trainY[0]:", trainY[1])

    print("end padding & transform to one hot...")

    # 2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes,
                          FLAGS.learning_rate, FLAGS.batch_size,
                          FLAGS.decay_steps, FLAGS.decay_rate,
                          FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                          FLAGS.is_training)
        # Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            # for i in range(3): #decay learning rate if necessary.
            #    print(i,"Going to decay learning rate by half.")
            #    sess.run(textCNN.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  # load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textCNN)
        curr_epoch = sess.run(textCNN.epoch_step)
        # 3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        iteration = 0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                feed_dict = {
                    textCNN.input_x: trainX[start:end],
                    textCNN.dropout_keep_prob: 0.5,
                    textCNN.iter: iteration,
                    textCNN.tst: not FLAGS.is_training
                }
                if not FLAGS.multi_label_flag:
                    feed_dict[textCNN.input_y] = trainY[start:end]
                else:
                    feed_dict[textCNN.input_y_multilabel] = trainY[start:end]
                curr_loss, lr, curr_acc, _ = sess.run([
                    textCNN.loss_val, textCNN.learning_rate, textCNN.accuracy,
                    textCNN.train_op
                ], feed_dict)
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 2 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter), lr,
                           acc / float(counter)))

                ########################################################################################################
                # if start % (2000 * FLAGS.batch_size) == 0:  # eval every 3000 steps.
                #     eval_loss, f1_score, precision, recall = do_eval(sess, textCNN, testX, testY, iteration)
                #     print("Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (
                #         epoch, eval_loss, f1_score, precision, recall))
                #     # save model to checkpoint
                #     save_path = FLAGS.ckpt_dir + "model.ckpt"
                #     saver.save(sess, save_path, global_step=epoch)
                ########################################################################################################
            # epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                # save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)

                eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY,
                                              iteration, batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

        # 5.最后在测试集上做测试，并报告测试准确率 Test
        eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY, iteration,
                                      batch_size)
        print("Test Loss:%.3f" % (eval_loss))
    pass

예제 #5

파일 보기

def main(_):
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        FLAGS.traning_data_path, FLAGS.vocab_size, name_scope=FLAGS.name_scope)
    vocab_size = len(vocabulary_word2index)
    print("cnn_model.vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    train, test = load_data_multilabel(FLAGS.traning_data_path,
                                       vocabulary_word2index,
                                       vocabulary_label2index,
                                       FLAGS.sentence_len)
    trainX, trainY = train
    testX, testY = test
    #print some message for debug purpose
    print("length of training data:", len(trainX),
          ";length of validation data:", len(testX))
    print("trainX[0]:", trainX[0])
    print("trainY[0]:", trainY[0])
    train_y_short = get_target_label_short(trainY[0])
    print("train_y_short:", train_y_short)

    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          num_classes,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          vocab_size,
                          FLAGS.embed_size,
                          FLAGS.is_training,
                          multi_label_flag=FLAGS.multi_label_flag)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            #for i in range(3): #decay learning rate if necessary.
            #    print(i,"Going to decay learning rate by half.")
            #    sess.run(textCNN.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textCNN,
                                                 FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        iteration = 0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, counter = 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                feed_dict = {
                    textCNN.input_x: trainX[start:end],
                    textCNN.dropout_keep_prob: 0.5,
                    textCNN.iter: iteration,
                    textCNN.tst: not FLAGS.is_training
                }
                if not FLAGS.multi_label_flag:
                    feed_dict[textCNN.input_y] = trainY[start:end]
                else:
                    feed_dict[textCNN.input_y_multilabel] = trainY[start:end]
                curr_loss, lr, _, _ = sess.run([
                    textCNN.loss_val, textCNN.learning_rate,
                    textCNN.update_ema, textCNN.train_op
                ], feed_dict)
                loss, counter = loss + curr_loss, counter + 1
                if counter % 50 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f"
                        % (epoch, counter, loss / float(counter), lr))

                ########################################################################################################
                if start % (2000 *
                            FLAGS.batch_size) == 0:  # eval every 3000 steps.
                    eval_loss, f1_score, precision, recall = do_eval(
                        sess, textCNN, testX, testY, iteration)
                    print(
                        "Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                        % (epoch, eval_loss, f1_score, precision, recall))
                    # save model to checkpoint
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    saver.save(sess, save_path, global_step=epoch)
                ########################################################################################################
            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, f1_score, precision, recall = do_eval(
                    sess, textCNN, testX, testY, iteration)
                print(
                    "Epoch %d Validation Loss:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                    % (epoch, eval_loss, f1_score, precision, recall))
                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)

        # 5.最后在测试集上做测试，并报告测试准确率 Test
        test_loss, _, _, _ = do_eval(sess, textCNN, testX, testY, iteration)
        print("Test Loss:%.3f" % (test_loss))
    pass

예제 #6

파일 보기

def main(_):
    # 1.load data(X:list of lint,y:int).
    # if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在，那么加载故事（词汇表索引化的）
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    # else:
    if 1 == 1:
        # vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab'
        # # print("end padding & transform to one hot...")
        # x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file)
        #
        # # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=2)
        # # x = np.array(list(vocab_processor.fit_transform(x_train)))
        # # vocab_processor.save(vocab_processor_path)
        #
        # vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path)
        # x = np.array(list(vocab_processor.transform(x_train)))
        #
        # trainX = x[:100000]
        # testX = x[100000:]
        # trainY = y[:100000]
        # testY = y[100000:]
        # vocab_size = len(vocab_processor.vocabulary_)
        # print('vocab_size', vocab_size)
        # print("trainX[0]:", trainX[0])  # ;print("trainY[0]:", trainY[0])
        # # Converting labels to binary vectors
        # print("end padding & transform to one hot...")
        training_data_path = '/Users/liyangyang/Downloads/dwb/new_data/train_set.txt'
        vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = \
            data_util.create_vocabulary(training_data_path, 345325, name_scope='cnn')
        vocab_size = len(vocabulary_word2index) + 1
        print("cnn_model.vocab_size:", vocab_size)
        num_classes = len(vocabulary_index2label)
        print("num_classes:", num_classes)
        print(vocabulary_index2label)
        train, test = data_util.load_data_multilabel(training_data_path,
                                                     vocabulary_word2index,
                                                     vocabulary_label2index,
                                                     5000)
        trainX, trainY = train
        testX, testY = test
        trainX = trainX[0:1000]
        trainY = trainY[0:1000]
        testX = testX[0:500]
        testY = testY[0:500]
        # print some message for debug purpose
        print("length of training data:", len(trainX),
              ";length of validation data:", len(testX))
        print("trainX.shape", np.array(trainX).shape)
        print("trainY.shape", np.array(trainY).shape)
        print("trainX[0]:", trainX[1])
        print("trainY[0]:", trainY[1])

        print("end padding & transform to one hot...")
    # 2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        textRNN = TextRNN(FLAGS.num_classes, FLAGS.learning_rate,
                          FLAGS.batch_size, FLAGS.decay_steps,
                          FLAGS.decay_rate, FLAGS.sequence_length, vocab_size,
                          FLAGS.embed_size, FLAGS.is_training)
        # Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint for rnn model.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  # load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textRNN)
        curr_epoch = sess.run(textRNN.epoch_step)
        # 3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end]
                          )  # ;print("trainY[start:end]:",trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [textRNN.loss_val, textRNN.accuracy, textRNN.train_op],
                    feed_dict={
                        textRNN.input_x: trainX[start:end],
                        textRNN.input_y: trainY[start:end],
                        textRNN.dropout_keep_prob: 1
                    }
                )  # curr_acc--->TextCNN.accuracy -->,textRNN.dropout_keep_prob:1
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 1 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter))
                    )  # tTrain Accuracy:%.3f---》acc/float(counter)
            # epoch increment
            print("going to increment epoch counter....")
            sess.run(textRNN.epoch_increment)
            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, textRNN, testX, testY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))
                # save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)

        # 5.最后在测试集上做测试，并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, textRNN, testX, testY, batch_size)
    pass

예제 #7

파일 보기

tf.app.flags.DEFINE_boolean(
    "is_training_flag", True, "is training.true:tranining,false:testing/inference")
tf.app.flags.DEFINE_integer("num_epochs", 15, "number of epochs to run.")
tf.app.flags.DEFINE_integer(
    "validate_every", 1, "Validate every validate_every epochs.")  # 每10轮做一次验证
tf.app.flags.DEFINE_boolean("use_embedding", False,
                            "whether to use embedding or not.")
tf.app.flags.DEFINE_integer(
    "num_filters", 128, "number of filters")  # 256--->512
tf.app.flags.DEFINE_string(
    "word2vec_model_path", "word2vec-title-desc.bin", "word2vec's vocabulary and vectors")
tf.app.flags.DEFINE_string("name_scope", "cnn", "name scope value.")
tf.app.flags.DEFINE_boolean(
    "multi_label_flag", False, "use multi label or single label.")
filter_sizes = [6, 7, 8]

print("Restoring Variables from Checkpoint.")
saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
trainX, trainY, testX, testY = None, None, None, None
vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, _ = create_vocabulary(FLAGS.traning_data_path, FLAGS.vocab_size, name_scope=FLAGS.name_scope)
vocab_size = len(vocabulary_word2index)
print("cnn_model.vocab_size:", vocab_size)
num_classes = len(vocabulary_label2index)
print("num_classes:", num_classes)
#num_examples,FLAGS.sentence_len=trainX.shape
#print("num_examples of training:",num_examples,";sentence_len:",FLAGS.sentence_len)
train, test = load_data_multilabel(
    FLAGS.traning_data_path, vocabulary_word2index, vocabulary_label2index, FLAGS.sentence_len)
trainX, trainY = train
testX, testY = test

예제 #8

파일 보기

파일: textCNN_train.py 프로젝트: ylf4910/tf_text_classification

def main(_):
    '''
    主函数：数据预处理，迭代数据训练模型
    '''
    print("数据预处理阶段：......")
    trainX,trainY,testX,testY = None, None, None, None
 
    #加载词向量转换
    vocabulary_word2index, vocabulary_index2word = create_vocabulary(
                word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn")

    vocab_size = len(vocabulary_word2index)
    #标签索引
    vocabulary_word2index_label, vocabulary_index2word_label = create_vocabulary_label(
               vocabulary_label=FLAGS.training_data_path, name_scope="cnn")

    #将文本转换为向量形式，分训练，测试集
    train, test, _ = load_data_multilabel_new(vocabulary_word2index, 
                   vocabulary_word2index_label,training_data_path=FLAGS.training_data_path)

    trainX, trainY = train
    testX, testY = test

    #用0填充短句
    trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0)
    testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0)
    print("数据预处理部分完成.....")

    print("创建session 对话.......")
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True

    with tf.Session(config=config) as sess:
        textCNN = TextCNN(filter_size,FLAGS.num_filters,FLAGS.num_classes,FLAGS.learning_rate,FLAGS.batch_size,
  FLAGS.sequence_length,vocab_size,FLAGS.embed_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.is_decay, FLAGS.is_dropout,FLAGS.is_l2)

        #存储变量
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter(FLAGS.tensorboard_dir, sess.graph)

        #初始化模型保存
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):         #判断模型是否存在
            print("从模型中恢复变量")
#            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))   #自动获取最近一次的模型变量
        else:
            print("初始化变量")
            sess.run(tf.global_variables_initializer())     #初始化所有变量
            if FLAGS.use_embedding:        #加载预训练词向量
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                           vocab_size, textCNN, word2vec_model_path=FLAGS.word2vec_model_path)

        curr_epoch = sess.run(textCNN.epoch_step)
   
        #划分训练数据
        num_train_data = len(trainX)
        batch_size = FLAGS.batch_size

        index = 0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss,acc,counter = 0.0, 0.0, 0.0
            for start, end in zip(range(0, num_train_data, batch_size), range(batch_size, num_train_data, batch_size)):
                feed_dict = {textCNN.input_x:trainX[start:end], textCNN.input_y:trainY[start:end], 
                                   textCNN.dropout_keep_prob:0.9}
                curr_loss, curr_acc, logits, _ = sess.run([textCNN.loss_val, textCNN.accuracy, 
                                                textCNN.logits, textCNN.train_op], feed_dict)
     
                index += 1
                loss, counter, acc = loss+curr_loss, counter+1, acc+curr_acc
   
                if counter % 100 == 0:
                    rs = sess.run(merged,feed_dict)       #执行参数记录
                    writer.add_summary(rs, index)
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f\tGlobal Step %d"
                          %(epoch,counter,loss/float(counter),acc/float(counter),sess.run(textCNN.global_step)))
 #                   print("Train Logits{}".format(logits))
    
            #迭代次数增加
            epoch_increment = tf.assign(textCNN.epoch_step, tf.add(textCNN.epoch_step, tf.constant(1)))
            sess.run(epoch_increment)

            #验证
            print("迭代次数:{}".format(epoch))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess,textCNN,testX,testY,batch_size)
                print("迭代次数:{}\t验证损失值:{}\t准确率:{}".format(epoch, eval_loss, eval_acc))

                #保存模型
        #        save_path = FLAGS.ckpt_dir+"model.ckpt"
        #        saver.save(sess, save_path, global_step=epoch)

        print("验证集上进行损失，准确率计算.....")
        test_loss, test_acc = do_eval(sess, textCNN, testX, testY, batch_size)
        print("测试集中损失值:{}\t准确率:{}".format(test_loss, test_acc))

예제 #9

파일 보기

def main(_):
    vocab_word2index, accusation_label2index, articles_label2index = create_vocabulary(
        FLAGS.data_path,
        FLAGS.traning_data_file,
        FLAGS.vocab_size,
        name_scope=FLAGS.name_scope,
        test_mode=FLAGS.test_mode)
    deathpenalty_label2index = {True: 1, False: 0}
    lifeimprisonment_label2index = {True: 1, False: 0}
    vocab_size = len(vocab_word2index)
    print("cnn_model.vocab_size:", vocab_size)
    accusation_num_classes = len(accusation_label2index)
    article_num_classes = len(articles_label2index)
    deathpenalty_num_classes = len(deathpenalty_label2index)
    lifeimprisonment_num_classes = len(lifeimprisonment_label2index)
    print("accusation_num_classes:", accusation_num_classes)
    print("article_num_clasess:", article_num_classes)
    train, valid, test = load_data_multilabel(FLAGS.traning_data_file,
                                              FLAGS.valid_data_file,
                                              FLAGS.test_data_path,
                                              vocab_word2index,
                                              accusation_label2index,
                                              articles_label2index,
                                              deathpenalty_label2index,
                                              lifeimprisonment_label2index,
                                              FLAGS.sentence_len,
                                              name_scope=FLAGS.name_scope,
                                              test_mode=FLAGS.test_mode)
    train_X, train_Y_accusation, train_Y_article, train_Y_deathpenalty, train_Y_lifeimprisonment, train_Y_imprisonment = train
    valid_X, valid_Y_accusation, valid_Y_article, valid_Y_deathpenalty, valid_Y_lifeimprisonment, valid_Y_imprisonment = valid
    test_X, test_Y_accusation, test_Y_article, test_Y_deathpenalty, test_Y_lifeimprisonment, test_Y_imprisonment = test
    #print some message for debug purpose
    print("length of training data:", len(train_X), ";valid data:",
          len(valid_X), ";test data:", len(test_X))
    print("trainX_[0]:", train_X[0])
    train_Y_accusation_short = get_target_label_short(train_Y_accusation[0])
    train_Y_article_short = get_target_label_short(train_Y_article[0])
    print("train_Y_accusation_short:", train_Y_accusation_short,
          ";train_Y_article_short:", train_Y_article_short)
    print("train_Y_deathpenalty:", train_Y_deathpenalty[0],
          ";train_Y_lifeimprisonment:", train_Y_lifeimprisonment[0],
          ";train_Y_imprisonment:", train_Y_imprisonment[0])
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        model = HierarchicalAttention(
            accusation_num_classes, article_num_classes,
            deathpenalty_num_classes, lifeimprisonment_num_classes,
            FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,
            FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences,
            vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            for i in range(2):  #decay learning rate if necessary.
                print(i, "Going to decay learning rate by half.")
                sess.run(model.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                vocabulary_index2word = {
                    index: word
                    for word, index in vocab_word2index.items()
                }
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, model,
                                                 FLAGS.word2vec_model_path)
        curr_epoch = sess.run(model.epoch_step)
        #3.feed data & training
        number_of_training_data = len(train_X)
        batch_size = FLAGS.batch_size
        iteration = 0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss_total, counter = 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", train_X[start:end],
                          "train_X.shape:", train_X.shape)
                feed_dict = {
                    model.input_x:
                    train_X[start:end],
                    model.input_y_accusation:
                    train_Y_accusation[start:end],
                    model.input_y_article:
                    train_Y_article[start:end],
                    model.input_y_deathpenalty:
                    train_Y_deathpenalty[start:end],
                    model.input_y_lifeimprisonment:
                    train_Y_lifeimprisonment[start:end],
                    model.input_y_imprisonment:
                    train_Y_imprisonment[start:end],
                    model.dropout_keep_prob:
                    FLAGS.keep_dropout_rate
                }
                #model.iter: iteration,model.tst: not FLAGS.is_training
                current_loss, lr, loss_accusation, loss_article, loss_deathpenalty, loss_lifeimprisonment, loss_imprisonment, l2_loss, _ = sess.run(
                    [
                        model.loss_val, model.learning_rate,
                        model.loss_accusation, model.loss_article,
                        model.loss_deathpenalty, model.loss_lifeimprisonment,
                        model.loss_imprisonment, model.l2_loss, model.train_op
                    ], feed_dict)  #model.update_ema
                loss_total, counter = loss_total + current_loss, counter + 1
                if counter % 100 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f"
                        % (epoch, counter, float(loss_total) / float(counter),
                           lr))
                if counter % 400 == 0:
                    print(
                        "Loss_accusation:%.3f\tLoss_article:%.3f\tLoss_deathpenalty:%.3f\tLoss_lifeimprisonment:%.3f\tLoss_imprisonment:%.3f\tL2_loss:%.3f\tCurrent_loss:%.3f\t"
                        % (loss_accusation, loss_article, loss_deathpenalty,
                           loss_lifeimprisonment, loss_imprisonment, l2_loss,
                           current_loss))
                ########################################################################################################
                if start != 0 and start % (
                        1000 * FLAGS.batch_size) == 0:  # eval every 400 steps.
                    loss, f1_macro_accasation, f1_micro_accasation, f1_a_article, f1_i_aritcle, f1_a_death, f1_i_death, f1_a_life, f1_i_life, score_penalty = \
                        do_eval(sess, model, valid,iteration,accusation_num_classes,article_num_classes)
                    accasation_score = (
                        (f1_macro_accasation + f1_micro_accasation) /
                        2.0) * 100.0
                    article_score = (
                        (f1_a_article + f1_i_aritcle) / 2.0) * 100.0
                    score_all = accasation_score + article_score + score_penalty
                    print(
                        "Epoch %d ValidLoss:%.3f\tMacro_f1_accasation:%.3f\tMicro_f1_accsastion:%.3f\tMacro_f1_article:%.3f Micro_f1_article:%.3f Macro_f1_deathpenalty:%.3f\t"
                        "Micro_f1_deathpenalty:%.3f\tMacro_f1_lifeimprisonment:%.3f\tMicro_f1_lifeimprisonment:%.3f\t"
                        % (epoch, loss, f1_macro_accasation,
                           f1_micro_accasation, f1_a_article, f1_i_aritcle,
                           f1_a_death, f1_i_death, f1_a_life, f1_i_life))
                    print("1.Accasation Score:", accasation_score,
                          ";2.Article Score:", article_score,
                          ";3.Penalty Score:", score_penalty, ";Score ALL:",
                          score_all)
                    # save model to checkpoint
                    #save_path = FLAGS.ckpt_dir + "model.ckpt" #TODO temp remove==>only save checkpoint for each epoch once.
                    #saver.save(sess, save_path, global_step=epoch)
            #epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                loss,f1_macro_accasation,f1_micro_accasation,f1_a_article,f1_i_aritcle,f1_a_death,f1_i_death,f1_a_life,f1_i_life,score_penalty=\
                    do_eval(sess,model,valid,iteration,accusation_num_classes,article_num_classes)
                accasation_score = (
                    (f1_macro_accasation + f1_micro_accasation) / 2.0) * 100.0
                article_score = ((f1_a_article + f1_i_aritcle) / 2.0) * 100.0
                score_all = accasation_score + article_score + score_penalty
                print()
                print(
                    "Epoch %d ValidLoss:%.3f\tMacro_f1_accasation:%.3f\tMicro_f1_accsastion:%.3f\tMacro_f1_article:%.3f\tMicro_f1_article:%.3f\tMacro_f1_deathpenalty%.3f\t"
                    "Micro_f1_deathpenalty%.3f\tMacro_f1_lifeimprisonment%.3f\tMicro_f1_lifeimprisonment%.3f\t"
                    % (epoch, loss, f1_macro_accasation, f1_micro_accasation,
                       f1_a_article, f1_i_aritcle, f1_a_death, f1_i_death,
                       f1_a_life, f1_i_life))
                print("===>1.Accasation Score:", accasation_score,
                      ";2.Article Score:", article_score, ";3.Penalty Score:",
                      score_penalty, ";Score ALL:", score_all)

                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)

        # 5.最后在测试集上做测试，并报告测试准确率 Testto 0.0
        #test_loss,macrof1,microf1 = do_eval(sess, textCNN, testX, testY,iteration)
        #print("Test Loss:%.3f\tMacro f1:%.3f\tMicro f1:%.3f" % (test_loss,macrof1,microf1))
        print("training completed...")
    pass

예제 #10

파일 보기

파일: test_multi.py 프로젝트: camille1874/MultiTaskMatch

def main(_):
    # 1.load data with vocabulary of words and labels
    compare_test_data = WikiQA(word2vec=Word2Vec(),
                               max_len=FLAGS.max_compare_len)
    compare_test_data.open_file(mode="test")

    vocabulary_word2index, vocabulary_index2word = create_vocabulary(
        word2vec_model_path=FLAGS.word2vec_model_path,
        name_scope="transformer_classification")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("transformer_classification.vocab_size:", vocab_size)
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    print("list of total questions:", len(questionid_question_lists))
    test = load_data_predict(vocabulary_word2index, questionid_question_lists)
    print("list of total questions2:", len(test))
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                           value=0.)  # padding to max length
    testY2 = load_data_predict_y(FLAGS.predict_source_file_y)
    print("list of total questions3:", len(testX2))
    print("end padding...")
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = Transformer(FLAGS.num_classes,
                            FLAGS.learning_rate,
                            len(testX2),
                            FLAGS.decay_steps,
                            FLAGS.decay_rate,
                            FLAGS.sequence_length,
                            vocab_size,
                            FLAGS.embed_size,
                            FLAGS.d_model,
                            FLAGS.d_k,
                            FLAGS.d_v,
                            FLAGS.h,
                            FLAGS.num_layer,
                            FLAGS.is_training,
                            compare_test_data.num_features,
                            di=50,
                            s=compare_test_data.max_len,
                            w=4,
                            l2_reg=0.0004,
                            l2_lambda=FLAGS.l2_lambda)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        batch_x1, batch_x2, _, batch_features = compare_test_data.next_batch(
            batch_size=number_of_training_data)
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'w',
                                            'utf-8')
        logits = sess.run(model.return_logits,
                          feed_dict={
                              model.input_x: testX2,
                              model.input_y_label: testY2,
                              model.dropout_keep_prob: 1,
                              model.x1: batch_x1,
                              model.x2: batch_x2,
                              model.features: batch_features
                          })  #logits:[batch_size,self.num_classes]

        answers = {}
        MAP, MRR = 0, 0
        total = len(logits)
        for i in range(total):
            #prob = logits[i][1] - logits[i][0]
            prob = logits[i][1]
            if question_id_list[i] in answers:
                answers[question_id_list[i]].append(
                    (testX[i], testY2[i], prob))
            else:
                answers[question_id_list[i]] = [(testX[i], testY2[i], prob)]
            predict_target_file_f.write(str(logits[i]) + "\n")
        for i in answers.keys():
            p, AP = 0, 0
            MRR_check = False
            answers[i] = sorted(answers[i], key=lambda x: x[-1], reverse=True)
            for idx, (s, label, prob) in enumerate(answers[i]):
                if label == 1:
                    if not MRR_check:
                        MRR += 1 / (idx + 1)
                        MRR_check = True
                    p += 1
                    AP += p / (idx + 1)

            AP /= p
            MAP += AP

        total_q = len(answers.keys())
        MAP /= total_q
        MRR /= total_q
        print("MAP", MAP, ",MRR", MRR)

        predict_target_file_f.close()