def test_step_report(logger,session,PadZeroBegin,max_length,test_path,
    dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet,
     word_column, label_column,char_embedd_dim,max_char_per_word):
        # read test data
    graph = tf.get_default_graph()
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)
    logger.info("Padding test text and lables ...")
    word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin)
    label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin)
    logger.info("Creating character set FROM test set ...")
    char_index_test,_= dp.generate_character_data(word_sentences_test, 
                                char_alphabet=char_alphabet, setType="Test")

    logger.info("Padding Test set ...")
    char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word)
    print(type(char_index_test_pad))
    print(type(word_index_sentences_test_pad))
    
    feed_dict=create_feed_Dict_Eval(graph,PadZeroBegin=PadZeroBegin,max_length=max_length,
        x_batch=word_index_sentences_test_pad,
         act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob,
         char_batch=char_index_test_pad)
    #tf.Print(feed_dict,feed_dict)
    logit_op = graph.get_tensor_by_name('output/logits:0')
    transition_params_op = graph.get_tensor_by_name('transitions:0')
    logits,transition_params = session.run([logit_op, transition_params_op],feed_dict)
    viterbi_decode(logits=logits,transition_params=transition_params,
      seq_length=test_seq_length,x_batch=word_index_sentences_test_pad,word_alphabet=word_alphabet,label_alphabet=label_alphabet, 
      prefix_filename="test",beginZero=PadZeroBegin)
    return
def test_step(logger,session,BiLSTM,PadZeroBegin,max_length,test_path,
    dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet,
     word_column, label_column,char_embedd_dim,max_char_per_word):
        # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)
    logger.info("Padding test text and lables ...")
    word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin)
    label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin)
    logger.info("Creating character set FROM test set ...")
    char_index_test,_= dp.generate_character_data(word_sentences_test, 
                                char_alphabet=char_alphabet, setType="Test")

    logger.info("Padding Test set ...")
    char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word)
    
    # test summaries
    #test_summary_op = tf.summary.merge([loss_summary])
    #test_summary_dir = os.path.join(out_dir, "summaries", "test")
    #test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph)

    feed_dict=create_feed_Dict_Test(BiLSTM,PadZeroBegin=PadZeroBegin,max_length=max_length,
        x_batch=word_index_sentences_test_pad, y_batch=label_index_sentences_test_pad,
         act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob,
         char_batch=char_index_test_pad)
    '''#tf.Print(feed_dict,feed_dict)
    logits, transition_params = session.run([BiLSTM.logits, BiLSTM.transition_params],feed_dict)
    #logits is a list of numpy.ndarray
    #transition_params : ndarray'''
    
    logits, transition_params,embedded_char,embedded_words,char_pool_flat,input_x_test = session.run([BiLSTM.logits, BiLSTM.transition_params,
        BiLSTM.W_char,BiLSTM.W_word,BiLSTM.char_pool_flat,BiLSTM.input_x],feed_dict)
    
    accuracy,accuracy_low_classes = predictAccuracyAndWrite(logits,transition_params,test_seq_length,
        label_index_sentences_test_pad,step,word_index_sentences_test_pad,word_alphabet,label_alphabet,prefix_filename="test",beginZero=PadZeroBegin)

    #test_summary_writer.add_summary(summaries, step)
    print("step {},  accuracy on test set {:g}, accuracy for classes except Others: {:g}".format(step,accuracy,accuracy_low_classes))

    checkpoint_dir_test = os.path.abspath(os.path.join(out_dir, "checkpoints_test"))

    if not os.path.exists(checkpoint_dir_test):
        os.makedirs(checkpoint_dir_test)
    fname_data = "input_x_test_"+str(step)+".pkl"
    fname_conv_out = "char_pool_flat_"+str(step)+".pkl" 
    fname_seqLength = "act_seq_len_"+str(step)+".pkl" 
    fname_embedded_char = "embedded_char_"+str(step)+".pkl" 
    fname_embedded_words = "embedded_words_"+str(step)+".pkl" 
    pickle.dump(input_x_test,open(os.path.join(checkpoint_dir_test, fname_data),'wb'))
    pickle.dump(char_pool_flat,open(os.path.join(checkpoint_dir_test, fname_conv_out),'wb'))
    pickle.dump(test_seq_length,open(os.path.join(checkpoint_dir_test, fname_seqLength),'wb'))
    pickle.dump(embedded_char,open(os.path.join(checkpoint_dir_test, fname_embedded_char),'wb'))
    pickle.dump(embedded_words,open(os.path.join(checkpoint_dir_test, fname_embedded_words),'wb'))
    print("Saved test data checkpoint to {}\n".format(checkpoint_dir_test))
    return accuracy,accuracy_low_classes
Exemplo n.º 3
0
logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))
# get maximum length : this is mainly for padding.
max_length_train = utils.get_max_length(word_sentences_train)
max_length_dev = utils.get_max_length(word_sentences_dev)
#max_length_test = utils.get_max_length(word_sentences_test)
max_length = min(dp.MAX_LENGTH, max(max_length_train, max_length_dev))
logger.info("Maximum length (i.e max words ) of training set is %d" %
            max_length_train)
logger.info("Maximum length (i.e max words ) of dev set is %d" %
            max_length_dev)
#logger.info("Maximum length (i.e max words ) of test set is %d" % max_length_test)
logger.info("Maximum length (i.e max words ) used for training is %d" %
            max_length)

logger.info("Padding training text and lables ...")
word_index_sentences_train_pad, train_seq_length = utils.padSequence(
    word_index_sentences_train, max_length, beginZero=FLAGS.PadZeroBegin)
label_index_sentences_train_pad, _ = utils.padSequence(
    label_index_sentences_train, max_length, beginZero=FLAGS.PadZeroBegin)

logger.info("Padding dev text and lables ...")
word_index_sentences_dev_pad, dev_seq_length = utils.padSequence(
    word_index_sentences_dev, max_length, beginZero=FLAGS.PadZeroBegin)
label_index_sentences_dev_pad, _ = utils.padSequence(
    label_index_sentences_dev, max_length, beginZero=FLAGS.PadZeroBegin)

logger.info("Creating character set FROM training set ...")
char_alphabet = Alphabet('character')
char_index_train, max_char_per_word_train = dp.generate_character_data(
    word_sentences_train, char_alphabet=char_alphabet, setType="Train")
# close character alphabet. WE close it because the embed table is goign to be random
char_alphabet.close()