Exemplo n.º 1
0
    def input_fn(self, mode):
        infile = self._DATA_PATH[mode]
        data_generator = None
        # Either it is a CSV file with the original text, and then we need to use the Bert Encoder
        if infile.endswith('.csv'):
            data = pd.read_csv(self._DATA_PATH[mode])
            data.sort_values(by=['case_id', 'candidate_id'],
                             inplace=True,
                             ascending=[True, True])
            data['embeddings'] = [x for x in BertEncoder.encode(data)]

            def data_generator_from_dataframe():
                for id in data['case_id'].unique():
                    chunk = data[data['case_id'] == id]
                    vecs = chunk['embeddings']
                    labels = chunk['candidate_is_noticed'].to_numpy()
                    matrix = np.expand_dims(np.vstack(vecs), axis=-1)
                    features_dict = {
                        str(k + 1): matrix[:, k]
                        for k in range(TFRanker._NUM_FEATURES)
                    }
                    yield features_dict, labels

            data_generator = data_generator_from_dataframe

        # Or it is a LIBSVM format file, and then we have a reader
        # Use the modified code, it was shuffling the input data !!!
        # Use text_2_libsvm.py for generation
        if infile.endswith('.libsvm'):
            data_generator = libsvm_generator(infile, TFRanker._NUM_FEATURES,
                                              TFRanker._LIST_SIZE)

        #
        # dataset = tf.data.Dataset.from_generator(
        #     data_generator,
        #     output_types=({str(k): tf.float32 for k in range(1,TFRanker._NUM_FEATURES+1)}, tf.float32),
        #     output_shapes=({str(k): tf.TensorShape([TFRanker._LIST_SIZE, 1]) for k in range(1,TFRanker._NUM_FEATURES+1)}, tf.TensorShape([TFRanker._LIST_SIZE]))
        # )

        # We don't have big datasets, let's load them once and for all
        gen = data_generator()
        all_data = [x for x in gen]
        X = {}
        for i in range(TFRanker._NUM_FEATURES):
            X[str(i + 1)] = np.stack([x[0][str(i + 1)] for x in all_data],
                                     axis=0)
        Y = np.stack([x[1] for x in all_data])

        dataset = tf.data.Dataset.from_tensor_slices((X, Y))

        if mode == 'train':
            dataset = dataset.shuffle(300).repeat().batch(self._BATCH_SIZE)
        else:
            dataset = dataset.batch(self._BATCH_SIZE)

        # Queue up a number of batches on the CPU side
        dataset = dataset.prefetch(8)

        # # Queue up batches asynchronously onto the GPU
        # # As long as there is a pool of batches CPU side a GPU prefetch of 1 is sufficient.
        # gpu = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU']
        # if len(gpu) == 1:
        #     dataset.apply(tf.data.experimental.prefetch_to_device(gpu[0], buffer_size=8))

        #return dataset.make_one_shot_iterator().get_next()
        return dataset
                                                  question2targets_path)

    judge_samples, _, _ = read_file(drop_judge_dev_path)
    # judge_template, _, _ = read_file(drop_template_path)
    max_sequence_len = max(
        max([len(sample['question']) for sample in judge_samples]),
        max([len(sample['answer']) for sample in judge_samples]))
    with open(char_voc_path, 'rb') as infile:
        char_voc = pickle.load(infile)
    with open(word_voc_path, 'rb') as infile:
        word_voc = pickle.load(infile)
    bert_encoder = BertEncoder(model_root=bert_model_path,
                               bert_config_file=bert_config_file,
                               init_checkpoint=bert_checkpoint_path,
                               vocab_file=bert_voc_path,
                               max_sequence_len=max_sequence_len,
                               embedding_batch=3,
                               embedding_matrix_path=None,
                               sen2id_path=sen2id_path,
                               vec_dim=768)
    instances_judge_dev = make_instances(judge_samples,
                                         char_voc,
                                         word_voc,
                                         sentiment_words_path,
                                         ner_dict_path=ner_dict_path,
                                         pos_dict_path=pos_dict_path,
                                         use_extra_feature=use_extra_feature,
                                         question2targets=question2targets,
                                         is_training=False,
                                         need_augment=False)
    # instances_judge_template = make_instances(
def test():
    with open('config_judge.json', encoding='utf-8') as infile:
        config = json.load(infile)

    int2bool = {1: True, 0: False}

    sentiment_words_path = config["train_config"]["SENTIMENT_WORDS_PATH"]
    batch_size = config["train_config"]["BATCH_SIZE"]
    is_loop = int2bool[config["train_config"]["Is_LOOP"]]
    is_sort = int2bool[config["train_config"]["IS_SORT"]]
    dropout_rate = config["train_config"]["DROPOUT_RATE"]
    nb_classes = config["train_config"]["NB_CLASSES"]
    attention_dim = config["train_config"]["ATTENTION_DIM"]
    nb_hops = config["train_config"]["NB_HOPS"]
    drop_template_path = config["train_config"]["DROP_JUDGE_TEMPLATE_PATH"]
    use_bert = int2bool[config["train_config"]["USE_BERT"]]
    optimizer = config["train_config"]["OPTIMIZER"]
    learning_rate = config["train_config"]["LEARNING_RATE"]
    grad_clipper = config["train_config"]["GRAD_CLIPPER"]
    drop_judge_dev_path = config["train_config"]["DROP_JUDGE_DEV_PATH"]
    best_path = config["train_config"]["BEST_PATH"]
    question2targets_path = config["train_config"]["QUESTION2TARGETS_PATH"]
    use_extra_feature = config["train_config"]["USE_EXTRA_FEATURE"]
    ner_dict_size = config["train_config"]["NER_DICT_SIZE"]
    pos_dict_size = config["train_config"]["POS_DICT_SIZE"]
    extra_feature_dim = config["train_config"]["EXTRA_FEATURE_DIM"]
    ner_dict_path = config["train_config"]["NER_DICT_PATH"]
    pos_dict_path = config["train_config"]["POS_DICT_PATH"]
    rnn_dim = config["train_config"]["RNN_DIM"]
    lambda_l2 = config["train_config"]["LAMBDA_L2"]
    ans_max_len = config["train_config"]["ANS_MAX_LEN"]
    que_max_len = config["train_config"]["QUE_MAX_LEN"]
    sentiment_polarity_multiple = config["train_config"]["POLARITY_MULTIPLE"]
    use_w2v = True
    if use_bert:
        use_w2v = False

    char_voc_path = config["w2v_config"]["CHAR_VOC_PATH"]
    char_embedding_matrix_path = config["w2v_config"][
        "CHAR_EMBEDDING_MATRIX_PATH"]
    word_voc_path = config["w2v_config"]["WORD_VOC_PATH"]
    word_embedding_matrix_path = config["w2v_config"][
        "WORD_EMBEDDING_MATRIX_PATH"]

    bert_model_path = config["bert_config"]["BERT_MODEL_PATH"]
    bert_config_file = config["bert_config"]["CONFIG_FILE"]
    bert_checkpoint_path = config["bert_config"]["INIT_CHECKPOINT"]
    bert_voc_path = config["bert_config"]["VOC_FILE"]
    sen2id_path = config["bert_config"]["SEN2ID_PATH"]

    judge_samples, _, _ = read_file(drop_judge_dev_path)
    judge_template, _, _ = read_file(drop_template_path)
    max_sequence_len = max(
        max([len(sample['question']) for sample in judge_samples]),
        max([len(sample['answer']) for sample in judge_samples]))
    with open(char_voc_path, 'rb') as infile:
        char_voc = pickle.load(infile)
    with open(word_voc_path, 'rb') as infile:
        word_voc = pickle.load(infile)

    bert_encoder = BertEncoder(model_root=bert_model_path,
                               bert_config_file=bert_config_file,
                               init_checkpoint=bert_checkpoint_path,
                               vocab_file=bert_voc_path,
                               max_sequence_len=max_sequence_len,
                               embedding_batch=3,
                               embedding_matrix_path=None,
                               sen2id_path=sen2id_path,
                               vec_dim=768)

    instances_judge_dev = make_instances(judge_samples,
                                         char_voc,
                                         word_voc,
                                         sentiment_words_path,
                                         ner_dict_path=ner_dict_path,
                                         pos_dict_path=pos_dict_path,
                                         use_extra_feature=use_extra_feature,
                                         question2targets=question2targets,
                                         is_training=False,
                                         need_augment=False)

    instances_judge_dev_with_match_result(instances_judge_dev)

    data_stream_judge_dev = DataStream(instances=instances_judge_dev,
                                       is_shuffle=False,
                                       is_loop=is_loop,
                                       batch_size=batch_size,
                                       ans_max_len=ans_max_len,
                                       que_max_len=que_max_len,
                                       use_bert=use_bert,
                                       bert_encoder=bert_encoder,
                                       is_sort=is_sort)
    with tf.Graph().as_default():
        with tf.variable_scope("Model",
                               reuse=False,
                               initializer=tf.glorot_uniform_initializer()):
            answer_understander_dev = AnswerUnderstander(
                use_bert=use_bert,
                use_w2v=use_w2v,
                rnn_unit='lstm',
                dropout_rate=dropout_rate,
                optimizer=optimizer,
                learning_rate=learning_rate,
                grad_clipper=grad_clipper,
                global_step=None,
                attention_dim=attention_dim,
                nb_hops=nb_hops,
                rnn_dim=rnn_dim,
                lambda_l2=lambda_l2,
                is_training=False,
                sentiment_polarity_multiple=sentiment_polarity_multiple,
                nb_classes=nb_classes,
                use_extra_feature=use_extra_feature,
                ner_dict_size=ner_dict_size,
                pos_dict_size=pos_dict_size,
                extra_feature_dim=extra_feature_dim,
                ans_max_len=ans_max_len,
                que_max_len=que_max_len,
                char_w2v_embedding_matrix_path=char_embedding_matrix_path,
                word_w2v_embedding_matrix_path=word_embedding_matrix_path)
        saver = tf.train.Saver()
        sess = tf.Session()
        initializer = tf.global_variables_initializer()
        sess.run(initializer)
        saver.restore(sess, best_path)
        judge_acc = evaluation(sess, answer_understander_dev,
                               data_stream_judge_dev,
                               'result_{}.txt'.format(loop_index))
        print("the final judge accuracy:{}".format(judge_acc))
        return judge_acc
Exemplo n.º 4
0
INPUT_TRAIN = 'data/text/train_summarized_200.csv'
INPUT_EVAL = 'data/text/eval_summarized_200.csv'
INPUT_TEST = 'data/text/test_summarized_200.csv'

OUTPUT = {
    INPUT_TRAIN: 'data/libsvm/train_features.libsvm',
    INPUT_EVAL: 'data/libsvm/eval_features.libsvm',
    INPUT_TEST: 'data/libsvm/test_features.libsvm'
}

INPUTS = [INPUT_TEST]

for input in INPUTS:
    data = pd.read_csv(input)
    data.sort_values(by=['case_id', 'candidate_id'],
                     inplace=True,
                     ascending=[True, True])
    data['embeddings'] = [x for x in BertEncoder.encode(data)]

    with open(OUTPUT[input], 'w') as output:
        for _, sample in data.iterrows():
            vec = sample['embeddings']
            label = int(sample['candidate_is_noticed'])
            case_id = sample['case_id']
            candidate_id = sample['candidate_id']
            features = ' '.join(
                ['{}:{}'.format(i + 1, v) for i, v in enumerate(vec)])
            output.write('{} qid:{} {} cid:{}\n'.format(
                label, case_id, features, candidate_id))