Пример #1
0
 def init_data_loader(self, config, query_map_file):
     vocab_file_path = os.path.join(
         config.get("bert_pretrained_model_path"), config.get("vocab_file"))
     slot_file = os.path.join(
         event_config.get("slot_list_root_path"),
         event_config.get("bert_slot_complete_file_name_role"))
     schema_file = os.path.join(event_config.get("data_dir"),
                                event_config.get("event_schema"))
     # query_map_file = os.path.join(event_config.get(
     #         "slot_list_root_path"), event_config.get("query_map_file"))
     data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file,
                                       schema_file, query_map_file)
     return data_loader
Пример #2
0
def gen_type_classification_data():
    """
    generate event type classification data of index_type_fold_data_{}
    """
    # bert vocab file path
    vocab_file_path = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("vocab_file"))
    # bert config file path
    bert_config_file = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_config_path"))
    # event type list file path
    event_type_file = os.path.join(event_config.get("slot_list_root_path"),
                                   event_config.get("event_type_file"))
    data_loader = EventTypeClassificationPrepare(vocab_file_path, 512,
                                                 event_type_file)
    # train file
    train_file = os.path.join(event_config.get("data_dir"),
                              event_config.get("event_data_file_train"))
    # eval file
    eval_file = os.path.join(event_config.get("data_dir"),
                             event_config.get("event_data_file_eval"))
    data_loader.k_fold_split_data(train_file, eval_file, True)
Пример #3
0
def gen_type_classification_data():
    """
    generate event type classification data of index_type_fold_data_{}
    """
    # bert vocab file path
    # chinese_roberta_wwm_ext_L-12_H-1024_A-12_large/vocab.txt
    vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file"))
    # event type list file path
    # slot_pattern/vocab_all_event_type_label_map.txt
    event_type_file =  os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file"))
    data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file)
    # train file
    # data/train.json
    train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train"))
    # eval file
    # data/dev.json
    eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval"))
    data_loader.k_fold_split_data(train_file,eval_file,True)
Пример #4
0
import os
import numpy as np
from data_processing.event_prepare_data import EventTypeClassificationPrepare, EventRolePrepareMRC
from configs.event_config import event_config

if __name__ == "__main__":
    vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file"))
    # bert_config_file = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("bert_config_path"))
    event_type_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("event_type_file"))
    # data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file)
    # train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train"))
    # eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval"))
    # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True)
    slot_file = os.path.join(event_config.get("slot_list_root_path"),
                             event_config.get("bert_slot_complete_file_name_role"))
    schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema"))
    query_map_file = os.path.join(event_config.get("slot_list_root_path"), event_config.get("query_map_file"))

    data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file, schema_file, query_map_file)
    train_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_train"))
    eval_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_eval"))
    # data_list,label_start_list,label_end_list,query_len_list,token_type_id_list
    # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,True)
    # dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(eval_file,None,False)
    # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._merge_ee_and_re_datas(train_file,eval_file,"relation_extraction/data/train_data.json","relation_extraction/data/dev_data.json")
    data_loader.k_fold_split_data(train_file, eval_file, True)
    # import numpy as np
    # train_query_lens = np.load("data/fold_data_{}/query_lens_train.npy".format(0),allow_pickle=True)
    # print(train_query_lens[0])

    # re_train_file = "relation_extraction/data/train_data.json"
Пример #5
0
def run_event_verify_role_mrc(args):
    """
    retro reader 第二阶段的精度模块,同时训练两个任务,role抽取和问题是否可以回答
    :param args:
    :return:
    """
    model_base_dir = event_config.get(args.model_checkpoint_dir).format(
        args.fold_index)
    pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index)
    vocab_file_path = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("vocab_file"))
    bert_config_file = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_config_path"))
    slot_file = os.path.join(
        event_config.get("slot_list_root_path"),
        event_config.get("bert_slot_complete_file_name_role"))
    schema_file = os.path.join(event_config.get("data_dir"),
                               event_config.get("event_schema"))
    query_map_file = os.path.join(event_config.get("slot_list_root_path"),
                                  event_config.get("query_map_file"))
    data_loader = EventRolePrepareMRC(vocab_file_path, 512, slot_file,
                                      schema_file, query_map_file)
    # train_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_train"))
    # eval_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_eval"))
    # data_list,label_start_list,label_end_list,query_len_list,token_type_id_list
    # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,True)
    # dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._read_json_file(eval_file,None,False)
    # train_datas, train_labels_start,train_labels_end,train_query_lens,train_token_type_id_list,dev_datas, dev_labels_start,dev_labels_end,dev_query_lens,dev_token_type_id_list = data_loader._merge_ee_and_re_datas(train_file,eval_file,"relation_extraction/data/train_data.json","relation_extraction/data/dev_data.json")
    train_has_answer_label_list = []
    dev_has_answer_label_list = []
    train_datas = np.load(
        "data/verify_neg_fold_data_{}/token_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    # train_has_answer_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_train.npy".format(args.fold_index),allow_pickle=True)
    train_token_type_id_list = np.load(
        "data/verify_neg_fold_data_{}/token_type_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_datas = np.load(
        "data/verify_neg_fold_data_{}/token_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    # dev_has_answer_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_dev.npy".format(args.fold_index),allow_pickle=True)
    dev_token_type_id_list = np.load(
        "data/verify_neg_fold_data_{}/token_type_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_query_lens = np.load(
        "data/verify_neg_fold_data_{}/query_lens_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_query_lens = np.load(
        "data/verify_neg_fold_data_{}/query_lens_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_start_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_start_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_start_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_start_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_end_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_end_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_end_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_end_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_samples_nums = len(train_datas)
    for i in range(train_samples_nums):
        if sum(train_start_labels[i]) == 0:
            train_has_answer_label_list.append(0)
        else:
            train_has_answer_label_list.append(1)

    train_has_answer_label_list = np.array(
        train_has_answer_label_list).reshape((train_samples_nums, 1))
    dev_samples_nums = len(dev_datas)
    for i in range(dev_samples_nums):
        if sum(dev_start_labels[i]) == 0:
            dev_has_answer_label_list.append(0)
        else:
            dev_has_answer_label_list.append(1)
    dev_has_answer_label_list = np.array(dev_has_answer_label_list).reshape(
        (dev_samples_nums, 1))

    if train_samples_nums % args.train_batch_size != 0:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1
    else:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size)
    # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1
    logger.info('*****train_set sample nums:{}'.format(train_samples_nums))
    logger.info('*****dev_set sample nums:{}'.format(dev_samples_nums))
    logger.info('*****train each epoch steps:{}'.format(each_epoch_steps))
    train_steps_nums = each_epoch_steps * args.epochs
    # train_steps_nums = each_epoch_steps * args.epochs // hvd.size()
    logger.info('*****train_total_steps:{}'.format(train_steps_nums))
    decay_steps = args.decay_epoch * each_epoch_steps
    logger.info('*****train decay steps:{}'.format(decay_steps))
    # dropout_prob是丢弃概率
    params = {
        "dropout_prob": args.dropout_prob,
        "num_labels": 2,
        "rnn_size": args.rnn_units,
        "num_layers": args.num_layers,
        "hidden_units": args.hidden_units,
        "decay_steps": decay_steps,
        "train_steps": train_steps_nums,
        "num_warmup_steps": int(train_steps_nums * 0.1)
    }
    # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums)
    config_tf = tf.ConfigProto()
    config_tf.gpu_options.allow_growth = True
    run_config = tf.estimator.RunConfig(
        model_dir=model_base_dir,
        save_summary_steps=each_epoch_steps,
        save_checkpoints_steps=each_epoch_steps,
        session_config=config_tf,
        keep_checkpoint_max=3,
        # train_distribute=dist_strategy
    )
    bert_init_checkpoints = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_init_checkpoints"))
    # init_checkpoints = "output/model/merge_usingtype_roberta_traindev_event_role_bert_mrc_model_desmodified_lowercase/checkpoint/model.ckpt-1218868"
    model_fn = event_verify_mrc_model_fn_builder(bert_config_file,
                                                 bert_init_checkpoints, args)
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)
    if args.do_train:
        train_input_fn = lambda: event_input_verfify_mrc_fn(
            train_datas,
            train_start_labels,
            train_end_labels,
            train_token_type_id_list,
            train_query_lens,
            train_has_answer_label_list,
            is_training=True,
            is_testing=False,
            args=args)
        eval_input_fn = lambda: event_input_verfify_mrc_fn(
            dev_datas,
            dev_start_labels,
            dev_end_labels,
            dev_token_type_id_list,
            dev_query_lens,
            dev_has_answer_label_list,
            is_training=False,
            is_testing=False,
            args=args)
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=train_steps_nums)
        exporter = tf.estimator.BestExporter(
            exports_to_keep=1,
            serving_input_receiver_fn=bert_mrc_serving_input_receiver_fn)
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          exporters=[exporter],
                                          throttle_secs=0)
        # for _ in range(args.epochs):

        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        # "bert_ce_model_pb"
        estimator.export_saved_model(pb_model_dir,
                                     bert_mrc_serving_input_receiver_fn)
Пример #6
0
def run_event_binclassification(args):
    """
    retroreader中的eav模块,即第一遍阅读模块,预测该问题是否有回答
    :param args:
    :return:
    """
    model_base_dir = event_config.get(args.model_checkpoint_dir).format(
        args.fold_index)
    pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index)
    print(model_base_dir)
    print(pb_model_dir)
    vocab_file_path = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("vocab_file"))
    bert_config_file = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_config_path"))
    event_type_file = os.path.join(event_config.get("slot_list_root_path"),
                                   event_config.get("event_type_file"))
    # data_loader =EventTypeClassificationPrepare(vocab_file_path,512,event_type_file)
    # train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train"))
    # eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval"))
    # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True)
    train_data_list = np.load(
        "data/verify_neg_fold_data_{}/token_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    # train_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_train.npy".format(args.fold_index),allow_pickle=True)
    train_label_list = []
    train_start_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_start_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_start_labels = np.load(
        "data/verify_neg_fold_data_{}/labels_start_dev.npy".format(
            args.fold_index),
        allow_pickle=True)

    train_token_type_id_list = np.load(
        "data/verify_neg_fold_data_{}/token_type_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_data_list = np.load(
        "data/verify_neg_fold_data_{}/token_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    # dev_label_list = np.load("data/verify_neg_fold_data_{}/has_answer_dev.npy".format(args.fold_index),allow_pickle=True)
    dev_label_list = []
    dev_token_type_id_list = np.load(
        "data/verify_neg_fold_data_{}/token_type_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)

    # dev_datas,dev_token_type_ids,dev_labels = data_loader._read_json_file(eval_file)
    train_samples_nums = len(train_data_list)
    for i in range(train_samples_nums):
        if sum(train_start_labels[i]) == 0:
            train_label_list.append(0)
        else:
            train_label_list.append(1)
    train_label_list = np.array(train_label_list).reshape(
        (train_samples_nums, 1))
    dev_samples_nums = len(dev_data_list)
    for i in range(dev_samples_nums):
        if sum(dev_start_labels[i]) == 0:
            dev_label_list.append(0)
        else:
            dev_label_list.append(1)
    dev_label_list = np.array(dev_label_list).reshape((dev_samples_nums, 1))
    if train_samples_nums % args.train_batch_size != 0:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1
    else:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size)
    # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1
    logger.info('*****train_set sample nums:{}'.format(train_samples_nums))
    logger.info('*****train each epoch steps:{}'.format(each_epoch_steps))
    train_steps_nums = each_epoch_steps * args.epochs
    # train_steps_nums = each_epoch_steps * args.epochs // hvd.size()
    logger.info('*****train_total_steps:{}'.format(train_steps_nums))
    decay_steps = args.decay_epoch * each_epoch_steps
    logger.info('*****train decay steps:{}'.format(decay_steps))
    # dropout_prob是丢弃概率
    params = {
        "dropout_prob": args.dropout_prob,
        "num_labels": 1,
        "rnn_size": args.rnn_units,
        "num_layers": args.num_layers,
        "hidden_units": args.hidden_units,
        "decay_steps": decay_steps,
        "class_weight": 1
    }
    # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums)
    config_tf = tf.ConfigProto()
    config_tf.gpu_options.allow_growth = True
    # "bert_ce_model_dir"
    # mirrored_strategy = tf.distribute.MirroredStrategy()
    # config_tf.gpu_options.visible_device_list = str(hvd.local_rank())
    # checkpoint_path = os.path.join(bert_config.get(args.model_checkpoint_dir), str(hvd.rank()))

    run_config = tf.estimator.RunConfig(
        model_dir=model_base_dir,
        save_summary_steps=train_steps_nums + 10,
        save_checkpoints_steps=each_epoch_steps,
        session_config=config_tf,
        keep_checkpoint_max=1,
        # train_distribute=dist_strategy
    )
    bert_init_checkpoints = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_init_checkpoints"))
    model_fn = bert_binaryclassification_model_fn_builder(
        bert_config_file, bert_init_checkpoints, args)
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    if args.do_train:
        # train_input_fn = lambda: data_loader.create_dataset(is_training=True,is_testing=False, args=args)
        # eval_input_fn = lambda: data_loader.create_dataset(is_training=False,is_testing=False,args=args)
        # train_X,train_Y = np.load(data_loader.train_X_path,allow_pickle=True),np.load(data_loader.train_Y_path,allow_pickle=True)

        # train_input_fn = lambda :event_class_input_bert_fn(train_data_list,token_type_ids=train_token_type_id_list,label_map_len=data_loader.labels_map_len,
        #                                                  is_training=True,is_testing=False,args=args,input_Ys=train_label_list)

        train_input_fn = lambda: event_binclass_input_bert_fn(
            train_data_list,
            token_type_ids=train_token_type_id_list,
            label_map_len=1,
            is_training=True,
            is_testing=False,
            args=args,
            input_Ys=train_label_list)
        # eval_X,eval_Y = np.load(data_loader.valid_X_path,allow_pickle=True),np.load(data_loader.valid_Y_path,allow_pickle=True)

        # eval_input_fn = lambda: event_class_input_bert_fn(dev_data_list,token_type_ids=dev_token_type_id_list,label_map_len=data_loader.labels_map_len,
        #                                                 is_training=False,is_testing=False,args=args,input_Ys=dev_label_list)
        eval_input_fn = lambda: event_binclass_input_bert_fn(
            dev_data_list,
            token_type_ids=dev_token_type_id_list,
            label_map_len=1,
            is_training=False,
            is_testing=False,
            args=args,
            input_Ys=dev_label_list)
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=train_steps_nums)
        exporter = tf.estimator.BestExporter(
            exports_to_keep=1,
            serving_input_receiver_fn=bert_event_bin_serving_input_receiver_fn)
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          throttle_secs=0,
                                          exporters=[exporter])
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        # "bert_ce_model_pb"
        estimator.export_saved_model(pb_model_dir,
                                     bert_event_bin_serving_input_receiver_fn)
Пример #7
0
def run_event_classification(args):
    """
    事件类型分析,多标签二分类问题,借鉴NL2SQL预测column的方法
    :param args:
    :return:
    """
    model_base_dir = event_config.get(args.model_checkpoint_dir).format(
        args.fold_index)
    pb_model_dir = event_config.get(args.model_pb_dir).format(args.fold_index)
    # print(model_base_dir)
    # print(pb_model_dir)
    vocab_file_path = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("vocab_file"))
    bert_config_file = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_config_path"))
    event_type_file = os.path.join(event_config.get("slot_list_root_path"),
                                   event_config.get("event_type_file"))
    data_loader = EventTypeClassificationPrepare(vocab_file_path, 512,
                                                 event_type_file)
    # train_data_list,train_label_list,train_token_type_id_list,dev_data_list,dev_label_list,dev_token_type_id_list = data_loader._read_json_file(train_file,eval_file,is_train=True)
    train_data_list = np.load(
        "data/index_type_fold_data_{}/token_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_label_list = np.load(
        "data/index_type_fold_data_{}/labels_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_token_type_id_list = np.load(
        "data/index_type_fold_data_{}/token_type_ids_train.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_type_index_ids_list = np.load(
        "data/index_type_fold_data_{}/type_index_in_token_ids_train.npy".
        format(args.fold_index),
        allow_pickle=True)
    dev_data_list = np.load(
        "data/index_type_fold_data_{}/token_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_label_list = np.load(
        "data/index_type_fold_data_{}/labels_dev.npy".format(args.fold_index),
        allow_pickle=True)
    dev_token_type_id_list = np.load(
        "data/index_type_fold_data_{}/token_type_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    dev_type_index_ids_list = np.load(
        "data/index_type_fold_data_{}/type_index_in_token_ids_dev.npy".format(
            args.fold_index),
        allow_pickle=True)
    train_labels = np.array(train_label_list)
    # print(train_labels.shape)
    a = np.sum(train_labels, axis=0)
    a = [max(a) / ele for ele in a]
    class_weight = np.array(a)
    class_weight = np.reshape(class_weight, (1, 65))
    # print(class_weight)
    # dev_datas,dev_token_type_ids,dev_labels = data_loader._read_json_file(eval_file)
    train_samples_nums = len(train_data_list)
    dev_samples_nums = len(dev_data_list)
    if train_samples_nums % args.train_batch_size != 0:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size) + 1
    else:
        each_epoch_steps = int(train_samples_nums / args.train_batch_size)
    # each_epoch_steps = int(data_loader.train_samples_nums/args.train_batch_size)+1
    logger.info('*****train_set sample nums:{}'.format(train_samples_nums))
    logger.info('*****train each epoch steps:{}'.format(each_epoch_steps))
    train_steps_nums = each_epoch_steps * args.epochs
    # train_steps_nums = each_epoch_steps * args.epochs // hvd.size()
    logger.info('*****train_total_steps:{}'.format(train_steps_nums))
    decay_steps = args.decay_epoch * each_epoch_steps
    logger.info('*****train decay steps:{}'.format(decay_steps))
    # dropout_prob是丢弃概率
    params = {
        "dropout_prob": args.dropout_prob,
        "num_labels": data_loader.labels_map_len,
        "rnn_size": args.rnn_units,
        "num_layers": args.num_layers,
        "hidden_units": args.hidden_units,
        "decay_steps": decay_steps,
        "class_weight": class_weight
    }
    # dist_strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.gpu_nums)
    config_tf = tf.ConfigProto()
    config_tf.gpu_options.allow_growth = True

    run_config = tf.estimator.RunConfig(
        model_dir=model_base_dir,
        save_summary_steps=train_steps_nums + 10,
        save_checkpoints_steps=each_epoch_steps,
        session_config=config_tf,
        keep_checkpoint_max=1,
        # train_distribute=dist_strategy
    )
    bert_init_checkpoints = os.path.join(
        event_config.get("bert_pretrained_model_path"),
        event_config.get("bert_init_checkpoints"))
    model_fn = bert_classification_model_fn_builder(bert_config_file,
                                                    bert_init_checkpoints,
                                                    args)
    estimator = tf.estimator.Estimator(model_fn,
                                       params=params,
                                       config=run_config)

    if args.do_train:

        train_input_fn = lambda: event_index_class_input_bert_fn(
            train_data_list,
            token_type_ids=train_token_type_id_list,
            type_index_ids_list=train_type_index_ids_list,
            label_map_len=data_loader.labels_map_len,
            is_training=True,
            is_testing=False,
            args=args,
            input_Ys=train_label_list)
        # eval_X,eval_Y = np.load(data_loader.valid_X_path,allow_pickle=True),np.load(data_loader.valid_Y_path,allow_pickle=True)

        # eval_input_fn = lambda: event_class_input_bert_fn(dev_data_list,token_type_ids=dev_token_type_id_list,label_map_len=data_loader.labels_map_len,
        #                                                 is_training=False,is_testing=False,args=args,input_Ys=dev_label_list)
        eval_input_fn = lambda: event_index_class_input_bert_fn(
            dev_data_list,
            token_type_ids=dev_token_type_id_list,
            type_index_ids_list=dev_type_index_ids_list,
            label_map_len=data_loader.labels_map_len,
            is_training=False,
            is_testing=False,
            args=args,
            input_Ys=dev_label_list)
        train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                            max_steps=train_steps_nums)
        exporter = tf.estimator.BestExporter(
            exports_to_keep=1,
            serving_input_receiver_fn=bert_event_type_serving_input_receiver_fn
        )
        eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                          throttle_secs=0,
                                          exporters=[exporter])
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        # "bert_ce_model_pb"
        estimator.export_saved_model(
            pb_model_dir, bert_event_type_serving_input_receiver_fn)
Пример #8
0
def parse_kfold_verify(args):

    if (args.gpus is not None):
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    # test_file = "data/test1.json"
    # Path of test dataset json file
    test_file = os.path.join(event_config.get("data_dir"),
                             event_config.get("event_data_file_test"))
    # Path of text multi label classification saved model
    class_type_model_path = event_config.get(args.event_type_model_path)
    event_schema_file = os.path.join(event_config.get("data_dir"),
                                     event_config.get("event_schema"))
    event_schema_dict = parse_event_schema(event_schema_file)
    # multi label event type classifer
    fp_type = fastPredictTypeClassification(class_type_model_path,
                                            event_config)
    # parse json file to get id and text
    id_list, text_list = fp_type.parse_test_json(test_file)
    #
    kfold_type_result_list = []  # for prediction in 65 probabilities
    event_type_result_list = []  # for result event type name
    for k in range(1):
        predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k))
        cur_fold_event_type_probs = fp_type.predict_for_all_prob(
            predict_fn, text_list)
        kfold_type_result_list.append(cur_fold_event_type_probs)

    for i in range(len(text_list)):
        cur_sample_event_type_buffer = [
            ele[i] for ele in kfold_type_result_list
        ]
        cur_sample_event_type_prob = np.array(
            cur_sample_event_type_buffer).reshape((-1, 65))
        avg_result = np.mean(cur_sample_event_type_prob, axis=0)
        event_label_ids = np.argwhere(avg_result > 0.5)
        event_cur_type_strs = [
            fp_type.data_loader.id2labels_map.get(ele[0])
            for ele in event_label_ids
        ]
        event_type_result_list.append(event_cur_type_strs)

    # path of Answerable Verificaion model to predict whether a query is answerable,
    # 第一阶段 粗读 的 answaerable verifier ,
    # External Front Verifier
    exterinal_av_model_path = "output/model/final_verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    # verify_av_model_path_old = event_config.get(args.event_verfifyav_model_path)
    verify_av_model_path_old = "output/model/verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    # 第二阶段 精读 的 answaerable verifier ,
    # Internal Front Verifier 。
    interbal_av_model_path = "output/model/final_verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"

    fp_cls_old = fastPredictCls(exterinal_av_model_path, event_config,
                                "data/slot_pattern/slot_descrip_old")
    # fp_cls_new = fastPredictCls(cls_model_path, event_config, "data/slot_pattern/slot_descrip")
    fp_answerable_verifier = fastPredictCls(exterinal_av_model_path,
                                            event_config,
                                            "data/slot_pattern/slot_descrip")

    kfold_eav_hasa_result = []
    kfold_start_result = []
    kfold_end_result = []
    kfold_hasa_result = []

    for k in range(1):

        # predict_fn_cls_new = fp_answerable_verifier.load_models_kfold(external_av_model_path.format(k))
        # 粗读fn
        predict_fn_ex_av = fp_answerable_verifier.load_models_kfold(
            exterinal_av_model_path.format(k))
        # predict_fn_av = fp_cls_new.load_models_kfold(verify_av_model_path_new.format(k))
        # 精读fn
        predict_fn_in_av = fp_answerable_verifier.load_models_kfold(
            interbal_av_model_path.format(k))

        cur_fold_eav_probs_result = {}
        cur_fold_av_start_probs_result = {}
        cur_fold_av_end_probs_result = {}
        cur_fold_av_has_answer_probs_result = {}

        for sample_id, event_type_res, text in zip(id_list,
                                                   event_type_result_list,
                                                   text_list):

            if event_type_res is None or len(event_type_res) == 0:
                # submit_result.append({"id": sample_id, "event_list": []})
                cur_fold_eav_probs_result.update({sample_id: []})
                continue
            for cur_event_type in event_type_res:
                cur_event_type = cur_event_type.strip()
                if cur_event_type is None or cur_event_type == "":
                    continue
                corresponding_role_type_list = event_schema_dict.get(
                    cur_event_type)
                cur_event_type_answerable_probs_result = []
                cur_event_av_start_probs_result = []
                cur_event_av_end_probs_result = []
                cur_event_av_hasanswer_probs_result = []
                for cur_role_type in corresponding_role_type_list:

                    has_answer_probs = None
                    label_prob = None
                    start_probs = None
                    end_probs = None

                    cur_query_word = fp_answerable_verifier.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    query_token_ids, query_token_len, token_type_ids_, token_mapping_new = fp_answerable_verifier.data_loader.trans_single_data_for_test(
                        text, cur_query_word, 512)

                    #############################################################################
                    ## Exterinal Answerable Verify, predict answerable probs
                    eav_probs = fp_answerable_verifier.predict_single_sample_prob(
                        predict_fn_ex_av, query_token_ids, query_token_len,
                        token_type_ids_)
                    #############################################################################
                    # Internal Answerable Verify ,predict start&end labe and answerable probs
                    role_start_ids, role_end_ids, role_start_probs, role_end_probs, iav_probs = fp_cls_old.predict_single_sample_av_prob(
                        predict_fn_in_av, query_token_ids, query_token_len,
                        token_type_ids_)

                    cur_event_type_answerable_probs_result.append(eav_probs)
                    cur_event_av_hasanswer_probs_result.append(iav_probs)
                    cur_event_av_start_probs_result.append(role_start_probs)
                    cur_event_av_end_probs_result.append(role_end_probs)

                cur_fold_eav_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_type_answerable_probs_result
                })
                cur_fold_av_start_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_start_probs_result
                })
                cur_fold_av_end_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_end_probs_result
                })
                cur_fold_av_has_answer_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_hasanswer_probs_result
                })
        kfold_eav_hasa_result.append(cur_fold_eav_probs_result)
        kfold_start_result.append(cur_fold_av_start_probs_result)
        kfold_end_result.append(cur_fold_av_end_probs_result)
        kfold_hasa_result.append(cur_fold_av_has_answer_probs_result)

    submit_result = []

    for sample_id, event_type_res, text in zip(id_list, event_type_result_list,
                                               text_list):
        event_list = []
        if event_type_res is None or len(event_type_res) == 0:
            submit_result.append({"id": sample_id, "event_list": []})
            continue
        for cur_event_type in event_type_res:
            cur_event_type = cur_event_type.strip()
            if cur_event_type is None or cur_event_type == "":
                continue
            corresponding_role_type_list = event_schema_dict.get(
                cur_event_type)
            find_key = sample_id + "-" + cur_event_type
            fold_cls_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_eav_hasa_result
            ]
            fold_start_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_start_result
            ]
            fold_end_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_end_result
            ]
            fold_has_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_hasa_result
            ]
            for index, cur_role_type in enumerate(
                    corresponding_role_type_list):
                cur_eav_fold_probs = [
                    probs[index] for probs in fold_cls_probs_cur_sample
                ]

                cur_iav_hasa_fold_probs = [
                    probs[index] for probs in fold_has_probs_cur_sample
                ]

                cur_eav_fold_probs = np.array(cur_eav_fold_probs).reshape(
                    (-1, 1))
                cls_avg_result = np.mean(cur_eav_fold_probs, axis=0)

                cur_iav_hasa_fold_probs = np.array(
                    cur_iav_hasa_fold_probs).reshape((-1, 1))
                has_avg_result = np.mean(cur_iav_hasa_fold_probs, axis=0)

                ######
                # EAV * 0.5 + IAV * 0.5
                final_probs_hasa = 0.5 * (cls_avg_result) + 0.5 * (
                    has_avg_result)

                if final_probs_hasa > 0.4:

                    cur_query_word = fp_answerable_verifier.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids, query_len, token_type_ids, token_mapping = fp_answerable_verifier.data_loader.trans_single_data_for_test(
                        text, cur_query_word, 512)

                    token_len = len(token_ids)

                    cur_start_fold_probs = [
                        probs[index] for probs in fold_start_probs_cur_sample
                    ]
                    cur_end_fold_probs = [
                        probs[index] for probs in fold_end_probs_cur_sample
                    ]

                    cur_start_fold_probs = np.array(
                        cur_start_fold_probs).reshape((-1, token_len, 2))
                    cur_end_fold_probs = np.array(cur_end_fold_probs).reshape(
                        (-1, token_len, 2))
                    start_avg_result = np.mean(cur_start_fold_probs, axis=0)
                    end_avg_result = np.mean(cur_end_fold_probs, axis=0)

                    text_start_probs = start_avg_result[query_len:-1, 1]
                    text_end_probs = end_avg_result[query_len:-1, 1]

                    pos_start_probs = (text_start_probs)
                    pos_end_probs = (text_end_probs)

                    start_ids = (pos_start_probs > 0.4).astype(int)

                    end_ids = (pos_end_probs > 0.4).astype(int)
                    token_mapping = token_mapping[1:-1]

                    entity_list, span_start_end_tuple_list = fp_answerable_verifier.extract_entity_from_start_end_ids(
                        text=text,
                        start_ids=start_ids,
                        end_ids=end_ids,
                        token_mapping=token_mapping)

                    for entity in entity_list:
                        if len(entity) > 1:
                            event_list.append({
                                "event_type":
                                cur_event_type,
                                "arguments": [{
                                    "role": cur_role_type,
                                    "argument": entity
                                }]
                            })
        submit_result.append({"id": sample_id, "event_list": event_list})

    with codecs.open(args.submit_result, 'w', 'utf-8') as fw:
        for dict_result in submit_result:
            write_str = json.dumps(dict_result, ensure_ascii=False)
            fw.write(write_str)
            fw.write("\n")

    print("finish")
Пример #9
0
def parse_kfold_verfify(args):
    # test_file = os.path.join(event_config.get("data_dir"), event_config.get("event_data_file_test"))
    test_file = "data/test2.json"
    class_type_model_path = event_config.get(args.event_type_model_path)
    event_schema_file = os.path.join(event_config.get("data_dir"),
                                     event_config.get("event_schema"))
    event_schema_dict = parse_event_schema(event_schema_file)
    fp_type = fastPredictTypeClassification(class_type_model_path,
                                            event_config)
    id_list, text_list = fp_type.parse_test_json(test_file)
    kfold_type_result_list = []
    event_type_result_list = []
    # for k in range(6):
    #     predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k))
    #     cur_fold_event_type_probs = fp_type.predict_for_all_prob(predict_fn,text_list)
    #     kfold_type_result_list.append(cur_fold_event_type_probs)

    # for i in range(len(text_list)):
    #     cur_sample_event_type_buffer = [ele[i] for ele in kfold_type_result_list]
    #     cur_sample_event_type_prob = np.array(cur_sample_event_type_buffer).reshape((6,65))
    #     avg_result = np.mean(cur_sample_event_type_prob,axis=0)
    #     event_label_ids = np.argwhere(avg_result > 0.5)
    #     event_cur_type_strs = [fp_type.data_loader.id2labels_map.get(
    #             ele[0]) for ele in event_label_ids]
    #     event_type_result_list.append(event_cur_type_strs)

    # with codecs.open("test2_kfold_new_final_event_type.txt", 'w', 'utf-8') as fw:
    #     for event_type_result in event_type_result_list:
    #         write_line = ",".join(event_type_result)
    #         fw.write(write_line)
    #         fw.write("\n")

    event_type_result_list = []
    with codecs.open("test2_kfold_new_final_event_type.txt", 'r',
                     'utf-8') as fr:
        for line in fr:
            line = line.strip("\n")
            event_list_cur = line.split(",")
            event_type_result_list.append(event_list_cur)
    # cls_model_path = event_config.get(args.event_cls_model_path)
    cls_model_path = "output/model/verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    cls_model_path_new = "output/model/final_verify_cls_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    # verify_av_model_path_old = event_config.get(args.event_verfifyav_model_path)
    verify_av_model_path_old = "output/model/verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    verify_av_model_path_new = "output/model/final_verify_avmrc_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/saved_model"
    fp_cls_old = fastPredictCls(cls_model_path, event_config,
                                "data/slot_pattern/slot_descrip_old")
    fp_cls_new = fastPredictCls(cls_model_path, event_config,
                                "data/slot_pattern/slot_descrip")
    kfold_cls_result = []
    kfold_start_result = []
    kfold_end_result = []
    kfold_hasa_result = []
    for k in range(6):

        predict_fn = fp_cls_old.load_models_kfold(cls_model_path.format(k))
        predict_fn_cls_new = fp_cls_new.load_models_kfold(
            cls_model_path_new.format(k))
        predict_fn_av = fp_cls_new.load_models_kfold(
            verify_av_model_path_new.format(k))
        predict_fn_av_old = fp_cls_old.load_models_kfold(
            verify_av_model_path_old.format(k))
        cur_fold_cls_probs_result = {}
        cur_fold_av_start_probs_result = {}
        cur_fold_av_end_probs_result = {}
        cur_fold_av_has_answer_probs_result = {}
        for sample_id, event_type_res, text in zip(id_list,
                                                   event_type_result_list,
                                                   text_list):

            if event_type_res is None or len(event_type_res) == 0:
                # submit_result.append({"id": sample_id, "event_list": []})
                cur_fold_cls_probs_result.update({sample_id: []})
                continue
            for cur_event_type in event_type_res:
                cur_event_type = cur_event_type.strip()
                if cur_event_type is None or cur_event_type == "":
                    continue
                corresponding_role_type_list = event_schema_dict.get(
                    cur_event_type)
                cur_event_type_cls_probs_result = []
                cur_event_av_start_probs_result = []
                cur_event_av_end_probs_result = []
                cur_event_av_hasanswer_probs_result = []
                for cur_role_type in corresponding_role_type_list:
                    cur_query_word_old = fp_cls_old.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids, query_len, token_type_ids, token_mapping = fp_cls_old.data_loader.trans_single_data_for_test(
                        text, cur_query_word_old, 512)
                    label_prob = fp_cls_old.predict_single_sample_prob(
                        predict_fn, token_ids, query_len, token_type_ids)

                    start_ids, end_ids, start_probs, end_probs, has_answer_probs = fp_cls_old.predict_single_sample_av_prob(
                        predict_fn_av_old, token_ids, query_len,
                        token_type_ids)
                    # cur_event_av_start_probs_result.append(start_probs)
                    # cur_event_av_end_probs_result.append(end_probs)
                    # new
                    cur_query_word_new = fp_cls_new.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids_new, query_len_new, token_type_ids_new, token_mapping_new = fp_cls_new.data_loader.trans_single_data_for_test(
                        text, cur_query_word_new, 512)
                    label_prob_new = fp_cls_new.predict_single_sample_prob(
                        predict_fn_cls_new, token_ids_new, query_len_new,
                        token_type_ids_new)
                    start_ids_new, end_ids_new, start_probs_new, end_probs_new, has_answer_probs_new = fp_cls_old.predict_single_sample_av_prob(
                        predict_fn_av, token_ids_new, query_len_new,
                        token_type_ids_new)
                    cur_event_av_hasanswer_probs_result.append(
                        (has_answer_probs, has_answer_probs_new))
                    cur_event_type_cls_probs_result.append(
                        (label_prob, label_prob_new))
                    cur_event_av_start_probs_result.append(
                        (start_probs, start_probs_new))
                    cur_event_av_end_probs_result.append(
                        (end_probs, end_probs_new))
                cur_fold_cls_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_type_cls_probs_result
                })
                cur_fold_av_start_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_start_probs_result
                })
                cur_fold_av_end_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_end_probs_result
                })
                cur_fold_av_has_answer_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    cur_event_av_hasanswer_probs_result
                })
        kfold_cls_result.append(cur_fold_cls_probs_result)
        kfold_start_result.append(cur_fold_av_start_probs_result)
        kfold_end_result.append(cur_fold_av_end_probs_result)
        kfold_hasa_result.append(cur_fold_av_has_answer_probs_result)

    submit_result = []
    for sample_id, event_type_res, text in zip(id_list, event_type_result_list,
                                               text_list):
        event_list = []
        if event_type_res is None or len(event_type_res) == 0:
            submit_result.append({"id": sample_id, "event_list": []})
            continue
        for cur_event_type in event_type_res:
            cur_event_type = cur_event_type.strip()
            if cur_event_type is None or cur_event_type == "":
                continue
            corresponding_role_type_list = event_schema_dict.get(
                cur_event_type)
            find_key = sample_id + "-" + cur_event_type
            fold_cls_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_cls_result
            ]
            fold_start_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_start_result
            ]
            fold_end_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_end_result
            ]
            fold_has_probs_cur_sample = [
                ele.get(find_key) for ele in kfold_hasa_result
            ]
            for index, cur_role_type in enumerate(
                    corresponding_role_type_list):
                cur_cls_fold_probs = [
                    probs[index] for probs in fold_cls_probs_cur_sample
                ]
                cur_cls_fold_probs_old = []
                cur_cls_fold_probs_new = []
                cur_hasa_fold_probs = [
                    probs[index] for probs in fold_has_probs_cur_sample
                ]
                cur_hasa_fold_probs_old = []
                cur_hasa_fold_probs_new = []
                for k in range(len(cur_cls_fold_probs)):
                    cur_cls_fold_probs_old.append(cur_cls_fold_probs[k][0])
                    cur_cls_fold_probs_new.append(cur_cls_fold_probs[k][1])
                    cur_hasa_fold_probs_old.append(cur_hasa_fold_probs[k][0])
                    cur_hasa_fold_probs_new.append(cur_hasa_fold_probs[k][1])

                cur_cls_fold_probs_old = np.array(
                    cur_cls_fold_probs_old).reshape((6, 1))
                cls_avg_result_old = np.mean(cur_cls_fold_probs_old, axis=0)

                cur_cls_fold_probs_new = np.array(
                    cur_cls_fold_probs_new).reshape((6, 1))
                cls_avg_result_new = np.mean(cur_cls_fold_probs_new, axis=0)

                cur_hasa_fold_probs_old = np.array(
                    cur_hasa_fold_probs_old).reshape((6, 1))
                has_avg_result_old = np.mean(cur_hasa_fold_probs_old, axis=0)

                cur_hasa_fold_probs_new = np.array(
                    cur_hasa_fold_probs_new).reshape((6, 1))
                has_avg_result_new = np.mean(cur_hasa_fold_probs_new, axis=0)

                # cur_hasa_fold_probs = np.array(cur_hasa_fold_probs).reshape((6,1))
                # has_avg_result = np.mean(cur_hasa_fold_probs,axis=0)
                final_probs_hasa = 0.5 * (
                    cls_avg_result_old + cls_avg_result_new) / 2 + 0.5 * (
                        has_avg_result_old + has_avg_result_new) / 2

                if final_probs_hasa > 0.4:
                    cur_query_word = fp_cls_new.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids, query_len, token_type_ids, token_mapping = fp_cls_new.data_loader.trans_single_data_for_test(
                        text, cur_query_word, 512)

                    cur_query_word_old = fp_cls_old.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids_old, query_len_old, token_type_ids_old, token_mapping_old = fp_cls_old.data_loader.trans_single_data_for_test(
                        text, cur_query_word_old, 512)

                    token_len = len(token_ids)
                    token_len_old = len(token_ids_old)
                    cur_start_fold_probs = [
                        probs[index] for probs in fold_start_probs_cur_sample
                    ]
                    cur_end_fold_probs = [
                        probs[index] for probs in fold_end_probs_cur_sample
                    ]
                    cur_start_fold_probs_old = []
                    cur_start_fold_probs_new = []
                    cur_end_fold_probs_old = []
                    cur_end_fold_probs_new = []

                    for k in range(len(cur_start_fold_probs)):
                        cur_start_fold_probs_old.append(
                            cur_start_fold_probs[k][0])
                        cur_start_fold_probs_new.append(
                            cur_start_fold_probs[k][1])
                        cur_end_fold_probs_old.append(cur_end_fold_probs[k][0])
                        cur_end_fold_probs_new.append(cur_end_fold_probs[k][1])
                    # cur_start_fold_probs_old = [probs[index] for probs in fold_start_probs_cur_sample]
                    # cur_end_fold_probs_old = [probs[index] for probs in fold_end_probs_cur_sample]
                    cur_start_fold_probs_old = np.array(
                        cur_start_fold_probs_old).reshape(
                            (6, token_len_old, 2))
                    cur_end_fold_probs_old = np.array(
                        cur_end_fold_probs_old).reshape((6, token_len_old, 2))
                    start_avg_result_old = np.mean(cur_start_fold_probs_old,
                                                   axis=0)
                    end_avg_result_old = np.mean(cur_end_fold_probs_old,
                                                 axis=0)

                    pos_start_probs_old = start_avg_result_old[:, 1]
                    pos_end_probs_old = end_avg_result_old[:, 1]
                    text_start_probs_old = pos_start_probs_old[
                        query_len_old:-1]
                    text_end_probs_old = pos_end_probs_old[query_len_old:-1]

                    cur_start_fold_probs_new = np.array(
                        cur_start_fold_probs_new).reshape((6, token_len, 2))
                    cur_end_fold_probs_new = np.array(
                        cur_end_fold_probs_new).reshape((6, token_len, 2))
                    start_avg_result_new = np.mean(cur_start_fold_probs_new,
                                                   axis=0)
                    end_avg_result_new = np.mean(cur_end_fold_probs_new,
                                                 axis=0)

                    pos_start_probs_new = start_avg_result_new[:, 1]
                    pos_end_probs_new = end_avg_result_new[:, 1]
                    text_start_probs_new = pos_start_probs_new[query_len:-1]
                    text_end_probs_new = pos_end_probs_new[query_len:-1]

                    pos_start_probs = (text_start_probs_old +
                                       text_start_probs_new) / 2
                    pos_end_probs = (text_end_probs_old +
                                     text_end_probs_new) / 2

                    start_ids = (pos_start_probs > 0.4).astype(int)
                    # end_ids = np.argmax(end_avg_result,axis=-1)
                    end_ids = (pos_end_probs > 0.4).astype(int)
                    token_mapping = token_mapping[1:-1]
                    # start_ids = start_ids[query_len:-1]

                    # end_ids = end_ids[query_len:-1]

                    entity_list, span_start_end_tuple_list = fp_cls_old.extract_entity_from_start_end_ids(
                        text=text,
                        start_ids=start_ids,
                        end_ids=end_ids,
                        token_mapping=token_mapping)
                    # if len(entity_list) == 0:
                    #     score_has_answer = 0.0
                    # else:
                    #     span_score = [text_start_probs[ele[0]]+text_end_probs[ele[1]] for ele in span_start_end_tuple_list]
                    #     score_has_answer = max(span_score)
                    # score_no_answer = 0.5*(max(pos_start_probs[0:query_len])+max(pos_end_probs[0:query_len]))+0.5*final_probs_hasa
                    # diff_score = score_has_answer - score_no_answer
                    for entity in entity_list:
                        if len(entity) > 1:
                            event_list.append({
                                "event_type":
                                cur_event_type,
                                "arguments": [{
                                    "role": cur_role_type,
                                    "argument": entity
                                }]
                            })
        submit_result.append({"id": sample_id, "event_list": event_list})
    # for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list):
    #     event_list = []
    #     if event_type_res is None or len(event_type_res) == 0:
    #             submit_result.append({"id": sample_id, "event_list": []})
    #             continue
    #     for cur_event_type in event_type_res:
    #             cur_event_type = cur_event_type.strip()
    #             if cur_event_type is None or cur_event_type == "":
    #                 continue
    #             corresponding_role_type_list = event_schema_dict.get(cur_event_type)
    #             find_key = sample_id + "-" + cur_event_type
    #             fold_probs_cur_sample = [ele.get(find_key) for ele in kfold_result]
    #             for index,cur_role_type in enumerate(corresponding_role_type_list):
    #                 cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample(
    #                     cur_event_type, cur_role_type)
    #                 token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test(
    #                     text, cur_query_word, 512)
    #                 cur_role_fold_probs = [probs[index] for probs in fold_probs_cur_sample]
    #                 # cur_role_fold_probs_array = np.vstack(cur_role_fold_probs)
    #                 token_len = len(token_ids)
    #                 cur_role_fold_probs_array = np.array(cur_role_fold_probs).reshape((1,token_len,3))
    #                 avg_result = np.mean(cur_role_fold_probs_array,axis=0)
    #                 pred_ids = np.argmax(avg_result,axis=-1)
    #                 token_mapping = token_mapping[1:-1]
    #                 pred_ids = pred_ids[query_len:-1]
    #                 entity_list = extract_entity_span_from_muliclass(text,pred_ids,token_mapping)
    #                 for entity in entity_list:
    #                     event_list.append({"event_type": cur_event_type, "arguments": [
    #                                           {"role": cur_role_type, "argument": entity}]})
    #     submit_result.append({"id": sample_id, "event_list": event_list})

    # for sample_id, event_type_res, text in zip(id_list, event_type_result_list, text_list):
    #     # if sample_id == "66de2f44ca8839ddcb0708096864df8b":
    #     #     print(text)
    #     if event_type_res is None or len(event_type_res) == 0:
    #         submit_result.append({"id": sample_id, "event_list": []})
    #         continue
    #     event_list = []
    #     # print(event_type_res)
    #     # {"event_type": "司法行为-开庭", "arguments": [{"role": "时间", "argument": "4月29日上午"}
    #     for cur_event_type in event_type_res:
    #         cur_event_type = cur_event_type.strip()
    #         if cur_event_type is None or cur_event_type == "":
    #             continue
    #         corresponding_role_type_list = event_schema_dict.get(
    #             cur_event_type)
    #         for cur_role_type in corresponding_role_type_list:
    #             if True:
    #                 cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample(
    #                     cur_event_type, cur_role_type)
    #                 token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test(
    #                     text, cur_query_word, 512)
    #                 start_ids, end_ids,start_probs,end_probs = fp_role_mrc.predict_single_sample(
    #                     token_ids, query_len, token_type_ids)
    #                 # print(start_probs.shape)
    #                 pos_start_probs = start_probs[:,1]
    #                 # pos_end_probs = end_probs[:,1]
    #                 start_ids = (pos_start_probs > 0.4).astype(int)
    #                 # end_ids = (pos_end_probs > 0.4).astype(int)
    #                 # end_ids = (pos_end_probs > 0.4).astype(int)
    #                 # 先松后紧
    #                 if sum(start_ids) == 0:
    #                     continue
    #                 # if sum(start_ids) > 1:
    #                 #     print(text)
    #                 token_mapping = token_mapping[1:-1]
    #                 # a = start_ids[query_len-1:]
    #                 start_ids = start_ids[query_len:-1]
    #                 end_ids = end_ids[query_len:-1]
    #                 entity_list = fp_role_mrc.extract_entity_from_start_end_ids(
    #                     text=text, start_ids=start_ids, end_ids=end_ids, token_mapping=token_mapping)
    #                 for entity in entity_list:
    #                     if len(entity) > 1:
    #                         event_list.append({"event_type": cur_event_type, "arguments": [
    #                                           {"role": cur_role_type, "argument": entity}]})
    #     submit_result.append({"id": sample_id, "event_list": event_list})

    with codecs.open(args.submit_result, 'w', 'utf-8') as fw:
        for dict_result in submit_result:
            write_str = json.dumps(dict_result, ensure_ascii=False)
            fw.write(write_str)
            fw.write("\n")
Пример #10
0
def parse_kfold(args):
    test_file = os.path.join(event_config.get("data_dir"),
                             event_config.get("event_data_file_test"))
    class_type_model_path = event_config.get(args.event_type_model_path)
    event_schema_file = os.path.join(event_config.get("data_dir"),
                                     event_config.get("event_schema"))
    event_schema_dict = parse_event_schema(event_schema_file)
    fp_type = fastPredictTypeClassification(class_type_model_path,
                                            event_config)
    id_list, text_list = fp_type.parse_test_json(test_file)
    kfold_type_result_list = []
    event_type_result_list = []
    for k in range(6):
        predict_fn = fp_type.load_models_kfold(class_type_model_path.format(k))
        cur_fold_event_type_probs = fp_type.predict_for_all_prob(
            predict_fn, text_list)
        kfold_type_result_list.append(cur_fold_event_type_probs)

    for i in range(len(text_list)):
        cur_sample_event_type_buffer = [
            ele[i] for ele in kfold_type_result_list
        ]
        cur_sample_event_type_prob = np.array(
            cur_sample_event_type_buffer).reshape((6, 65))
        avg_result = np.mean(cur_sample_event_type_prob, axis=0)
        event_label_ids = np.argwhere(avg_result > 0.45)
        event_cur_type_strs = [
            fp_type.data_loader.id2labels_map.get(ele[0])
            for ele in event_label_ids
        ]
        event_type_result_list.append(event_cur_type_strs)

    # event_type_result_list = fp_type.predict_for_all((text_list))
    # event_type_result_list = []
    # with codecs.open("new_final_event_type.txt", 'r', 'utf-8') as fr:
    #     for line in fr:
    #         line = line.strip("\n")
    #         event_list_cur = line.split(",")
    #         event_type_result_list.append(event_list_cur)
    role_model_path = event_config.get(args.model_role_pb_dir)
    role_model_path_use_best = "output/model/re_lr_fold_{}_usingtype_roberta_large_traindev_event_role_bert_mrc_model_desmodified_lowercase/checkpoint/export/best_exporter"
    fp_role_mrc = fastPredictMRC(role_model_path, event_config, "role")
    id_list, text_list = fp_role_mrc.parse_test_json(test_file)
    submit_result = []
    # index = 0
    kfold_result = []
    for k in range(1):
        # if k in [0,3,5]:
        predict_fn = fp_role_mrc.load_models(role_model_path.format(k))
        # else:
        #     predict_fn = fp_role_mrc.load_models(role_model_path_use_best.format(k))
        cur_fold_probs_result = {}
        for sample_id, event_type_res, text in zip(id_list,
                                                   event_type_result_list,
                                                   text_list):
            if event_type_res is None or len(event_type_res) == 0:
                # submit_result.append({"id": sample_id, "event_list": []})
                cur_fold_probs_result.update({sample_id: []})
                continue

            for cur_event_type in event_type_res:
                cur_event_type = cur_event_type.strip()
                if cur_event_type is None or cur_event_type == "":
                    continue
                corresponding_role_type_list = event_schema_dict.get(
                    cur_event_type)
                event_type_probs_result = []
                for cur_role_type in corresponding_role_type_list:
                    cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample(
                        cur_event_type, cur_role_type)
                    token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test(
                        text, cur_query_word, 512)

                    pred_ids, pred_probs = fp_role_mrc.predict_single_sample(
                        predict_fn, token_ids, query_len, token_type_ids)
                    event_type_probs_result.append(pred_probs)
                cur_fold_probs_result.update({
                    sample_id + "-" + cur_event_type:
                    event_type_probs_result
                })
        kfold_result.append(cur_fold_probs_result)

    for sample_id, event_type_res, text in zip(id_list, event_type_result_list,
                                               text_list):
        event_list = []
        if event_type_res is None or len(event_type_res) == 0:
            submit_result.append({"id": sample_id, "event_list": []})
            continue
        for cur_event_type in event_type_res:
            cur_event_type = cur_event_type.strip()
            if cur_event_type is None or cur_event_type == "":
                continue
            corresponding_role_type_list = event_schema_dict.get(
                cur_event_type)
            find_key = sample_id + "-" + cur_event_type
            fold_probs_cur_sample = [ele.get(find_key) for ele in kfold_result]
            for index, cur_role_type in enumerate(
                    corresponding_role_type_list):
                cur_query_word = fp_role_mrc.data_loader.gen_query_for_each_sample(
                    cur_event_type, cur_role_type)
                token_ids, query_len, token_type_ids, token_mapping = fp_role_mrc.data_loader.trans_single_data_for_test(
                    text, cur_query_word, 512)
                cur_role_fold_probs = [
                    probs[index] for probs in fold_probs_cur_sample
                ]
                # cur_role_fold_probs_array = np.vstack(cur_role_fold_probs)
                token_len = len(token_ids)
                cur_role_fold_probs_array = np.array(
                    cur_role_fold_probs).reshape((1, token_len, 3))
                avg_result = np.mean(cur_role_fold_probs_array, axis=0)
                pred_ids = np.argmax(avg_result, axis=-1)
                token_mapping = token_mapping[1:-1]
                pred_ids = pred_ids[query_len:-1]
                entity_list = extract_entity_span_from_muliclass(
                    text, pred_ids, token_mapping)
                for entity in entity_list:
                    event_list.append({
                        "event_type":
                        cur_event_type,
                        "arguments": [{
                            "role": cur_role_type,
                            "argument": entity
                        }]
                    })
        submit_result.append({"id": sample_id, "event_list": event_list})

    with codecs.open(args.submit_result, 'w', 'utf-8') as fw:
        for dict_result in submit_result:
            write_str = json.dumps(dict_result, ensure_ascii=False)
            fw.write(write_str)
            fw.write("\n")
Пример #11
0
def gen_role_class_data():
    """
    generate role mrc data for verify_neg_fold_data_{}
    """
    # bert vocab file path
    vocab_file_path = os.path.join(event_config.get("bert_pretrained_model_path"), event_config.get("vocab_file"))
    # event role slot list file path
    # slot_pattern/vocab_all_slot_label_noBI_map.txt
    slot_file = os.path.join(event_config.get("slot_list_root_path"),event_config.get("bert_slot_complete_file_name_role"))
    # schema file path
    schema_file = os.path.join(event_config.get("data_dir"), event_config.get("event_schema"))
    # query map file path
    # data/slot_descrip
    query_file = os.path.join(event_config.get("slot_list_root_path"),event_config.get("query_map_file"))
    data_loader = EventRolePrepareMRC(vocab_file_path,512,slot_file,schema_file,query_file)
    train_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_train"))
    eval_file = os.path.join(event_config.get("data_dir"),event_config.get("event_data_file_eval"))
    data_loader.k_fold_split_data(train_file,eval_file,True)