示例#1
0
def load_data(config):

    print("-*-"*10)
    print("current data_sign: {}".format(config.data_sign))

    if config.data_sign == "conll03":
        data_processor = Conll03Processor()
    elif config.data_sign == "zh_msra":
        data_processor = MSRAProcessor()
    elif config.data_sign == "zh_onto":
        data_processor = Onto4ZhProcessor()
    elif config.data_sign == "en_onto":
        data_processor = Onto5EngProcessor()
    elif config.data_sign == "genia":
        data_processor = GeniaProcessor()
    elif config.data_sign == "ace2004":
        data_processor = ACE2004Processor()
    elif config.data_sign == "ace2005":
        data_processor = ACE2005Processor()
    elif config.data_sign == "resume":
        data_processor = ResumeZhProcessor()
    else:
        raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!")

    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model, do_lower_case=True)

    dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="train", allow_impossible=True)
    train_dataloader = dataset_loaders.get_dataloader(data_sign="train") 
    dev_dataloader = dataset_loaders.get_dataloader(data_sign="dev")
    test_dataloader = dataset_loaders.get_dataloader(data_sign="test")
    num_train_steps = dataset_loaders.get_num_train_epochs()

    return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list 
def load_data(config, logger):

    logger.info("-*-"*10)
    logger.info(f"current data_sign: {config.data_sign}")

    if config.data_sign == "conll03":
        data_processor = Conll03Processor()
    elif config.data_sign == "zh_msra":
        data_processor = MSRAProcessor()
    elif config.data_sign == "zh_onto":
        data_processor = Onto4ZhProcessor()
    elif config.data_sign == "en_onto":
        data_processor = Onto5EngProcessor()
    elif config.data_sign == "genia":
        data_processor = GeniaProcessor()
    elif config.data_sign == "ace2004":
        data_processor = ACE2004Processor()
    elif config.data_sign == "ace2005":
        data_processor = ACE2005Processor()
    elif config.data_sign == "resume":
        data_processor = ResumeZhProcessor()
    else:
        raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!")


    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case)

    dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="test", allow_impossible=True)
    test_dataloader = dataset_loaders.get_dataloader(data_sign="test", num_data_processor=config.num_data_processor, logger=logger)

    return test_dataloader, label_list
示例#3
0
def predictions(start_logits, end_logits, path, index, bert_model):
    with open(path, "r") as f:
        test_data = json.load(f)

        test_dict = test_data[index]
        query_item = test_dict["query"]
        start_pos = test_dict["start_position"]
        end_pos = test_dict["end_position"]
        context_item = test_dict["context"]
        entity = test_dict["entity_label"]
        tokenizer = BertTokenizer4Tagger.from_pretrained(bert_model, do_lower_case=True)
        query_tokens = tokenizer.tokenize(query_item)
        n = len(query_tokens) + 2
        start_pos_pred = []
        end_pos_pred = []
        start_logit_list = start_logits.numpy()
        end_logit_list = end_logits.numpy()
        for idx, bit in enumerate(start_logit_list[index]):
            if bit==1:
                start_pos_pred.append(idx-n)
        for idx, bit in enumerate(end_logit_list[index]):
            if bit==1:
                end_pos_pred.append(idx-n)
        
    return entity, start_pos_pred, end_pos_pred, start_pos, end_pos
def load_data(config, logger):

    logger.info("-*-"*10)
    logger.info(f"current data_sign: {config.data_sign}")

    if config.data_sign == "conll03":
        data_processor = Conll03Processor()
    elif config.data_sign == "zh_msra":
        data_processor = MSRAProcessor()
    elif config.data_sign == "zh_onto":
        data_processor = Onto4ZhProcessor()
    elif config.data_sign == "en_onto":
        data_processor = Onto5EngProcessor()
    elif config.data_sign == "genia":
        data_processor = GeniaProcessor()
    elif config.data_sign == "ace2004":
        data_processor = ACE2004Processor()
    elif config.data_sign == "ace2005":
        data_processor = ACE2005Processor()
    elif config.data_sign == "resume":
        data_processor = ResumeZhProcessor()
    elif config.data_sign == "en_wnut_20_wlp":
        data_processor = WlpWnut20Processor()
    else:
        raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!")


    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model)

    dataset_loaders = MRCNERDataLoader(config, data_processor, label_list,
                                       tokenizer, mode="train", allow_impossible=True, ) # entity_scheme=config.entity_scheme)
    if config.debug:
        logger.info("%="*20)
        logger.info("="*10 + " DEBUG MODE " + "="*10)
        train_dataloader = dataset_loaders.get_dataloader(data_sign="dev", num_data_processor=config.num_data_processor, logger=logger)
    else:
        train_dataloader = dataset_loaders.get_dataloader(data_sign="train", num_data_processor=config.num_data_processor, logger=logger)
    dev_dataloader = dataset_loaders.get_dataloader(data_sign="dev", num_data_processor=config.num_data_processor, logger=logger)
    test_dataloader = dataset_loaders.get_dataloader(data_sign="test", num_data_processor=config.num_data_processor, logger=logger)
    train_instances = dataset_loaders.get_train_instance()
    num_train_steps = len(train_dataloader) // config.gradient_accumulation_steps * config.num_train_epochs
    per_gpu_train_batch_size = config.train_batch_size // config.n_gpu

    logger.info("****** Running Training ******")
    logger.info(f"Number of Training Data: {train_instances}")
    logger.info(f"Train Epoch {config.num_train_epochs}; Total Train Steps: {num_train_steps}; Warmup Train Steps: {config.warmup_steps}")
    logger.info(f"Per GPU Train Batch Size: {per_gpu_train_batch_size}")

    return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list 
def main():
    arg_configs = collect_arguments()

    if arg_configs.data_sign == "conll03":
        data_processor = Conll03Processor()
    elif arg_configs.data_sign == "zh_msra":
        data_processor = MSRAProcessor()
    elif arg_configs.data_sign == "zh_onto":
        data_processor = Onto4ZhProcessor()
    elif arg_configs.data_sign == "en_onto":
        data_processor = Onto5EngProcessor()
    elif arg_configs.data_sign == "genia":
        data_processor = GeniaProcessor()
    elif arg_configs.data_sign == "ace2004":
        data_processor = ACE2004Processor()
    elif arg_configs.data_sign == "ace2005":
        data_processor = ACE2005Processor()
    elif arg_configs.data_sign == "resume":
        data_processor = ResumeZhProcessor()
    else:
        raise ValueError(
            "Please Notice that your data_sign DO NOT exits !!!!!")

    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer4Tagger.from_pretrained(
        arg_configs.bert_model, do_lower_case=arg_configs.do_lower_case)

    dataset_loaders = MRCNERDataLoader(
        arg_configs,
        data_processor,
        label_list,
        tokenizer,
        mode="transform_binary_files",
        allow_impossible=arg_configs.allow_impossible)
    print("||| Number of Data Processor is : {}".format(
        arg_configs.num_data_processor))
    train_features = dataset_loaders.convert_examples_to_features(
        data_sign="train", num_data_processor=arg_configs.num_data_processor)
    dev_features = dataset_loaders.convert_examples_to_features(
        data_sign="dev", num_data_processor=arg_configs.num_data_processor)
    test_features = dataset_loaders.convert_examples_to_features(
        data_sign="test", num_data_processor=arg_configs.num_data_processor)
示例#6
0
def run_analysis_for_input_length(arg_configs):
    tokenizer = BertTokenizer4Tagger.from_pretrained(arg_configs.bert_model_dir, do_lower_case=arg_configs.do_lower_case)
    print("%=%"*15)
    print("data_dir", "--->", arg_configs.data_dir)
    print("bert_model_dir", "--->", arg_configs.bert_model_dir)
    print("clip_length", "--->", arg_configs.clip_length)

    for data_type in ["train", "dev", "test"]:
        print("==="*15)
        print("*** *** *** " * 5 , data_type, "*** *** *** " * 5)

        input_file_path = os.path.join(arg_configs.data_dir, "mrc-ner.{}".format(data_type))
        with open(input_file_path, "r") as f:
            data_instances = json.load(f)
            # data_instances is a list of dict:
            # the keys of one element in data_instances are:
            # query, context,
            summary_of_input_data = tokenize_input_sequence_to_subtokens(data_instances, tokenizer, arg_configs.clip_length)
            for s_k, s_v in summary_of_input_data.items():
                print(s_k, "---> ", s_v)