def load_data(config): print("-*-"*10) print("current data_sign: {}".format(config.data_sign)) if config.data_sign == "conll03": data_processor = Conll03Processor() elif config.data_sign == "zh_msra": data_processor = MSRAProcessor() elif config.data_sign == "zh_onto": data_processor = Onto4ZhProcessor() elif config.data_sign == "en_onto": data_processor = Onto5EngProcessor() elif config.data_sign == "genia": data_processor = GeniaProcessor() elif config.data_sign == "ace2004": data_processor = ACE2004Processor() elif config.data_sign == "ace2005": data_processor = ACE2005Processor() elif config.data_sign == "resume": data_processor = ResumeZhProcessor() else: raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!") label_list = data_processor.get_labels() tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model, do_lower_case=True) dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="train", allow_impossible=True) train_dataloader = dataset_loaders.get_dataloader(data_sign="train") dev_dataloader = dataset_loaders.get_dataloader(data_sign="dev") test_dataloader = dataset_loaders.get_dataloader(data_sign="test") num_train_steps = dataset_loaders.get_num_train_epochs() return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list
def load_data(config, logger): logger.info("-*-"*10) logger.info(f"current data_sign: {config.data_sign}") if config.data_sign == "conll03": data_processor = Conll03Processor() elif config.data_sign == "zh_msra": data_processor = MSRAProcessor() elif config.data_sign == "zh_onto": data_processor = Onto4ZhProcessor() elif config.data_sign == "en_onto": data_processor = Onto5EngProcessor() elif config.data_sign == "genia": data_processor = GeniaProcessor() elif config.data_sign == "ace2004": data_processor = ACE2004Processor() elif config.data_sign == "ace2005": data_processor = ACE2005Processor() elif config.data_sign == "resume": data_processor = ResumeZhProcessor() else: raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!") label_list = data_processor.get_labels() tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case) dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="test", allow_impossible=True) test_dataloader = dataset_loaders.get_dataloader(data_sign="test", num_data_processor=config.num_data_processor, logger=logger) return test_dataloader, label_list
def predictions(start_logits, end_logits, path, index, bert_model): with open(path, "r") as f: test_data = json.load(f) test_dict = test_data[index] query_item = test_dict["query"] start_pos = test_dict["start_position"] end_pos = test_dict["end_position"] context_item = test_dict["context"] entity = test_dict["entity_label"] tokenizer = BertTokenizer4Tagger.from_pretrained(bert_model, do_lower_case=True) query_tokens = tokenizer.tokenize(query_item) n = len(query_tokens) + 2 start_pos_pred = [] end_pos_pred = [] start_logit_list = start_logits.numpy() end_logit_list = end_logits.numpy() for idx, bit in enumerate(start_logit_list[index]): if bit==1: start_pos_pred.append(idx-n) for idx, bit in enumerate(end_logit_list[index]): if bit==1: end_pos_pred.append(idx-n) return entity, start_pos_pred, end_pos_pred, start_pos, end_pos
def load_data(config, logger): logger.info("-*-"*10) logger.info(f"current data_sign: {config.data_sign}") if config.data_sign == "conll03": data_processor = Conll03Processor() elif config.data_sign == "zh_msra": data_processor = MSRAProcessor() elif config.data_sign == "zh_onto": data_processor = Onto4ZhProcessor() elif config.data_sign == "en_onto": data_processor = Onto5EngProcessor() elif config.data_sign == "genia": data_processor = GeniaProcessor() elif config.data_sign == "ace2004": data_processor = ACE2004Processor() elif config.data_sign == "ace2005": data_processor = ACE2005Processor() elif config.data_sign == "resume": data_processor = ResumeZhProcessor() elif config.data_sign == "en_wnut_20_wlp": data_processor = WlpWnut20Processor() else: raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!") label_list = data_processor.get_labels() tokenizer = BertTokenizer4Tagger.from_pretrained(config.bert_model) dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="train", allow_impossible=True, ) # entity_scheme=config.entity_scheme) if config.debug: logger.info("%="*20) logger.info("="*10 + " DEBUG MODE " + "="*10) train_dataloader = dataset_loaders.get_dataloader(data_sign="dev", num_data_processor=config.num_data_processor, logger=logger) else: train_dataloader = dataset_loaders.get_dataloader(data_sign="train", num_data_processor=config.num_data_processor, logger=logger) dev_dataloader = dataset_loaders.get_dataloader(data_sign="dev", num_data_processor=config.num_data_processor, logger=logger) test_dataloader = dataset_loaders.get_dataloader(data_sign="test", num_data_processor=config.num_data_processor, logger=logger) train_instances = dataset_loaders.get_train_instance() num_train_steps = len(train_dataloader) // config.gradient_accumulation_steps * config.num_train_epochs per_gpu_train_batch_size = config.train_batch_size // config.n_gpu logger.info("****** Running Training ******") logger.info(f"Number of Training Data: {train_instances}") logger.info(f"Train Epoch {config.num_train_epochs}; Total Train Steps: {num_train_steps}; Warmup Train Steps: {config.warmup_steps}") logger.info(f"Per GPU Train Batch Size: {per_gpu_train_batch_size}") return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list
def main(): arg_configs = collect_arguments() if arg_configs.data_sign == "conll03": data_processor = Conll03Processor() elif arg_configs.data_sign == "zh_msra": data_processor = MSRAProcessor() elif arg_configs.data_sign == "zh_onto": data_processor = Onto4ZhProcessor() elif arg_configs.data_sign == "en_onto": data_processor = Onto5EngProcessor() elif arg_configs.data_sign == "genia": data_processor = GeniaProcessor() elif arg_configs.data_sign == "ace2004": data_processor = ACE2004Processor() elif arg_configs.data_sign == "ace2005": data_processor = ACE2005Processor() elif arg_configs.data_sign == "resume": data_processor = ResumeZhProcessor() else: raise ValueError( "Please Notice that your data_sign DO NOT exits !!!!!") label_list = data_processor.get_labels() tokenizer = BertTokenizer4Tagger.from_pretrained( arg_configs.bert_model, do_lower_case=arg_configs.do_lower_case) dataset_loaders = MRCNERDataLoader( arg_configs, data_processor, label_list, tokenizer, mode="transform_binary_files", allow_impossible=arg_configs.allow_impossible) print("||| Number of Data Processor is : {}".format( arg_configs.num_data_processor)) train_features = dataset_loaders.convert_examples_to_features( data_sign="train", num_data_processor=arg_configs.num_data_processor) dev_features = dataset_loaders.convert_examples_to_features( data_sign="dev", num_data_processor=arg_configs.num_data_processor) test_features = dataset_loaders.convert_examples_to_features( data_sign="test", num_data_processor=arg_configs.num_data_processor)
def run_analysis_for_input_length(arg_configs): tokenizer = BertTokenizer4Tagger.from_pretrained(arg_configs.bert_model_dir, do_lower_case=arg_configs.do_lower_case) print("%=%"*15) print("data_dir", "--->", arg_configs.data_dir) print("bert_model_dir", "--->", arg_configs.bert_model_dir) print("clip_length", "--->", arg_configs.clip_length) for data_type in ["train", "dev", "test"]: print("==="*15) print("*** *** *** " * 5 , data_type, "*** *** *** " * 5) input_file_path = os.path.join(arg_configs.data_dir, "mrc-ner.{}".format(data_type)) with open(input_file_path, "r") as f: data_instances = json.load(f) # data_instances is a list of dict: # the keys of one element in data_instances are: # query, context, summary_of_input_data = tokenize_input_sequence_to_subtokens(data_instances, tokenizer, arg_configs.clip_length) for s_k, s_v in summary_of_input_data.items(): print(s_k, "---> ", s_v)