def predict(args, model, tokenizer, prefix=""): metric = SeqEntityScore(args.id2label, markup=args.markup) pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) f_results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "crf_test_prediction.json") # pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") test_iterator = tqdm(test_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) if isinstance(model, nn.DataParallel): model = model.module test_loss = 0.0 nb_test_steps = 0 for step, batch in enumerate(test_iterator): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_test_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_test_loss.mean() # print(logits.shape) # print(logits) tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() ### 测试集结果保存到文件 preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities f_results.append(json_d) test_loss += tmp_test_loss nb_test_steps += 1 out_label_ids = inputs['labels'].cpu().numpy().tolist() input_lens = inputs['input_lens'].cpu().numpy().tolist() for i, label in enumerate(out_label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif j == input_lens[i] - 1: metric.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(args.id2label[out_label_ids[i][j]]) temp_2.append(args.id2label[tags[i][j]]) ### 打印测试集的最终结果 ### logger.info("\n") test_loss = test_loss / nb_test_steps test_info, entity_info = metric.result() results = {f'{key}': value for key, value in test_info.items()} results['loss'] = test_loss logger.info("***** Test results %s *****", prefix) info = "-".join( [f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("\n") logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): logger.info("******* %s results ********" % key) info = "-".join([ f' {key}: {value:.4f} ' for key, value in entity_info[key].items() ]) logger.info(info) # pbar(step) logger.info("\n") with open(output_predict_file, "w") as writer: for record in f_results: writer.write(json.dumps(record) + '\n') if args.task_name == 'cluener': output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') if args.task_name == 'cluener': output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, tokenizer, prefix="",batch_size=64): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset,texts = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') print(len(test_dataset)) test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] import time #start = time.time()#results = [] start = time.time()#results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "test_predict.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() #print(len(texts[step*batch_size:min(step*batch_size+batch_size,len(texts))]))#model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": None, "end_positions": None} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) start_logits, end_logits = outputs[:2] #print(start_logits.shape)#, end_logits = outputs[:2] R = bert_extract_predict_item(start_logits, end_logits) #print(R)#= bert_extract_item(start_logit, end_logits) if R: label_entities =[] for x in R: label_entity = [] if x and len(x)>0 : for d in x: if len(d)>0: label_entity.append([args.id2label[d[0]], d[1], d[2]])# if x else [] for x in R] label_entities.append(label_entity) # else: # label_entities = [] # print(label_entities)# = [[args.id2label[x[0]], x[1], x[2]] for x in R] # print(len(label_entities))# = [[args.id2label[x[0]], x[1], x[2]] for x in R] #sys.exit(1)#print(len(label_entities))# = [[args.id2label[x[0]], x[1], x[2]] for x in R] for idx in range(step*batch_size,min(step*batch_size+batch_size,len(texts))): json_d = {} json_d['id'] = idx json_d['text'] =texts[idx] json_d['entities'] = label_entities[idx-step*batch_size] results.append(json_d) pbar(step) logger.info("\n") print("time cost="+str(time.time()-start))#json_to_text(output_submit_file, test_submit) with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') if args.task_name == "cluener": output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') print(len(test_dataset)) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_submit_file = os.path.join(pred_output_dir, prefix, "cluener_predict.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": None, "end_positions": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) start_logits, end_logits = outputs[:2] R = bert_extract_item(start_logits, end_logits) if R: label_entities = [[args.id2label[x[0]], x[1], x[2]] for x in R] else: label_entities = [] json_d = {} json_d['id'] = step json_d['entities'] = label_entities results.append(json_d) pbar(step) print(" ") # with open(output_submit_file, "w") as writer: # for record in results: # writer.write(json.dumps(record) + '\n') test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1,collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) # t_all_input_ids, t_all_input_mask, t_all_segment_ids, t_all_lens, t_all_orig_to_tok_index, t_all_word_lens a_all_input_ids, a_all_input_mask, a_all_segment_ids, a_all_lens, a_all_orig_to_tok_index, a_all_word_lens, all_label_ids with torch.no_grad(): inputs = {'t_input_ids': batch[0], 'a_input_ids': batch[6], 't_orig_to_tok_index': batch[4], 'a_orig_to_tok_index': batch[10], 't_attention_mask': batch[1], 'a_attention_mask': batch[7], 'labels': None, 't_input_lens': batch[3], 'a_input_lens':batch[9], 't_word_lens': batch[5], 'a_word_lens':batch[11]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["t_token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) inputs["a_token_type_ids"] = (batch[8] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] preds, _ = model.crf._obtain_labels(logits, args.id2label, inputs['t_word_lens']) preds = preds[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(preds) json_d['entities'] = label_entities results.append(json_d) pbar(step) print(" ") output_predic_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] with open(os.path.join(args.data_dir,"test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file,test_submit)