def __init__(self, y_true, y_pred): self.f1 = f1_score(y_true, y_pred) self.acuracia = accuracy_score(y_true, y_pred) self.precisao = precision_score(y_true, y_pred) self.recall = recall_score(y_true, y_pred) self.relatorio_classificacao = classification_report(y_true, y_pred)
def evaluate(args, model, UniDataSet, task): _, dataset, _ = UniDataSet.load_single_dataset( task, batch_size=args.mini_batch_size, mode="dev") task_id = UniDataSet.task_map[task] label_list = UniDataSet.labels_list[task_id] if torch.cuda.device_count() > 0: eval_batch_size = torch.cuda.device_count() * args.mini_batch_size else: eval_batch_size = args.mini_batch_size eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) logger.info(" *** Runing {} evaluation ***".format(task)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "task_id": task_id } outputs = model(**inputs) if args.do_alpha: alpha = outputs[0] outputs = outputs[1:] if type(model.classifier_list[task_id] ) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id] ) == HummingbirdLSTMBiAffineDecoder: # do parsing logits_arc = outputs[0] logits_label = outputs[1] else: logits = outputs[0] nb_eval_steps += 1 if preds is None: # print("preds", logits.shape) if type(model.classifier_list[task_id] ) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id] ) == HummingbirdLSTMBiAffineDecoder: preds_arc = logits_arc.detach().cpu().numpy() preds_label = logits_label.detach().cpu().numpy() out_head_ids = batch[4].detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() else: preds = logits.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() else: if type(model.classifier_list[task_id] ) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id] ) == HummingbirdLSTMBiAffineDecoder: preds_arc = np.append(preds_arc, logits_arc.detach().cpu().numpy(), axis=0) preds_label = np.append(preds_label, logits_label.detach().cpu().numpy(), axis=0) out_head_ids = np.append(out_head_ids, batch[4].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0) else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0) if type(model.classifier_list[task_id]) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id]) == HummingbirdLSTMBiAffineDecoder: preds_arc = np.argmax(preds_arc, axis=2) preds_label = np.argmax(preds_label, axis=2) else: preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(label_list)} print(label_map) if type(model.classifier_list[task_id]) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id]) == HummingbirdLSTMBiAffineDecoder: pad_token_label_id = -100 out_head_list = [[] for _ in range(out_head_ids.shape[0])] preds_arc_list = [[] for _ in range(out_head_ids.shape[0])] out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_label_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_head_ids.shape[0]): for j in range(out_head_ids.shape[1]): if out_head_ids[i, j] != pad_token_label_id: out_head_list[i].append(str(out_head_ids[i][j])) preds_arc_list[i].append(str(preds_arc[i][j])) for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_label_list[i].append(label_map[preds_label[i][j]]) else: out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != -100: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) if task == "ONTO_NER" or task == "NER": for i in range(len(preds_list)): for j in range(len(preds_list[i])): preds_list[i][j] = preds_list[i][j].split("-")[-1] for i in range(len(out_label_list)): for j in range(len(out_label_list[i])): out_label_list[i][j] = out_label_list[i][j].split("-")[-1] results = {} if type(model.classifier_list[task_id]) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id]) == HummingbirdLSTMBiAffineDecoder: results["uas"] = accuracy_score(out_head_list, preds_arc_list) results["las"] = las_score(out_label_list, out_head_list, preds_label_list, preds_arc_list) else: results["a"] = accuracy_score(out_label_list, preds_list) results["p"] = precision_score(out_label_list, preds_list) results["r"] = recall_score(out_label_list, preds_list) results["f"] = f1_score(out_label_list, preds_list) logger.info("*** {} Evaluate results ***".format(task)) for key in sorted(results.keys()): logger.info(" %s = %s ", key, str(results[key])) # print(results) if type(model.classifier_list[task_id]) == DeepBiAffineDecoderV2 or type( model.classifier_list[task_id]) == HummingbirdLSTMBiAffineDecoder: print("sample results") print("preds head", preds_arc_list[0]) print("true head", out_head_list[0]) print("preds label", preds_label_list[0]) print("true label", out_label_list[0]) else: print("predict_sample") print("predict_list", preds_list[0]) print("out_label_list", out_label_list[0]) return results
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) ## span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 start_preds = start_logits.detach().cpu().numpy() # [64, 128, 5] end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) # [64, 128] end_preds = np.argmax(end_preds, axis=2) start_preds_list = [] end_preds_list = [] batch_true_labels = batch[4].squeeze(0).cpu().numpy().tolist() for index, input_length in enumerate(batch[3]): # batch[3] 每句长度 start_preds_list.append([span_map[j] for j in start_preds[index][:input_length]][1:-1]) end_preds_list.append([span_map[j] for j in end_preds[index][:input_length]][1:-1]) batch_true = [args.id2label.get(i) for i in batch_true_labels[index][:input_length]][1:-1] true_labels.append(batch_true) batch_predict_labels = convert_span_to_bio(start_preds_list, end_preds_list) predict_labels.extend(batch_predict_labels) pbar(step) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss/nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, #64, 256 type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = AlbertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = AlbertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = AlbertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) accuracy = accuracy_score(y_true, y_pred) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report) with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") logger.info("\n%s", accuracy) writer.write(str(accuracy))
def test( self, data: DataLoader, all_ids: list, tag2code: dict, code2tag: dict, ) -> (dict, pd.DataFrame): eval_loss = 0. eval_steps, eval_examples = 0, 0 eval_ids, eval_tokens, eval_predictions, eval_labels = [], [], [], [] self.model.eval() for batch in data: batch_ids, batch_tokens, batch_masks, batch_tags = tuple( t.to(self.device) for t in batch) with torch.no_grad(): outputs = self.model(batch_tokens, attention_mask=batch_masks, labels=batch_tags) logits = outputs[1].detach().cpu().numpy() label_ids = batch_tags.to('cpu').numpy() toks = batch_tokens.to('cpu').numpy() sentence_ids = batch_ids.to('cpu').numpy() eval_loss += outputs[0].mean().item() toks = [ self.tokenizer.convert_ids_to_tokens(sentence) for sentence in toks ] eval_tokens.extend(toks) eval_predictions.extend( [list(p) for p in np.argmax(logits, axis=2)]) eval_labels.extend(label_ids) eval_ids.extend(sentence_ids) eval_examples += batch_tokens.size(0) eval_steps += 1 eval_loss = eval_loss / eval_steps flatten = lambda x: [j for i in x for j in i] predicted_tags, valid_tags, tokens, sentence_ids = self.translate( eval_predictions, eval_labels, eval_tokens, eval_ids, tag2code, code2tag, all_ids) # for st, sp, sv, vi in zip(tokens, predicted_tags, valid_tags, sentence_ids): # for t, p, v, i in zip(st, sp, sv, vi): # logger.info(f"row = {t}, {p}, {v}, {i}") predicted_data = pd.DataFrame( data={ 'sentence_id': flatten(sentence_ids), 'tokens': flatten(tokens), 'predicted_tag': flatten(predicted_tags), 'valid_tag': flatten(valid_tags), }) if len([ tag for sent in valid_tags for tag in sent if tag[:2] in ['B-', 'I-'] ]) == 0: valid_tags.append(["O"]) predicted_tags.append(["B-ORG"]) scores = { "loss": eval_loss, "acc": accuracy_score(valid_tags, predicted_tags), "f1": f1_score(valid_tags, predicted_tags), "p": precision_score(valid_tags, predicted_tags), "r": recall_score(valid_tags, predicted_tags), "report": classification_report(valid_tags, predicted_tags), } return scores, predicted_data
def evaluate(args, model, tokenizer, label_list, pad_token_label_id): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval logger.info("***** Running evaluation %s *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 true_labels = [] predict_labels = [] model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist() pbar(step) for index, input_length in enumerate(batch[4]): batch_true = [ args.id2label.get(i) for i in batch_true_labels[index][:input_length] ][1:-1] batch_predict = [ args.id2label.get(i) for i in batch_predict_labels[index][:input_length] ][1:-1] true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n") logger.info("average eval_loss: %s", str(eval_loss / nb_eval_steps)) logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
true_labels.extend(val_batch_labels) tmp_eval_accuracy = flat_accuracy(val_batch_labels, val_batch_preds) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 # Evaluate loss, acc, conf. matrix, and class. report on devset pred_tags = [[tag2name[i] for i in predictions]] valid_tags = [[tag2name[i] for i in true_labels]] cl_report = classification_report(valid_tags, pred_tags) eval_loss = eval_loss / nb_eval_steps tmp_accuracy = accuracy_score(valid_tags, pred_tags) if tmp_accuracy > dev_best_acc: dev_best_acc = tmp_accuracy model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(bert_out_address, "pytorch_model.bin") output_config_file = os.path.join(bert_out_address, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) # Report metrics f1 = f1_score(valid_tags, pred_tags) if f1 > dev_best_f1: dev_best_f1 = f1
def evaluate(y_true, y_pred): print("accuracy: {:.2}".format(accuracy_score(y_true, y_pred))) print("precision: {:.2}".format(precision_score(y_true, y_pred))) print("recall: {:.2}".format(recall_score(y_true, y_pred))) print("f1: {:.2}".format(f1_score(y_true, y_pred))) print(classification_report(y_true, y_pred))
def calculate_metrics(pred_tags, gt_tags): f1 = f1_score(pred_tags, gt_tags)*100 ppv = precision_score(pred_tags, gt_tags)*100 sen = recall_score(pred_tags, gt_tags)*100 acc = accuracy_score(pred_tags, gt_tags)*100 return {'f1':f1, 'precision':ppv, 'recall':sen, 'accuracy':acc}
# Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() val_pred.append(np.argmax(logits, axis=2).flatten().tolist()) val_lab.append(label_ids.flatten().tolist()) # Report the final accuracy for this validation run. val_pred_tag = get_label_name(val_pred) val_lab_tag = get_label_name(val_lab) avg_val_accuracy = accuracy_score(y_pred=val_pred_tag, y_true=val_lab_tag) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) val_class_report = classification_report(y_pred=val_pred_tag, y_true=val_lab_tag) print("====class_report:", val_class_report) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(val_data_loader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch.
def iterate_batches(self, epoch, n_epoch, iterator, train, mode): ''' iterates through batchs in an epoch epoch: current epoch n_epoch: total epochs iterator: the iterator to be used for fetching batches train: switch for whether or not to train this epoch mode: string that just labels the epoch in the output ''' # initialize lists for batch losses and metrics batch_loss = [] batch_accuracy = [] batch_f1 = [] # initialize batch range batch_range = tqdm(iterator, desc='') for batch in batch_range: # fetch texts, characters, and tags from batch text = batch.text.to(self.device) char = batch.char.to(self.device) tag = batch.tag.to(self.device) # zero out prior gradients for training if train: self.optimizer.zero_grad() # output depends on whether conditional random field is used for prediction/loss if self.model.use_crf: prediction, loss = self.model(text, char, tag) else: logit = self.model(text, char, tag) loss = self.criterion(logit.view(-1, logit.shape[-1]), tag.view(-1)) logit = logit.detach().cpu().numpy() prediction = [list(p) for p in np.argmax(logit, axis=2)] # send the true tags to python list on the cpu true = list(tag.to('cpu').numpy()) # put the prediction tags and valid tags into a nested list form for the scoring metrics prediction_tags = [[self.data.tag_field.vocab.itos[ii] for ii, jj in zip(i, j) if self.data.tag_field.vocab.itos[jj] != self.data.pad_token] for i, j in zip(prediction, true)] valid_tags = [[self.data.tag_field.vocab.itos[ii] for ii in i if self.data.tag_field.vocab.itos[ii] != self.data.pad_token] for i in true] # calculate the accuracy and f1 scores accuracy = accuracy_score(valid_tags, prediction_tags) f1 = f1_score(valid_tags, prediction_tags) # append to the lists batch_loss.append(loss.item()) batch_accuracy.append(accuracy) batch_f1.append(f1) # backpropagate the gradients and step the optimizer forward if train: loss.backward() self.optimizer.step() # calculate means across the batches so far means = (np.mean(batch_loss), np.mean(batch_accuracy), np.mean(batch_f1)) # display progress batch_range.set_description('| epoch: {:d}/{} | {} | loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f} |'.format(epoch+1, n_epoch, mode, *means)) # return the batch losses and metrics return batch_loss, batch_accuracy, batch_f1
def testing(self, test_episodes, label_map): self.bert.load_state_dict( torch.load( 'MetaLearningForNER/saved_models/SupervisedLearner-stable.h5')) map_to_label = {v: k for k, v in label_map.items()} # episode_accuracies, episode_precisions, episode_recalls, episode_f1s = [], [], [], [] all_true_labels = [] all_predictions = [] for episode_id, episode in enumerate(tqdm(test_episodes)): batch_x, batch_len, batch_y = next(iter(episode.support_loader)) support_repr, _, support_labels = self.vectorize( batch_x, batch_len, batch_y) support_repr = support_repr.reshape( support_repr.shape[0] * support_repr.shape[1], -1) support_labels = support_labels.view(-1) support_repr = support_repr[support_labels != -1].cpu().numpy() support_labels = support_labels[support_labels != -1].cpu().numpy() batch_x, batch_len, batch_y = next(iter(episode.query_loader)) query_repr, _, true_labels = self.vectorize( batch_x, batch_len, batch_y) query_bs, query_seqlen = query_repr.shape[0], query_repr.shape[1] query_repr = query_repr.reshape( query_repr.shape[0] * query_repr.shape[1], -1) true_labels = true_labels.view(-1) # query_repr = query_repr[true_labels != -1].cpu().numpy() query_repr = query_repr.cpu().numpy() # true_labels = true_labels[true_labels != -1].cpu().numpy() true_labels = true_labels.cpu().numpy() dist = cdist(query_repr, support_repr, metric='cosine') nearest_neighbor = np.argmin(dist, axis=1) predictions = support_labels[nearest_neighbor] true_labels = true_labels.reshape(query_bs, query_seqlen) predictions = predictions.reshape(query_bs, query_seqlen) seq_true_labels, seq_predictions = [], [] for i in range(len(true_labels)): true_i = true_labels[i] pred_i = predictions[i] seq_predictions.append( [map_to_label[val] for val in pred_i[true_i != -1]]) seq_true_labels.append( [map_to_label[val] for val in true_i[true_i != -1]]) all_predictions.extend(seq_predictions) all_true_labels.extend(seq_true_labels) accuracy = accuracy_score(seq_true_labels, seq_predictions) precision = precision_score(seq_true_labels, seq_predictions) recall = recall_score(seq_true_labels, seq_predictions) f1 = f1_score(seq_true_labels, seq_predictions) # logger.info('Episode {}/{}, task {} [query set]: Accuracy = {:.5f}, precision = {:.5f}, ' # 'recall = {:.5f}, F1 score = {:.5f}'.format(episode_id + 1, len(test_episodes), episode.task_id, # accuracy, precision, recall, f1)) # episode_accuracies.append(accuracy) # episode_precisions.append(precision) # episode_recalls.append(recall) # episode_f1s.append(f1_score) accuracy = accuracy_score(all_true_labels, all_predictions) precision = precision_score(all_true_labels, all_predictions) recall = recall_score(all_true_labels, all_predictions) f1 = f1_score(all_true_labels, all_predictions) logger.info( 'Avg meta-testing metrics: Accuracy = {:.5f}, precision = {:.5f}, recall = {:.5f}, ' 'F1 score = {:.5f}'.format(accuracy, precision, recall, f1)) return f1
def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return {"acc": accuracy_score(out_label_list, preds_list)}
def get_predictions(model, dataloader, device): eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] model.eval() predictions, hit, miss, err, total = [], 0, 0, 0, 0 with torch.no_grad(): for _, *data in dataloader: if next(model.parameters()).is_cuda: data = [t.to(device) for t in data if t is not None] tokens_tensors, segments_tensors, masks_tensors, labels = data outputs = model( input_ids=tokens_tensors, token_type_ids=None, # token_type_ids=segments_tensors, attention_mask=masks_tensors) logits = outputs[0] logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() # Get NER true result labels = labels.to('cpu').numpy() # Only predict the real word, mark=0, will not calculate masks_tensors = masks_tensors.to('cpu').numpy() # Compare the valuable predict result for i, mask in enumerate(masks_tensors): # Real one temp_1 = [] # Predict one temp_2 = [] for j, m in enumerate(mask): # Mark=0, meaning its a pad word, dont compare if m: if tag2name[labels[i][j]] != "X" \ and tag2name[labels[i][j]] != "[CLS]" \ and tag2name[labels[i][j]] != "[SEP]" : # Exclude the X label temp_1.append(tag2name[labels[i][j]]) temp_2.append(tag2name[logits[i][j]]) else: break y_true.append(temp_1) y_pred.append(temp_2) mtag = lambda labs: [tag2idx['I-per'] \ if l == tag2idx['B-per'] else l for l in labs] aseq = set([tuple(i for i,value in it) \ for key,it in itertools.groupby( enumerate(mtag(labels[i])), key=operator.itemgetter(1)) \ if key == tag2idx['I-per']]) pseq = set([tuple(i for i,value in it) \ for key,it in itertools.groupby( enumerate(mtag(logits[i])), key=operator.itemgetter(1)) \ if key == tag2idx['I-per']]) total += len(aseq) hit += len(pseq & aseq) miss += len(aseq - pseq) err += len(pseq - aseq) ##predictions.append(pseq) print("f1 socre: %f" % (f1_score(y_true, y_pred))) print("Accuracy score: %f" % (accuracy_score(y_true, y_pred))) print("Name score hit: {} / {} = {}".format(hit, total, hit / total)) print("Name score miss: {} / {} = {}".format(miss, total, miss / total)) print("Name score error: {} / {} = {}".format(err, total, err / total)) return None, accuracy_score(y_true, y_pred)
# Not storing gradient for memory with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = outputs[1].detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() eval_loss += outputs[0].mean().item() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.extend(label_ids) eval_loss = eval_loss / len(valid_dataloader) validation_loss_values.append(eval_loss) pred_tags = [ tag_values[p_i] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD" ] valid_tags = [ tag_values[l_i] for l in true_labels for l_i in l if tag_values[l_i] != "PAD" ] print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags))) print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))) torch.save(model.state_dict(), "Criteria.pth")
def evaluate(args, model, UniDataSet, task): _, dataset, _ = UniDataSet.load_single_dataset( task, batch_size=args.mini_batch_size, mode="dev") task_id = UniDataSet.task_map[task] label_list = UniDataSet.labels_list[task_id] if torch.cuda.device_count() > 0: eval_batch_size = torch.cuda.device_count() * args.mini_batch_size else: eval_batch_size = args.mini_batch_size eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size) logger.info(" *** Runing {} evaluation ***".format(task)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", eval_batch_size) nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "task_id": task_id } outputs = model(**inputs) if args.do_alpha: alpha = outputs[0] outputs = outputs[1:] logits = outputs[0] nb_eval_steps += 1 if preds is None: # print("preds", logits.shape) preds = logits.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=2) if len(label_list) == 0: pass else: label_map = {i: label for i, label in enumerate(label_list)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != -100: if len(label_list) == 0: out_label_list[i].append(str(out_label_ids[i][j])) preds_list[i].append(str(preds[i][j])) else: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) if task == "ONTO_NER" or task == "NER": for i in range(len(preds_list)): for j in range(len(preds_list[i])): preds_list[i][j] = preds_list[i][j].split("-")[-1] for i in range(len(out_label_list)): for j in range(len(out_label_list[i])): out_label_list[i][j] = out_label_list[i][j].split("-")[-1] results = {} results["a"] = accuracy_score(out_label_list, preds_list) results["p"] = precision_score(out_label_list, preds_list) results["r"] = recall_score(out_label_list, preds_list) results["f"] = f1_score(out_label_list, preds_list) logger.info("*** {} Evaluate results ***".format(task)) for key in sorted(results.keys()): logger.info(" %s = %s ", key, str(results[key])) # print(results) print("predict_sample") print("predict_list", preds_list[0]) print("out_label_list", out_label_list[0]) # write the results to text with open("results-v2.txt", "w+", encoding="utf-8") as f: for line in preds_list: line = " ".join(line) + "\n" f.write(line) return results
from seqeval.metrics import accuracy_score from seqeval.metrics import classification_report from seqeval.metrics import f1_score from seqeval.metrics import precision_score from sklearn import metrics y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] labels = [3, 3, 3, 1, 2, 2, 3, 1, 2, 3] predictions = [3, 3, 1, 2, 2, 2, 3, 1, 2, 3] p, r, f, sup = metrics.precision_recall_fscore_support(labels, predictions, average='macro') print(precision_score(y_true, y_pred)) print(f1_score(y_true, y_pred)) print(accuracy_score(y_true, y_pred)) print(classification_report(y_true, y_pred))
epochs = 8 model = create_model(number_of_words_total, number_of_tags, len_max, embedding_size, lstm_units, dropout, recurrent_dropout) history, pred_labels, real_labels = run_model(model, X_train, y_train, X_test, y_test, indices_to_tag, epochs) with open("../BiLTSM_CRF.log", "a") as file: file.write( "\n##############################################################\n\n" ) file.write(f"Dataset: {dataset}\n") file.write( f"Embedding size: {embedding_size} | Dropout: {dropout} | Recurrent dropout: {recurrent_dropout}\ | Epochs: {epochs} | LSTM units: {lstm_units} | Train: train, eng_a | Test: eng_b\n" ) file.write("Accuracy: {:.2%}\n".format( accuracy_score(real_labels, pred_labels))) file.write("F1-score: {:.2%}\n\n".format( f1_score(real_labels, pred_labels))) file.write(classification_report(real_labels, pred_labels)) report = flat_classification_report(y_pred=pred_labels, y_true=real_labels, labels=tags_without_O) file.write(report)
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # 每次只有一条数据 # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results = [] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] batch_predict_labels = model.crf.decode(logits, inputs['attention_mask']) batch_predict_labels = batch_predict_labels[0][ 1:-1] # [CLS]XXXX[SEP] 每次只有一条数据 batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError = False for input_id, pre, lab in zip(input_ids, batch_predict_labels, batch_true_labels): sent += " ".join([ tokenizer.ids_to_tokens[input_id], args.id2label[lab], args.id2label[pre] ]) + "\n" if args.id2label[lab] != args.id2label[pre]: ifError = True sent += "\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 batch_true = [args.id2label.get(i) for i in batch_true_labels] batch_predict = [args.id2label.get(i) for i in batch_predict_labels] assert len(batch_true) == len(batch_predict) true_labels.append(batch_true) predict_labels.append(batch_predict) logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info( str( classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w", encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w", encoding="utf-8") as writer: for record in error_results: writer.write(record)
for c in cur: if c == 0: aa.append('O') else: aa.append(ind2label[c]) # print (aa) depad_pred.append(aa) # _ = input("Type something to test this out: ") print("============================================================") flat_pred = [item for sublist in depad_pred for item in sublist] flat_y = [item for sublist in y for item in sublist] flat_X = [item for sublist in X for item in sublist] print(flat_pred[:20]) # print (flat_y[:20]) print(flat_X[:20]) pred_idx = [label2ind[c] for c in flat_pred] y_idx = [label2ind[c] for c in flat_y] # print (pred_idx[:100]) # print (y_idx[:100]) print(f1_score(flat_y, flat_pred)) print(accuracy_score(flat_y, flat_pred)) print(classification_report(flat_y, flat_pred)) a = input("here we are ....")
def evaluate(args, model, UniDataSet, label_list, task): dataset = UniDataSet.load_single_dataset(task, "dev") task_id = UniDataset.task_map[task] label_list = UniDataSet[task_id] if torch.cuda.device_count() > 0: eval_batch_size = torch.cuda.device_count() * args.mini_batch_size else: eval_batch_size = args.per_gpu_eval_batch_size eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size) logger.info(" *** Runing {} evaluation ***".format(task)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", eval_batch_size) nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids":batch[0], "attention_mask":batch[1], "token_type_ids":batch[2], "task_id":task_id} outputs = model(**inputs) if args.do_alpha: alpha = outputs[0] outputs = outputs[1:] _ , logits = outputs[:2] nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy, axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = {} results["a"] = accuracy_score(out_label_list, preds_list) results["p"] = precision_score(out_label_list, preds_list) results["r"] = recall_score(out_label_list, preds_list) results["f"] = f1_score(out_label_list, preds_list) return results
def rcml_main(args): logger.info('KB-ALBERT 중요 정보 추출기 동작') if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = NerProcessor() converter = convert_examples_to_features_ner label_list = processor.get_labels(args.data_dir) label_map = {i: label for i, label in enumerate(label_list)} num_labels = len(label_list) tokenizer = KbAlbertCharTokenizer.from_pretrained(args.bert_model_path) train_sen_examples = None eval_sen_examples = None test_sen_examples = None num_train_optimization_steps = None if args.do_train: train_sen_examples = processor.get_train_examples(args.data_dir) eval_sen_examples = processor.get_dev_examples(args.data_dir) train_sen_features = converter(train_sen_examples, label_list, args.max_seq_length, tokenizer) eval_sen_features = converter(eval_sen_examples, label_list, args.max_seq_length, tokenizer) num_train_optimization_steps = int( len(train_sen_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.do_test: if args.do_prototype: test_sen_examples = processor.get_prototype_examples(args.data_dir) else: test_sen_examples = processor.get_test_examples(args.data_dir) test_sen_features = converter(test_sen_examples, label_list, args.max_seq_length, tokenizer) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) config = AlbertConfig.from_pretrained(args.config_file_name, num_labels=num_labels, id2label=label_map) if args.do_train: model = AlbertForTokenClassification.from_pretrained( args.bert_model_path, config=config) elif args.do_test: model = torch.load( os.path.join(args.bert_model_path, args.bert_model_name)) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) ##train_model global_step = 0 if args.do_train: # model.unfreeze_bert_encoder() if len(train_sen_features) == 0: logger.info( "The number of train_features is zero. Please check the tokenization. " ) sys.exit() logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_sen_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_sen_input_ids = torch.tensor( [f.input_ids for f in train_sen_features], dtype=torch.long) train_sen_input_mask = torch.tensor( [f.input_mask for f in train_sen_features], dtype=torch.long) train_sen_segment_ids = torch.tensor( [f.segment_ids for f in train_sen_features], dtype=torch.long) train_sen_label_ids = torch.tensor( [f.label_id for f in train_sen_features], dtype=torch.long) eval_sen_input_ids = torch.tensor( [f.input_ids for f in eval_sen_features], dtype=torch.long) eval_sen_input_mask = torch.tensor( [f.input_mask for f in eval_sen_features], dtype=torch.long) eval_sen_segment_ids = torch.tensor( [f.segment_ids for f in eval_sen_features], dtype=torch.long) eval_sen_label_ids = torch.tensor( [f.label_id for f in eval_sen_features], dtype=torch.long) train_sen_data = TensorDataset(train_sen_input_ids, train_sen_input_mask, train_sen_segment_ids, train_sen_label_ids) eval_sen_data = TensorDataset(eval_sen_input_ids, eval_sen_input_mask, eval_sen_segment_ids, eval_sen_label_ids) train_sen_dataloader = DataLoader( train_sen_data, batch_size=args.train_batch_size, worker_init_fn=lambda _: np.random.seed()) eval_sen_dataloader = DataLoader(eval_sen_data, batch_size=args.train_batch_size) train_loss_values, valid_loss_values = [], [] train_acc, valid_acc = [], [] train_f1, valid_f1 = [], [] for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() total_loss = 0 tr_predicted_labels, tr_target_labels = list(), list() for step, train_sen_batch in enumerate( tqdm(train_sen_dataloader, total=len(train_sen_dataloader), desc="Iteration")): train_sen_batch = tuple(t.to(device) for t in train_sen_batch) sen_input_ids, sen_input_mask, sen_segment_ids, train_sen_label_ids = train_sen_batch output = model(input_ids=sen_input_ids, attention_mask=sen_input_mask, position_ids=None, token_type_ids=sen_segment_ids, labels=train_sen_label_ids) loss = output[0] loss.backward() total_loss += loss.item() logits = output[1].detach().cpu().numpy() label_ids = train_sen_label_ids.to('cpu').numpy() tr_predicted_labels.extend( [list(p) for p in np.argmax(logits, axis=2)]) tr_target_labels.extend(label_ids) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tr_loss = total_loss / len(train_sen_dataloader) train_loss_values.append(tr_loss) tr_pred_tags = [ label_list[p_i] for p, l in zip(tr_predicted_labels, tr_target_labels) for p_i, l_i in zip(p, l) if label_list[l_i] != "PAD" ] tr_target_tags = [ label_list[l_i] for l in tr_target_labels for l_i in l if label_list[l_i] != "PAD" ] acc = accuracy_score(tr_pred_tags, tr_target_tags) f1 = f1_score(tr_pred_tags, tr_target_tags) train_acc.append(acc) train_f1.append(f1) logger.info('') logger.info( '################### epoch ################### : {}'.format( epoch + 1)) logger.info( '################### train loss ###################: {}'. format(tr_loss)) logger.info( '################### train accuracy ###############: {}'. format(acc)) logger.info( '################### train f1 score ###############: {}'. format(f1)) eval_loss = 0 ev_predicted_labels, ev_target_labels = list(), list() for eval_sen_batch in eval_sen_dataloader: eval_sen_batch = tuple(t.to(device) for t in eval_sen_batch) eval_sen_input_ids, eval_sen_input_mask, eval_sen_segment_ids, eval_label_ids = eval_sen_batch with torch.no_grad(): model.eval() output = model(input_ids=eval_sen_input_ids, attention_mask=eval_sen_input_mask, position_ids=None, token_type_ids=eval_sen_segment_ids, labels=eval_label_ids) logits = output[1].detach().cpu().numpy() label_ids = eval_label_ids.to('cpu').numpy() eval_loss += output[0].mean().item() ev_predicted_labels.extend( [list(p) for p in np.argmax(logits, axis=2)]) ev_target_labels.extend(label_ids) ev_loss = eval_loss / len(eval_sen_dataloader) valid_loss_values.append(ev_loss) ev_pred_tags = [ label_list[p_i] for p, l in zip(ev_predicted_labels, ev_target_labels) for p_i, l_i in zip(p, l) if label_list[l_i] != "PAD" ] ev_target_tags = [ label_list[l_i] for l in ev_target_labels for l_i in l if label_list[l_i] != "PAD" ] acc = accuracy_score(ev_pred_tags, ev_target_tags) f1 = f1_score(ev_pred_tags, ev_target_tags) valid_acc.append(acc) valid_f1.append(f1) logger.info('') logger.info( '################### valid loss ###################: {}'. format(ev_loss)) logger.info( '################### valid accuracy ###############: {}'. format(acc)) logger.info( '################### valid f1 score ###############: {}'. format(f1)) model_to_save = model.module if hasattr(model, 'module') else model if (epoch + 1) % 5 == 0: torch.save(model_to_save.state_dict(), './model/eval_model/{}_epoch.bin'.format(epoch + 1)) torch.save(model, './model/eval_model/{}_epoch.pt'.format(epoch + 1)) save_training_result = train_loss_values, train_acc, train_f1, valid_loss_values, valid_acc, valid_f1 with open('./output_dir/training_history.pkl', 'wb') as f: pickle.dump(save_training_result, f) if args.do_test: # logger.info("***** Running prediction *****") # logger.info(" Num examples = %d", len(test_sen_examples)) # logger.info(" Batch size = %d", args.eval_batch_size) test_sen_input_ids = torch.tensor( [f.input_ids for f in test_sen_features], dtype=torch.long) test_sen_input_mask = torch.tensor( [f.input_mask for f in test_sen_features], dtype=torch.long) test_sen_segment_ids = torch.tensor( [f.segment_ids for f in test_sen_features], dtype=torch.long) test_sen_label_ids = torch.tensor( [f.label_id for f in test_sen_features], dtype=torch.long) test_sen_data = TensorDataset(test_sen_input_ids, test_sen_input_mask, test_sen_segment_ids, test_sen_label_ids) # Run prediction for full data test_sen_dataloader = DataLoader(test_sen_data, batch_size=args.eval_batch_size) all_labels = None te_predicted_labels, te_target_labels = list(), list() for test_sen_batch in tqdm(test_sen_dataloader, total=len(test_sen_dataloader), desc='Prediction'): test_sen_batch = tuple(t.to(device) for t in test_sen_batch) test_sen_input_ids, test_sen_input_mask, test_sen_segment_ids, test_label_ids = test_sen_batch with torch.no_grad(): model.eval() output = model(input_ids=test_sen_input_ids, attention_mask=test_sen_input_mask, position_ids=None, token_type_ids=test_sen_segment_ids) logits = output[0].detach().cpu().numpy() label_ids = test_label_ids.to('cpu').numpy() te_predicted_labels.extend( [list(p) for p in np.argmax(logits, axis=2)]) te_target_labels.extend(label_ids) te_pred_tags = [ label_list[p_i] for p, l in zip(te_predicted_labels, te_target_labels) for p_i, l_i in zip(p, l) if label_list[l_i] != "PAD" ] te_target_tags = [ label_list[l_i] for l in te_target_labels for l_i in l if label_list[l_i] != "PAD" ] if all_labels is None: all_labels = label_ids else: all_labels = np.concatenate((all_labels, label_ids), axis=0) acc = accuracy_score(te_pred_tags, te_target_tags) f1 = f1_score(te_pred_tags, te_target_tags) # logger.info('################### test accuracy ###############: {}'.format(acc)) # logger.info('################### test f1 score ###############: {}'.format(f1)) # tokenized_testcase = [[tokenizer.tokenize(str(j)) for j in input_example.text_a] for input_example in test_sen_examples] tokenized_testcase = [ tokenizer.tokenize(str(i.text_a)) for i in test_sen_examples ] # input_data = [{'id': input_example.guid, 'text': input_example.text_a} for input_example in test_sen_examples] real_text = pd.DataFrame(tokenized_testcase) pred_text = pd.DataFrame(te_predicted_labels) pred_text.to_excel('./output_dir/output_ner_pred.xlsx') real_text.to_excel('./output_dir/output_ner_tokenized.xlsx')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--true_data_file", default=None, type=str, required=True, help="The data file containing the true labels.") parser.add_argument("--pred_data_file", default=None, type=str, required=True, help="The data file containing the predicted labels.") args = parser.parse_args() true_label = [] words = [] pred_label = [] true_label_set = set() pred_label_set = set() true_file = open(args.true_data_file, 'r') sentence = [] label = [] for line in true_file: line = line.strip() if line: word, tag = line.split() if tag != 'O': true_label_set.add(tag[2:]) tag = tag.replace('---', '+') else: true_label_set.add(tag) sentence.append(word) label.append(tag) else: true_label.append(label) label = [] true_file.close() print(len(true_label)) pred_file = open(args.pred_data_file, 'r') for line in pred_file: line = line.strip() if line: word, tag = line.split() pred_label_set.add(tag) if tag != 'O': tag = tag.replace('---', '+') if not label or label[-1] != tag: tag = 'B-' + tag else: # label and label[-1] == tag tag = 'I-' + tag sentence.append(word) label.append(tag) else: pred_label.append(label) label = [] pred_file.close() assert len(true_label) == len(pred_label) print('f1: %f' % (metrics.f1_score(true_label, pred_label))) print('precision: %f' % (metrics.precision_score(true_label, pred_label))) print('recall: %f' % (metrics.recall_score(true_label, pred_label))) print('acc: %f' % (metrics.accuracy_score(true_label, pred_label))) print(metrics.classification_report(true_label, pred_label)) print('true set: ', true_label_set) print('pred set: ', pred_label_set)
predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.extend(label_ids) eval_loss = eval_loss / len(valid_dataloader) validation_loss_values.append(eval_loss) print("Validation loss: {}".format(eval_loss)) pred_tags = [ tag_values[p_i] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD" ] valid_tags = [ tag_values[l_i] for l in true_labels for l_i in l if tag_values[l_i] != "PAD" ] print("Validation Accuracy: {}".format( accuracy_score(pred_tags, valid_tags))) #print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))) print() # save the model to disk import joblib filename = 'finalized_model.sav' joblib.dump(model, filename) model = joblib.load(filename) model.to(device) test_sentence = """ Ousted WeWork founder Adam Neumann lists his Manhattan penthouse for $37.5 million. """ tokenized_sentence = tokenizer.encode(test_sentence) input_ids = torch.tensor([tokenized_sentence]).cuda() with torch.no_grad(): output = model(input_ids)
def ensemble(models, eval_examples, eval_dataset, step, args): device = args.device logger.info("Predicting...") logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.predict_batch_size) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_dataset) else: eval_sampler = DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.predict_batch_size) for model in models: model.eval() all_labels = [] all_predictions = [] logger.info("Start evaluating") for input_ids, input_mask, labels in tqdm(eval_dataloader, desc="Evaluating", disable=None): if len(all_predictions) % 1000 == 0: logger.info("Processing example: %d" % (len(all_predictions))) input_ids = input_ids.to(device) lengths = input_mask.sum(dim=-1) # batch_size input_mask = input_mask.to(device) with torch.no_grad(): logits_list = [model(input_ids, input_mask)[0] for model in models] logits = sum(logits_list) / len(logits_list) predictions = logits.argmax(dim=-1) #batch_size * length for i in range(len(labels)): length = lengths[i] eval_label = [ id2label_dict[k] for k in labels[i][1:length - 1].tolist() ] eval_prediction = [ id2label_dict[k] for k in predictions[i].cpu()[1:length - 1].tolist() ] assert len(eval_label) == len(eval_prediction) all_labels.append(eval_label) all_predictions.append(eval_prediction) for model in models: model.train() #eval f1 = f1_score(all_labels, all_predictions) * 100 precision = precision_score(all_labels, all_predictions) * 100 accuracy = accuracy_score(all_labels, all_predictions) * 100 report = classification_report(all_labels, all_predictions) logger.info("Eval results:") logger.info(f"\nF1 : {f1:.3f}\nP : {precision:.3f}\nAcc: {accuracy:.3f}") logger.info(f"\n{report}") output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: writer.write( f"Step: {step}\nF1: {f1:.3f}\nP: {precision:.3f}\nAcc: {accuracy:.3f}" ) logger.info("Write predictions...") output_prediction_file = os.path.join(args.output_dir, "predictions_%d.json" % step) write_predictions(eval_examples, all_labels, all_predictions, output_prediction_file)
#Performance Measures-----------------> #BOOK1 print("entity_predictions") entity_pred = [] for X in B_2[6350:6750]: if X.ent_type_ == "GPE" or X.ent_type_ == "PERSON" or X.ent_type_ == "ORG"or X.ent_type_ == "FAC" or X.ent_type_ == "LOC": entity_pred.append('B-'+X.ent_type_) elif X.ent_type_=="": entity_pred.append('O') print(entity_pred) entity_pred = [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-PERSON', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O'], ['B-FAC', 'I-FAC', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON','O', 'O'], ['B-PERSON', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O']] entity_true = [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-PERSON','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O'], ['B-PERSON', 'O', 'O','O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON','I-PERSON', 'O', 'O'], ['B-PERSON', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O']] f1_score(entity_true, entity_pred) accuracy_score(entity_true, entity_pred) #BOOK2 entity_pred2 = [] for X in S2[7050:7500]: if X.ent_type_ == "GPE" or X.ent_type_ == "PERSON" or X.ent_type_ == "ORG" or X.ent_type_ == "FAC" or X.ent_type_ == "LOC": entity_pred2.append(X.ent_type_) elif X.ent_type_=="": entity_pred2.append('O') print(entity_pred2) entity_pred2 =['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'GPE', 'GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] entity_pred2 = [['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O','O'], ['B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O', 'O'], ['B-ORG', 'I-ORG','I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-GPE', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O', 'O'], ['B-PERSON','I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O'], ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O']] entity_true2 = [['O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON','I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O','O', 'O'], ['B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'I-PERSON', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O', 'O'],['B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON','I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O'], ['B-GPE', 'O', 'O', 'O', 'O'], ['B-PERSON','I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['B-LOC', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON','I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON','I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O']] f1_score(entity_true2, entity_pred2) accuracy_score(entity_true2, entity_pred2)
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix="", is_test=False): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, data_file=mode, is_test=is_test) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [] preds_list = [] # import ipdb; ipdb.set_trace() for i in range(out_label_ids.shape[0]): if out_label_ids[i] != pad_token_label_id: out_label_list.append(label_map[out_label_ids[i]]) preds_list.append(label_map[preds[i]]) results = { "loss": eval_loss, "accuracy": accuracy_score(out_label_list, preds_list), } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None, # XLM and RoBERTa don"t use segment_ids "labels": batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), "accuracy": accuracy_score(out_label_list, preds_list) } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir): os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id,data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) ### span_labels = [] for label in label_list: label = label.split('-')[-1] if label not in span_labels: span_labels.append(label) span_map = {i: label for i, label in enumerate(span_labels)} # Eval logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] # 全部测试结果 error_results=[] # 预测错误结果 true_labels = [] # 真实标签 predict_labels = [] # 预测标签 output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, torch.nn.DataParallel): # 多GPU训练 model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[5], "end_positions": batch[6]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] start_preds = start_logits.detach().cpu().numpy() end_preds = end_logits.detach().cpu().numpy() start_preds = np.argmax(start_preds, axis=2) end_preds = np.argmax(end_preds, axis=2) start_preds_list = [span_map[j] for j in start_preds[0][1:-1]] end_preds_list = [span_map[j] for j in end_preds[0][1:-1]] batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1] batch_true_labels = [args.id2label.get(i) for i in batch_true_labels] true_labels.append(batch_true_labels) batch_predict_labels = convert_span_to_bio([start_preds_list], [end_preds_list]) predict_labels.extend(batch_predict_labels) input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1] sent = "" ifError=False for input_id,pre,lab in zip(input_ids, batch_predict_labels[0], batch_true_labels): sent+=" ".join([tokenizer.ids_to_tokens[input_id],lab,pre])+"\n" if lab != pre: ifError=True sent+="\n" results.append(sent) if ifError: error_results.append(sent) ifError = False pbar(step) # 计算测试集 acc, recall, f1 logger.info("\n测试集结果统计:") logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels))) logger.info("p: %s", str(precision_score(true_labels, predict_labels))) logger.info("r: %s", str(recall_score(true_labels, predict_labels))) logger.info("f1: %s", str(f1_score(true_labels, predict_labels))) logger.info("classification report: ") logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2))) logger.info("\n") with open(output_predict_file, "w",encoding="utf-8") as writer: for record in results: writer.write(record) with open(error_predict_file, "w",encoding="utf-8") as writer: for record in error_results: writer.write(record)
# Mark=0, meaning its a pad word, dont compare if m: if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label temp_1.append(tag2name[label_ids[i][j]]) temp_2.append(tag2name[logits[i][j]]) else: break y_true.append(temp_1) y_pred.append(temp_2) print("f1 socre: %f"%(f1_score(y_true, y_pred))) print("Accuracy score: %f"%(accuracy_score(y_true, y_pred))) # Get acc , recall, F1 result report report = classification_report(y_true, y_pred,digits=4) bert_out_address = 'bert' # Save the report into file output_eval_file = os.path.join(bert_out_address, "eval_results.txt") with open(output_eval_file, "w") as writer: print("***** Eval results *****") print("\n%s"%(report)) print("f1 socre: %f"%(f1_score(y_true, y_pred))) print("Accuracy score: %f"%(accuracy_score(y_true, y_pred))) writer.write("f1 socre:\n")