def __call__(self, inputs, outputs): words, label_gts, label_preds = normalized_words_labels_preds( inputs, outputs, self.tokenizer) acc = accuracy_score(label_preds, label_gts) f1 = f1_score(label_preds, label_gts) precision = precision_score(label_preds, label_gts) recall = recall_score(label_preds, label_gts) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall }
def acc_and_f1(preds, labels): acc = accuracy_score(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) p = precision_score(y_true=labels, y_pred=preds) r = recall_score(y_true=labels, y_pred=preds) report = classification_report(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, "precision:": p, "recall": r, # "report": report, }
def reduce_aggregated_logs(self, aggregated_logs, global_step=None): """Reduces aggregated logs over validation steps.""" label_class = aggregated_logs['label_class'] predict_class = aggregated_logs['predict_class'] return { 'f1': seqeval_metrics.f1_score(label_class, predict_class), 'precision': seqeval_metrics.precision_score(label_class, predict_class), 'recall': seqeval_metrics.recall_score(label_class, predict_class), 'accuracy': seqeval_metrics.accuracy_score(label_class, predict_class), }
def results_as_entities(out_label_list, preds_list, results_abs_path): results = { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), "accuracy": accuracy_score(out_label_list, preds_list), "report:": classification_report(out_label_list, preds_list, digits=2) } with open(results_abs_path, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) return results
def infer(self): num_test_batch = int(self.test_size / self.batch_size) + 1 test_feed_dict = { self.char_input: self.test_char_idx, self.seq_len: self.test_len, self.label: self.test_Y } modelpath = f"./ner-model-{self.test_type}/" modelName = f"ner-{self.test_type}.ckpt" best_f1_score = 0. saver = tf.train.Saver() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) ## Load pretrained model ckpt = tf.train.get_checkpoint_state(modelpath) if (ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path)): saver.restore(sess, modelpath + modelName) print("Model loaded!") start_time = datetime.datetime.now() test_f1_list, test_recall_list, test_precision_list = [], [], [] print(self.data.label2idx) sess.run(self.test_init_op, feed_dict=test_feed_dict) y_true_list, y_pred_list, test_loss = [], [], 0. for step in range(num_test_batch): loss = sess.run(self.loss, feed_dict={self.dropout: 0.0}) y_true, y_pred = self.predict(sess) y_true_list += y_true y_pred_list += y_pred test_loss += loss / num_test_batch test_f1_score = f1_score(y_true_list, y_pred_list) test_recall_score = recall_score(y_true_list, y_pred_list) test_precision_score = precision_score(y_true_list, y_pred_list) print( "[test] loss {:.6f} f1 {:.4f} recall {:.4f} precision {:.4f}". format(test_loss, test_f1_score, test_recall_score, test_precision_score)) print(classification_report(y_true_list, y_pred_list, digits=4)) elapsed_time = datetime.datetime.now() - start_time print("{}".format(elapsed_time))
def eval_seq_scores(y_true, y_pred): """ Performs sequence evaluation on slot labels Args: y_true: true slot labels y_pred: predicted slot labels Returns: scores: dict containing the evaluation scores: f1, accuracy, precision, recall """ scores = dict() scores['f1'] = f1_score(y_true, y_pred) scores['accuracy'] = accuracy_score(y_true, y_pred) scores['precision'] = precision_score(y_true, y_pred) scores['recall'] = recall_score(y_true, y_pred) return scores
def metrics_fn(logits, labels): preds = np.argmax(logits, axis=-1) label_names = self.get_labels() y_true = [] y_pred = [] for pred, label in zip(preds, labels): y_true.append([label_names[l] for l in label if l >= 0]) y_pred.append( [label_names[p] for p, l in zip(pred, label) if l >= 0]) return OrderedDict( accuracy=seq_metrics.accuracy_score(y_true, y_pred), f1=seq_metrics.f1_score(y_true, y_pred), precision=seq_metrics.precision_score(y_true, y_pred), recall=seq_metrics.recall_score(y_true, y_pred))
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key): if metric_key == "accuracy": return seq_metrics.accuracy_score(targ_toks, pred_toks) if metric_key == "precision": return seq_metrics.precision_score(targ_toks, pred_toks) if metric_key == "recall": return seq_metrics.recall_score(targ_toks, pred_toks) if metric_key == "f1": return seq_metrics.f1_score(targ_toks, pred_toks) if metric_key == "classification_report": return seq_metrics.classification_report(targ_toks, pred_toks)
def compute_f1(predictions, correct, idx2Label): label_pred = [] for sentence in predictions: label_pred.append([idx2Label[element] for element in sentence]) label_correct = [] for sentence in correct: label_correct.append([idx2Label[element] for element in sentence]) print(classification_report(label_correct, label_pred)) # print("predictions ", len(label_pred)) # print("correct labels ", len(label_correct) return precision_score(label_correct, label_pred), recall_score( label_correct, label_pred), f1_score(label_correct, label_pred)
def score(self, y_true, y_pred): """Calculate f1 score. Args: y_true (list): true sequences. y_pred (list): predicted sequences. Returns: score: f1 score. """ f1 = f1_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) if self._digits: self._logger.info( classification_report(y_true, y_pred, digits=self._digits)) return f1, precision, recall
def model_evaluate(config, model, data_iter, test=False): model.eval() loss_total = 0 predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) label_map = {i: label for i, label in enumerate(config.class_list)} criterion = FocalLoss(gamma=2, alpha=1) with torch.no_grad(): for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(data_iter): input_ids = torch.tensor(input_ids).type(torch.LongTensor).to( config.device) attention_mask = torch.tensor(attention_mask).type( torch.LongTensor).to(config.device) token_type_ids = torch.tensor(token_type_ids).type( torch.LongTensor).to(config.device) labels = torch.tensor(labels).type(torch.LongTensor).to( config.device) outputs = model(input_ids, attention_mask, token_type_ids) active_loss = attention_mask.view(-1) == 1 active_logits = outputs.view(-1, config.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] #loss = F.cross_entropy(active_logits, active_labels) loss = criterion(active_logits, active_labels) loss_total += loss active_labels = active_labels.data.cpu().numpy() predic = torch.max(active_logits.data, 1)[1].cpu().numpy() labels_all = np.append(labels_all, active_labels) predict_all = np.append(predict_all, predic) true_label = [label_map[key] for key in labels_all] predict_label = [label_map[key] for key in predict_all] acc = metrics.accuracy_score(labels_all, predict_all) precision = precision_score(true_label, predict_label) recall = recall_score(true_label, predict_label) f1 = f1_score(true_label, predict_label) if test: report = classification_report(true_label, predict_label, digits=4) confusion = metrics.confusion_matrix(true_label, predict_label) return acc, precision, recall, f1, loss_total / len( data_iter), report, confusion return acc, precision, recall, f1, loss_total / len(data_iter)
def get_metric(self, reset: bool) -> Union[float, Tuple[float, ...], Dict[str, float], Dict[str, List[float]]]: if not reset: return dict() if not self._predictions: return dict() metrics = { 'seqeval_precision': precision_score(self._gold_labels, self._predictions), 'seqeval_recall': recall_score(self._gold_labels, self._predictions), 'seqeval_f1_score': f1_score(self._gold_labels, self._predictions) } self.reset() return metrics
def test_by_ground_truth(self): with open(self.file_name) as f: output = subprocess.check_output(['perl', 'conlleval.pl'], stdin=f).decode('utf-8') acc_true, p_true, r_true, f1_true = self.parse_conlleval_output( output) acc_pred = accuracy_score(self.y_true, self.y_pred) p_pred = precision_score(self.y_true, self.y_pred) r_pred = recall_score(self.y_true, self.y_pred) f1_pred = f1_score(self.y_true, self.y_pred) self.assertLess(abs(acc_pred - acc_true), 1e-4) self.assertLess(abs(p_pred - p_true), 1e-4) self.assertLess(abs(r_pred - r_true), 1e-4) self.assertLess(abs(f1_pred - f1_true), 1e-4)
def calculate_report(self, y, y_, transform=True): ''' calculating F1, P, R :param y: golden label, list :param y_: model output, list :return: ''' if transform: for i in range(len(y)): for j in range(len(y[i])): y[i][j] = self.voc_i2s[y[i][j]] for i in range(len(y_)): for j in range(len(y_[i])): y_[i][j] = self.voc_i2s[y_[i][j]] return precision_score(y, y_), recall_score(y, y_), f1_score(y, y_)
def pos_evaluate_word_PRF(y_pred, y): #dict = {'E': 2, 'S': 3, 'B':0, 'I':1} y_word = [] y_pos = [] y_pred_word = [] y_pred_pos = [] for y_label, y_pred_label in zip(y, y_pred): y_word.append(y_label[0]) y_pos.append(y_label[2:]) y_pred_word.append(y_pred_label[0]) y_pred_pos.append(y_pred_label[2:]) word_cor_num = 0 pos_cor_num = 0 yp_wordnum = y_pred_word.count('E')+y_pred_word.count('S') yt_wordnum = y_word.count('E')+y_word.count('S') start = 0 for i in range(len(y_word)): if y_word[i] == 'E' or y_word[i] == 'S': word_flag = True pos_flag = True for j in range(start, i+1): if y_word[j] != y_pred_word[j]: word_flag = False pos_flag = False break if y_pos[j] != y_pred_pos[j]: pos_flag = False if word_flag: word_cor_num += 1 if pos_flag: pos_cor_num += 1 start = i+1 wP = word_cor_num / float(yp_wordnum) if yp_wordnum > 0 else -1 wR = word_cor_num / float(yt_wordnum) if yt_wordnum > 0 else -1 wF = 2 * wP * wR / (wP + wR) # pP = pos_cor_num / float(yp_wordnum) if yp_wordnum > 0 else -1 # pR = pos_cor_num / float(yt_wordnum) if yt_wordnum > 0 else -1 # pF = 2 * pP * pR / (pP + pR) pP = precision_score(y, y_pred) pR = recall_score(y, y_pred) pF = f1_score(y, y_pred) return (wP, wR, wF), (pP, pR, pF)
def get_results(self, y_true,y_pred): result_med = collections.namedtuple( 'result_med', [ 'precision', 'recall', 'f1_score', 'accuracy', ] ) result_med.accuracy = accuracy_score(y_true,y_pred) result_med.precision = precision_score(y_true,y_pred) result_med.recall = recall_score(y_true,y_pred) result_med.f1_score = f1_score(y_true,y_pred) return result_med
def run(filename): print('------------------ %s ---------------' % filename) with open(filename, 'r') as f: obj = json.load(f) y_true = obj['gold_results'] y_pred = obj['pred_results'] pre = precision_score(y_true, y_pred) rec = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) df = pd.DataFrame({'percision': [pre], 'recall': [rec], 'f1': [f1]}) print(df) report = classification_report(y_true, y_pred) print(report)
def on_epoch_end(self, epoch, logs=None): pred_probs = self.model.predict(self.valid_features) if self.preprocessor.use_bert: pred_probs = pred_probs[:, 1:-1, :] # remove <CLS> and <SEQ> y_pred = self.preprocessor.label_decode(pred_probs, self.get_lengths(pred_probs)) r = metrics.recall_score(self.valid_labels, y_pred) p = metrics.precision_score(self.valid_labels, y_pred) f1 = metrics.f1_score(self.valid_labels, y_pred) logs['val_r'] = r logs['val_p'] = p logs['val_f1'] = f1 print('Epoch {}: val_r: {}, val_p: {}, val_f1: {}'.format( epoch + 1, r, p, f1)) print(metrics.classification_report(self.valid_labels, y_pred))
def compute_metrics(self) -> Dict: # TODO 这里有个很神奇的事情,不传zero_division,即使用默认值0的时候,这里的运算会慢很多,待进一步分析 logger.info( f"\n{classification_report(self.out_label_list, self.preds_list, zero_division=0)}" ) return { "precision": precision_score(self.out_label_list, self.preds_list, zero_division=0), "recall": recall_score(self.out_label_list, self.preds_list, zero_division=0), "f1": f1_score(self.out_label_list, self.preds_list, zero_division=0), }
def evaluate_results(net, test_loader, pad_id, cuda): logger.info("Evaluating test samples...") acc = 0 out_labels = [] true_labels = [] net.eval() with torch.no_grad(): for i, data in tqdm(enumerate(test_loader), total=len(test_loader)): x, e1_e2_start, labels, _, _, _ = data attention_mask = (x != pad_id).float() token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long() if cuda: x = x.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() token_type_ids = token_type_ids.cuda() classification_logits = net( x, token_type_ids=token_type_ids, attention_mask=attention_mask, Q=None, e1_e2_start=e1_e2_start, ) accuracy, (o, l) = evaluate_(classification_logits, labels, ignore_idx=-1) out_labels.append([str(i) for i in o]) true_labels.append([str(i) for i in l]) acc += accuracy accuracy = acc / (i + 1) results = { "accuracy": accuracy, "precision": precision_score(true_labels, out_labels), "recall": recall_score(true_labels, out_labels), "f1": f1_score(true_labels, out_labels), } logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results
def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), }
def get_metrics(pred_list: list, label_list: list) -> dict: """ 获取序列标注的各项评估指标 :param pred_list: list 模型预测的标签序列 :param label_list: list 人为标注的标签序列 :return: dict 包含各类评估指标的字典 """ if len(pred_list) == 0: return { "accuracy": 0, "precision": 0, "recall": 0, "f1": 0 } return { "accuracy": "{:.5f}".format(metrics.accuracy_score(label_list, pred_list)), "precision": "{:.5f}".format(metrics.precision_score(label_list, pred_list)), "recall": "{:.5f}".format(metrics.recall_score(label_list, pred_list)), "f1": "{:.5f}".format(metrics.f1_score(label_list, pred_list)) }
def compute_metrics(predictions, label_ids, label_map): preds = np.argmax(predictions, axis=-1) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), }
def compute(self, p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ self.id2tag[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ self.id2tag[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return { "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), "accuracy": accuracy_score(true_labels, true_predictions), }
def evaluate_nn(gold_labels, predictions, id2label): """ Parameters ---------- gold_labels : numpy.ndarray Predicted labels for all tokens (or sentences if seq2label) predictions : numpy.ndarray Respective gold labels for all tokens (or sentences if seq2label) id2label : dict Dictionary that maps from ids to real labels. Returns ------- acc : float Accuracy of predicted labels. f1_e : float F! Score of predicated labels, adapted for BIO scheme (seqeval module) """ acc = (predictions == gold_labels).sum() / len(predictions) # token level #precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support( # gold_labels, predictions, average='micro') # seqeval needs IOB2 labels, convert it preds = [id2label[x] for x in predictions] golds = [id2label[x] for x in gold_labels] # entity level precision_e = precision_score(golds, preds) recall_e = recall_score(golds, preds) f1_e = f1_score(golds, preds) print(' \t Accuracy: {:.3f}'.format(acc)) #print(' sklearn \t Prec: {:.2f} \t Recall: {:.2f} \t F1: {:.2f}'.format( # precision, recall, f1)) print( ' \t Precision: {:.2f} \t Recall: {:.2f} \t F1: {:.2f}'.format( precision_e, recall_e, f1_e)) return acc, f1_e
def xzk_eval_model(self, dataloader, name=None): with torch.no_grad(): metrics = np.asarray([0, 0, 0], dtype=int) all_true_y_label = list() all_pred_y_label = list() print("testing") for batch, data in tqdm(enumerate(dataloader)): insts = data[-1] data = [x.to(self.device) for x in data[0:-1]] token_id_seq, data_length, char_seq_tensor, char_seq_len, masks, label_seq = data sequence_loss, logits = self.model(token_id_seq, data_length, char_seq_tensor, char_seq_len, masks, label_seq) batch_max_scores, pred_ids = self.model.decode( logits, data_length) metrics += evaluate_batch_insts(insts, pred_ids, label_seq, data_length, self.config.idx2labels, self.config.use_crf_layer) for i in insts: all_pred_y_label.append(i.prediction) all_true_y_label.append(i.output) p, total_predict, total_entity = metrics[0], metrics[1], metrics[2] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore), flush=True) p = precision_score(all_true_y_label, all_pred_y_label) r = recall_score(all_true_y_label, all_pred_y_label) f1 = f1_score(all_true_y_label, all_pred_y_label) print("Precision: %.2f, Recall: %.2f, F1: %.2f" % (p, r, f1), flush=True) print('acc', accuracy_score(all_true_y_label, all_pred_y_label), flush=True) print(classification_report(all_true_y_label, all_pred_y_label)) return precision, recall, fscore
def student_metrics(predictions, label_ids, student_to_teacher_map): label_map = {i: label for i, label in enumerate(['B', 'I', 'O'])} preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != -100: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[student_to_teacher_map[preds[i][j]]]) metrics = { "accuracy_score": accuracy_score(out_label_list, preds_list), "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } return metrics
def f1_pre_rec(labels, preds, is_ner=True): if is_ner: return { "precision": seqeval_metrics.precision_score(labels, preds, suffix=True), "recall": seqeval_metrics.recall_score(labels, preds, suffix=True), "f1": seqeval_metrics.f1_score(labels, preds, suffix=True), } else: return { "precision": sklearn_metrics.precision_score(labels, preds, average="macro"), "recall": sklearn_metrics.recall_score(labels, preds, average="macro"), "f1": sklearn_metrics.f1_score(labels, preds, average="macro"), }
def _compute(self, predictions, references, suffix=False): true_entities = set(get_entities(references, suffix)) pred_entities = set(get_entities(predictions, suffix)) d1 = defaultdict(set) d2 = defaultdict(set) scores = {} for e in true_entities: d1[e[0]].add((e[1], e[2])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) for type_name, true_entities in d1.items(): scores[type_name] = {} pred_entities = d2[type_name] nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 scores[type_name]["precision"] = p scores[type_name]["recall"] = r scores[type_name]["f1"] = f1 scores[type_name]["number"] = nb_true scores["overall_precision"] = precision_score(y_true=references, y_pred=predictions, suffix=suffix) scores["overall_recall"] = recall_score(y_true=references, y_pred=predictions, suffix=suffix) scores["overall_f1"] = f1_score(y_true=references, y_pred=predictions, suffix=suffix) scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions) return scores
def __test(self, model: PreTrainedModel, data: DataLoader) -> (float, float, float, float, float, str): eval_loss = 0. eval_steps, eval_examples = 0, 0 tokens, eval_predictions, eval_labels = [], [], [] model.eval() for batch in tqdm(data): batch_tokens, batch_masks, batch_tags = tuple( t.to(self.device) for t in batch) with torch.no_grad(): outputs = model(batch_tokens, attention_mask=batch_masks, labels=batch_tags) logits = outputs[1].detach().cpu().numpy() label_ids = batch_tags.to('cpu').numpy() toks = batch_tokens.to('cpu').numpy() eval_loss += outputs[0].mean().item() batch_toks = [ self.tokenizer.convert_ids_to_tokens(sentence) for sentence in toks ] tokens.extend(batch_toks) eval_predictions.extend( [list(p) for p in np.argmax(logits, axis=2)]) eval_labels.extend(label_ids) eval_examples += batch_tokens.size(0) eval_steps += 1 eval_loss = eval_loss / eval_steps predicted_tags, valid_tags, tokens = self.translate( eval_predictions, eval_labels, tokens) score_acc = accuracy_score(valid_tags, predicted_tags) score_f1 = f1_score(valid_tags, predicted_tags) score_p = precision_score(valid_tags, predicted_tags) score_r = recall_score(valid_tags, predicted_tags) report = classification_report(valid_tags, predicted_tags) return eval_loss, score_acc, score_f1, score_p, score_r, report