def evaluate_ner_seq_eval(self, batch_ner_labels, batch_ner_predictions, labels: List[str], partition, head_identifier): id2label = {} entity_labels = labels for idx, label in enumerate(entity_labels): if label.endswith('NP'): label = label[:2] + head_identifier.split('_')[-1] elif label == 'BERT_TOKEN': label = 'O' id2label[idx] = label ner_ground_truth = [[id2label[idx] for idx in seq] for seq in batch_ner_labels] ner_predictions = [[id2label[idx] for idx in seq] for seq in batch_ner_predictions] # Get results default_results = classification_report(y_true=ner_ground_truth, y_pred=ner_predictions, output_dict=True, digits=3, mode='default', scheme=IOB2) default_results['performance'] = performance_measure( y_true=ner_ground_truth, y_pred=ner_predictions) default_results = { metric_group1: {metric: float(value) for metric, value in metric_group2.items()} for metric_group1, metric_group2 in default_results.items() } strict_results = classification_report(y_true=ner_ground_truth, y_pred=ner_predictions, output_dict=True, digits=3, mode='strict', scheme=IOB2) strict_results['performance'] = performance_measure( y_true=ner_ground_truth, y_pred=ner_predictions) strict_results = { metric_group1: {metric: float(value) for metric, value in metric_group2.items()} for metric_group1, metric_group2 in strict_results.items() } mlflow.log_dict(dict(lenient=default_results, strict=strict_results), f"{partition}/{self.epoch}/{head_identifier}.json")
def main(): parser = argparse.ArgumentParser() parser.add_argument("predictions") parser.add_argument("input_df_pkl") args = parser.parse_args() test_df = pd.read_pickle(args.input_df_pkl) y_pred = [] with open(args.predictions) as fp: for line in fp: y_pred += line.strip().split() y_true = reduce(lambda acc, l: acc + l, test_df.entity_type, []) y_true = pd.Series(y_true) y_pred = pd.Series(y_pred) print(len(y_true)) print(len(y_pred)) print("Token level") eval_d = ner_report(y_true, y_pred, mode="token", return_dict=True) eval_d = dict(eval_d["PATHWAY"]) with open("pathway_metrics_token.json", "w") as fp: json.dump(eval_d, fp) fp.write("\n") pprint(eval_d) print("Entity level") y_pred_corr = pd.Series(correct_iob(y_pred)) eval_d = ner_report(y_true, y_pred_corr, mode="entity", return_dict=True) eval_d = dict(eval_d["PATHWAY"]) with open("pathway_metrics_entity.json", "w") as fp: json.dump(eval_d, fp) fp.write("\n") pprint(eval_d) print("Seqeval") y_true = list(test_df.entity_type) y_pred = [] with open(args.predictions) as fp: for line in fp: y_pred.append(line.strip().split()) from collections import Counter c = Counter() for x in y_true: c.update(x) print(c) total = 0 for s1, s2 in zip(y_true, y_pred): total += sum(t1 == t2 for t1, t2 in zip(s1, s2)) acc = total / sum(len(s) for s in y_true) print("acc:", acc) print("acc_score:", accuracy_score(y_true, y_pred)) print(classification_report(y_true, y_pred, scheme=IOB2, mode="strict")) print(performance_measure(y_true, y_pred))
def test_performance_measure(self): y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O', 'B-PER']] y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O', 'B-MISC']] performance_dict = performance_measure(y_true, y_pred) self.assertDictEqual(performance_dict, { 'FN': 1, 'FP': 3, 'TN': 4, 'TP': 3 })
def train_epoch(model, criterion, optimizer, data, tag2idx, idx2tag, device, scheduler): epoch_loss = 0 epoch_metrics = FMeasureStorage() model.train() for batch in data: tokens = batch[0].to(device) tags = batch[1].to(device) batch_element_length = len(tags[0]) predictions = model(tokens) predictions = predictions.view(-1, predictions.shape[-1]) tags_mask = tags != tag2idx['<pad>'] tags_mask = tags_mask.view(-1) labels = torch.where( tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags)) loss = criterion(predictions, labels) predictions = predictions.argmax(dim=1) predictions = predictions.cpu().numpy() labels = labels.cpu().numpy() # clear <PAD>, CLS and SEP tags from both labels and predictions clear_labels, clear_predictions = clear_tags(labels, predictions, idx2tag, tag2idx, batch_element_length) iteration_result = performance_measure(clear_labels, clear_predictions) epoch_metrics + iteration_result epoch_loss += loss.item() optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() if scheduler: scheduler.step() torch.cuda.empty_cache() epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report() print( 'Train Loss = {:.5f}, F1-score = {:.3%}, Precision = {:.3%}, Recall = {:.3%}' .format(epoch_loss / len(data), epoch_f1_score, epoch_precision, epoch_recall))
def test_epoch(model, criterion, data, tag2idx, idx2tag, device): name = '[Final] Test :' epoch_loss = 0 epoch_metrics = FMeasureStorage() model.eval() with torch.no_grad(): with tqdm(total=len(data)) as progress_bar: for batch in data: tokens = batch[0].to(device) tags = batch[1].to(device) batch_element_length = len(tags[0]) predictions, _ = model(tokens) predictions = predictions.view(-1, predictions.shape[-1]) tags_mask = tags != tag2idx['<PAD>'] tags_mask = tags_mask.view(-1) labels = torch.where(tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags)) loss = criterion(predictions, labels) predictions = predictions.argmax(dim=1) predictions = predictions.cpu().numpy() labels = labels.cpu().numpy() # clear <PAD>, CLS and SEP tags from both labels and predictions clear_labels, clear_predictions = clear_tags(labels, predictions, idx2tag, tag2idx, batch_element_length) iteration_result = performance_measure(clear_labels, clear_predictions) epoch_metrics + iteration_result epoch_loss += loss.item() progress_bar.update() progress_bar.set_description( '{:>5s} Loss = {:.5f}, F1-score = {:.2%}'.format(name, loss.item(), 0)) epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report() progress_bar.set_description( '{:>5s} Loss = {:.5f}, F1-score = {:.2%}'.format(name, epoch_loss / len(data), epoch_f1_score))
def test_epoch(model, criterion, data, tag2idx, idx2tag, device): epoch_loss = 0 epoch_metrics = FMeasureStorage() model.eval() with torch.no_grad(): for batch in data: tokens = batch[0].to(device) tags = batch[1].to(device) batch_element_length = len(tags[0]) predictions = model(tokens) predictions = predictions.view(-1, predictions.shape[-1]) tags_mask = tags != tag2idx['<pad>'] tags_mask = tags_mask.view(-1) labels = torch.where( tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags)) loss = criterion(predictions, labels) predictions = predictions.argmax(dim=1) predictions = predictions.cpu().numpy() labels = labels.cpu().numpy() # clear <PAD>, CLS and SEP tags from both labels and predictions clear_labels, clear_predictions = clear_tags( labels, predictions, idx2tag, tag2idx, batch_element_length) iteration_result = performance_measure(clear_labels, clear_predictions) epoch_metrics + iteration_result epoch_loss += loss.item() epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report() print( 'Test Loss = {:.5f}, F1-score = {:.3%}, Precision = {:.3%}, Recall = {:.3%}' .format(epoch_loss / len(data), epoch_f1_score, epoch_precision, epoch_recall))
def evaluate(self, X, y, sentences=None): """ Evaluate the model using the given data. Parameters ---------- X: Evaluation data. y: Evaluation data labels. sentences: Evaluation data sentences, used to print the wrong results. Returns ------- Evaluation score (precision, recall, and f1-score) """ y_pred = self.predict(X) y_true = [] y_pred2 = [] for seq in y: for label in seq: if label == 'O': y_true.append(0) elif label == 'B-ASPECT': y_true.append(1) elif label == 'I-ASPECT': y_true.append(2) elif label == 'B-SENTIMENT': y_true.append(3) elif label == 'I-SENTIMENT': y_true.append(4) for seq in y_pred: for label in seq: if label == 'O': y_pred2.append(0) elif label == 'B-ASPECT': y_pred2.append(1) elif label == 'I-ASPECT': y_pred2.append(2) elif label == 'B-SENTIMENT': y_pred2.append(3) elif label == 'I-SENTIMENT': y_pred2.append(4) print("Confusion Matrix:") print(confusion_matrix(y_true, y_pred2)) print() print("Precision:") print(precision_score(y_true, y_pred2, average=None)) print() print("Recall:") print(recall_score(y_true, y_pred2, average=None)) print() print("F1-score:") print(f1_score(y_true, y_pred2, average=None)) print() print("Report (entity level):") print(classification_report(y, y_pred)) print(performance_measure(y, y_pred)) if sentences != None: self.get_wrong_predictions(y, y_pred, sentences)
def eval_ner(self, dataset_id, corr_labels, pred_labels, train_data, test_data): # compute entity-level metrics metrics = classification_report(corr_labels, pred_labels, digits=4) metrics = pd.read_csv(StringIO(metrics), sep=' {2,}', engine='python') * 100 # sort the labels alphabetically and rename columns metrics.sort_index(inplace=True) metrics.rename(columns={ 'precision': 'Prec', 'recall': 'Rec', 'f1-score': 'F1' }, inplace=True) # append the prefix B- to all tags in order to compute token-level metrics corr_labels_t = [[ 'B' + ent[1:] if ent[0] in ('B', 'I') else 'B-' + ent for ent in sent ] for sent in corr_labels] pred_labels_t = [[ 'B' + ent[1:] if ent[0] in ('B', 'I') else 'B-' + ent for ent in sent ] for sent in pred_labels] # compute token-level metrics metrics_t = classification_report(corr_labels_t, pred_labels_t, digits=4) metrics_t = pd.read_csv( StringIO(metrics_t), sep=' {2,}', engine='python') * 100 # sort the labels alphabetically and rename columns metrics_t.sort_index(inplace=True) metrics_t.rename(columns={ 'precision': 'Prec', 'recall': 'Rec', 'f1-score': 'F1' }, inplace=True) # compute performance metrics perf = performance_measure(corr_labels, pred_labels) tp, tn, fp, fn = perf['TP'], perf['TN'], perf['FP'], perf['FN'] # compute entity- and token-level accuracy ent_acc = round((tp + tn) / (tp + tn + fp + fn) * 100, 2) tok_acc = round(accuracy_score(corr_labels, pred_labels) * 100, 2) # obtain overall Prec, Rec and F1 for entity- and token-level ent_avg = metrics.loc['micro avg'].drop('support') tok_avg = metrics_t.loc['macro avg'].drop('support') ent_avg = pd.concat([pd.Series({'Acc': ent_acc}), ent_avg]) tok_avg = pd.concat([pd.Series({'Acc': tok_acc}), tok_avg]) metrics_avg = pd.concat([ent_avg, tok_avg], keys=['Entity Spans', 'Tokens']) self.ner_metrics[dataset_id] = metrics_avg # obtain F1 score at the entity- and token-level per entity type ent_f1 = metrics['F1'].drop(['micro avg', 'macro avg']) tok_f1 = metrics_t['F1'].drop(['O', 'micro avg', 'macro avg']) metrics_f1 = pd.concat([ent_f1, tok_f1], keys=['Entity Spans', 'Tokens']) self.ent_type_f1[dataset_id] = metrics_f1