def evaluate(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singletons, min_span): doc_coref_infos = get_coref_infos(key_lines, sys_lines, NP_only, remove_nested, keep_singletons, min_span) output_scores = {} conll = 0 conll_subparts_num = 0 for name, metric in metrics: recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1) if name in ["muc", "bcub", "ceafe"]: conll += f1 conll_subparts_num += 1 output_scores.update({ f"{name}/recall": recall, f"{name}/precision": precision, f"{name}/f1": f1 }) logger.info( name.ljust(10), "Recall: %.2f" % (recall * 100), " Precision: %.2f" % (precision * 100), " F1: %.2f" % (f1 * 100), ) if conll_subparts_num == 3: conll = (conll / 3) * 100 logger.info("CoNLL score: %.2f" % conll) output_scores.update({f"conll_score": conll}) return output_scores
def evaluate(key_directory, sys_directory, metrics, keep_singletons, keep_non_referring, use_MIN): doc_coref_infos, doc_non_referring_infos = reader.get_coref_infos( key_directory, sys_directory, keep_singletons, keep_non_referring, use_MIN) conll = 0 conll_subparts_num = 0 for name, metric in metrics: recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1) if name in ["muc", "bcub", "ceafe"]: conll += f1 conll_subparts_num += 1 print(name) print('Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100)) if conll_subparts_num == 3: conll = (conll / 3) * 100 print('CoNLL score: %.2f' % conll) if keep_non_referring: recall, precision, f1 = evaluate_non_referrings( doc_non_referring_infos) print('============================================') print('Non-referring markable identification scores:') print('Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100))
def evaluate(key_file, sys_file, metrics, keep_singletons, keep_split_antecedent, keep_bridging, keep_non_referring, only_split_antecedent, evaluate_discourse_deixis, use_MIN): doc_coref_infos, doc_non_referring_infos, doc_bridging_infos = reader.get_coref_infos( key_file, sys_file, keep_singletons, keep_split_antecedent, keep_bridging, keep_non_referring, evaluate_discourse_deixis, use_MIN) conll = 0 conll_subparts_num = 0 for name, metric in metrics: recall, precision, f1 = evaluator.evaluate_documents( doc_coref_infos, metric, beta=1, only_split_antecedent=only_split_antecedent) if name in ["muc", "bcub", "ceafe"]: conll += f1 conll_subparts_num += 1 print(name) print('Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100)) if conll_subparts_num == 3: conll = (conll / 3) * 100 print('CoNLL score: %.2f' % conll) if keep_non_referring: recall, precision, f1 = evaluate_non_referrings( doc_non_referring_infos) print('============================================') print('Non-referring markable identification scores:') print('Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100)) if keep_bridging: score_ar, score_fbm, score_fbe = evaluator.evaluate_bridgings( doc_bridging_infos) recall_ar, precision_ar, f1_ar = score_ar recall_fbm, precision_fbm, f1_fbm = score_fbm recall_fbe, precision_fbe, f1_fbe = score_fbe print('============================================') print('Bridging anaphora recognition scores:') print('Recall: %.2f' % (recall_ar * 100), ' Precision: %.2f' % (precision_ar * 100), ' F1: %.2f' % (f1_ar * 100)) print('Full bridging scores (Markable Level):') print('Recall: %.2f' % (recall_fbm * 100), ' Precision: %.2f' % (precision_fbm * 100), ' F1: %.2f' % (f1_fbm * 100)) print('Full bridging scores (Entity Level):') print('Recall: %.2f' % (recall_fbe * 100), ' Precision: %.2f' % (precision_fbe * 100), ' F1: %.2f' % (f1_fbe * 100))
def coref_evaluate(key_file, sys_file, args): metrics = [('mentions', evaluator.mentions), ('muc', evaluator.muc), ('bcub', evaluator.b_cubed), ('ceafe', evaluator.ceafe), ('lea', evaluator.lea)] NP_only, remove_nested, keep_singletons, min_span = False, False, True, False doc_coref_infos = reader.get_coref_infos(key_file, sys_file, NP_only, remove_nested, keep_singletons, min_span, mode=args.mode) conll = 0 conll_subparts_num = 0 results = {} for name, metric in metrics: try: recall, precision, f1 = evaluator.evaluate_documents( doc_coref_infos, metric, beta=1) except: recall = precision = f1 = -10 results[name] = { 'recall': recall * 100, 'precision': precision * 100, 'f1': f1 * 100 } if args.mode == 'testing': print(name.ljust(10), 'Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100)) for key in ['recall', 'precision', 'f1']: results['avg_{}'.format(key)] = (results["muc"][key] + results["bcub"][key] + results["ceafe"][key]) / 3 return results
def evaluate(key_file, sys_file, metrics, NP_only, remove_nested, keep_singletons, min_span): doc_coref_infos = reader.get_coref_infos(key_file, sys_file, NP_only, remove_nested, keep_singletons, min_span) conll = 0 conll_subparts_num = 0 for name, metric in metrics: recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1) if name in ["muc", "bcub", "ceafe"]: conll += f1 conll_subparts_num += 1 print(name.ljust(10), 'Recall: %.2f' % (recall * 100), ' Precision: %.2f' % (precision * 100), ' F1: %.2f' % (f1 * 100)) if conll_subparts_num == 3: conll = (conll / 3) * 100 print('CoNLL score: %.2f' % conll)
def evaluate(key_file, sys_file, metrics, NP_only, remove_nested, keep_singletons, min_span): doc_coref_infos = reader.get_coref_infos(key_file, sys_file, NP_only, remove_nested, keep_singletons, min_span) conll = 0 conll_subparts_num = 0 print(' recall precision F1') for name, metric in metrics: recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1) if name in ('muc', 'bcub', 'ceafe'): conll += f1 conll_subparts_num += 1 print('%s %6.2f %6.2f %6.2f' % (name.ljust(8), recall * 100, precision * 100, f1 * 100)) if conll_subparts_num == 3: conll = (conll / 3) * 100 print('CoNLL score: %6.2f' % conll)