def combine_span_and_cluster_file(span_file, cluster_file): spans = load_jsonl(span_file) clusters = {item['doc_id']: item for item in load_jsonl(cluster_file)} for doc in spans: if doc['doc_id'] not in clusters.keys(): continue if 'clusters' in clusters[doc['doc_id']]: doc['coref'] = clusters[doc['doc_id']]['clusters'] else: merge_method_subrelations(clusters[doc['doc_id']]) doc['coref'] = { x: v for x, v in clusters[doc['doc_id']]['coref'].items() if len(v) > 0 } if 'n_ary_relations' in doc: del doc['n_ary_relations'] if 'method_subrelations' in doc: del doc['method_subrelations'] annotations_to_jsonl(spans, 'tmp_relation_42424242.jsonl')
def main(args): gold_data = load_jsonl(args.gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_ner = convert_to_dict(load_jsonl(args.ner_file)) predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file)) for d, doc in predicted_salient_clusters.items() : if 'clusters' not in doc : merge_method_subrelations(doc) doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0} predicted_relations = convert_to_dict(load_jsonl(args.relations_file)) predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner) get_types_of_clusters(predicted_ner, predicted_salient_clusters) get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data)) predicted_cluster_to_gold_cluster_map = clustering_metrics( gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map ) for n in [2, 4] : all_metrics = [] for types in combinations(used_entities, n): for doc in gold_data: predicted_data = predicted_relations[doc["doc_id"]] mapping = predicted_cluster_to_gold_cluster_map[doc["doc_id"]] relations = list(set([ tuple([mapping.get(v, v) for v in x[0]]) for x in predicted_data["predicted_relations"] if x[2] == 1 ])) relations = [dict(zip(used_entities, x)) for x in relations] relations = set([tuple((t, x[t]) for t in types) for x in relations]) gold_relations = [tuple((t, x[t]) for t in types) for x in doc['n_ary_relations']] gold_relations = set([x for x in gold_relations if has_all_mentions(doc, x)]) matched = relations & gold_relations metrics = { "p": len(matched) / (len(relations) + 1e-7), "r": len(matched) / (len(gold_relations) + 1e-7), } metrics["f1"] = 2 * metrics["p"] * metrics["r"] / (metrics["p"] + metrics["r"] + 1e-7) if len(gold_relations) > 0: all_metrics.append(metrics) all_metrics = pd.DataFrame(all_metrics) print(f"Relation Metrics n={n}") rln_metrics = all_metrics.describe().loc['mean'][['p', 'r', 'f1']] print(rln_metrics) rln_metrics.to_json(os.environ["DECODING_METRICS_OUTFP"])
def main(args): gold_data = load_jsonl(args.gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_salient_mentions = convert_to_dict(load_jsonl(args.salient_mentions_file)) salent_mentions_metrics(gold_data, predicted_salient_mentions) predicted_ner = convert_to_dict(load_jsonl(args.ner_file)) predicted_salient_clusters = convert_to_dict(load_jsonl(args.clusters_file)) for d, doc in predicted_salient_clusters.items() : if 'clusters' not in doc : merge_method_subrelations(doc) doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0} predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner) get_types_of_clusters(predicted_ner, predicted_salient_clusters) get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data)) predicted_cluster_to_gold_cluster_map = clustering_metrics( gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map )
def prepare_data(gold_file, ner_file, clusters_file, relations_file): gold_data = load_jsonl(gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_ner = convert_to_dict(load_jsonl(ner_file)) predicted_salient_clusters = convert_to_dict(load_jsonl(clusters_file)) for d, doc in predicted_salient_clusters.items() : if 'clusters' not in doc : merge_method_subrelations(doc) doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0} predicted_relations = convert_to_dict(load_jsonl(relations_file)) predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner) get_types_of_clusters(predicted_ner, predicted_salient_clusters) get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data)) predicted_cluster_to_gold_cluster_map = match_predicted_clusters_with_gold( gold_data, predicted_salient_clusters, predicted_span_to_gold_span_map ) return gold_data, predicted_ner, predicted_salient_clusters, predicted_relations, predicted_cluster_to_gold_cluster_map
def main(): parser = argparse.ArgumentParser() parser.add_argument("--gold-file") parser.add_argument("--ner-file") parser.add_argument("--clusters-file-a", help="Cluster predictions from system A") parser.add_argument("--salient-mentions-file-a", help="Salient mentions from system A") parser.add_argument("--clusters-file-b", help="Cluster predictions from system B") parser.add_argument("--salient-mentions-file-b", help="Salient mentions from system B") parser.add_argument("--edge-degree-direction", default="both", choices=["both", "out", "in"], type=str) parser.add_argument("--num-buckets", default=6, type=int) args = parser.parse_args() bucketed_eval_comparison = {} gold_data = load_jsonl(args.gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_ner = convert_to_dict(load_jsonl(args.ner_file)) predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics( gold_data, predicted_ner) doc_ids = [doc["doc_id"] for doc in gold_data] doc_buckets = bucket_documents_by_graph_degree( doc_ids, num_buckets=args.num_buckets) for bucket_name, bucket_docs in doc_buckets: gold_data_in_bucket = [ doc for doc in gold_data if doc["doc_id"] in bucket_docs ] print("\n") print( f"bucket: {bucket_name}, contains {len(gold_data_in_bucket)} documents" ) predicted_salient_mentions_a = convert_to_dict( load_jsonl(args.salient_mentions_file_a)) preds_a, labels_a = salent_mentions_metrics( gold_data_in_bucket, predicted_salient_mentions_a) predicted_salient_mentions_b = convert_to_dict( load_jsonl(args.salient_mentions_file_b)) preds_b, labels_b = salent_mentions_metrics( gold_data_in_bucket, predicted_salient_mentions_b) assert labels_a == labels_b gold_mentions = labels_a print( f"Paired Bootstrap Comparison of System A and System B on salient mention metric:" ) # The bootstrap script expects a list of gold values, but here the "system" values are already # comparisons with gold, so just pass in a list of Nones to satisfy the input. assert len(preds_a) == len(preds_b) assert len(preds_a) == len(gold_mentions) sys1_mention = list(preds_a) sys2_mention = list(preds_b) assert len(sys1_mention) == len(sys2_mention) sys1_summary, sys2_summary, p_value_lose, p_value_win = eval_with_paired_bootstrap( gold_mentions, sys1_mention, sys2_mention, num_samples=1000, sample_ratio=0.5, eval_type='f1', return_results=True) bucketed_eval_comparison[str(bucket_name)] = { "base": [list(sys1_summary), p_value_lose], "diff": [list(sys2_summary), p_value_win] } predicted_salient_clusters_a = convert_to_dict( load_jsonl(args.clusters_file_a)) predicted_salient_clusters_b = convert_to_dict( load_jsonl(args.clusters_file_b)) get_types_of_clusters(convert_to_dict(gold_data_in_bucket), convert_to_dict(gold_data_in_bucket)) i = 0 filenames = [ args.salient_mentions_file_a, args.salient_mentions_file_b ] for predicted_salient_clusters in [ predicted_salient_clusters_a, predicted_salient_clusters_b ]: print(f"\nMetrics for {filenames[i]}") i += 1 for d, doc in predicted_salient_clusters.items(): if 'clusters' not in doc: merge_method_subrelations(doc) doc['clusters'] = { x: v for x, v in doc['coref'].items() if len(v) > 0 } get_types_of_clusters(predicted_ner, predicted_salient_clusters) _, all_metrics_a = clustering_metrics(gold_data_in_bucket, predicted_salient_clusters_a, predicted_span_to_gold_span_map) _, all_metrics_b = clustering_metrics(gold_data_in_bucket, predicted_salient_clusters_b, predicted_span_to_gold_span_map) print( f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:" ) # The bootstrap script expects a list of gold values, but here the "system" values are already # comparisons with gold, so just pass in a list of Nones to satisfy the input. sys1_cluster = list(all_metrics_a["f1"]) sys2_cluster = list(all_metrics_b["f1"]) assert len(sys1_cluster) == len(sys2_cluster) gold = [None for _ in sys1_cluster] # Each bootstrap sample draws 50 items. eval_with_paired_bootstrap(gold, sys1_cluster, sys2_cluster, num_samples=1000, sample_ratio=0.76, eval_type='avg') print( f"Bucket evaluations (diff):\n{json.dumps(bucketed_eval_comparison, indent=2)}" ) draw_box_plot_with_error_bars( bucketed_eval_comparison, fname= f"/tmp/bucketed_salient_mention_eval_comparison_n_{args.num_buckets}.png" )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--gold-file") parser.add_argument("--ner-file") parser.add_argument("--clusters-files-a", help="Cluster predictions from system A", nargs='+', type=str) parser.add_argument("--salient-mentions-files-a", help="Salient mentions from system A", nargs='+', type=str) parser.add_argument("--clusters-files-b", help="Cluster predictions from system B", nargs='+', type=str) parser.add_argument("--salient-mentions-files-b", help="Salient mentions from system B", nargs='+', type=str) args = parser.parse_args() gold_data = load_jsonl(args.gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_ner = convert_to_dict(load_jsonl(args.ner_file)) predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics( gold_data, predicted_ner) salient_mention_predictions_a = [] gold_mentions = None salient_mentions_files_a = list(args.salient_mentions_files_a) salient_mentions_files_b = list(args.salient_mentions_files_b) for salient_file_a in salient_mentions_files_a: predicted_salient_mentions_a = convert_to_dict( load_jsonl(salient_file_a)) preds_a, labels_a = salent_mentions_metrics( gold_data, predicted_salient_mentions_a) if gold_mentions is None: gold_mentions = labels_a else: assert gold_mentions == labels_a, breakpoint() assert len(gold_mentions) == len(labels_a) salient_mention_predictions_a.append(preds_a) print("\n") salient_mention_predictions_b = [] for salient_file_b in salient_mentions_files_b: predicted_salient_mentions_b = convert_to_dict( load_jsonl(salient_file_b)) preds_b, labels_b = salent_mentions_metrics( gold_data, predicted_salient_mentions_b) assert gold_mentions == labels_b assert len(gold_mentions) == len(preds_b) salient_mention_predictions_b.append(preds_b) for metric_type in ["f1", "precision", "recall"]: print( f"Paired Bootstrap Comparison of System A and System B on salient mention metric: {metric_type}" ) sys1_mention_list = list(salient_mention_predictions_a) sys2_mention_list = list(salient_mention_predictions_b) eval_with_hierarchical_paired_bootstrap(gold_mentions, sys1_mention_list, sys2_mention_list, num_samples=1000, sample_ratio=0.5, eval_type=metric_type) get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data)) predicted_salient_clusters_a_list = [ convert_to_dict(load_jsonl(x)) for x in args.clusters_files_a ] predicted_salient_clusters_b_list = [ convert_to_dict(load_jsonl(x)) for x in args.clusters_files_b ] all_clusters = [ predicted_salient_clusters_a_list, predicted_salient_clusters_b_list ] for i in range(len(all_clusters)): clusters_set = all_clusters[i] for predicted_salient_clusters in clusters_set: for d, doc in predicted_salient_clusters.items(): if 'clusters' not in doc: merge_method_subrelations(doc) doc['clusters'] = { x: v for x, v in doc['coref'].items() if len(v) > 0 } get_types_of_clusters(predicted_ner, predicted_salient_clusters) all_metrics_a_list = [] preds_len = None for predicted_salient_clusters_a in predicted_salient_clusters_a_list: _, all_metrics_a = clustering_metrics(gold_data, predicted_salient_clusters_a, predicted_span_to_gold_span_map) all_metrics_a_list.append(all_metrics_a) if preds_len is None: preds_len = len(all_metrics_a) else: assert preds_len == len(all_metrics_a) all_metrics_b_list = [] for predicted_salient_clusters_b in predicted_salient_clusters_b_list: _, all_metrics_b = clustering_metrics(gold_data, predicted_salient_clusters_b, predicted_span_to_gold_span_map) all_metrics_b_list.append(all_metrics_b) assert preds_len == len(all_metrics_b) print("\n") for metric_type in ["f1", "p", "r"]: print( f"Paired Bootstrap Comparison of System A and System B on salient cluster metric: {metric_type}" ) # The bootstrap script expects a list of gold values, but here the "system" values are already # comparisons with gold, so just pass in a list of Nones to satisfy the input. sys1_cluster_list = [ list(metrics_a[metric_type]) for metrics_a in all_metrics_a_list ] sys2_cluster_list = [ list(metrics_b[metric_type]) for metrics_b in all_metrics_b_list ] gold = [None for _ in sys1_cluster_list[0]] # Each bootstrap sample draws 50 items. eval_with_hierarchical_paired_bootstrap(gold, sys1_cluster_list, sys2_cluster_list, num_samples=5000, sample_ratio=0.5, eval_type='avg')
def evaluate(predicted_data, gold_data): p, r, f1 = 0, 0, 0 gold_data = {x['doc_id']: x for x in gold_data} all_metrics = [] for doc in predicted_data: predicted_doc = predicted_data[doc] gold_doc = gold_data[doc] merge_method_subrelations(gold_doc) gold_doc["clusters"] = gold_doc["coref"] gold_spans = [tuple(x) for x in gold_doc['ner']] predicted_spans = [tuple(x) for x in predicted_doc['ner']] for t in used_entities: typed_gold_spans = set([x for x in gold_spans if x[2] == t]) typed_predicted_spans = set( [x for x in predicted_spans if x[2] == t]) matched = len(typed_gold_spans & typed_predicted_spans) tp, tr = matched / (len(typed_predicted_spans) + 1e-7), matched / (len(typed_gold_spans) + 1e-7) tf1 = 2 * tp * tr / (tp + tr + 1e-7) p += tp / (len(used_entities) * len(predicted_data)) r += tr / (len(used_entities) * len(predicted_data)) f1 += tf1 / (len(used_entities) * len(predicted_data)) clusters = gold_doc['coref'] span_to_cluster = {} for c, spans in clusters.items(): for span in spans: span_to_cluster[tuple(span)] = c predicted_span_to_gold = {} for i, (s, e, t) in enumerate(predicted_spans): span = (s, e) predicted_span_to_gold[span] = (t, span, str(i)) for sg, eg, tg in gold_spans: span_g = (sg, eg) if span_match(span, span_g) > 0.5: predicted_span_to_gold[span] = (tg, span_g, span_to_cluster.get( span_g, str(i))) break for types in combinations(used_entities, 2): gold_relations = [ tuple((t, x[t]) for t in types) for x in gold_doc['n_ary_relations'] ] gold_relations = set( [x for x in gold_relations if has_all_mentions(gold_doc, x)]) predicted_relations = [] for s1, s2 in predicted_doc['relations']: if s1 in predicted_span_to_gold and s2 in predicted_span_to_gold: t1, span_1, c_1 = predicted_span_to_gold[s1] t2, span_2, c_2 = predicted_span_to_gold[s2] if t1 in types and t2 in types and t1 != t2: rel = {t1: c_1, t2: c_2} predicted_relations.append( tuple([(t, rel[t]) for t in types])) predicted_relations = set(predicted_relations) matched = predicted_relations & gold_relations metrics = { "p": len(matched) / (len(predicted_relations) + 1e-7), "r": len(matched) / (len(gold_relations) + 1e-7), } metrics["f1"] = 2 * metrics["p"] * metrics["r"] / ( metrics["p"] + metrics["r"] + 1e-7) if len(gold_relations) > 0: all_metrics.append(metrics) print(p, r, f1) all_metrics = pd.DataFrame(all_metrics) print(f"Relation Metrics n={2}") print(all_metrics.describe().loc['mean'][['p', 'r', 'f1']])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--gold-file") parser.add_argument("--ner-file") parser.add_argument("--clusters-file-a", help="Cluster predictions from system A") parser.add_argument("--salient-mentions-file-a", help="Salient mentions from system A") parser.add_argument("--clusters-file-b", help="Cluster predictions from system B") parser.add_argument("--salient-mentions-file-b", help="Salient mentions from system B") args = parser.parse_args() gold_data = load_jsonl(args.gold_file) for d in gold_data: merge_method_subrelations(d) d["clusters"] = d["coref"] predicted_ner = convert_to_dict(load_jsonl(args.ner_file)) predicted_span_to_gold_span_map: Dict[str, Dict[tuple, tuple]] = ner_metrics(gold_data, predicted_ner) predicted_salient_mentions_a = convert_to_dict(load_jsonl(args.salient_mentions_file_a)) preds_a, labels_a = salent_mentions_metrics(gold_data, predicted_salient_mentions_a) predicted_salient_mentions_b = convert_to_dict(load_jsonl(args.salient_mentions_file_b)) preds_b, labels_b = salent_mentions_metrics(gold_data, predicted_salient_mentions_b) assert labels_a == labels_b gold_mentions = labels_a print(f"Paired Bootstrap Comparison of System A and System B on salient mention metric:") # The bootstrap script expects a list of gold values, but here the "system" values are already # comparisons with gold, so just pass in a list of Nones to satisfy the input. assert len(preds_a) == len(preds_b) assert len(preds_a) == len(gold_mentions) sys1_mention = list(preds_a) sys2_mention = list(preds_b) assert len(sys1_mention) == len(sys2_mention) eval_with_paired_bootstrap(gold_mentions, sys1_mention, sys2_mention, num_samples=1000, sample_ratio=0.5, eval_type='f1') predicted_salient_clusters_a = convert_to_dict(load_jsonl(args.clusters_file_a)) predicted_salient_clusters_b = convert_to_dict(load_jsonl(args.clusters_file_b)) get_types_of_clusters(convert_to_dict(gold_data), convert_to_dict(gold_data)) i = 0 filenames = [args.salient_mentions_file_a, args.salient_mentions_file_b] for predicted_salient_clusters in [predicted_salient_clusters_a, predicted_salient_clusters_b]: print(f"\nMetrics for {filenames[i]}") i+=1 for d, doc in predicted_salient_clusters.items() : if 'clusters' not in doc : merge_method_subrelations(doc) doc['clusters'] = {x:v for x, v in doc['coref'].items() if len(v) > 0} get_types_of_clusters(predicted_ner, predicted_salient_clusters) _, all_metrics_a = clustering_metrics( gold_data, predicted_salient_clusters_a, predicted_span_to_gold_span_map ) _, all_metrics_b = clustering_metrics( gold_data, predicted_salient_clusters_b, predicted_span_to_gold_span_map ) print(f"Paired Bootstrap Comparison of System A and System B on salient cluster metric:") # The bootstrap script expects a list of gold values, but here the "system" values are already # comparisons with gold, so just pass in a list of Nones to satisfy the input. sys1_cluster = list(all_metrics_a["f1"]) sys2_cluster = list(all_metrics_b["f1"]) assert len(sys1_cluster) == len(sys2_cluster) gold = [None for _ in sys1_cluster] # Each bootstrap sample draws 50 items. eval_with_paired_bootstrap(gold, sys1_cluster, sys2_cluster, num_samples=1000, sample_ratio=0.76, eval_type='avg')