def normalized_decomposition(self, decomposition: Decomposition, verbose: bool = False) -> Decomposition: norm_g = self.normalize_graph(graph=decomposition.to_graph(), verbose=verbose) return Decomposition.from_graph(graph=norm_g)
def evaluate(ids, questions, decompositions, golds, metadata, output_path_base, metrics=None): decompositions_str = [d.to_string() for d in decompositions] golds_str = [g.to_string() for g in golds] # calculating exact match scores exact_match = get_exact_match(decompositions_str, golds_str) \ if (metrics is None) or 'exact_match' in metrics else None # evaluate using SARI sari = get_sari_score(decompositions_str, golds_str, questions) \ if (metrics is None) or 'sari' in metrics else None # evaluate using sequence matcher match_ratio = get_match_ratio(decompositions_str, golds_str) \ if (metrics is None) or 'match' in metrics else None structural_match_ratio = get_structural_match_ratio(decompositions_str, golds_str) \ if (metrics is None) or 'structural_match' in metrics else None # evaluate using graph distances graph_scorer = GraphMatchScorer() decomposition_graphs = [d.to_graph() for d in decompositions] gold_graphs = [g.to_graph() for g in golds] ged_scores = graph_scorer.get_edit_distance_match_scores(decomposition_graphs, gold_graphs) # structural_ged_scores = graph_scorer.get_edit_distance_match_scores(decomposition_graphs, gold_graphs, # structure_only=True) # ged_plus_scores = get_ged_plus_scores(decomposition_graphs, gold_graphs, # exclude_thr=5, num_processes=num_processes) # calculate normalized match scores normalize_scorer = NormalizedGraphMatchScorer() def try_invoke(func, graph, default=None): try: return func(graph) except Exception as ex: return default decomposition_norm_graphs = [try_invoke(normalize_scorer.normalize_graph, g, default=g) for g in decomposition_graphs] decomposition_norm_str = [try_invoke(lambda x: Decomposition.from_graph(x).to_string(), g) for g in decomposition_norm_graphs] gold_norm_graphs = [try_invoke(normalize_scorer.normalize_graph, g, default=g) for g in gold_graphs] gold_norm_str = [try_invoke(lambda x: Decomposition.from_graph(x).to_string(), g) for g in gold_norm_graphs] normalized_exact_match = skip_none(get_exact_match, decomposition_norm_str, gold_norm_str) \ if (metrics is None) or 'normalized_exact_match' in metrics else None normalized_sari = skip_none(get_sari_score, decomposition_norm_str, gold_norm_str, questions) \ if (metrics is None) or 'normalized_sari' in metrics else None normalized_match_ratio = skip_none(get_match_ratio, decomposition_norm_str, gold_norm_str) \ if (metrics is None) or 'normalized_match' in metrics else None normalized_structural_match_ratio = skip_none(get_structural_match_ratio, decomposition_norm_str, gold_norm_str) \ if (metrics is None) or 'normalized_structural_match' in metrics else None evaluation_dict = { "id": ids, "question": questions, "gold": golds_str, "prediction": decompositions_str, "exact_match": exact_match, "match": match_ratio, "structural_match": structural_match_ratio, "sari": sari, "ged": ged_scores, # "structural_ged": structural_ged_scores, # "ged_plus": ged_plus_scores, "normalized_exact_match": normalized_exact_match, "normalized_match": normalized_match_ratio, "normalized_structural_match": normalized_structural_match_ratio, "normalized_sari": normalized_sari, } evaluation_dict = {k: v for k, v in evaluation_dict.items() if v is not None} num_examples = len(questions) print_first_example_scores(evaluation_dict, min(5, num_examples)) mean_scores = print_score_stats(evaluation_dict) if output_path_base: write_evaluation_output(output_path_base, num_examples, **evaluation_dict) ### Addition write the mean scores json write_evaluation_results(mean_scores, output_path_base) if metadata is not None: #metadata = metadata[metadata["question_text"].isin(evaluation_dict["question"])] metadata = metadata[metadata['question_id'].isin(evaluation_dict['id'])] metadata["dataset"] = metadata["question_id"].apply(lambda x: x.split("_")[0]) metadata["num_steps"] = metadata["decomposition"].apply(lambda x: len(x.split(";"))) score_keys = [key for key in evaluation_dict if key not in ["id", "question", "gold", "prediction"]] for key in score_keys: metadata[key] = evaluation_dict[key] for agg_field in ["dataset", "num_steps"]: df = metadata[[agg_field] + score_keys].groupby(agg_field).agg("mean") print(df.round(decimals=3)) return mean_scores