def normalized_decomposition(self,
                              decomposition: Decomposition,
                              verbose: bool = False) -> Decomposition:
     norm_g = self.normalize_graph(graph=decomposition.to_graph(),
                                   verbose=verbose)
     return Decomposition.from_graph(graph=norm_g)
def evaluate(ids, questions, decompositions, golds, metadata,
             output_path_base,
             metrics=None):
    decompositions_str = [d.to_string() for d in decompositions]
    golds_str = [g.to_string() for g in golds]

    # calculating exact match scores
    exact_match = get_exact_match(decompositions_str, golds_str) \
        if (metrics is None) or 'exact_match' in metrics else None

    # evaluate using SARI
    sari = get_sari_score(decompositions_str, golds_str, questions) \
        if (metrics is None) or 'sari' in metrics else None

    # evaluate using sequence matcher
    match_ratio = get_match_ratio(decompositions_str, golds_str) \
        if (metrics is None) or 'match' in metrics else None
    structural_match_ratio = get_structural_match_ratio(decompositions_str, golds_str) \
        if (metrics is None) or 'structural_match' in metrics else None

    # evaluate using graph distances
    graph_scorer = GraphMatchScorer()
    decomposition_graphs = [d.to_graph() for d in decompositions]
    gold_graphs = [g.to_graph() for g in golds]

    ged_scores = graph_scorer.get_edit_distance_match_scores(decomposition_graphs, gold_graphs) 
    
    # structural_ged_scores = graph_scorer.get_edit_distance_match_scores(decomposition_graphs, gold_graphs,
    #                                                                     structure_only=True)
    # ged_plus_scores = get_ged_plus_scores(decomposition_graphs, gold_graphs,
    #                                       exclude_thr=5, num_processes=num_processes)

    # calculate normalized match scores
    normalize_scorer = NormalizedGraphMatchScorer()

    def try_invoke(func, graph, default=None):
        try:
            return func(graph)
        except Exception as ex:
            return default

    decomposition_norm_graphs = [try_invoke(normalize_scorer.normalize_graph, g, default=g) for g in
                                 decomposition_graphs]
    decomposition_norm_str = [try_invoke(lambda x: Decomposition.from_graph(x).to_string(), g) for g in
                              decomposition_norm_graphs]
    gold_norm_graphs = [try_invoke(normalize_scorer.normalize_graph, g, default=g) for g in gold_graphs]
    gold_norm_str = [try_invoke(lambda x: Decomposition.from_graph(x).to_string(), g) for g in gold_norm_graphs]

    normalized_exact_match = skip_none(get_exact_match, decomposition_norm_str, gold_norm_str) \
        if (metrics is None) or 'normalized_exact_match' in metrics else None
    normalized_sari = skip_none(get_sari_score, decomposition_norm_str, gold_norm_str, questions) \
        if (metrics is None) or 'normalized_sari' in metrics else None
    normalized_match_ratio = skip_none(get_match_ratio, decomposition_norm_str, gold_norm_str) \
        if (metrics is None) or 'normalized_match' in metrics else None
    normalized_structural_match_ratio = skip_none(get_structural_match_ratio, decomposition_norm_str, gold_norm_str) \
        if (metrics is None) or 'normalized_structural_match' in metrics else None

    evaluation_dict = {
        "id": ids,
        "question": questions,
        "gold": golds_str,
        "prediction": decompositions_str,
        "exact_match": exact_match,
        "match": match_ratio,
        "structural_match": structural_match_ratio,
        "sari": sari,
        "ged": ged_scores,
        # "structural_ged": structural_ged_scores,
        # "ged_plus": ged_plus_scores,

        "normalized_exact_match": normalized_exact_match,
        "normalized_match": normalized_match_ratio,
        "normalized_structural_match": normalized_structural_match_ratio,
        "normalized_sari": normalized_sari,
    }
    evaluation_dict = {k: v for k, v in evaluation_dict.items() if v is not None}
    num_examples = len(questions)
    print_first_example_scores(evaluation_dict, min(5, num_examples))
    mean_scores = print_score_stats(evaluation_dict)

    if output_path_base:
        write_evaluation_output(output_path_base, num_examples, **evaluation_dict)
        ### Addition write the mean scores json
        write_evaluation_results(mean_scores, output_path_base)

    if metadata is not None:
        #metadata = metadata[metadata["question_text"].isin(evaluation_dict["question"])]
        metadata = metadata[metadata['question_id'].isin(evaluation_dict['id'])]
        metadata["dataset"] = metadata["question_id"].apply(lambda x: x.split("_")[0])
        metadata["num_steps"] = metadata["decomposition"].apply(lambda x: len(x.split(";")))
        score_keys = [key for key in evaluation_dict if key not in ["id", "question", "gold", "prediction"]]
        for key in score_keys:
            metadata[key] = evaluation_dict[key]

        for agg_field in ["dataset", "num_steps"]:
            df = metadata[[agg_field] + score_keys].groupby(agg_field).agg("mean")
            print(df.round(decimals=3))

    return mean_scores