def evaluate_inter_generator_agreement(annot_df: pd.DataFrame, verbose: bool = False) -> float: cols = ['qasrl_id', get_predicate_idx_label(annot_df)] n_gen = annot_df.groupby(cols).worker_id.transform(pd.Series.nunique) workers = annot_df.worker_id.unique().tolist() n_workers = len(workers) annot_df = annot_df.copy() n_predicates = annot_df[cols].drop_duplicates().shape[0] if verbose: print("n_workers: ", n_workers) print("n_predicates: ", n_predicates) print(f"metric\tworker_1\tworker_2\tprec\trecall\tf1") total_arg_metric = Metrics.empty() total_larg_metric = Metrics.empty() total_role_metric = Metrics.empty() total_nomIdent_metric: BinaryClassificationMetrics = BinaryClassificationMetrics.empty( ) for w1, w2 in combinations(workers, r=2): w1_df = annot_df[annot_df.worker_id == w1].copy() w2_df = annot_df[annot_df.worker_id == w2].copy() # compute agreement measures arg_metrics, labeled_arg_metrics, role_metrics, nom_ident_metrics, _ = \ eval_datasets(w1_df, w2_df) if verbose: print(f"\nComparing {w1} to {w2}: [p,r,f1]") merged_df = pd.merge(w1_df, w2_df, on='key') print( f"Number of shared predicates: {get_n_predicates(merged_df)}") print(f"ARG:\t{arg_metrics}") print(f"Labeled ARG:\t{labeled_arg_metrics}") print(f"ROLE:\t{role_metrics}") print( f"NOM_IDENT:\t{w1}\t{w2}\t{nom_ident_metrics.prec():.3f}\t{nom_ident_metrics.recall():.3f}\t{nom_ident_metrics.f1():.3f}" ) print( f"NOM_IDENT accuracy: {nom_ident_metrics.accuracy():.3f}, {int(nom_ident_metrics.errors())} mismathces out of {nom_ident_metrics.instances()} predicates." ) total_arg_metric += arg_metrics total_larg_metric += labeled_arg_metrics total_role_metric += role_metrics total_nomIdent_metric += nom_ident_metrics print(f"\nOverall pairwise agreement:") print(f"arg-f1 \t {total_arg_metric.f1():.4f}") print(f"labeled-arg-f1 \t {total_larg_metric.f1():.4f}") print(f"role-f1 \t {total_role_metric.f1():.4f}") print( f"is-verbal-accuracy \t {total_nomIdent_metric.accuracy():.4f} for {total_nomIdent_metric.instances()} pairwise comparisons." ) return total_arg_metric.f1()
def eval_labeled_arguments(grt_roles: List[Role], sys_roles: List[Role], sys_to_grt: Dict[Argument, Argument]) -> Metrics: """ LA metric - Labeled Argument match - spans overlap and questions are equivalent. """ tp_arg_count = count_labeled_arg_matches(grt_roles, sys_roles, sys_to_grt) fp_arg_count = count_arguments(sys_roles) - tp_arg_count fn_arg_count = count_arguments(grt_roles) - tp_arg_count return Metrics(tp_arg_count, fp_arg_count, fn_arg_count)
def eval_datasets(sys_df, grt_df, sent_map= None) \ -> Tuple[Metrics, Metrics, Metrics, BinaryClassificationMetrics, pd.DataFrame]: if not sent_map: annot_df = pd.concat([ sys_df[['qasrl_id', 'sentence']], grt_df[['qasrl_id', 'sentence']] ]) sent_map = get_sent_map(annot_df) arg_metrics = Metrics.empty() labeled_arg_metrics = Metrics.empty() role_metrics = Metrics.empty() is_nom_counts = BinaryClassificationMetrics.empty() all_matchings = [] for key, sys_response, grt_response in tqdm(yield_paired_predicates( sys_df, grt_df), leave=False): qasrl_id, target_idx = key tokens = sent_map[qasrl_id] local_arg_metric, local_labeled_arg_metric, local_role_metric, local_is_nom_metric, sys_to_grt = \ evaluate_response(sys_response, grt_response) arg_metrics += local_arg_metric labeled_arg_metrics += local_labeled_arg_metric role_metrics += local_role_metric is_nom_counts += local_is_nom_metric all_args = build_all_qa_pairs(sys_response.roles, grt_response.roles, sys_to_grt) all_args['qasrl_id'] = qasrl_id all_args['target_idx'] = target_idx all_args['grt_arg_text'] = all_args.grt_arg.apply(fill_answer, tokens=tokens) all_args['sys_arg_text'] = all_args.sys_arg.apply(fill_answer, tokens=tokens) all_matchings.append(all_args) # when all_matching is empty, return empty DataFrame if not all_matchings: all_matchings = pd.DataFrame() else: all_matchings = pd.concat(all_matchings) all_matchings = all_matchings[[ 'grt_arg_text', 'sys_arg_text', 'grt_role', 'sys_role', 'grt_arg', 'sys_arg', 'qasrl_id', 'target_idx' ]] return arg_metrics, labeled_arg_metrics, role_metrics, is_nom_counts, all_matchings
def evaluate_response(sys_response: Response, grt_response: Response): sys_roles: List[Role] = sys_response.roles grt_roles: List[Role] = grt_response.roles sys_to_grt = find_matches(sys_response.all_args(), grt_response.all_args()) is_nom_metrics = BinaryClassificationMetrics.simple_boolean_decision(sys_response.is_verbal, grt_response.is_verbal) # todo decide on evaluation of roles where is_verbal mismatch - should the roles be included in the role_count metric? # Currently excluding these mismatches from the arg & roles metrics if is_nom_metrics.errors() == 0: arg_metrics = eval_arguments(grt_roles, sys_roles, sys_to_grt) labeled_arg_metrics = eval_labeled_arguments(grt_roles, sys_roles, sys_to_grt) role_metrics = eval_roles(grt_roles, sys_roles, sys_to_grt) else: arg_metrics = Metrics.empty() labeled_arg_metrics = Metrics.empty() role_metrics = Metrics.empty() return arg_metrics, labeled_arg_metrics, role_metrics, is_nom_metrics, sys_to_grt
def eval_roles(grt_roles: List[Role], sys_roles: List[Role], sys_to_grt: Dict[Argument, Argument]) -> Metrics: alignemnt = align_by_argument(grt_roles, sys_roles, sys_to_grt) tp, fp, fn = 0, 0, 0 for grt_role in grt_roles: if alignemnt.has_single_alignment(grt_role, is_grt=True): tp += 1 else: fn += 1 for sys_role in sys_roles: if not alignemnt.has_single_alignment(sys_role, is_grt=False): fp += 1 return Metrics(tp, fp, fn)
def eval_arguments(grt_roles: List[Role], sys_roles: List[Role], sys_to_grt: Dict[Argument, Argument]) -> Metrics: tp_arg_count = len(sys_to_grt) fp_arg_count = count_arguments(sys_roles) - tp_arg_count fn_arg_count = count_arguments(grt_roles) - tp_arg_count return Metrics(tp_arg_count, fp_arg_count, fn_arg_count)