Пример #1
0
def evaluate_inter_generator_agreement(annot_df: pd.DataFrame,
                                       verbose: bool = False) -> float:
    cols = ['qasrl_id', get_predicate_idx_label(annot_df)]
    n_gen = annot_df.groupby(cols).worker_id.transform(pd.Series.nunique)
    workers = annot_df.worker_id.unique().tolist()
    n_workers = len(workers)
    annot_df = annot_df.copy()
    n_predicates = annot_df[cols].drop_duplicates().shape[0]
    if verbose:
        print("n_workers: ", n_workers)
        print("n_predicates: ", n_predicates)
        print(f"metric\tworker_1\tworker_2\tprec\trecall\tf1")

    total_arg_metric = Metrics.empty()
    total_larg_metric = Metrics.empty()
    total_role_metric = Metrics.empty()
    total_nomIdent_metric: BinaryClassificationMetrics = BinaryClassificationMetrics.empty(
    )
    for w1, w2 in combinations(workers, r=2):
        w1_df = annot_df[annot_df.worker_id == w1].copy()
        w2_df = annot_df[annot_df.worker_id == w2].copy()
        # compute agreement measures
        arg_metrics, labeled_arg_metrics, role_metrics, nom_ident_metrics, _ = \
            eval_datasets(w1_df, w2_df)
        if verbose:
            print(f"\nComparing  {w1}   to   {w2}:   [p,r,f1]")
            merged_df = pd.merge(w1_df, w2_df, on='key')
            print(
                f"Number of shared predicates: {get_n_predicates(merged_df)}")
            print(f"ARG:\t{arg_metrics}")
            print(f"Labeled ARG:\t{labeled_arg_metrics}")
            print(f"ROLE:\t{role_metrics}")
            print(
                f"NOM_IDENT:\t{w1}\t{w2}\t{nom_ident_metrics.prec():.3f}\t{nom_ident_metrics.recall():.3f}\t{nom_ident_metrics.f1():.3f}"
            )
            print(
                f"NOM_IDENT accuracy: {nom_ident_metrics.accuracy():.3f}, {int(nom_ident_metrics.errors())} mismathces out of {nom_ident_metrics.instances()} predicates."
            )
        total_arg_metric += arg_metrics
        total_larg_metric += labeled_arg_metrics
        total_role_metric += role_metrics
        total_nomIdent_metric += nom_ident_metrics

    print(f"\nOverall pairwise agreement:")
    print(f"arg-f1 \t {total_arg_metric.f1():.4f}")
    print(f"labeled-arg-f1 \t {total_larg_metric.f1():.4f}")
    print(f"role-f1 \t {total_role_metric.f1():.4f}")
    print(
        f"is-verbal-accuracy \t {total_nomIdent_metric.accuracy():.4f}    for {total_nomIdent_metric.instances()} pairwise comparisons."
    )
    return total_arg_metric.f1()
Пример #2
0
def eval_labeled_arguments(grt_roles: List[Role], sys_roles: List[Role],
                           sys_to_grt: Dict[Argument, Argument]) -> Metrics:
    """ LA metric - Labeled Argument match - spans overlap and questions are equivalent. """
    tp_arg_count = count_labeled_arg_matches(grt_roles, sys_roles, sys_to_grt)
    fp_arg_count = count_arguments(sys_roles) - tp_arg_count
    fn_arg_count = count_arguments(grt_roles) - tp_arg_count
    return Metrics(tp_arg_count, fp_arg_count, fn_arg_count)
Пример #3
0
def eval_datasets(sys_df, grt_df, sent_map= None) \
        -> Tuple[Metrics, Metrics, Metrics, BinaryClassificationMetrics, pd.DataFrame]:
    if not sent_map:
        annot_df = pd.concat([
            sys_df[['qasrl_id', 'sentence']], grt_df[['qasrl_id', 'sentence']]
        ])
        sent_map = get_sent_map(annot_df)
    arg_metrics = Metrics.empty()
    labeled_arg_metrics = Metrics.empty()
    role_metrics = Metrics.empty()
    is_nom_counts = BinaryClassificationMetrics.empty()
    all_matchings = []
    for key, sys_response, grt_response in tqdm(yield_paired_predicates(
            sys_df, grt_df),
                                                leave=False):
        qasrl_id, target_idx = key
        tokens = sent_map[qasrl_id]
        local_arg_metric, local_labeled_arg_metric, local_role_metric, local_is_nom_metric, sys_to_grt = \
            evaluate_response(sys_response, grt_response)
        arg_metrics += local_arg_metric
        labeled_arg_metrics += local_labeled_arg_metric
        role_metrics += local_role_metric
        is_nom_counts += local_is_nom_metric
        all_args = build_all_qa_pairs(sys_response.roles, grt_response.roles,
                                      sys_to_grt)
        all_args['qasrl_id'] = qasrl_id
        all_args['target_idx'] = target_idx
        all_args['grt_arg_text'] = all_args.grt_arg.apply(fill_answer,
                                                          tokens=tokens)
        all_args['sys_arg_text'] = all_args.sys_arg.apply(fill_answer,
                                                          tokens=tokens)
        all_matchings.append(all_args)

    # when all_matching is empty, return empty DataFrame
    if not all_matchings:
        all_matchings = pd.DataFrame()
    else:
        all_matchings = pd.concat(all_matchings)
        all_matchings = all_matchings[[
            'grt_arg_text', 'sys_arg_text', 'grt_role', 'sys_role', 'grt_arg',
            'sys_arg', 'qasrl_id', 'target_idx'
        ]]

    return arg_metrics, labeled_arg_metrics, role_metrics, is_nom_counts, all_matchings
Пример #4
0
def evaluate_response(sys_response: Response,
                      grt_response: Response):
    sys_roles: List[Role] = sys_response.roles
    grt_roles: List[Role] = grt_response.roles
    sys_to_grt = find_matches(sys_response.all_args(), grt_response.all_args())

    is_nom_metrics = BinaryClassificationMetrics.simple_boolean_decision(sys_response.is_verbal, grt_response.is_verbal)

    # todo decide on evaluation of roles where is_verbal mismatch - should the roles be included in the role_count metric?
    # Currently excluding these mismatches from the arg & roles metrics
    if is_nom_metrics.errors() == 0:
        arg_metrics = eval_arguments(grt_roles, sys_roles, sys_to_grt)
        labeled_arg_metrics = eval_labeled_arguments(grt_roles, sys_roles, sys_to_grt)
        role_metrics = eval_roles(grt_roles, sys_roles, sys_to_grt)
    else:
        arg_metrics = Metrics.empty()
        labeled_arg_metrics = Metrics.empty()
        role_metrics = Metrics.empty()

    return arg_metrics, labeled_arg_metrics, role_metrics, is_nom_metrics, sys_to_grt
Пример #5
0
def eval_roles(grt_roles: List[Role], sys_roles: List[Role],
               sys_to_grt: Dict[Argument, Argument]) -> Metrics:
    alignemnt = align_by_argument(grt_roles, sys_roles, sys_to_grt)
    tp, fp, fn = 0, 0, 0
    for grt_role in grt_roles:
        if alignemnt.has_single_alignment(grt_role, is_grt=True):
            tp += 1
        else:
            fn += 1
    for sys_role in sys_roles:
        if not alignemnt.has_single_alignment(sys_role, is_grt=False):
            fp += 1
    return Metrics(tp, fp, fn)
Пример #6
0
def eval_arguments(grt_roles: List[Role], sys_roles: List[Role], sys_to_grt: Dict[Argument, Argument]) -> Metrics:
    tp_arg_count = len(sys_to_grt)
    fp_arg_count = count_arguments(sys_roles) - tp_arg_count
    fn_arg_count = count_arguments(grt_roles) - tp_arg_count
    return Metrics(tp_arg_count, fp_arg_count, fn_arg_count)