def evaluate(token_dir, coref_out, all_attribute_combinations, token_offset_fields, token_file_ext, diff_out): """ Conduct the main evaluation steps. :param token_dir: :param coref_out: :param all_attribute_combinations: :param token_offset_fields: :param token_file_ext: :param diff_out: :return: """ if EvalState.has_next_doc(): res, (g_mention_lines, g_relation_lines), ( s_mention_lines, s_relation_lines), doc_id, system_id = get_next_doc() else: return False logger.info("Evaluating Document %s" % doc_id) if len(g_mention_lines) == 0: logger.warn( "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro " "score will be fine." % doc_id) invisible_ids = [] if MutableConfig.eval_mode == EvalMethod.Token: invisible_ids, id2token, id2span = read_token_ids( token_dir, doc_id, token_file_ext, token_offset_fields) # Parse the lines and save them as a table from id to content. system_mention_table = [] gold_mention_table = [] # Save the raw text for visualization. sys_id_2_text = {} gold_id_2_text = {} logger.debug("Reading gold and response mentions.") remaining_sys_ids = set() num_system_mentions = 0 for sl in s_mention_lines: parse_result = parse_line(sl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue num_system_mentions += 1 sys_attributes = parse_result[1] sys_mention_id = parse_result[2] text = parse_result[4] system_mention_table.append(parse_result) EvalState.all_possible_types.add(sys_attributes[0]) remaining_sys_ids.add(sys_mention_id) sys_id_2_text[sys_mention_id] = text if not num_system_mentions == len(remaining_sys_ids): logger.warn( "Duplicated mention id for doc %s, one of them is randomly removed." % doc_id) remaining_gold_ids = set() for gl in g_mention_lines: parse_result = parse_line(gl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue gold_attributes = parse_result[1] gold_mention_id = parse_result[2] text = parse_result[4] gold_mention_table.append(parse_result) EvalState.all_possible_types.add(gold_attributes[0]) gold_id_2_text[gold_mention_id] = text remaining_gold_ids.add(gold_mention_id) num_system_predictions = len(system_mention_table) num_gold_predictions = len(gold_mention_table) # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting. all_gold_system_mapping_scores = [] # Debug purpose printing. print_score_matrix = False logger.debug("Computing overlap scores.") for system_index, (sys_spans, sys_attributes, sys_mention_id, _, _) in enumerate(system_mention_table): if print_score_matrix: print("%d %s" % (system_index, sys_mention_id)) for index, (gold_spans, gold_attributes, gold_mention_id, _, _) in enumerate(gold_mention_table): if len(gold_spans) == 0: logger.warning( "Found empty span gold standard at doc : %s, mention : %s" % (doc_id, gold_mention_id)) if len(sys_spans) == 0: logger.warning( "Found empty span system at doc : %s, mention : %s" % (doc_id, sys_mention_id)) overlap = compute_overlap_score(gold_spans, sys_spans) if print_score_matrix: sys.stdout.write("%.1f " % overlap) if overlap > 0: # maintaining a max heap based on overlap score heapq.heappush(all_gold_system_mapping_scores, (-overlap, system_index, index)) if print_score_matrix: print greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy( all_gold_system_mapping_scores, all_attribute_combinations, gold_mention_table, system_mention_table, doc_id) write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n") if diff_out is not None: # Here if you change the mapping used, you will see what's wrong on different level! # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table, # system_mention_table, diff_out) write_gold_and_system_mappings(system_id, greedy_mention_only_mapping, gold_mention_table, system_mention_table, diff_out) attribute_based_fps = [0.0] * len(all_attribute_combinations) for attribute_comb_index, abtp in enumerate(greedy_attribute_tps): attribute_based_fps[ attribute_comb_index] = num_system_predictions - abtp # Unmapped system mentions and the partial scores are considered as false positive. fp = len(remaining_sys_ids) - greedy_tp EvalState.doc_mention_scores.append( (greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps), num_gold_predictions, num_system_predictions, doc_id)) # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria # settings. mention_mapping = None type_mapping = None for attribute_comb_index, attribute_comb in enumerate( all_attribute_combinations): if attribute_comb == Config.coref_criteria: mention_mapping = greedy_all_attribute_mapping[ attribute_comb_index] logger.debug("Select mapping that matches criteria [%s]" % (Config.coref_criteria[0][1])) if attribute_comb[0][1] == "mention_type": type_mapping = greedy_all_attribute_mapping[attribute_comb_index] if Config.coref_criteria == "span_only": mention_mapping = greedy_mention_only_mapping if mention_mapping is None: # In case when we don't do attribute scoring. mention_mapping = greedy_mention_only_mapping # Evaluate how the performance of each type. per_type_eval(system_mention_table, gold_mention_table, type_mapping) gold_directed_relations, gold_corefs = utils.parse_relation_lines( g_relation_lines, remaining_gold_ids) sys_directed_relations, sys_corefs = utils.parse_relation_lines( s_relation_lines, remaining_sys_ids) if Config.script_result_dir: seq_eval = TemporalEval(mention_mapping, gold_mention_table, gold_directed_relations, system_mention_table, sys_directed_relations, gold_corefs, sys_corefs) if not Config.no_script_validation: if not seq_eval.validate_gold(): logger.error( "The gold edges cannot form a valid script graph.") utils.exit_on_fail() if not seq_eval.validate_sys(): logger.error( "The system edges cannot form a valid script graph.") utils.exit_on_fail() seq_eval.write_time_ml(doc_id) # Evaluate coreference links. if coref_out is not None: logger.debug("Start preparing coreference files.") # Prepare CoNLL style coreference input for this document. conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text, gold_id_2_text) gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines( gold_corefs, sys_corefs, gold_mention_table, system_mention_table, mention_mapping, MutableConfig.coref_mention_threshold) # If we are selecting among multiple mappings, it is easy to write in our file. write_mode = 'w' if EvalState.claim_write_flag() else 'a' g_conll_out = open(Config.conll_gold_file, write_mode) s_conll_out = open(Config.conll_sys_file, write_mode) g_conll_out.writelines(gold_conll_lines) s_conll_out.writelines(sys_conll_lines) if diff_out is not None: write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs, gold_id_2_text, sys_id_2_text) write_if_provided(diff_out, Config.eod_marker + " " + "\n") return True