def main(): parser = argparse.ArgumentParser( description= "Event mention scorer, provides support to Event Nugget scoring, Event Coreference and Event " "Sequencing scoring.") parser.add_argument("-g", "--gold", help="Golden Standard", required=True) parser.add_argument("-s", "--system", help="System output", required=True) parser.add_argument("-d", "--comparison_output", help="Compare and help show the difference between " "system and gold") parser.add_argument( "-o", "--output", help="Optional evaluation result redirects, put eval result to file") parser.add_argument( "-c", "--coref", help="Eval Coreference result output, need to put the reference" "conll coref scorer in the same folder with this scorer") parser.add_argument( "-a", "--sequencing", help="Eval Event sequencing result output (After and Subevent)") parser.add_argument("-nv", "--no_script_validation", help="Whether to turn off script validation", action="store_true") parser.add_argument( "-t", "--token_path", help= "Path to the directory containing the token mappings file, only used in token mode." ) parser.add_argument( "-m", "--coref_mapping", help="Which mapping will be used to perform coreference mapping.", type=int) parser.add_argument( "-of", "--offset_field", help="A pair of integer indicates which column we should " "read the offset in the token mapping file, index starts" "at 0, default value will be %s" % Config.default_token_offset_fields) parser.add_argument( "-te", "--token_table_extension", help= "any extension appended after docid of token table files. Default is [%s], only used in token mode." % Config.default_token_file_ext) parser.add_argument("-ct", "--coreference_threshold", type=float, help="Threshold for coreference mention mapping") parser.add_argument("-b", "--debug", help="turn debug mode on", action="store_true") # parser.add_argument("--eval_mode", choices=["char", "token"], default="char", # help="Use Span or Token mode. The Span mode will take a span as range [start:end], while the " # "Token mode consider each token is provided as a single id.") parser.add_argument( "-wl", "--type_white_list", type=argparse.FileType('r'), help= "Provide a file, where each line list a mention type subtype pair to be evaluated. Types " "that are out of this white list will be ignored.") parser.add_argument("-dn", "--doc_id_to_eval", help="Provide one single doc id to evaluate.") parser.set_defaults(debug=False) args = parser.parse_args() if args.debug: stream_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.debug("Entered debug mode.") else: stream_handler.setLevel(logging.INFO) logger.setLevel(logging.INFO) if args.type_white_list is not None: logger.info( "Only the following types in the white list will be evaluated.") EvalState.white_listed_types = set() for line in args.type_white_list: logger.info(line.strip()) EvalState.white_listed_types.add(canonicalize_string(line)) if args.output is not None: out_path = args.output utils.create_parent_dir(out_path) mention_eval_out = open(out_path, 'w') logger.info("Evaluation output will be saved at %s" % out_path) else: mention_eval_out = sys.stdout logger.info("Evaluation output at standard out.") if os.path.isfile(args.gold): gf = open(args.gold) else: logger.error("Cannot find gold standard file at " + args.gold) sys.exit(1) if args.coref is not None: Config.conll_out = args.coref Config.conll_gold_file = args.coref + "_gold.conll" Config.conll_sys_file = args.coref + "_sys.conll" logger.info("CoNLL script output will be output at " + Config.conll_out) logger.info("Gold and system conll files will generated at " + Config.conll_gold_file + " and " + Config.conll_sys_file) if args.sequencing is not None: Config.script_result_dir = args.sequencing logger.info("Temporal files will be output at " + Config.script_result_dir) utils.supermakedirs(Config.script_result_dir) logger.info("Will evaluate link type: %s." % ",".join(Config.script_types)) for t in Config.script_types: utils.supermakedirs(os.path.join(Config.script_result_dir, t)) utils.remove_file_by_extension(Config.script_result_dir, ".tml") utils.remove_file_by_extension(Config.script_result_dir, ".tml") if args.no_script_validation: Config.no_script_validation = True if os.path.isfile(args.system): sf = open(args.system) else: logger.error("Cannot find system file at " + args.system) sys.exit(1) if args.coref_mapping is not None: if args.coref_mapping < 4: Config.coref_criteria = Config.possible_coref_mapping[ args.coref_mapping] else: logger.error( "Possible mapping : 0: Span only 1: Mention Type 2: Realis 3 Type and Realis" ) utils.terminate_with_error("Must provide a mapping between 0 to 3") else: Config.coref_criteria = Config.possible_coref_mapping[1] diff_out = None if args.comparison_output is not None: diff_out_path = args.comparison_output utils.create_parent_dir(diff_out_path) diff_out = open(diff_out_path, 'w') token_dir = "." if args.token_path is not None: MutableConfig.eval_mode = EvalMethod.Token logger.info("Eval mode is set to token.") if os.path.isdir(args.token_path): logger.debug("Will search token files in " + args.token_path) token_dir = args.token_path else: logger.debug("Cannot find given token directory at [%s], " "will try search for current directory" % args.token_path) else: MutableConfig.eval_mode = EvalMethod.Char token_offset_fields = Config.default_token_offset_fields if args.offset_field is not None: try: token_offset_fields = [ int(x) for x in args.offset_field.split(",") ] except ValueError as _: logger.error( "Token offset argument should be two integer with comma in between, i.e. 2,3" ) if args.coreference_threshold is not None: MutableConfig.coref_mention_threshold = args.coreference_threshold # Read all documents. read_all_doc(gf, sf, args.doc_id_to_eval) # Take all attribute combinations, which will be used to produce scores. attribute_comb = get_attr_combinations(Config.attribute_names) logger.info("Coreference mentions need to match %s before consideration" % Config.coref_criteria[0][1]) while True: print('dir is:', token_dir) if not evaluate(token_dir, args.coref, attribute_comb, token_offset_fields, args.token_table_extension, diff_out): break # Run the CoNLL script on the combined files, which is concatenated from the best alignment of all documents. if args.coref is not None: logger.debug("Running coreference script for the final scores.") ConllEvaluator.run_conll_script(Config.conll_gold_file, Config.conll_sys_file, Config.conll_out) # Get the CoNLL scores from output EvalState.overall_coref_scores = ConllEvaluator.get_conll_scores( Config.conll_out) # Run the TimeML evaluation script. if Config.script_result_dir: TemporalEval.eval_time_ml() print_eval_results(mention_eval_out, attribute_comb) # Clean up, close files. close_if_not_none(diff_out) logger.info("Evaluation Done.") return 0
def evaluate(token_dir, coref_out, all_attribute_combinations, token_offset_fields, token_file_ext, diff_out): """ Conduct the main evaluation steps. :param token_dir: :param coref_out: :param all_attribute_combinations: :param token_offset_fields: :param token_file_ext: :param diff_out: :return: """ if EvalState.has_next_doc(): res, (g_mention_lines, g_relation_lines), ( s_mention_lines, s_relation_lines), doc_id, system_id = get_next_doc() else: return False logger.info("Evaluating Document %s" % doc_id) if len(g_mention_lines) == 0: logger.warn( "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro " "score will be fine." % doc_id) invisible_ids = [] if MutableConfig.eval_mode == EvalMethod.Token: invisible_ids, id2token, id2span = read_token_ids(token_dir, doc_id, token_file_ext, token_offset_fields) # Parse the lines and save them as a table from id to content. system_mention_table = [] gold_mention_table = [] # Save the raw text for visualization. sys_id_2_text = {} gold_id_2_text = {} logger.debug("Reading gold and response mentions.") sys_mention_ids = [] for sl in s_mention_lines: parse_result = parse_line(sl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue # if len(sys_spans) == 0: # # Temporarily ignoring empty mentions. # continue sys_attributes = parse_result[1] sys_mention_id = parse_result[2] text = parse_result[4] system_mention_table.append(parse_result) EvalState.all_possible_types.add(sys_attributes[0]) sys_mention_ids.append(sys_mention_id) sys_id_2_text[sys_mention_id] = text remaining_sys_ids = set(sys_mention_ids) if not len(sys_mention_ids) == len(remaining_sys_ids): logger.error("Duplicated mention id for doc %s" % doc_id) return False remaining_gold_ids = set() for gl in g_mention_lines: parse_result = parse_line(gl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue gold_attributes = parse_result[1] gold_mention_id = parse_result[2] text = parse_result[4] gold_mention_table.append(parse_result) EvalState.all_possible_types.add(gold_attributes[0]) gold_id_2_text[gold_mention_id] = text remaining_gold_ids.add(gold_mention_id) num_system_predictions = len(system_mention_table) num_gold_predictions = len(gold_mention_table) # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting. all_gold_system_mapping_scores = [] # Debug purpose printing. print_score_matrix = False logger.debug("Computing overlap scores.") for system_index, (sys_spans, sys_attributes, sys_mention_id, _, _) in enumerate(system_mention_table): if print_score_matrix: print system_index, sys_mention_id, for index, (gold_spans, gold_attributes, gold_mention_id, _, _) in enumerate(gold_mention_table): if len(gold_spans) == 0: logger.warning("Found empty span gold standard at doc : %s, mention : %s" % (doc_id, gold_mention_id)) if len(sys_spans) == 0: logger.warning("Found empty span system standard at doc : %s, mention : %s" % (doc_id, sys_mention_id)) overlap = compute_overlap_score(gold_spans, sys_spans) if print_score_matrix: print "%.1f" % overlap, if overlap > 0: # maintaining a max heap based on overlap score heapq.heappush(all_gold_system_mapping_scores, (-overlap, system_index, index)) if print_score_matrix: print greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy( all_gold_system_mapping_scores, all_attribute_combinations, gold_mention_table, system_mention_table, doc_id) write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n") if diff_out is not None: # Here if you change the mapping used, you will see what's wrong on different level! # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table, # system_mention_table, diff_out) write_gold_and_system_mappings(doc_id, system_id, greedy_mention_only_mapping, gold_mention_table, system_mention_table, diff_out) attribute_based_fps = [0.0] * len(all_attribute_combinations) for attribute_comb_index, abtp in enumerate(greedy_attribute_tps): attribute_based_fps[attribute_comb_index] = num_system_predictions - abtp # Unmapped system mentions and the partial scores are considered as false positive. fp = len(sys_mention_ids) - greedy_tp EvalState.doc_mention_scores.append((greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps), num_gold_predictions, num_system_predictions, doc_id)) # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria # settings. coref_mapping = None type_mapping = None for attribute_comb_index, attribute_comb in enumerate(all_attribute_combinations): if attribute_comb == Config.coref_criteria: coref_mapping = greedy_all_attribute_mapping[attribute_comb_index] logger.debug("Select mapping that matches criteria [%s]" % (Config.coref_criteria[0][1])) if attribute_comb[0][1] == "mention_type": type_mapping = greedy_all_attribute_mapping[attribute_comb_index] if Config.coref_criteria == "span_only": coref_mapping = greedy_mention_only_mapping # Evaluate how the performance of each type. per_type_eval(system_mention_table, gold_mention_table, type_mapping) # Parse relations. g_relations = [parse_relation(l) for l in g_relation_lines] s_relations = [parse_relation(l) for l in s_relation_lines] if EvalState.white_listed_types: g_relations = filter_relations(g_relations, remaining_gold_ids) s_relations = filter_relations(s_relations, remaining_sys_ids) if coref_mapping is None: # In case when we don't do attribute scoring. coref_mapping = greedy_mention_only_mapping # Evaluate after links. gold_afters = [after for after in g_relations if after[0] == Config.after_relation_name] sys_afters = [after for after in s_relations if after[0] == Config.after_relation_name] after_eval = TemporalEval(doc_id, coref_mapping, gold_mention_table, gold_afters, system_mention_table, sys_afters) after_eval.write_time_ml() # Evaluate coreference links. if coref_out is not None: logger.debug("Start preparing coreference files.") gold_corefs = [coref for coref in g_relations if coref[0] == Config.coreference_relation_name] sys_corefs = [coref for coref in s_relations if coref[0] == Config.coreference_relation_name] # Prepare CoNLL style coreference input for this document. conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text, gold_id_2_text) gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines(gold_corefs, sys_corefs, gold_mention_table, system_mention_table, coref_mapping, MutableConfig.coref_mention_threshold) # If we are selecting among multiple mappings, it is easy to write in our file. write_mode = 'w' if EvalState.claim_write_flag() else 'a' g_conll_out = open(Config.conll_gold_file, write_mode) s_conll_out = open(Config.conll_sys_file, write_mode) g_conll_out.writelines(gold_conll_lines) s_conll_out.writelines(sys_conll_lines) if diff_out is not None: write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs, gold_id_2_text, sys_id_2_text) write_if_provided(diff_out, Config.eod_marker + " " + "\n") return True
def evaluate(token_dir, coref_out, all_attribute_combinations, token_offset_fields, token_file_ext, diff_out): """ Conduct the main evaluation steps. :param token_dir: :param coref_out: :param all_attribute_combinations: :param token_offset_fields: :param token_file_ext: :param diff_out: :return: """ if EvalState.has_next_doc(): res, (g_mention_lines, g_relation_lines), ( s_mention_lines, s_relation_lines), doc_id, system_id = get_next_doc() else: return False logger.info("Evaluating Document %s" % doc_id) if len(g_mention_lines) == 0: logger.warn( "[%s] does not contain gold standard mentions. Document level F score will not be valid, but the micro " "score will be fine." % doc_id) invisible_ids = [] if MutableConfig.eval_mode == EvalMethod.Token: invisible_ids, id2token, id2span = read_token_ids( token_dir, doc_id, token_file_ext, token_offset_fields) # Parse the lines and save them as a table from id to content. system_mention_table = [] gold_mention_table = [] # Save the raw text for visualization. sys_id_2_text = {} gold_id_2_text = {} logger.debug("Reading gold and response mentions.") remaining_sys_ids = set() num_system_mentions = 0 for sl in s_mention_lines: parse_result = parse_line(sl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue num_system_mentions += 1 sys_attributes = parse_result[1] sys_mention_id = parse_result[2] text = parse_result[4] system_mention_table.append(parse_result) EvalState.all_possible_types.add(sys_attributes[0]) remaining_sys_ids.add(sys_mention_id) sys_id_2_text[sys_mention_id] = text if not num_system_mentions == len(remaining_sys_ids): logger.warn( "Duplicated mention id for doc %s, one of them is randomly removed." % doc_id) remaining_gold_ids = set() for gl in g_mention_lines: parse_result = parse_line(gl, invisible_ids) # If parse result is rejected, we ignore this line. if not parse_result: continue gold_attributes = parse_result[1] gold_mention_id = parse_result[2] text = parse_result[4] gold_mention_table.append(parse_result) EvalState.all_possible_types.add(gold_attributes[0]) gold_id_2_text[gold_mention_id] = text remaining_gold_ids.add(gold_mention_id) num_system_predictions = len(system_mention_table) num_gold_predictions = len(gold_mention_table) # Store list of mappings with the score as a priority queue. Score is stored using negative for easy sorting. all_gold_system_mapping_scores = [] # Debug purpose printing. print_score_matrix = False logger.debug("Computing overlap scores.") for system_index, (sys_spans, sys_attributes, sys_mention_id, _, _) in enumerate(system_mention_table): if print_score_matrix: print("%d %s" % (system_index, sys_mention_id)) for index, (gold_spans, gold_attributes, gold_mention_id, _, _) in enumerate(gold_mention_table): if len(gold_spans) == 0: logger.warning( "Found empty span gold standard at doc : %s, mention : %s" % (doc_id, gold_mention_id)) if len(sys_spans) == 0: logger.warning( "Found empty span system at doc : %s, mention : %s" % (doc_id, sys_mention_id)) overlap = compute_overlap_score(gold_spans, sys_spans) if print_score_matrix: sys.stdout.write("%.1f " % overlap) if overlap > 0: # maintaining a max heap based on overlap score heapq.heappush(all_gold_system_mapping_scores, (-overlap, system_index, index)) if print_score_matrix: print greedy_tp, greedy_attribute_tps, greedy_mention_only_mapping, greedy_all_attribute_mapping = get_tp_greedy( all_gold_system_mapping_scores, all_attribute_combinations, gold_mention_table, system_mention_table, doc_id) write_if_provided(diff_out, Config.bod_marker + " " + doc_id + "\n") if diff_out is not None: # Here if you change the mapping used, you will see what's wrong on different level! # write_gold_and_system_mappings(doc_id, system_id, greedy_all_attribute_mapping[0], gold_mention_table, # system_mention_table, diff_out) write_gold_and_system_mappings(system_id, greedy_mention_only_mapping, gold_mention_table, system_mention_table, diff_out) attribute_based_fps = [0.0] * len(all_attribute_combinations) for attribute_comb_index, abtp in enumerate(greedy_attribute_tps): attribute_based_fps[ attribute_comb_index] = num_system_predictions - abtp # Unmapped system mentions and the partial scores are considered as false positive. fp = len(remaining_sys_ids) - greedy_tp EvalState.doc_mention_scores.append( (greedy_tp, fp, zip(greedy_attribute_tps, attribute_based_fps), num_gold_predictions, num_system_predictions, doc_id)) # Select a computed mapping, we currently select the mapping based on mention type. This means that in order to get # coreference right, your mention type should also be right. This can be changed by change Config.coref_criteria # settings. mention_mapping = None type_mapping = None for attribute_comb_index, attribute_comb in enumerate( all_attribute_combinations): if attribute_comb == Config.coref_criteria: mention_mapping = greedy_all_attribute_mapping[ attribute_comb_index] logger.debug("Select mapping that matches criteria [%s]" % (Config.coref_criteria[0][1])) if attribute_comb[0][1] == "mention_type": type_mapping = greedy_all_attribute_mapping[attribute_comb_index] if Config.coref_criteria == "span_only": mention_mapping = greedy_mention_only_mapping if mention_mapping is None: # In case when we don't do attribute scoring. mention_mapping = greedy_mention_only_mapping # Evaluate how the performance of each type. per_type_eval(system_mention_table, gold_mention_table, type_mapping) gold_directed_relations, gold_corefs = utils.parse_relation_lines( g_relation_lines, remaining_gold_ids) sys_directed_relations, sys_corefs = utils.parse_relation_lines( s_relation_lines, remaining_sys_ids) if Config.script_result_dir: seq_eval = TemporalEval(mention_mapping, gold_mention_table, gold_directed_relations, system_mention_table, sys_directed_relations, gold_corefs, sys_corefs) if not Config.no_script_validation: if not seq_eval.validate_gold(): logger.error( "The gold edges cannot form a valid script graph.") utils.exit_on_fail() if not seq_eval.validate_sys(): logger.error( "The system edges cannot form a valid script graph.") utils.exit_on_fail() seq_eval.write_time_ml(doc_id) # Evaluate coreference links. if coref_out is not None: logger.debug("Start preparing coreference files.") # Prepare CoNLL style coreference input for this document. conll_converter = ConllEvaluator(doc_id, system_id, sys_id_2_text, gold_id_2_text) gold_conll_lines, sys_conll_lines = conll_converter.prepare_conll_lines( gold_corefs, sys_corefs, gold_mention_table, system_mention_table, mention_mapping, MutableConfig.coref_mention_threshold) # If we are selecting among multiple mappings, it is easy to write in our file. write_mode = 'w' if EvalState.claim_write_flag() else 'a' g_conll_out = open(Config.conll_gold_file, write_mode) s_conll_out = open(Config.conll_sys_file, write_mode) g_conll_out.writelines(gold_conll_lines) s_conll_out.writelines(sys_conll_lines) if diff_out is not None: write_gold_and_system_corefs(diff_out, gold_corefs, sys_corefs, gold_id_2_text, sys_id_2_text) write_if_provided(diff_out, Config.eod_marker + " " + "\n") return True
def main(): parser = argparse.ArgumentParser( description="Event mention scorer, which conducts token based " "scoring, system and gold standard files should follows " "the token-based format.") parser.add_argument("-g", "--gold", help="Golden Standard", required=True) parser.add_argument("-s", "--system", help="System output", required=True) parser.add_argument("-d", "--comparison_output", help="Compare and help show the difference between " "system and gold") parser.add_argument( "-o", "--output", help="Optional evaluation result redirects, put eval result to file") parser.add_argument( "-c", "--coref", help="Eval Coreference result output, need to put the reference" "conll coref scorer in the same folder with this scorer") parser.add_argument( "-a", "--sequencing", help="Eval Event sequencing result output (After and Subevent)" ) parser.add_argument( "-t", "--token_path", help="Path to the directory containing the " "token mappings file") parser.add_argument( "-m", "--coref_mapping", help="Which mapping will be used to perform coreference mapping.", type=int ) parser.add_argument( "-of", "--offset_field", help="A pair of integer indicates which column we should " "read the offset in the token mapping file, index starts" "at 0, default value will be %s" % Config.default_token_offset_fields ) parser.add_argument( "-te", "--token_table_extension", help="any extension appended after docid of token table files. " "Default is [%s]" % Config.default_token_file_ext) parser.add_argument("-ct", "--coreference_threshold", type=float, help="Threshold for coreference mention mapping") parser.add_argument( "-b", "--debug", help="turn debug mode on", action="store_true") parser.add_argument("--eval_mode", choices=["char", "token"], default="char", help="Use Span Overlap or Token Overlap mode. The Span Overlap mode will take a span as range " "[start:end], while the Token Overlap mode consider each token is provided as a single " "id.") parser.add_argument("-wl", "--type_white_list", type=argparse.FileType('r'), help="Provide a file, where each line list a mention type subtype pair to be evaluated. Types " "that are out of this white list will be ignored.") parser.add_argument( "-dn", "--doc_id_to_eval", help="Provide one single doc id to evaluate." ) parser.set_defaults(debug=False) args = parser.parse_args() if args.debug: stream_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.debug("Entered debug mode.") else: stream_handler.setLevel(logging.INFO) logger.setLevel(logging.INFO) if args.type_white_list is not None: logger.info("Only the following types in the white list will be evaluated.") EvalState.white_listed_types = set() for line in args.type_white_list: logger.info(line.strip()) EvalState.white_listed_types.add(canonicalize_string(line)) if args.eval_mode == "char": MutableConfig.eval_mode = EvalMethod.Char else: MutableConfig.eval_mode = EvalMethod.Token if args.output is not None: out_path = args.output utils.create_parent_dir(out_path) mention_eval_out = open(out_path, 'w') logger.info("Evaluation output will be saved at %s" % out_path) else: mention_eval_out = sys.stdout logger.info("Evaluation output at standard out.") if os.path.isfile(args.gold): gf = open(args.gold) else: logger.error("Cannot find gold standard file at " + args.gold) sys.exit(1) if args.coref is not None: Config.conll_out = args.coref Config.conll_gold_file = args.coref + "_gold.conll" Config.conll_sys_file = args.coref + "_sys.conll" logger.info("CoNLL script output will be output at " + Config.conll_out) logger.info( "Gold and system conll files will generated at " + Config.conll_gold_file + " and " + Config.conll_sys_file) # if os.path.exists(Config.conll_tmp_marker): # # Clean up the directory to avoid scoring errors. # remove_conll_tmp() # supermakedirs(Config.conll_tmp_marker) if args.sequencing is not None: Config.temporal_result_dir = args.sequencing utils.supermakedirs(os.path.join(Config.temporal_result_dir, Config.temporal_gold_dir)) utils.supermakedirs(os.path.join(Config.temporal_result_dir, Config.temporal_sys_dir)) if os.path.isfile(args.system): sf = open(args.system) else: logger.error("Cannot find system file at " + args.system) sys.exit(1) if args.coref_mapping is not None: if args.coref_mapping < 4: Config.coref_criteria = Config.possible_coref_mapping[args.coref_mapping] else: logger.error("Possible mapping : 0: Span only 1: Mention Type 2: Realis 3 Type and Realis") utils.terminate_with_error("Must provide a mapping between 0 to 3") else: Config.coref_criteria = Config.possible_coref_mapping[1] diff_out = None if args.comparison_output is not None: diff_out_path = args.comparison_output utils.create_parent_dir(diff_out_path) diff_out = open(diff_out_path, 'w') token_dir = "." if args.token_path is not None: if args.eval_mode == EvalMethod.Token: utils.terminate_with_error("Token table (-t) must be provided in token mode") if os.path.isdir(args.token_path): logger.debug("Will search token files in " + args.token_path) token_dir = args.token_path else: logger.debug("Cannot find given token directory at [%s], " "will try search for current directory" % args.token_path) token_offset_fields = Config.default_token_offset_fields if args.offset_field is not None: try: token_offset_fields = [int(x) for x in args.offset_field.split(",")] except ValueError as _: logger.error("Token offset argument should be two integer with comma in between, i.e. 2,3") if args.coreference_threshold is not None: MutableConfig.coref_mention_threshold = args.coreference_threshold # Read all documents. read_all_doc(gf, sf, args.doc_id_to_eval) # Take all attribute combinations, which will be used to produce scores. attribute_comb = get_attr_combinations(Config.attribute_names) logger.info("Coreference mentions need to match %s before consideration" % Config.coref_criteria[0][1]) while True: if not evaluate(token_dir, args.coref, attribute_comb, token_offset_fields, args.token_table_extension, diff_out): break # Run the CoNLL script on the combined files, which is concatenated from the best alignment of all documents. if args.coref is not None: logger.debug("Running coreference script for the final scores.") ConllEvaluator.run_conll_script(Config.conll_gold_file, Config.conll_sys_file, Config.conll_out) # Get the CoNLL scores from output EvalState.overall_coref_scores = ConllEvaluator.get_conll_scores(Config.conll_out) # Run the TimeML evaluation script. TemporalEval.eval_time_ml() print_eval_results(mention_eval_out, attribute_comb) # Clean up, close files. close_if_not_none(diff_out) logger.info("Evaluation Done.")