def main(): opt_parser = cmd_line_parser() (options, args) = opt_parser.parse_args() if len(args) > 0: opt_parser.error("all arguments must be flagged") if (options.guessed is None) or (options.ref is None) or (options.db_filename is None): opt_parser.error("missing arguments. type --help for help.") if (options.pid is not None and options.from_xids is not None): opt_parser.error("inconsistent parameters. \ you can't have both a pid and from_xids paramters.") keys = [options.guessed, options.ref] if options.from_xids: xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys) else: xmls = ucca_db.get_xml_trees(options.db_filename, options.host, options.pid, keys) guessed, ref = [convert.from_site(x) for x in xmls] if options.units or options.fscore or options.errors: evaluate(guessed, ref, units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
def main(args): keys = [args.guessed, args.ref] xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \ api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys) guessed, ref = [convert.from_site(x) for x in xmls] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, constructions=args.constructions, verbose=True)
def main(filenames, write, **kwargs): uploader = TaskUploader(**kwargs) downloader = TaskDownloader(**kwargs) scores = [] try: for pattern in filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for ref in read_files_and_dirs(filenames): print("Converting passage " + ref.ID + "... ", end="") task = uploader.upload_task(ref) guessed = downloader.download_task(task["id"], write=write, **kwargs) score = evaluate(guessed, ref, **kwargs) print("F1=%.3f" % score.average_f1()) scores.append(score) except HTTPError as e: try: raise ValueError(e.response.json()) from e except JSONDecodeError: raise ValueError(e.response.text) from e print() if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def main(args): guessed, ref = [ ioutil.read_files_and_dirs((x, )) for x in (args.guessed, args.ref) ] guessed = match_by_id(guessed, ref) results = [] for g, r in zip(guessed, ref): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize) if args.verbose: print("Average labeled F1 score: %.3f\n" % result.average_f1()) results.append(result) summarize(args, results)
def main(args): guessed, ref, ref_yield_tags = [None if x is None else ioutil.read_files_and_dirs((x,)) for x in (args.guessed, args.ref, args.ref_yield_tags)] if args.match_by_id: guessed = match_by_id(guessed, ref) ref_yield_tags = match_by_id(ref_yield_tags, ref) results = [] eval_type = evaluation.UNLABELED if args.unlabeled else evaluation.LABELED verbose = args.verbose or len(guessed) == 1 for g, r, ryt in zip(guessed, ref, ref_yield_tags or repeat(None)): if len(guessed) > 1: print("Evaluating %s%s" % (g.ID, ":" if args.verbose else "..."), end="\r", flush=True) if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=verbose, normalize=args.normalize, ref_yield_tags=ryt, eval_type=evaluation.UNLABELED if args.unlabeled else None) if verbose: if args.errors: result.print_confusion_matrix(as_table=args.as_table) if not args.quiet: print_f1(result, eval_type) results.append(result) summarize(args, results, eval_type=eval_type)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: ref = file2passage(filename) try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, fscore=True, verbose=False, units=False, errors=False)) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename, e) if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(args): guessed, ref, ref_yield_tags = [ None if x is None else ioutil.read_files_and_dirs((x, )) for x in (args.guessed, args.ref, args.ref_yield_tags) ] if args.match_by_id: guessed = match_by_id(guessed, ref) ref_yield_tags = match_by_id(ref_yield_tags, ref) results = [] for g, r, ryt in zip(guessed, ref, ref_yield_tags or repeat(None)): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate( g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize, ref_yield_tags=ryt, eval_type=evaluation.UNLABELED if args.unlabeled else None) if args.verbose: print_f1(result, args.unlabeled) results.append(result) summarize(args, results)
def evaluate_passage(guessed_passage, ref_passage): score = evaluation.evaluate(guessed_passage, ref_passage, constructions=Config().args.constructions, verbose=Config().args.verbose and guessed_passage is not None) print("F1=%.3f" % score.average_f1(), flush=True) return score
def evaluate_passage(guessed_passage, ref_passage): score = evaluation.evaluate(guessed_passage, ref_passage, verbose=Config().verbose and guessed_passage is not None, units=False, errors=False) print("F1=%.3f" % score.average_unlabeled_f1(), flush=True) return score
def test_evaluate_self(create, units, errors, normalize): p = create() scores = evaluate(p, p, units=units, errors=errors, normalize=normalize) assert 1.0 == scores.average_f1() for eval_type, results in sorted(scores.evaluators.items()): for construction, stats in results.results.items(): assert 1.0 == stats.f1, (eval_type, construction) assert 1.0 == stats.p, (eval_type, construction) assert 1.0 == stats.r, (eval_type, construction) check_primary_remote(scores, 1.0)
def test_evaluate(create1, create2, f1, units, errors): p1 = create1() p2 = create2() validation_errors_before = [list(validate(p, linkage=False)) for p in (p1, p2)] scores = evaluate(p1, p2, units=units, errors=errors) validation_errors_after = [list(validate(p, linkage=False)) for p in (p1, p2)] for before, after in zip(validation_errors_before, validation_errors_after): if not before: assert not after check_primary_remote(scores, f1)
def evaluate_passage(guessed_passage, ref_passage): score = evaluation.evaluate( guessed_passage, ref_passage, verbose=Config().verbose and guessed_passage is not None, units=False, errors=False, ) print("F1=%.3f" % score.average_unlabeled_f1(), flush=True) return score
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") argparser.add_argument( "-s", "--strict", action="store_true", help="stop immediately if failed to convert or evaluate a file") argparser.add_argument( "-v", "--verbose", action="store_true", help="print evaluation results for each file separately") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: sys.stdout.write("\rConverting %s" % filename) sys.stdout.flush() ref = file2passage(filename) try: guessed = next( converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % filename) from e else: print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(): opt_parser = cmd_line_parser() (options, args) = opt_parser.parse_args() if len(args) > 0: opt_parser.error("all arguments must be flagged") if (options.guessed is None) or (options.ref is None) or (options.db_filename is None): opt_parser.error("missing arguments. type --help for help.") if options.pid is not None and options.from_xids is not None: opt_parser.error("inconsistent parameters. \ you can't have both a pid and from_xids paramters.") keys = [options.guessed, options.ref] if options.from_xids: xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys) else: xmls = ucca_db.get_xml_trees(options.db_filename, options.host, options.pid, keys) guessed, ref = [convert.from_site(x) for x in xmls] if options.units or options.fscore or options.errors: evaluate(guessed, ref, units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
def train_test(self, model_type, compare=True): passages = [self.passage] scores = [] for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) p = Parser(model_file="test_files/%s" % model_type, model_type=model_type) p.train(passages if mode == "train" else None) guess, ref = zip(*list(p.parse(passages))) print() self.assertSequenceEqual(ref, passages) score = evaluation.Scores.aggregate([evaluation.evaluate( g, r, verbose=False, units=False, errors=False) for g, r in zip(guess, ref)]) scores.append(score.average_f1()) if compare: self.assertEqual(*scores) print("-- average labeled f1: %.3f, %.3f" % tuple(scores))
def main(task_ids, by_filename=False, validate=None, log=None, **kwargs): kwargs["write"] = False if by_filename: task_ids_from_file = [] for filename in task_ids: with open(filename, 'r') as f: task_ids_from_file += zip( *list(map(str.split, filter(None, map(str.strip, f))))) task_ids = task_ids_from_file else: task_ids = [[task_id] for task_id in task_ids] assert len(task_ids) == 2, "Got %d lists of task IDs instead of two" % len( task_ids) downloader = TaskDownloader(**kwargs) scores = [] validate_h = open(validate, "w", encoding="utf-8") if validate else None log_h = open(log, "w", encoding="utf-8") if log else None if log: fields = ["guessed", "ref"] + Scores.field_titles( eval_type=LABELED) + Scores.field_titles(eval_type=UNLABELED) print(*fields, file=log_h, sep="\t", flush=True) for task_id_pair in tqdm(list(zip(*task_ids)), unit=" tasks", desc="Evaluating"): passage_pair = [] for task_id in task_id_pair: passage, *_ = downloader.download_task(task_id, validate=validate_h, **kwargs) passage_pair.append(passage) score = evaluate(*passage_pair, **kwargs) if log: fields = list(task_id_pair) + score.fields( eval_type=LABELED) + score.fields(eval_type=UNLABELED) print(*fields, file=log_h, sep="\t", flush=True) scores.append(score) if validate: validate_h.close() if log: log_h.close() print() if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def main(args): guessed, ref = [ ioutil.read_files_and_dirs((x, ), converters=FROM_FORMAT) for x in (args.guessed, args.ref) ] if len(guessed) != len(ref): raise ValueError( "Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref))) if len(guessed) > 1: guessed_by_id = { g.ID: g for g in tqdm( guessed, desc="Reading " + args.guessed, unit=" passages") } try: guessed = [ guessed_by_id[p.ID] for p in tqdm( ref, desc="Reading " + args.ref, unit=" passages") ] except KeyError as e: raise ValueError("Passage IDs do not match") from e results = [ evaluate(g, r, errors=True) for g, r in zip( tqdm(guessed, desc="Evaluating", unit=" passages"), ref) ] confusion_matrix = Scores.aggregate( results).evaluators[LABELED].results[PRIMARY].errors.most_common() label_map = {} for (g, r), _ in confusion_matrix: g, *_ = g.partition("|") prefix, *_ = g.partition(":") if not any(l.startswith(prefix) for l in label_map): # drop suffix for most common label g = prefix if g not in label_map: label_map[g], *_ = r.partition("|") with open(args.out_file, "w", encoding="utf-8") as f: csv.writer(f).writerows( tqdm(sorted(label_map.items()), desc="Writing " + args.out_file, unit=" rows"))
def main(args): converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for ref in get_passages_with_progress_bar(args.filenames, desc="Converting"): try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % ref.ID) from e else: with tqdm.external_write_mode(): print("Error evaluating conversion of %s: %s" % (ref.ID, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") argparser.add_argument("-s", "--strict", action="store_true", help="stop immediately if failed to convert or evaluate a file") argparser.add_argument("-v", "--verbose", action="store_true", help="print evaluation results for each file separately") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: sys.stdout.write("\rConverting %s" % filename) sys.stdout.flush() ref = file2passage(filename) try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % filename) from e else: print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: ref = file2passage(filename) guessed = next(converter2(converter1(ref), ref.ID)) scores.append( evaluate(guessed, ref, fscore=True, verbose=True, units=False, errors=False)) if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(task_ids, by_filename=False, validate=None, log=None, **kwargs): kwargs["write"] = False if by_filename: task_ids_from_file = [] for filename in task_ids: with open(filename, 'r') as f: task_ids_from_file += zip(*list(map(str.split, filter(None, map(str.strip, f))))) task_ids = task_ids_from_file else: task_ids = [[task_id] for task_id in task_ids] assert len(task_ids) == 2, "Got %d lists of task IDs instead of two" % len(task_ids) downloader = TaskDownloader(**kwargs) scores = [] validate_h = open(validate, "w", encoding="utf-8") if validate else None log_h = open(log, "w", encoding="utf-8") if log else None if log: fields = ["guessed", "ref"] + Scores.field_titles(eval_type=LABELED) + Scores.field_titles(eval_type=UNLABELED) print(*fields, file=log_h, sep="\t", flush=True) for task_id_pair in tqdm(list(zip(*task_ids)), unit=" tasks", desc="Evaluating"): passage_pair = [] for task_id in task_id_pair: passage, *_ = downloader.download_task(task_id, validate=validate_h, **kwargs) passage_pair.append(passage) score = evaluate(*passage_pair, **kwargs) if log: fields = list(task_id_pair) + score.fields(eval_type=LABELED) + score.fields(eval_type=UNLABELED) print(*fields, file=log_h, sep="\t", flush=True) scores.append(score) if validate: validate_h.close() if log: log_h.close() print() if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def test_evaluate(create1, create2, f1, units, errors): scores = evaluate(create1(), create2(), units=units, errors=errors) check_primary_remote(scores, f1)
sys.stdout.flush() guessed_by_id[g.ID] = g ids = [p.ID for p in ref] try: guessed = [guessed_by_id[i] for i in ids] except KeyError as e: raise ValueError("Passage IDs do not match") from e results = [] for g, r in zip(guessed, ref): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize) if args.verbose: print("Average labeled F1 score: %.3f\n" % result.average_f1()) results.append(result) summary = evaluation.Scores.aggregate(results) if len(results) > 1: if args.verbose: print("Aggregated scores:") else: print(end="\r") if not args.quiet: summary.print() if not args.quiet: print("Average labeled F1 score: %.3f" % summary.average_f1()) args_constructions = summary.evaluators
try: guessed = [guessed_by_id[i] for i in ids] except KeyError as e: raise ValueError("Passage IDs do not match") from e results = [] for g, r in zip(guessed, ref): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize) if args.verbose: print("Average labeled F1 score: %.3f\n" % result.average_f1()) results.append(result) summary = evaluation.Scores.aggregate(results) if len(results) > 1: if args.verbose: print("Aggregated scores:") else: print(end="\r") if not args.quiet: summary.print() if not args.quiet:
elif mode == 'refinement': convert_refinement_to_concat(passage) convert_refinement_to_concat(ref) passage_vanilla = get_vanilla_ucca(passage) passage_snacs = get_snacs_ucca(passage) passage_refined = get_refined_ucca(passage) ref_vanilla = get_vanilla_ucca(ref) ref_snacs = get_snacs_ucca(ref) ref_refined = get_refined_ucca(ref) # print(passage_snacs) # print(ref_snacs) integrated_results.append( evaluation.evaluate(passage, ref, constructions=('Non-preterm', ))) vanilla_results.append( evaluation.evaluate(passage_vanilla, ref_vanilla, constructions=('Non-preterm', ))) snacs_results.append( evaluation.evaluate(passage_snacs, ref_snacs, constructions=( 'Non-preterm', 'SNACS', 'hastags', ))) refined_results.append( evaluation.evaluate(passage_refined, ref_refined,
def __call__(self, dataset_label, predicted_tree, gold_tree): score = evaluation.evaluate(predicted_tree, gold_tree) if dataset_label not in self.scores: self.scores[dataset_label] = [] self.scores[dataset_label].append(score)
from argparse import ArgumentParser from ucca.evaluation import evaluate from ucca.ioutil import file2passage ################ # MAIN # ################ if __name__ == "__main__": argparser = ArgumentParser(description="Compare two UCCA passages.") argparser.add_argument("guessed", help="xml/pickle file name for the guessed annotation") argparser.add_argument("ref", help="xml/pickle file name for the reference annotation") argparser.add_argument("--units", "-u", dest="units", action="store_true", help="the units the annotations have in common, and those each has separately") argparser.add_argument("--fscore", "-f", dest="fscore", action="store_true", help="outputs the traditional P,R,F instead of the scene structure evaluation") argparser.add_argument("--errors", "-e", dest="errors", action="store_true", help="prints the error distribution according to its frequency") args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, verbose=True)
ref_vanilla = get_vanilla_ucca(ref) ref_snacs, ref_refined = get_snacs_refined_ucca(ref) # ref_refined, edges_refined = get_refined_ucca(ref) # with open('edges_snacs.tsv', 'a') as f: # for e, ts in sorted(edges_snacs,key=lambda x: str(x[0])): # print(name, e, ts, sep='\t', file=f) # with open('edges_refined.tsv', 'a') as f: # for e, ts in sorted(edges_refined, key=lambda x: str(x[0])): # print(name, e, ts, sep='\t', file=f) # print(passage_snacs) # print(ref_snacs) integrated_results.append(evaluation.evaluate(passage_full, ref_full, constructions=('Non-preterm','SNACS'), normalize=False)) vanilla_results.append(evaluation.evaluate(passage_vanilla, ref_vanilla, constructions=('Non-preterm','SNACS', 'has_gold_SNACS', 'has_gold_SNACS_sibling', 'has_gold_SNACS_or_sibling', 'scenes', 'scene_children', 'scenes_and_scene_children'), normalize=True)) snacs_results.append(evaluation.evaluate(passage_snacs, ref_snacs, constructions=('Non-preterm', 'SNACS', 'has_tags',), normalize=False)) refined_results.append(evaluation.evaluate(passage_refined, ref_refined, constructions=('Non-preterm', 'has_tags',), normalize=False)) print('UCCA SNACS') integ_aggr = evaluation.Scores.aggregate(integrated_results) integ_aggr.print() # integ_aggr.print_confusion_matrix() print('\n\nUCCA')
"the units the annotations have in common, and those each has separately" ) argparser.add_argument( "--fscore", "-f", dest="fscore", action="store_true", help= "outputs the traditional P,R,F instead of the scene structure evaluation" ) argparser.add_argument( "--errors", "-e", dest="errors", action="store_true", help="prints the error distribution according to its frequency") args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, verbose=True)