def validate_passage(self, passage): if self.normalization: normalize(passage, extra=self.extra) errors = list(validate(passage, linkage=self.linkage)) if self.strict: print_errors(passage.ID, errors) return passage.ID, errors
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split( passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("Unmatched sentences:", *[ s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices ], sep="\n")
def validate(passage, normalization=False, extra_normalization=False, ucca_validation=False, output_format=None, **kwargs): del kwargs if normalization: normalize(passage, extra=extra_normalization) if ucca_validation: yield from ucca_validations.validate(passage) else: # Generic validations depending on format-specific constraints try: constraints = CONSTRAINTS[passage.extra.get("format", output_format)]() except KeyError as e: raise ValueError("No validations defined for '%s' format" % output_format) from e yield from detect_cycles(passage) l0 = passage.layer(layer0.LAYER_ID) l1 = passage.layer(layer1.LAYER_ID) for terminal in l0.all: yield from check_orphan_terminals(constraints, terminal) yield from check_root_terminal_children(constraints, l1, terminal) yield from check_multiple_incoming(constraints, terminal) yield from check_top_level_allowed(constraints, l1) for node in l1.all: yield from check_multigraph(constraints, node) yield from check_implicit_children(constraints, node) yield from check_multiple_incoming(constraints, node) yield from check_top_level_only(constraints, l1, node) yield from check_required_outgoing(constraints, node) yield from check_tag_rules(constraints, node)
def finish(self, status, display=True, write=False, accuracies=None): self.model.classifier.finished_item(self.training) for model in self.models[1:]: model.classifier.finished_item(renew=False) # So that dynet.renew_cg happens only once if not self.training or self.config.args.verify: self.out = self.state.create_passage(verify=self.config.args.verify, format=self.out_format) if write: for out_format in self.config.args.formats or [self.out_format]: if self.config.args.normalize and out_format == "ucca": normalize(self.out) ioutil.write_passage(self.out, output_format=out_format, binary=out_format == "pickle", outdir=self.config.args.outdir, prefix=self.config.args.prefix, converter=get_output_converter(out_format), verbose=self.config.args.verbose, append=self.config.args.join, basename=self.config.args.join) if self.oracle and self.config.args.verify: self.verify(self.out, self.passage) ret = (self.out,) if self.evaluation: ret += (self.evaluate(self.evaluation),) status = "%-14s %s F1=%.3f" % (status, self.eval_type, self.f1) if display: self.config.print("%s%.3fs %s" % (self.accuracy_str, self.duration, status), level=1) if accuracies is not None: accuracies[self.passage.ID] = self.correct_action_count / self.action_count if self.action_count else 0 return ret
def submit_tasks(self, filename, log_file, **kwargs): del kwargs log_file = open(log_file,'w') with open(filename) as f: task_ids = list(f.readlines()) for task_id in task_ids: try: task_id = task_id.strip() task = self.get_user_task(int(task_id)) if task['type'] not in ['ANNOTATION', 'REVIEW']: print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True) continue try: passage = next(iter(convert.from_json(task))) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e # validate the task normalization.normalize(passage) validation_errors = list(validation.validate(passage, linkage=False)) if len(validation_errors) == 0: self.submit_task(**task) print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True) else: for error in validation_errors: print(task_id, error, file=log_file, sep="\t", flush=True) except requests.exceptions.HTTPError as e: print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True)
def main(args): os.makedirs(args.out_dir, exist_ok=True) for passage in iter_passages(args.filenames, desc="Converting", input_format=args.input_format, prefix=args.prefix, mark_aux=args.mark_aux, annotate=args.annotate, wikification=args.wikification, label_map_file=args.label_map, output_format=args.output_format): map_labels(passage, args.label_map) if args.normalize and args.output_format != "txt": normalize(passage, extra=args.extra_normalization) if args.lang: passage.attrib["lang"] = args.lang write_passage(passage, **vars(args)) if args.validate: try: errors = list( validate(passage, ucca_validation=args.ucca_validation, output_format=args.output_format)) except ValueError: continue if errors: print_errors(errors, passage.ID) sys.exit(1)
def submit_tasks(self, filename, log_file, **kwargs): del kwargs log_file = open(log_file,'w') with open(filename) as f: task_ids = list(f.readlines()) for task_id in task_ids: try: task_id = task_id.strip() task = self.get_user_task(int(task_id)) if task['type'] not in ['ANNOTATION', 'REVIEW']: print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True) continue try: passage = convert.from_json(task) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e # validate the task normalization.normalize(passage) validation_errors = list(validation.validate(passage, linkage=False)) if len(validation_errors) == 0: self.submit_task(**task) print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True) else: for error in validation_errors: print(task_id, error, file=log_file, sep="\t", flush=True) except requests.exceptions.HTTPError as e: print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True)
def evaluate(guessed, ref, converter=None, verbose=False, constructions=DEFAULT, units=False, fscore=True, errors=False, normalize=True, eval_type=None, ref_yield_tags=None, **kwargs): """ Compare two passages and return requested diagnostics and scores, possibly printing them too. NOTE: since normalize=True by default, this method is destructive: it modifies the given passages before evaluation. :param guessed: Passage object to evaluate :param ref: reference Passage object to compare to :param converter: optional function to apply to passages before evaluation :param verbose: whether to print the results :param constructions: names of construction types to include in the evaluation :param units: whether to evaluate common units :param fscore: whether to compute precision, recall and f1 score :param errors: whether to print the mistakes :param normalize: flatten centers and move common functions to root before evaluation - modifies passages :param eval_type: specific evaluation type(s) to limit to :param ref_yield_tags: reference passage for fine-grained evaluation :return: Scores object """ del kwargs if converter is not None: guessed = converter(guessed) ref = converter(ref) if normalize: # FIXME clone passages to avoid modifying the original ones for passage in (guessed, ref): normalization.normalize(passage) # flatten Cs inside Cs move_functions(guessed, ref) # move common Fs to be under the root, FIXME should be before normalize if isinstance(eval_type, str): eval_type = [eval_type] evaluator = Evaluator(verbose, constructions, units, fscore, errors) return Scores((evaluation_type, evaluator.get_scores(guessed, ref, evaluation_type, r=ref_yield_tags)) for evaluation_type in (eval_type or EVAL_TYPES))
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): scores = [] if not args.verbose: spec.passages = tqdm( spec.passages, unit=" passages", desc="Parsing " + (spec.out_dir if spec.out_dir != "." else spec.lang)) for passage, parsed in parse(spec.passages, spec.lang, spec.udpipe, args.verbose): map_labels(parsed, args.label_map) normalize(parsed, extra=True) if args.write: write_passage(parsed, args) if args.evaluate: evaluator = EVALUATORS.get(args.output_format) converter = TO_FORMAT.get(args.output_format) if converter is not None: passage, parsed = map(converter, (passage, parsed)) if evaluator is not None: scores.append( evaluator.evaluate(parsed, passage, verbose=args.verbose > 1)) if scores: Scores(scores).print()
def evaluate(guessed, ref, converter=None, verbose=False, constructions=DEFAULT, units=False, fscore=True, errors=False, normalize=True, eval_type=None, ref_yield_tags=None, **kwargs): """ Compare two passages and return requested diagnostics and scores, possibly printing them too. NOTE: since normalize=True by default, this method is destructive: it modifies the given passages before evaluation. :param guessed: Passage object to evaluate :param ref: reference Passage object to compare to :param converter: optional function to apply to passages before evaluation :param verbose: whether to print the results :param constructions: names of construction types to include in the evaluation :param units: whether to evaluate common units :param fscore: whether to compute precision, recall and f1 score :param errors: whether to print the mistakes :param normalize: flatten centers and move common functions to root before evaluation - modifies passages :param eval_type: specific evaluation type to limit to :param ref_yield_tags: reference passage for fine-grained evaluation :return: Scores object """ del kwargs if converter is not None: guessed = converter(guessed) ref = converter(ref) if normalize: for passage in (guessed, ref): normalization.normalize(passage) # flatten Cs inside Cs move_functions(guessed, ref) # move common Fs to be under the root evaluator = Evaluator(verbose, constructions, units, fscore, errors) return Scores((evaluation_type, evaluator.get_scores(guessed, ref, evaluation_type, r=ref_yield_tags)) for evaluation_type in ([eval_type] if eval_type else EVAL_TYPES))
def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None, prefix=None, by_external_id=False, verbose=False, write_valid_only=False, **kwargs): del kwargs task = self.get_user_task(task_id) user_id = task["user"]["id"] try: passage = from_json(task, by_external_id=by_external_id) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e if normalize: try: normalization.normalize(passage) except AssertionError as e: raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e if log: print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"], file=log, sep="\t", flush=True) ret = passage, task_id, user_id if validate or write_valid_only: for error in validation.validate(passage, linkage=False): if validate: print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True) if write_valid_only: return ret if write: write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose) return ret
def normalize_and_compare(unnormalized, normalized, extra=False): p1 = unnormalized() p2 = normalized() if unnormalized != normalized: assert not p1.equals(p2), "Unnormalized and normalized passage: %s == %s" % (str(p1), str(p2)) normalize(p1, extra=extra) assert p1.equals(p2), "Normalized passage: %s != %s" % (str(p1), str(p2))
def normalize_and_compare(unnormalized, normalized, extra=False): p1 = unnormalized() p2 = normalized() if unnormalized != normalized: assert not p1.equals( p2), "Unnormalized and normalized passage: %s == %s" % (str(p1), str(p2)) normalize(p1, extra=extra) assert p1.equals(p2), "Normalized passage: %s != %s" % (str(p1), str(p2))
def main(args): os.makedirs(args.outdir, exist_ok=True) words_set = read_dict(args.words_set) with open(args.logfile, "w", newline="", encoding="utf-8") as outfile: cw = csv.writer(outfile) for passage in get_passages_with_progress_bar(args.filenames, "Fixing tokenization"): fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw) if fixed is not None: outfile.flush() normalize(fixed) write_passage(fixed, outdir=args.outdir, binary=args.binary, prefix=args.prefix, verbose=args.verbose)
def main(args): if args.outdir: os.makedirs(args.outdir, exist_ok=True) for p in get_passages_with_progress_bar(args.filenames, desc="Normalizing", converters={}): normalize(p, extra=args.extra) write_passage(p, outdir=args.outdir, prefix=args.prefix, binary=args.binary, verbose=False)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary)
def validate_passage(self, passage): if self.normalization: normalize(passage, extra=self.extra) errors = list(validate(passage, linkage=self.linkage, multigraph=self.multigraph)) passage_id = passage.ID user_id = passage.attrib.get("userID") if user_id: passage_id += " " + user_id task_id = passage.attrib.get("annotationID") if task_id: passage_id += " " + task_id if self.strict: print_errors(passage_id, errors) return passage_id, errors
def main(args): os.makedirs(args.out_dir, exist_ok=True) kwargs = vars(args) for passage in iter_passages(args.filenames, desc="Converting", **kwargs): map_labels(passage, args.label_map) if args.normalize and args.output_format != "txt": normalize(passage, extra=args.extra_normalization) if args.lang: passage.attrib["lang"] = args.lang write_passage(passage, **kwargs) if args.validate: try: errors = list(validate(passage, **kwargs)) except ValueError: continue if errors: print_errors(errors, passage.ID) sys.exit(1)
def main(args): scores = [] for pattern in args.filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: print("\rConverting '%s'" % filename, end="") if args.out_dir or args.verbose: print(flush=True) basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") converters = CONVERTERS.get(passage_format, CONVERTERS["amr"]) evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate with open(filename, encoding="utf-8") as f: for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True): if args.normalize: normalize(passage, extra=args.extra_normalization) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = "%s/%s.xml" % (args.out_dir, passage.ID) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = converters[1](passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluator(guessed, ref, verbose=args.verbose > 1) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e scores.append(s) if args.verbose: print(passage_id) s.print() print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print()
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
def main(args): os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for paragraph in split2paragraphs( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(paragraph, file=sys.stderr) print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(paragraph) passage2file(paragraph, outfile, binary=args.binary)
def main(args): os.makedirs(args.out_dir, exist_ok=True) for passage in iter_passages(args.filenames, desc="Converting", input_format=args.input_format, prefix=args.prefix, split=args.split, mark_aux=args.mark_aux, annotate=args.annotate): map_labels(passage, args.label_map) if args.normalize: normalize(passage, extra=args.extra_normalization) if args.lang: passage.attrib["lang"] = args.lang write_passage(passage, args) if args.validate: errors = list( validate(passage, ucca_validation=args.ucca_validation, output_format=args.output_format)) if errors: print_errors(errors, passage.ID) sys.exit(1)
def main(args): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) scores = [] for pattern in args.filenames: for filename in glob(pattern) or [pattern]: file_scores = [] basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") if passage_format == "txt": passage_format = args.format in_converter, out_converter = CONVERTERS.get( passage_format, CONVERTERS[args.format]) evaluate = EVALUATORS.get(passage_format, EVALUATORS[args.format]) with open(filename, encoding="utf-8") as f: t = tqdm(in_converter(f, passage_id=basename, return_original=True), unit=" passages", desc=("Converting '%s'" % filename) + ((", writing to '%s'" % args.out_dir) if args.out_dir else "")) for passage, ref, passage_id in t: if args.normalize: normalize(passage, extra=args.extra_normalization) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = os.path.join(args.out_dir, passage.ID + ".xml") if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = out_converter(passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = os.path.join(args.out_dir, passage.ID + ext) if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluate(guessed, ref, verbose=args.verbose > 1, units=args.units) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e file_scores.append(s) if args.verbose: with ioutil.external_write_mode(): print(passage_id) s.print() t.set_postfix(F1="%.2f" % (100.0 * Scores(file_scores).average_f1())) scores += file_scores print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print()
argparser.add_argument("passages", nargs="+", help="Passages in any format") argparser.add_argument("--tikz", action="store_true", help="print tikz code rather than showing plots") argparser.add_argument("--out-dir", help="directory to save figures in (otherwise displayed immediately)") group = argparser.add_mutually_exclusive_group() group.add_argument("--no-normalize", action="store_false", dest="normalize", help="do not normalize passage") group.add_argument("-e", "--extra-normalization", action="store_true", help="more normalization rules") argparser.add_argument("--label-map", help="CSV file specifying mapping of input edge labels to output edge labels") argparser.add_argument("-i", "--node-ids", action="store_true", help="print tikz code rather than showing plots") args = argparser.parse_args() if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) for passage in get_passages_with_progress_bar(args.passages, desc="Visualizing", converters=FROM_FORMAT): map_labels(passage, args.label_map) if args.normalize: normalize(passage, extra=args.extra_normalization) if args.tikz: tikz = visualization.tikz(passage, node_ids=args.node_ids) if args.out_dir: with open(os.path.join(args.out_dir, passage.ID + ".tikz.txt"), "w") as f: print(tikz, file=f) else: with tqdm.external_write_mode(): print(tikz) else: plt.figure(figsize=(19, 10)) visualization.draw(passage, node_ids=args.node_ids) if args.out_dir: plt.savefig(os.path.join(args.out_dir, passage.ID + ".png")) else: mng = plt.get_current_fig_manager()