def test_annotate_all(self): passages = [ convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")), TestUtil.create_passage(), TestUtil.create_crossing_passage(), TestUtil.create_discontiguous(), TestUtil.create_multi_passage() ] list(textutil.annotate_all(passages)) for passage, compare in textutil.annotate_all( ((p, p) for p in passages), as_array=True, as_tuples=True): assert passage is compare for p in passage, convert.from_standard( convert.to_standard(passage)): self.assertTrue(is_annotated(p, as_array=True), "Passage %s is not annotated" % passage.ID) self.assertTrue(is_annotated(p, as_array=False), "Passage %s is not annotated" % passage.ID) for terminal in p.layer(layer0.LAYER_ID).all: for attr in textutil.Attr: self.assertIn( attr.key, terminal.extra, "Terminal %s in passage %s has no %s" % (terminal, passage.ID, attr.name)) self.assertIsNotNone( terminal.tok, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)) self.assertEqual(len(terminal.tok), len(textutil.Attr))
def test_annotate_all(as_array, convert_and_back): passages = [create() for create in PASSAGES] list(textutil.annotate_all(passages)) for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True): assert passage is compare p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in p.layer(layer0.LAYER_ID).all: if as_array: assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID) assert len(terminal.tok) == len(textutil.Attr) else: for attr in textutil.Attr: assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % ( terminal, passage.ID, attr.name)
def parse_sentence(self, sentence): reg_stdout = sys.stdout sys.stdout = open(os.devnull, 'w', encoding='UTF-8') parsed_passage = None try: TupaParser.__passage_counter = +1 passage_id = TupaParser.__passage_counter = +1 # from_text will convert the sentence into a ucca structure. # annotate_all will annotate the structure with information from the Spacy parse. # annotate_all returns a generator - one that will yield only one object - hence # we call next unparsed_passage = next( annotate_all(from_text(sentence, passage_id, one_per_line=True))) # The 'tupa.parse class's parse method expects a list of unparsed-message. We also need to set # the 'evaluate' argument to True, otherwise we get incorrect results. (Ofir Arviv advised as such). # The parse method also returns a generator, hence the need to call next. # The actual object returned is a tuple of the parsed-passage and an internal score object. We're # not interested in the score though, so we just extract the parsed-passage parsed_passage_and_score = next( self.__parser.parse([unparsed_passage], evaluate=True)) internal_parsed_passage = parsed_passage_and_score[0] parsed_passage = TupaParser.__get_ucca_parsed_passage_from_passage( internal_parsed_passage) finally: sys.stdout = reg_stdout return parsed_passage
def parse_spacy(passages, lang, verbose=False): for passage, in annotate_all(zip(passages), as_array=True, as_tuples=True, lang=lang, verbose=verbose): terminals = sorted(passage.layer(layer0.LAYER_ID).all, key=operator.attrgetter("position")) dep_nodes = [ConlluConverter.Node()] + [ ConlluConverter.Node(t.position, terminal=t, token=ConlluConverter.Token(t.text, t.tag)) for t in terminals ] for dep_node in dep_nodes[1:]: dep_node.token.paragraph = dep_node.terminal.paragraph head = Attr.HEAD(dep_node.terminal.tok[Attr.HEAD.value]) if head: head += dep_node.position rel = Attr.DEP(dep_node.terminal.tok[Attr.DEP.value], lang=passage.attrib.get("lang", lang)) assert head is not None and rel is not None, \ "head=%r, rel=%r for token %d in:\n%s" % (head, rel, dep_node.position, " ".join(map(str, terminals))) edge = ConlluConverter.Edge(head, rel, remote=False) dep_node.terminal = None edge.link_head(dep_nodes) dep_node.add_edges([edge]) parsed = ConlluConverter().build_passage(dep_nodes, passage.ID) yield passage, parsed
def main(args): if not args.as_array and not args.as_extra: args.as_extra = True for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS): kwargs = dict(as_array=args.as_array, as_extra=args.as_extra, verbose=args.verbose, lang=spec.lang) passages = spec.passages if spec.conllu: passages = copy_annotation(passages, spec.conllu, by_id=args.by_id, **kwargs) elif spec.udpipe: passages = annotate_udpipe(passages, spec.udpipe, **kwargs) elif spec.stanfordnlp: passages = annotate_stanfordnlp(passages, spec.stanfordnlp, **kwargs) for passage in annotate_all(passages if args.verbose else tqdm( passages, unit=" passages", desc="Annotating " + spec.out_dir), replace=spec.conllu or not (spec.udpipe or spec.stanfordnlp), **kwargs): if passage.extra.get("format") == "amr" and args.as_array: from semstr.conversion.amr import AmrConverter AmrConverter.introduce_placeholders(passage) write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def main(args): words = args.word or [] categories = list(args.category or ()) dependencies = list(args.dependency or ()) if args.case_insensitive: words = list(map(str.lower, words)) for spec in read_specs(args, converters=FROM_FORMAT): if args.dependency: spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \ annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang) t = tqdm(spec.passages, unit=" passages", desc="Finding") if words: t.set_postfix(words=",".join(words)) if categories: t.set_postfix(categories=",".join(categories)) if dependencies: t.set_postfix(dependencies=",".join(dependencies)) found = 0 filename = os.path.join( spec.out_dir, "_".join(words + categories + dependencies) + ".txt") with open(filename, "w", encoding="utf-8") as f: for passage in t: for terminal in passage.layer(layer0.LAYER_ID).all: parent = terminal.parents[0] word = terminal.text if args.case_insensitive: word = word.lower() if (not words or word in words) and ( not categories or parent.ftag in categories) and ( not dependencies or get_annotation( terminal, spec.udpipe) in dependencies): print(passage.ID, parent.fparent, file=f) found += 1 t.set_postfix(found=found) print("Wrote '%s'" % filename)
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \ annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang) filename = os.path.join(spec.out_dir, "find.db") with sqlite3.connect(filename) as conn: c = conn.cursor() c.execute("DROP TABLE terminals") c.execute( "CREATE TABLE terminals (pid, tid, text, ftag, fparent, dep)") c.execute("CREATE INDEX idx_terminals_pid ON terminals (pid)") c.execute("CREATE INDEX idx_terminals_text ON terminals (text)") c.execute("CREATE INDEX idx_terminals_ftag ON terminals (ftag)") c.execute("CREATE INDEX idx_terminals_dep ON terminals (dep)") for passage in tqdm(spec.passages, unit=" passages", desc="Creating " + filename): rows = [] for terminal in passage.layer(layer0.LAYER_ID).all: parent = terminal.parents[0] rows.append( (passage.ID, terminal.ID, terminal.text, parent.ftag, str(parent.fparent), get_annotation(terminal, spec.udpipe))) c.executemany("INSERT INTO terminals VALUES (?,?,?,?,?,?)", rows) conn.commit()
def main(args): for passage in annotate_all(get_passages_with_progress_bar( args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated( passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
def from_format(self, lines, passage_id, return_original=False, save_original=True, remove_cycles=True, **kwargs): self.passage_id = passage_id self.return_original = return_original self.save_original = save_original self.remove_cycles = remove_cycles self.extensions = [l for l in EXTENSIONS if kwargs.get(l)] self.excluded = {i for l, r in EXTENSIONS.items() if l not in self.extensions for i in r} for passage, amr, amr_id in textutil.annotate_all(self._init_passages(self._amr_generator(lines)), as_array=True, as_tuples=True): yield self._build_passage(passage, amr, amr_id)
def parse(self, passages, display=True, write=False): passages, total = generate_and_len(single_to_iter(passages)) if self.config.args.ignore_case: passages = to_lower_case(passages) pr_width = len(str(total)) id_width = 1 passages = self.add_progress_bar(ThreadedGenerator( textutil.annotate_all( passages, as_array=True, lang=self.config.args.lang, verbose=self.config.args.verbose > 2, vocab=self.model.config.vocab(lang=self.config.args.lang)), queue_maxsize=100), display=display) for i, passage in enumerate(passages, start=1): parser = PassageParser(passage, self.config, self.models, self.training, self.evaluation) if self.config.args.verbose and display: progress = "%3d%% %*d/%d" % ( i / total * 100, pr_width, i, total) if total and i <= total else "%d" % i id_width = max(id_width, len(str(passage.ID))) print("%s %2s %-6s %-*s" % (progress, parser.lang, parser.in_format, id_width, passage.ID), end=self.config.line_end) else: passages.set_description() postfix = {parser.lang + " " + parser.in_format: passage.ID} if display: postfix["|t/s|"] = self.tokens_per_second() if self.correct_action_count: postfix["|a|"] = percents_str( self.correct_action_count, self.action_count, fraction=False) if self.correct_label_count: postfix["|l|"] = percents_str(self.correct_label_count, self.label_count, fraction=False) if self.evaluation and self.num_passages: postfix["|F1|"] = self.f1 / self.num_passages passages.set_postfix(**postfix) self.seen_per_format[parser.in_format] += 1 if self.training and self.config.args.max_training_per_format and \ self.seen_per_format[parser.in_format] > self.config.args.max_training_per_format: self.config.print("skipped", level=1) continue assert not (self.training and parser.in_format == "text"), "Cannot train on unannotated plain text" yield parser.parse(display=display, write=write) self.update_counts(parser) if self.num_passages and display: self.summary()
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): if spec.udpipe: spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose) elif spec.conllu: spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose) for passage in annotate_all(spec.passages if args.verbose else tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir), as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang, verbose=args.verbose): write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def main(args): textutil.BATCH_SIZE = 1 os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "terminal", "pos", "before", "after")) for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"), verbose=args.verbose): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): for passages, out_dir, lang in read_specs(args): for passage in annotate_all(passages if args.verbose else tqdm( passages, unit=" passages", desc="Annotating " + out_dir), as_array=args.as_array, replace=True, lang=lang, verbose=args.verbose): write_passage(passage, outdir=out_dir, verbose=args.verbose, binary=args.binary)
def main(args): textutil.BATCH_SIZE = 1 os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow( ("rule", "passage", "terminal", "pos", "before", "after")) for passage in annotate_all(get_passages_with_progress_bar( args.passages, desc="Converting"), verbose=args.verbose): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def parse(self, passages, display=True, write=False, accuracies=None): passages, total = generate_and_len(single_to_iter(passages)) if self.config.args.ignore_case: passages = to_lower_case(passages) pr_width = len(str(total)) id_width = 1 passages = self.add_progress_bar(textutil.annotate_all( passages, as_array=True, as_extra=False, lang=self.config.args.lang, verbose=self.config.args.verbose > 2, vocab=self.model.config.vocab(lang=self.config.args.lang)), display=display) for i, passage in enumerate(passages, start=1): parser = PassageParser(passage, self.config, self.models, self.training, self.evaluation) if self.config.args.verbose and display: progress = "%3d%% %*d/%d" % (i / total * 100, pr_width, i, total) if total and i <= total else "%d" % i id_width = max(id_width, len(str(passage.ID))) print("%s %2s %-6s %-*s" % (progress, parser.lang, parser.in_format, id_width, passage.ID), end=self.config.line_end) else: passages.set_description() postfix = {parser.lang + " " + parser.in_format: passage.ID} if display: postfix["|t/s|"] = self.tokens_per_second() if self.correct_action_count: postfix["|a|"] = percents_str(self.correct_action_count, self.action_count, fraction=False) if self.correct_label_count: postfix["|l|"] = percents_str(self.correct_label_count, self.label_count, fraction=False) if self.evaluation and self.num_passages: postfix["|F1|"] = self.f1 / self.num_passages passages.set_postfix(**postfix) self.seen_per_format[parser.in_format] += 1 if self.training and self.config.args.max_training_per_format and \ self.seen_per_format[parser.in_format] > self.config.args.max_training_per_format: self.config.print("skipped", level=1) continue assert not (self.training and parser.in_format == "text"), "Cannot train on unannotated plain text" yield parser.parse(display=display, write=write, accuracies=accuracies) self.update_counts(parser) if self.num_passages and display: self.summary()
def from_format(self, lines, passage_id, return_original=False, save_original=True, remove_cycles=True, wikification=True, placeholders=True, **kwargs): self.passage_id = passage_id self.return_original = return_original self.save_original = save_original self.remove_cycles = remove_cycles self.wikification = wikification self.placeholders = placeholders self.set_extensions(**kwargs) passages = self._init_passages(self._amr_generator(lines), **kwargs) if placeholders: passages = textutil.annotate_all(passages, as_array=True, as_tuples=True) for passage, graph in passages: yield self._build_passage(passage, graph)
def main(args): for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)