def main(args): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) if not args.tikz: import matplotlib matplotlib.use('Agg') to_stdout = (args.tikz or args.standoff) and not args.out_dir t = args.passages t = get_passages(t) if to_stdout else get_passages_with_progress_bar( t, desc="Visualizing") if args.sentences: t = (sentence for passage in t for sentence in split2sentences(passage)) for passage in t: if args.tikz: print_text(args, visualization.tikz(passage), passage.ID + ".tikz.txt") elif args.standoff: print_text(args, visualization.standoff(passage), passage.ID + ".ann") else: import matplotlib.pyplot as plt width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27 plt.figure(passage.ID, figsize=(width, width * 10 / 19)) visualization.draw(passage, node_ids=args.node_ids) if args.out_dir: plt.savefig( os.path.join(args.out_dir, passage.ID + "." + args.format)) plt.close() else: plt.show()
def copy_annotation(passages, conllu, by_id=False, as_array=True, as_extra=True, verbose=False, lang=None): conllu_sentences = {annotated.ID: annotated for annotated in get_passages_with_progress_bar(conllu, converters=CONVERTERS, desc="Reading '%s'" % conllu)} \ if by_id else get_passages(conllu, converters=CONVERTERS) for passage in passages: try: annotated = conllu_sentences[passage.ID] if by_id else next( conllu_sentences) except (KeyError, StopIteration) as e: raise ValueError( "Missing annotation for passage ID '%s', by_id=%s" % (passage.ID, by_id)) from e if verbose: with external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) if as_array: passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer( layer0.LAYER_ID).docs() if as_extra: for terminal, annotated_terminal in zip( passage.layer(layer0.LAYER_ID).all, annotated.layer(layer0.LAYER_ID).all): copy_tok_to_extra(annotated_terminal, terminal, lang=lang) yield passage
def main(args): os.makedirs(args.outdir, exist_ok=True) if args.join: out_file = os.path.join(args.outdir, args.join) with open(out_file, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(sorted( args.filenames, key=numeric), desc="Converting"): write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) print("Wrote '%s'." % out_file) else: # one file per passage for pattern in args.filenames: for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"): passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f: write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
def main(args): matchers = [CandidateMatcher(line) for line in tqdm(list(gen_lines(args.text)), desc="Indexing " + args.text, unit=" lines")] out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}): match_passage_text(p, matchers, out) out.close()
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split( passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("Unmatched sentences:", *[ s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices ], sep="\n")
def main(args): df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant", "implicit", "edges", "primary", "remote"]) df.fillna(0, inplace=True) for i, directory in enumerate(args.directories): row = df.loc[directory] for passage in get_passages_with_progress_bar(directory, desc=directory): l1 = passage.layer(layer1.LAYER_ID) non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1] edges = {e for n in non_terminals for e in n} remote_counter = Counter(e.attrib.get("remote", False) for e in edges) row["sentences"] += 1 row["tokens"] += len(passage.layer(layer0.LAYER_ID).all) row["nodes"] += len(non_terminals) row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous) row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming)) row["edges"] += len(edges) row["primary"] += remote_counter[False] row["remote"] += remote_counter[True] row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit")) # Change to percentages df["discontinuous"] *= 100. / df["nodes"] df["reentrant"] *= 100. / df["nodes"] df["implicit"] *= 100. / df["nodes"] df["primary"] *= 100. / df["edges"] df["remote"] *= 100. / df["edges"] # Print if args.outfile: df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n") print("Saved to " + args.outfile) else: with pd.option_context("display.max_rows", None, "display.max_columns", None): print(df.T)
def main(args): matchers = [CandidateMatcher(spelling) for line in tqdm(list(gen_lines(args.text)), desc="Indexing " + args.text, unit=" lines") for spelling in alternative_spellings(line)] out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}): match_passage_text(p, matchers, out) out.close()
def main(filename, input_filenames, outdir): os.makedirs(outdir, exist_ok=True) with open(filename, encoding="utf-8") as f: pairs = [line.strip().split() for line in f] old_to_new_id = {old_id: new_id for new_id, old_id in pairs} for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"): passage._ID = old_to_new_id[passage.ID] write_passage(passage, outdir=outdir, verbose=False)
def main(args): os.makedirs(args.outdir, exist_ok=True) for passage in get_passages_with_progress_bar(args.filenames): site_filename = os.path.join(args.outdir, passage.ID + ".xml") with open(site_filename, "w", encoding="utf-8") as f: print(tostring(convert.to_site(passage)).decode(), file=f) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename)
def main(args): for passage in annotate_all(get_passages_with_progress_bar( args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated( passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "edge", "before", "after")) for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "terminal", "before", "after")) for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def read(fp, text = None, prefix = None): parent = Path(fp.name).parent; paths = {parent / file.strip() for file in fp}; for passage in get_passages_with_progress_bar(map(str, paths), desc="Analyzing"): try: graph = passage2graph(passage, text, prefix); except Exception as exception: print(exception); continue; yield graph;
def main(args): print( "id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for passage in get_passages_with_progress_bar(args.filenames): terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [ n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1" ] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = ( int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([ n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous ]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum( len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([ n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1 ]), ) if not args.summary: with tqdm.external_write_mode(): print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") if args.summary: print(",".join("%d" % f for f in data.sum(axis=0)))
def main(args): os.makedirs(args.outdir, exist_ok=True) words_set = read_dict(args.words_set) with open(args.logfile, "w", newline="", encoding="utf-8") as outfile: cw = csv.writer(outfile) for passage in get_passages_with_progress_bar(args.filenames, "Fixing tokenization"): fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw) if fixed is not None: outfile.flush() normalize(fixed) write_passage(fixed, outdir=args.outdir, binary=args.binary, prefix=args.prefix, verbose=args.verbose)
def from_xml(xml_files): passages = get_passages_with_progress_bar(xml_files, desc="Visualizing") for passage in passages: width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27 plt.figure(figsize=(width, width * 10 / 19)) draw_from_native(passage) plt.savefig(os.path.join(args.out_dir, passage.ID + ".png")) plt.close()
def main(args): if args.outdir: os.makedirs(args.outdir, exist_ok=True) for p in get_passages_with_progress_bar(args.filenames, desc="Normalizing", converters={}): normalize(p, extra=args.extra) write_passage(p, outdir=args.outdir, prefix=args.prefix, binary=args.binary, verbose=False)
def main(args): textutil.BATCH_SIZE = 1 os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "terminal", "pos", "before", "after")) for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"), verbose=args.verbose): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): errors = ((p.ID, list(validate(p))) for p in get_passages_with_progress_bar(args.filenames, desc="Validating")) errors = {k: v for k, v in errors if v} if errors: id_len = max(map(len, errors)) for passage_id, es in sorted(errors.items()): for i, e in enumerate(es): print("%-*s|%s" % (id_len, "" if i else passage_id, e)) sys.exit(1) else: print("No errors found.")
def main(args): validator = Validator(args.normalize, args.extra, linkage=args.linkage, strict=args.strict) with Pool(10) as pool: errors = pool.map(validator.validate_passage, get_passages_with_progress_bar(args.filenames, desc="Validating", converters={})) errors = dict((k, v) for k, v in errors if v) if errors: if not args.strict: id_len = max(map(len, errors)) for passage_id, es in sorted(errors.items()): print_errors(passage_id, es, id_len) sys.exit(1) else: print("No errors found.")
def main(args): df = pd.DataFrame(index=args.directories, columns=[ "sentences", "tokens", "nodes", "discontinuous", "reentrant", "implicit", "edges", "primary", "remote" ]) df.fillna(0, inplace=True) for i, directory in enumerate(args.directories): row = df.loc[directory] for passage in get_passages_with_progress_bar(directory, desc=directory): l1 = passage.layer(layer1.LAYER_ID) non_terminals = [ n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1 ] edges = {e for n in non_terminals for e in n} remote_counter = Counter( e.attrib.get("remote", False) for e in edges) row["sentences"] += 1 row["tokens"] += len(passage.layer(layer0.LAYER_ID).all) row["nodes"] += len(non_terminals) row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous) row["reentrant"] += sum(1 for n in non_terminals if any( e.attrib.get("remote") for e in n.incoming)) row["edges"] += len(edges) row["primary"] += remote_counter[False] row["remote"] += remote_counter[True] row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit")) # Change to percentages df["discontinuous"] *= 100. / df["nodes"] df["reentrant"] *= 100. / df["nodes"] df["implicit"] *= 100. / df["nodes"] df["primary"] *= 100. / df["edges"] df["remote"] *= 100. / df["edges"] # Print if args.outfile: df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n") print("Saved to " + args.outfile) else: with pd.option_context("display.max_rows", None, "display.max_columns", None): print(df.T)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary)
def main(args): out = args.direction == "out" roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items() if isinstance(tag, str) and not name.startswith('__')) for passage in get_passages_with_progress_bar([args.directory]): for node in passage.layer(layer1.LAYER_ID).all: counts = Counter(edge.tag for edge in (node if out else node.incoming)) roles.difference_update(tag for tag, count in counts.items() if count > 1) lines = "\n".join(sorted(roles)) print(lines) if args.outfile: with open(args.outfile, "w", encoding="utf-8") as f: print(lines, file=f)
def main(args): for passage in get_passages_with_progress_bar(args.passages): extracted = constructions.extract_edges( passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): with tqdm.external_write_mode(): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def main(args): for passage in get_passages_with_progress_bar(args.passages): c2es = OrderedDict((c, [candidate.edge for candidate in candidates]) for c, candidates in extract_candidates(passage, constructions=args.constructions, verbose=args.verbose).items() if candidates) if any(c2es.values()): with external_write_mode(): if not args.verbose: print("%s:" % passage.ID) for construction, edges in c2es.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def main(args): order = None if args.sentences: with open(args.sentences, encoding="utf-8") as f: order = dict(map(reversed, enumerate(map(str.strip, f)))) for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in split(passage, order) if order else split2sentences( passage, remarks=args.remarks, lang=args.lang): outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with tqdm.external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) passage2file(sentence, outfile, args.binary)
def main(args): os.makedirs(args.outdir, exist_ok=True) if args.join: out_file = os.path.join(args.outdir, args.join) with open(out_file, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"): write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) print("Wrote '%s'." % out_file) else: # one file per passage for pattern in args.filenames: for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"): passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f: write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
def main(args): validator = Validator(args.normalize, args.extra, linkage=args.linkage, multigraph=args.multigraph, strict=args.strict) with Pool(10) as pool: errors = pool.map(validator.validate_passage, get_passages_with_progress_bar(args.filenames, desc="Validating", converters={})) errors = dict((k, v) for k, v in errors if v) if errors: if not args.strict: id_len = max(map(len, errors)) for passage_id, es in sorted(errors.items()): print_errors(passage_id, es, id_len) sys.exit(1) else: print("No errors found.")
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) i = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 compunds = [] for node in sen.nodes: if (sen.nodes[node].layer.ID == '0'): find_id = '' l = sen.nodes[node] if (l.parents[0].ftag == 'C'): if (l.parents[0].ID not in compunds): compunds.append(l.parents[0].ID) tmp_c = [] for n in l.parents[0].children: tmp_c.append(n.text) #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID)) find_id = l.parents[0].ID path = [] path.append(' '.join(tmp_c)) path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') else: #print('Word: %s\nWord ID: %s' % (l.text, l.ID)) find_id = l.ID path = [] path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') print( '------------------------------------------------------------------' )
def main(args): filenames = list(args.passages) if args.filenames: with open(args.filenames, encoding="utf-8") as f: filenames += list(filter(None, map(str.strip, f))) with open(args.out, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(filenames): out = upload_passage(convert.to_site(passage), verbose=args.verbose, site_filename=passage.ID + "_site_upload.xml" if args.write_site else None, db_name=args.db_name, host_name=args.host_name, new_pid=passage.ID, new_prid=args.project_id, username=args.username) print(passage.ID, out, file=f) if args.verbose: print("Uploaded passage %s with xid=%s" % (passage.ID, out)) if CONNECTION is not None: CONNECTION.commit() print("Wrote '%s'" % args.out)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
def main(args): histograms = defaultdict(Counter) for passage in get_passages_with_progress_bar(args.filenames): for node in passage.layer(layer1.LAYER_ID).all: if node.ID != "1.1": # Exclude the root node histograms["parents"][clip(node.incoming, 3)] += 1 histograms["children"][clip(node.outgoing, 7)] += 1 for label, counter in histograms.items(): handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout handle.writelines( ["%s\t%d\n" % (num, count) for num, count in counter.items()]) if handle is not sys.stdout: handle.close() try: plot_histogram(counter, label, plot=args.plot) plot_pie(counter, label, plot=args.plot) except: pass
def upload_tasks(self, filenames, log=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None try: for pattern in filenames: filenames = sorted(glob(pattern)) if not filenames: raise IOError("Not found: " + pattern) for passage in get_passages_with_progress_bar(filenames, desc="Uploading"): logging.debug("Uploading passage %s" % passage.ID) task = self.upload_task(passage, log=log_h) logging.debug("Submitted task %d" % task["id"]) yield task except HTTPError as e: try: raise ValueError(e.response.json()["detail"]) from e except JSONDecodeError: raise ValueError(e.response.text) from e finally: if log: log_h.close()
def upload_tasks(self, filenames, log=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None try: for pattern in filenames: filenames = sorted(glob(pattern)) if not filenames: raise IOError("Not found: " + pattern) for passage in get_passages_with_progress_bar( filenames, desc="Uploading"): logging.debug("Uploading passage %s" % passage.ID) task = self.upload_task(passage, log=log_h) logging.debug("Submitted task %d" % task["id"]) yield task except HTTPError as e: try: raise ValueError(e.response.json()["detail"]) from e except JSONDecodeError: raise ValueError(e.response.text) from e finally: if log: log_h.close()
def upload_tasks(self, filenames, log=None, submit=True, existing_ids=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None if existing_ids: with open(existing_ids, "r", encoding="utf-8") as ids_h: ids = { old_passage_id: (passage_id, tok_id, ann_id) for (old_passage_id, passage_id, tok_id, ann_id) in map(str.split, ids_h) } else: ids = None try: for pattern in filenames: filenames = sorted(glob(pattern)) if not filenames: raise IOError("Not found: " + pattern) for passage in get_passages_with_progress_bar( filenames, desc="Uploading"): logging.debug("Uploading passage %s" % passage.ID) task = self.upload_task(passage, log=log_h, submit=submit, ids=ids) logging.debug("Submitted task %d" % task["id"]) yield task except HTTPError as e: try: raise ValueError(e.response.json()["detail"]) from e except JSONDecodeError: raise ValueError(e.response.text) from e finally: if log: log_h.close()
def main(args): os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for paragraph in split2paragraphs( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(paragraph, file=sys.stderr) print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(paragraph) passage2file(paragraph, outfile, binary=args.binary)
def main(args): for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
def main(args): for passage in get_passages_with_progress_bar(args.passages): #passage = convert.from_standard(elem) #print("Linearised\n------\n") # print(convert.to_sequence(passage)) words = {} xmltoconll(passage) t = split2sentences(passage) i = 0 for sen in t: print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 while (1): word = input('\nType the word below\n\n') for node in passage.nodes: t = passage.nodes[node] if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)): #print('Word: %s\nWord ID: %s' %(t.text,t.ID)) #ans = input('\nDo you want to continue with wordi Id : %s', t.ID) path = [] path = find_path(passage.nodes[t.ID], path) break print(' '.join(path))
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) sen_no = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen))) root = sen.nodes['1.1'] first = 1 tab_len = {} tab_len[0] = len('1.1') for i in root.children: print('\n') path = [] level = 1 path.append((i.ftag, i.ID, level, False)) path = find_children(i, path, level) end = 0 if (first): pstr = root.ID first = 0 else: for k in range(0, tab_len[0]): pstr = pstr + ' ' for j in path: if (j == 'End'): print(pstr) pstr = '' end = 1 continue rel = j[0] nd = j[1] tab = int(j[2]) remote = j[3] if (end): q_mark = 0 for k in range(0, tab_len[tab - 1]): if (k == tab_len[q_mark]): pstr = pstr + '.' q_mark += 1 else: pstr = pstr + ' ' end = 0 if (rel in descr): rel_desc = rel + ':' + descr[rel] else: rel_desc = rel if (remote): pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd else: pstr = pstr + '|-->(' + rel_desc + ')-->' + nd tab_len[tab] = len(pstr) print('-----------------------------------\n') sen_no += 1