def parse_txt( parser: SpacyBISTParser, txt_path: Union[str, PathLike], out_dir: Union[str, PathLike] = None, show_tok=True, show_doc=True, ): """Parse raw documents in the form of lines in a text file. Args: parser (SpacyBISTParser) txt_path (str or PathLike) out_dir (str or PathLike): If specified, the output will also be written to this path. show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Yields: CoreNLPDoc: the annotated document. """ with open(txt_path, encoding="utf-8") as f: if out_dir: print("Writing parsed documents to {}".format(out_dir)) for i, doc_text in enumerate( tqdm(f, total=line_count(txt_path), file=sys.stdout)): parsed_doc = parser.parse(doc_text.rstrip("\n"), show_tok, show_doc) if out_dir: out_path = Path(out_dir) / (str(i + 1) + ".json") with open(out_path, "w", encoding="utf-8") as doc_file: doc_file.write(parsed_doc.pretty_json()) yield parsed_doc
def _iterate_docs(data: PathLike) -> tuple: if isdir(data): for file, doc_text in tqdm(list(walk_directory(data))): yield file, doc_text else: with open(data, encoding='utf-8') as f: for i, doc_text in tqdm(enumerate(f), total=line_count(data)): yield str(i + 1), doc_text