def setUp(self): self.resources: Resources = Resources() self.resources.update(redirects={}) self.data_dir: str = os.path.abspath( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../../../../data_samples/dbpedia", )) self.output_dir = tempfile.TemporaryDirectory() self.raw_output: str = os.path.join(self.output_dir.name, "raw") pl = Pipeline[DataPack](self.resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ "output_dir": self.raw_output, "zip_pack": True, "drop_record": True, }, ) pl.run(os.path.join(self.data_dir, "nif_context.tql"))
def add_wiki_info(reader: PackReader, resources: Resources, input_path: str, input_pack_path: str, output_path: str, prompt_name: str, skip_existing=True): pl = Pipeline[DataPack](resources) if skip_existing and os.path.exists(output_path): print_progress(f'\n{output_path} exist, skipping {prompt_name}', '\n') return pl.set_reader(reader, config={ 'pack_index': os.path.join(input_pack_path, 'article.idx'), 'pack_dir': input_pack_path, }) pl.add( WikiArticleWriter(), config={ 'output_dir': output_path, 'zip_pack': True, 'drop_record': True, }, ) print_progress(f'Start running the {prompt_name} pipeline.', '\n') pl.run(input_path) print_progress(f'Done collecting {prompt_name}.', '\n')
def write_results(pl: Pipeline, output_path: str, input_data: str): pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, }, ) pl.run(input_data)
def complete_and_tokens(): # Define paths pack_input = os.path.join(pack_dir, "nif_raw_struct_links") pack_output = os.path.join(pack_dir, "nif_raw_struct_links_token") # Store which documents are processed, try to make input output structure # similar. pack_input_index = os.path.join(pack_input, "article.idx") pack_output_index = os.path.join(pack_output, "article.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO, filename=os.path.join(pack_dir, "complete_tokenize.log"), ) pipeline = Pipeline(loaded_resource).set_reader( DirPackReader(), config={ "suffix": ".json.gz", "zip_pack": True }, # ).add( # WikiEntityCompletion() ).add(WikiAddTitle()).add(SpacyProcessor(), config={ "processors": ["sentence", "tokenize"], }).add(SubwordTokenizer(), config={ "tokenizer_configs": { "pretrained_model_name": "bert-base-uncased" }, "token_source": "ft.onto.base_ontology.Token", }).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "serialize_method": "jsonpickle" }, ).add(ProgressPrinter()) pipeline.run(pack_input)
def add_wiki_info( reader: PackReader, resources: Resources, input_path: str, input_pack_path: str, output_path: str, prompt_name: str, skip_existing=True, overwrite=False, input_index_file_name: str = "article.idx", output_index_file_name: str = "article.idx", ): pl = Pipeline[DataPack](resources) out_index_path = os.path.join(output_path, output_index_file_name) if skip_existing and os.path.exists(out_index_path): print_progress(f"\n{output_path} exist, skipping {prompt_name}", "\n") return pl.set_reader( reader, config={ "pack_index": os.path.join(input_pack_path, input_index_file_name), "pack_dir": input_pack_path, }, ) pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, "output_index_file": output_index_file_name, "overwrite": overwrite, }, ) print_progress(f"Start running the {prompt_name} pipeline.", "\n") pl.run(input_path) print_progress(f"Done collecting {prompt_name}.", "\n")
def read_wiki_text(nif_context: str, output_dir: str, resources: Resources, skip_existing: bool = False): if skip_existing and os.path.exists(output_dir): print_progress(f'\n{output_dir} exist, skipping reading text', '\n') return pl = Pipeline[DataPack](resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ 'output_dir': output_dir, 'zip_pack': True, 'drop_record': True, }, ) print_progress('Start running wiki text pipeline.', '\n') pl.run(nif_context) print_progress('Done collecting wiki text.', '\n')
def read_wiki_text( nif_context: str, output_dir: str, resources: Resources, skip_existing: bool = False, ): if skip_existing and os.path.exists(output_dir): print_progress(f"\n{output_dir} exist, skipping reading text", "\n") return pl = Pipeline[DataPack](resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ "output_dir": output_dir, "zip_pack": True, "drop_record": True, }, ) print_progress("Start running wiki text pipeline.", "\n") pl.run(nif_context) print_progress("Done collecting wiki text.", "\n")
def add_wiki_info( reader: WikiPackReader, resources: Resources, wiki_info_data_path: str, input_pack_path: str, output_path: str, prompt_name: str, use_input_index=False, skip_existing=True, resume_from_last=False, input_index_file_path: Optional[str] = "article.idx", output_index_file_name: Optional[str] = "article.idx", ): """ Add wiki resource into the data pack. Args: reader: The info reader that loads the data pack. resources: The resources object that should contain the redirects. wiki_info_data_path: The path containing the wiki data. input_pack_path: The initial data pack path. output_path: The resulting output path. prompt_name: a name to show during processing. use_input_index: whether to use the input index to determine the output path. skip_existing: whether to skip this function if the folder exists. resume_from_last: whether to resume from last end point, at most one can be true between this and `skip_existing` input_index_file_path: the full file path to the input index. output_index_file_name: the file path to write the output index, this is relative to `output_path`. Returns: """ pl = Pipeline[DataPack](resources) if resume_from_last and skip_existing: raise ValueError( "resume_from_last and skip_existing cannot both be " "true." ) out_index_path = os.path.join(output_path, output_index_file_name) if skip_existing and os.path.exists(out_index_path): print_progress( f"\n{out_index_path} exist, skipping {prompt_name}", "\n" ) return if resume_from_last: if not os.path.exists(out_index_path): raise ValueError(f"Configured to do resume but path " f"{out_index_path} does not exists.") print_progress( f"\nWill resume from last from {out_index_path}", "\n" ) pl.set_reader( reader, config={ "pack_index": input_index_file_path, "pack_dir": input_pack_path, "resume_index": out_index_path, }, ) else: pl.set_reader( reader, config={ "pack_index": input_index_file_path, "pack_dir": input_pack_path, }, ) pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, "use_input_index": use_input_index, "input_index_file": input_index_file_path, "output_index_file": output_index_file_name, "append_to_index": resume_from_last, }, ) print_progress(f"Start running the {prompt_name} pipeline.", "\n") pl.run(wiki_info_data_path) print_progress(f"Done collecting {prompt_name}.", "\n")
pack_output = os.path.join(pack_dir, "category") # Store which documents have category. pack_input_index = os.path.join(pack_input, "article.idx") # Store which documents have category. pack_output_index = os.path.join(pack_output, "category.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO, filename=os.path.join(pack_dir, "category.log"), ) Pipeline(resources).set_reader( WikiCategoryReader(), config={ "pack_index": pack_input_index, "pack_dir": pack_input, }, ).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "overwrite": True, }, ).run(os.path.join(base_dir, "article_categories_en.tql.bz2"))