def _collect( # type: ignore self, nif_context: str ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f"Collecting DBpedia resource: [{c.identifier}]") fragment = get_resource_fragment(v) if ( nif_type and nif_type == "context" and fragment is not None and fragment == "isString" ): str_data["text"] = o.toPython() doc_name: Optional[str] = get_resource_name(s) old_id: Optional[str] = get_resource_attribute( c.identifier, "oldid" ) if doc_name is not None and old_id is not None: str_data["doc_name"] = doc_name str_data["oldid"] = old_id yield str_data
def _parse_pack( self, collection: Tuple[str, List[state_type]] ) -> Iterator[DataPack]: resource_name, statements = collection if resource_name in self._redirects: resource_name = self._redirects[resource_name] if resource_name in self._pack_index: print_progress( f"Handling resource [{resource_name}] in {self.component_name}" ) pack_path = os.path.join( self._pack_dir, self._pack_index[resource_name] ) if os.path.exists(pack_path): pack: DataPack = DataPack.deserialize( pack_path, self.configs.serialize_method, self.configs.zip_pack, ) self.add_wiki_info(pack, statements) yield pack else: logging.info("Resource %s pack not found.", resource_name)
def _collect( # type: ignore self, nif_path: str ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]: skipped = 0 for _, statements in ContextGroupedNIFReader(nif_path): name = get_resource_name(statements[0][0]) if name is not None: if name not in self._resume_index: yield name, statements else: skipped += 1 print_progress( f"Skipped {skipped} documents", terminal_only=True )
def _collect(self, nif_context: str # type: ignore ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia resource: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') yield str_data
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) # A mapping from the name of the page to the path on th disk. self._pack_index = read_index(configs.pack_index) self._pack_dir = configs.pack_dir if self.configs.resume_index: self._resume_index = read_index(configs.resume_index) print_progress( f"Loaded {len(self._resume_index)} existing " f"files.", "\n" ) if self.resources.contains("redirects"): self._redirects = self.resources.get("redirects") print_progress(f"{len(self._redirects)} redirects loaded.", "\n") else: raise ResourceError("Redirects not provided from resources.")
def add_wiki_info(reader: PackReader, resources: Resources, input_path: str, input_pack_path: str, output_path: str, prompt_name: str, skip_existing=True): pl = Pipeline[DataPack](resources) if skip_existing and os.path.exists(output_path): print_progress(f'\n{output_path} exist, skipping {prompt_name}', '\n') return pl.set_reader(reader, config={ 'pack_index': os.path.join(input_pack_path, 'article.idx'), 'pack_dir': input_pack_path, }) pl.add( WikiArticleWriter(), config={ 'output_dir': output_path, 'zip_pack': True, 'drop_record': True, }, ) print_progress(f'Start running the {prompt_name} pipeline.', '\n') pl.run(input_path) print_progress(f'Done collecting {prompt_name}.', '\n')
def _parse_pack( self, collection: Tuple[str, List[state_type]]) -> Iterator[DataPack]: resource_name, statements = collection if resource_name in self._redirects: resource_name = self._redirects[resource_name] if resource_name in self._pack_index: print_progress( f"Handling resource [{resource_name}] in {self.component_name}" ) pack_path = os.path.join(self._pack_dir, self._pack_index[resource_name]) # `smart_open` can handle the `gz` files. if os.path.exists(pack_path): with open(pack_path) as pack_file: pack: DataPack = DataPack.deserialize(pack_file.read()) self.add_wiki_info(pack, statements) yield pack else: logging.info("Resource %s pack not found.", resource_name)
def load_from_nif(link_file, output_file): linkings = {} bilinks = [] num_articles = 0 num_bilinks = 0 start_time = timeit.default_timer() with open(output_file, "w") as out: for _, statements in ContextGroupedNIFReader(link_file): num_articles += 1 for nif_range, rel, info in statements: r = get_resource_fragment(rel) if r is not None and r == "taIdentRef": src_name = get_resource_name(nif_range) target_name = get_resource_name(info) if src_name == target_name: continue if linkings.get(target_name, None) == src_name: bilinks.append((src_name, target_name)) linkings.pop(target_name) num_bilinks += 1 out.write(f"{src_name}\t{target_name}\n") out.flush() else: linkings[src_name] = target_name elapsed = timeit.default_timer() - start_time print_progress( f"{num_bilinks} bi-links found in {num_articles} after " f"{datetime.timedelta(seconds=elapsed)}, speed is " f"{num_articles / elapsed:.2f} (packs/second)." )
def add_wiki_info( reader: PackReader, resources: Resources, input_path: str, input_pack_path: str, output_path: str, prompt_name: str, skip_existing=True, overwrite=False, input_index_file_name: str = "article.idx", output_index_file_name: str = "article.idx", ): pl = Pipeline[DataPack](resources) out_index_path = os.path.join(output_path, output_index_file_name) if skip_existing and os.path.exists(out_index_path): print_progress(f"\n{output_path} exist, skipping {prompt_name}", "\n") return pl.set_reader( reader, config={ "pack_index": os.path.join(input_pack_path, input_index_file_name), "pack_dir": input_pack_path, }, ) pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, "output_index_file": output_index_file_name, "overwrite": overwrite, }, ) print_progress(f"Start running the {prompt_name} pipeline.", "\n") pl.run(input_path) print_progress(f"Done collecting {prompt_name}.", "\n")
def read_wiki_text(nif_context: str, output_dir: str, resources: Resources, skip_existing: bool = False): if skip_existing and os.path.exists(output_dir): print_progress(f'\n{output_dir} exist, skipping reading text', '\n') return pl = Pipeline[DataPack](resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ 'output_dir': output_dir, 'zip_pack': True, 'drop_record': True, }, ) print_progress('Start running wiki text pipeline.', '\n') pl.run(nif_context) print_progress('Done collecting wiki text.', '\n')
def read_wiki_text( nif_context: str, output_dir: str, resources: Resources, skip_existing: bool = False, ): if skip_existing and os.path.exists(output_dir): print_progress(f"\n{output_dir} exist, skipping reading text", "\n") return pl = Pipeline[DataPack](resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ "output_dir": output_dir, "zip_pack": True, "drop_record": True, }, ) print_progress("Start running wiki text pipeline.", "\n") pl.run(nif_context) print_progress("Done collecting wiki text.", "\n")
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, base_output_path: str): # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress('Loading redirects', '\n') redirect_pickle = os.path.join(base_output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", '\n') # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, 'nif_raw') read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", '\n') # 2. Add wiki page structures, create a new directory for it. struct_dir = raw_pack_dir + '_struct' add_wiki_info(WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, 'page_structures', True) print_progress("Done reading wikipedia structures.", '\n') # 3. Add wiki links, create a new directory for it. link_dir = struct_dir + '_links' add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, 'anchor_links', True) print_progress("Done reading wikipedia anchors.", '\n') # 4 The following steps add info boxes: # 4.1 Add un-mapped infobox, we directly write to the previous directory property_dir = link_dir add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, 'info_box_properties', skip_existing=True, overwrite=True, output_index_file_name='properties.idx') print_progress("Done reading wikipedia info-boxes properties.", '\n') # 4.1 Add mapped literal, we directly write to the previous directory. literal_dir = property_dir add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, 'literals', skip_existing=True, overwrite=True, output_index_file_name='literals.idx') print_progress("Done reading wikipedia info-boxes literals.", '\n') # 4.1 Add mapped object, we directly write to the previous directory. mapping_dir = literal_dir add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, 'objects', skip_existing=True, overwrite=True, output_index_file_name='objects.idx') print_progress("Done reading wikipedia info-boxes objects.", '\n')
def add_wiki_info( reader: WikiPackReader, resources: Resources, wiki_info_data_path: str, input_pack_path: str, output_path: str, prompt_name: str, use_input_index=False, skip_existing=True, resume_from_last=False, input_index_file_path: Optional[str] = "article.idx", output_index_file_name: Optional[str] = "article.idx", ): """ Add wiki resource into the data pack. Args: reader: The info reader that loads the data pack. resources: The resources object that should contain the redirects. wiki_info_data_path: The path containing the wiki data. input_pack_path: The initial data pack path. output_path: The resulting output path. prompt_name: a name to show during processing. use_input_index: whether to use the input index to determine the output path. skip_existing: whether to skip this function if the folder exists. resume_from_last: whether to resume from last end point, at most one can be true between this and `skip_existing` input_index_file_path: the full file path to the input index. output_index_file_name: the file path to write the output index, this is relative to `output_path`. Returns: """ pl = Pipeline[DataPack](resources) if resume_from_last and skip_existing: raise ValueError( "resume_from_last and skip_existing cannot both be " "true." ) out_index_path = os.path.join(output_path, output_index_file_name) if skip_existing and os.path.exists(out_index_path): print_progress( f"\n{out_index_path} exist, skipping {prompt_name}", "\n" ) return if resume_from_last: if not os.path.exists(out_index_path): raise ValueError(f"Configured to do resume but path " f"{out_index_path} does not exists.") print_progress( f"\nWill resume from last from {out_index_path}", "\n" ) pl.set_reader( reader, config={ "pack_index": input_index_file_path, "pack_dir": input_pack_path, "resume_index": out_index_path, }, ) else: pl.set_reader( reader, config={ "pack_index": input_index_file_path, "pack_dir": input_pack_path, }, ) pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, "use_input_index": use_input_index, "input_index_file": input_index_file_path, "output_index_file": output_index_file_name, "append_to_index": resume_from_last, }, ) print_progress(f"Start running the {prompt_name} pipeline.", "\n") pl.run(wiki_info_data_path) print_progress(f"Done collecting {prompt_name}.", "\n")
def main( nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, categories: str, base_output_path: str, resume_existing: bool, ): # Whether to skip the whole step. if resume_existing: skip_existing = False else: skip_existing = True # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress("Loading redirects", "\n") redirect_map: Dict[str, str] = cache_redirects(base_output_path, redirects) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", "\n") # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, "nif_raw") read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", "\n") # Use the same index structure for all writers. main_index = os.path.join(raw_pack_dir, "article.idx") # 2. Add wiki page structures, create a new directory for it. struct_dir = raw_pack_dir + "_struct" add_wiki_info( WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, "page_structures", use_input_index=True, skip_existing=skip_existing, resume_from_last=resume_existing, input_index_file_path=main_index, ) print_progress("Done reading wikipedia structures.", "\n") # 3. Add wiki links, create a new directory for it. link_dir = struct_dir + "_links" add_wiki_info( WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, "anchor_links", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, input_index_file_path=main_index, ) print_progress("Done reading wikipedia anchors.", "\n") # 4 The following steps add info boxes: # 4.1 Add un-mapped infobox, we directly write to the previous directory property_dir = link_dir add_wiki_info( WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, "info_box_properties", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="properties.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes properties.", "\n") # 4.1 Add mapped literal, we directly write to the previous directory. literal_dir = property_dir add_wiki_info( WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, "literals", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="literals.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes literals.", "\n") # 4.1 Add mapped object, we directly write to the previous directory. mapping_dir = literal_dir add_wiki_info( WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, "objects", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="objects.idx", input_index_file_path=main_index, ) print_progress("Done reading wikipedia info-boxes objects.", "\n") # 4.2 Add category, directly write to previous directory. category_dir = mapping_dir add_wiki_info( WikiCategoryReader(), resources, categories, mapping_dir, category_dir, "categories", use_input_index=True, skip_existing=True, resume_from_last=resume_existing, output_index_file_name="categories.idx", input_index_file_path=main_index, )
true_path_lookup[article_name] = article_path print(f"Loaded {len(true_path_lookup)} true article paths.") with open(target_index) as ti: doc_count = 0 removed_count = 0 fixed_count = 0 for line in ti: article_name, article_path = line.strip().split("\t") true_path = true_path_lookup[article_name] doc_count += 1 import pdb if true_path == article_path: print_progress(f"Fixing category from {true_path} ({doc_count}" f"/{removed_count}/{fixed_count})") fixed_count += 1 else: print_progress( f"Removing extra file {article_path}, will keep " f"{true_path} ({doc_count}/{removed_count}/{fixed_count})") removed_count += 1 print( f"Removing extra file {article_path}, will keep " f"{true_path} ({doc_count}/{removed_count}/{fixed_count})") pdb.set_trace()
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs_properties: str, base_output_path: str): # The datasets are read in a few steps. # 0. Load redirects between wikipedia pages. print_progress('Loading redirects', '\n') redirect_pickle = os.path.join(base_output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) resources: Resources = Resources() resources.update(redirects=redirect_map) print_progress("Done loading.", '\n') # 1. Read the wiki text. raw_pack_dir = os.path.join(base_output_path, 'nif_raw') read_wiki_text(nif_context, raw_pack_dir, resources, True) print_progress("Done reading wikipedia text.", '\n') # 2. Add the rest of wiki page structures: struct_dir = raw_pack_dir + '_struct' add_wiki_info(WikiStructReader(), resources, nif_page_structure, raw_pack_dir, struct_dir, 'page_structures', True) print_progress("Done reading wikipedia structures.", '\n') link_dir = struct_dir + '_links' add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir, link_dir, 'anchor_links', True) print_progress("Done reading wikipedia anchors.", '\n') property_dir = link_dir + '_property' add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties, link_dir, property_dir, 'info_box_properties', True) print_progress("Done reading wikipedia info-boxes.", '\n') literal_dir = property_dir + '_literals' add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals, property_dir, literal_dir, 'literals', True) print_progress("Done reading wikipedia info-boxes literals.", '\n') mapping_dir = literal_dir + '_objects' add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir, mapping_dir, 'objects', True) print_progress("Done reading wikipedia info-boxes objects.", '\n')