def _parse_pack( self, collection: Tuple[str, Dict[str, List[state_type]]] ) -> Iterator[DataPack]: resource_name, info_box_data = collection if resource_name in self.redirects: resource_name = self.redirects[resource_name] if resource_name in self.pack_index: print_progress(f'Add infobox to resource: [{resource_name}]') pack_path = os.path.join( self.pack_dir, self.pack_index[resource_name] ) if os.path.exists(pack_path): with open(pack_path) as pack_file: pack = data_utils.deserialize( self._pack_manager, pack_file.read()) add_info_boxes(pack, info_box_data['literals']) add_info_boxes(pack, info_box_data['objects']) add_property(pack, info_box_data['properties']) yield pack else: print_notice(f"Resource {resource_name} is not in the raw packs.") self.logger.warning("Resource %s is not in the raw packs.", resource_name)
def _collect( self, nif_context: str # type: ignore ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]: str_data: Dict[str, str] = {} node_data: Dict[str, List[state_type]] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia context: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') node_data['struct'] = self.struct_reader.get(c) node_data['links'] = self.link_reader.get(c) yield str_data, node_data print(' ..Done')
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs: str, output_path: str): # Load redirects. print_progress('Loading redirects', '\n') logging.info("Loading redirects") redirect_pickle = os.path.join(output_path, 'redirects.pickle') redirect_map: Dict[str, str] if os.path.exists(redirect_pickle): redirect_map = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) print_progress('\nLoading redirects', '\n') logging.info("Done loading.") # The datasets are read in two steps. raw_pack_dir = os.path.join(output_path, 'nif_raw') # First, we create the NIF reader that read the NIF in order. nif_pl = Pipeline[DataPack]() nif_pl.resource.update(redirects=redirect_map) nif_pl.set_reader(DBpediaWikiReader(), config=Config( { 'redirect_path': redirects, 'nif_page_structure': nif_page_structure, 'nif_text_links': nif_text_links, }, DBpediaWikiReader.default_configs() )) nif_pl.add(WikiArticleWriter(), config=Config( { 'output_dir': raw_pack_dir, 'zip_pack': True, }, WikiArticleWriter.default_configs() )) nif_pl.initialize() logging.info('Start running the DBpedia text pipeline.') print_progress('Start running the DBpedia text pipeline.', '\n') nif_pl.run(nif_context) # Second, we add info boxes to the packs with NIF. ib_pl = Pipeline[DataPack]() ib_pl.resource.update(redirects=redirect_map) ib_pl.set_reader(DBpediaInfoBoxReader(), config=Config( { 'pack_index': os.path.join(raw_pack_dir, 'article.idx'), 'pack_dir': raw_pack_dir, 'mapping_literals': mapping_literals, 'mapping_objects': mapping_objects, 'reading_log': os.path.join(output_path, 'infobox.log') }, DBpediaInfoBoxReader.default_configs() )) ib_pl.add(WikiArticleWriter(), config=Config( { 'output_dir': os.path.join(output_path, 'nif_info_box'), 'zip_pack': True, }, WikiArticleWriter.default_configs() )) # Now we run the info box pipeline. ib_pl.run(info_boxs)