def _collect( # type: ignore self, nif_context: str ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f"Collecting DBpedia resource: [{c.identifier}]") fragment = get_resource_fragment(v) if ( nif_type and nif_type == "context" and fragment is not None and fragment == "isString" ): str_data["text"] = o.toPython() doc_name: Optional[str] = get_resource_name(s) old_id: Optional[str] = get_resource_attribute( c.identifier, "oldid" ) if doc_name is not None and old_id is not None: str_data["doc_name"] = doc_name str_data["oldid"] = old_id yield str_data
def add_wiki_info(self, pack: DataPack, statements: List): for nif_range, rel, struct_type in statements: r = get_resource_fragment(rel) if r == 'type': range_ = get_resource_attribute(nif_range, 'char') begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there # are new line characters, we cannot correct them. # but we need to make sure they don't go longer than # the text. logging.info("NIF Structure end is %d by %s, " "clipped to fit with the text.", end, nif_range) end = len(pack.text) if end <= begin: logging.info( "Provided struct [%d:%d] is invalid.", begin, end) continue struct_ = get_resource_fragment(struct_type) if struct_ == 'Section': WikiSection(pack, begin, end) elif struct_ == 'Paragraph': WikiParagraph(pack, begin, end) elif struct_ == 'Title': WikiTitle(pack, begin, end) else: logging.warning("Unknown struct type: %s", struct_type)
def _collect(self, nif_context: str # type: ignore ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia resource: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') yield str_data
def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[ str, Dict[str, rdflib.term.Node] ] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, "char") r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(",")] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end, ) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue for info_key, info_value in link_infos.items(): info_value = str(info_value) if info_key == "type": anchor_type = get_resource_fragment(info_value) if ( not anchor_type == "Phrase" and not anchor_type == "Word" ): logging.warning("Unknown anchor type: %s", info_value) if info_key == "taIdentRef": target_page_name = get_resource_name(info_value) if ( target_page_name is not None and target_page_name in self._redirects ): target_page_name = self._redirects[target_page_name] if target_page_name is not None: # Only create anchor with proper link. anchor = WikiAnchor(pack, begin, end) anchor.target_page_name = target_page_name # If it is an DBpedia resource, the domain will be # truncated, otherwise it will stay the same, meaning # it is an external link. anchor.is_external = target_page_name == str(info_value)
def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if (not anchor_type == 'Phrase' and not anchor_type == 'Word'): logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if (target_page_name is not None and target_page_name in self._redirects): target_page_name = self._redirects[target_page_name] anchor.target_page_name = target_page_name