def add_struct(pack: DataPack, struct_statements: List): for nif_range, rel, struct_type in struct_statements: r = get_resource_fragment(rel) if r == 'type': range_ = get_resource_attribute(nif_range, 'char') begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the text. logging.info( "NIF Structure end is %d by %s, " "clipped to fit with the text.", end, nif_range) end = len(pack.text) if end <= begin: logging.info("Provided struct [%d:%d] is invalid.", begin, end) continue struct_ = get_resource_fragment(struct_type) if struct_ == 'Section': WikiSection(pack, begin, end) elif struct_ == 'Paragraph': WikiParagraph(pack, begin, end) elif struct_ == 'Title': WikiTitle(pack, begin, end) else: logging.warning("Unknown struct type: %s", struct_type)
def _collect( self, nif_context: str # type: ignore ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]: str_data: Dict[str, str] = {} node_data: Dict[str, List[state_type]] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia context: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') node_data['struct'] = self.struct_reader.get(c) node_data['links'] = self.link_reader.get(c) yield str_data, node_data print(' ..Done')
def add_struct(pack: DataPack, struct_statements: List): for nif_range, rel, struct_type in struct_statements: r = get_resource_fragment(rel) if r == 'type': range_ = get_resource_attribute(nif_range, 'char') begin, end = [int(d) for d in range_.split(',')] struct_ = get_resource_fragment(struct_type) if struct_ == 'Section': section = WikiSection(pack, begin, end) pack.add_entry(section) elif struct_ == 'Paragraph': para = WikiParagraph(pack, begin, end) pack.add_entry(para) elif struct_ == 'Title': title = WikiTitle(pack, begin, end) pack.add_entry(title) else: logging.warning("Unknown struct type: %s", struct_type)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.set_target_page_name(target_page_name) pack.add_entry(anchor)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.target_page_name = target_page_name