示例#1
0
def add_struct(pack: DataPack, struct_statements: List):
    for nif_range, rel, struct_type in struct_statements:
        r = get_resource_fragment(rel)
        if r == 'type':
            range_ = get_resource_attribute(nif_range, 'char')
            begin, end = [int(d) for d in range_.split(',')]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the text.
                logging.info(
                    "NIF Structure end is %d by %s, "
                    "clipped to fit with the text.", end, nif_range)
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided struct [%d:%d] is invalid.", begin, end)
                continue

            struct_ = get_resource_fragment(struct_type)

            if struct_ == 'Section':
                WikiSection(pack, begin, end)
            elif struct_ == 'Paragraph':
                WikiParagraph(pack, begin, end)
            elif struct_ == 'Title':
                WikiTitle(pack, begin, end)
            else:
                logging.warning("Unknown struct type: %s", struct_type)
示例#2
0
    def _collect(
        self,
        nif_context: str  # type: ignore
    ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]:
        str_data: Dict[str, str] = {}
        node_data: Dict[str, List[state_type]] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia context: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    node_data['struct'] = self.struct_reader.get(c)
                    node_data['links'] = self.link_reader.get(c)

                    yield str_data, node_data
        print(' ..Done')
def add_struct(pack: DataPack, struct_statements: List):
    for nif_range, rel, struct_type in struct_statements:
        r = get_resource_fragment(rel)
        if r == 'type':
            range_ = get_resource_attribute(nif_range, 'char')
            begin, end = [int(d) for d in range_.split(',')]

            struct_ = get_resource_fragment(struct_type)

            if struct_ == 'Section':
                section = WikiSection(pack, begin, end)
                pack.add_entry(section)
            elif struct_ == 'Paragraph':
                para = WikiParagraph(pack, begin, end)
                pack.add_entry(para)
            elif struct_ == 'Title':
                title = WikiTitle(pack, begin, end)
                pack.add_entry(title)
            else:
                logging.warning("Unknown struct type: %s", struct_type)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type],
                     redirects: Dict[str, str]):
    link_grouped: DefaultDict[str, Dict[str,
                                        rdflib.term.Node]] = defaultdict(dict)
    for nif_range, rel, info in text_link_statements:
        range_ = get_resource_attribute(nif_range, 'char')
        r = get_resource_fragment(rel)
        link_grouped[range_][r] = info

    for range_, link_infos in link_grouped.items():
        begin, end = [int(d) for d in range_.split(',')]
        anchor = WikiAnchor(pack, begin, end)
        for info_key, info_value in link_infos.items():
            if info_key == 'type':
                anchor_type = get_resource_fragment(info_value)
                if not anchor_type == 'Phrase' and not anchor_type == 'Word':
                    logging.warning("Unknown anchor type: %s", info_value)
            if info_key == 'taIdentRef':
                target_page_name = get_resource_name(info_value)
                if target_page_name in redirects:
                    target_page_name = redirects[target_page_name]
                anchor.set_target_page_name(target_page_name)
        pack.add_entry(anchor)
示例#5
0
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type],
                     redirects: Dict[str, str]):
    link_grouped: DefaultDict[str, Dict[str,
                                        rdflib.term.Node]] = defaultdict(dict)
    for nif_range, rel, info in text_link_statements:
        range_ = get_resource_attribute(nif_range, 'char')
        r = get_resource_fragment(rel)
        link_grouped[range_][r] = info

    for range_, link_infos in link_grouped.items():
        begin, end = [int(d) for d in range_.split(',')]

        if end > len(pack.text):
            # Some nif dataset are off by a bit, mostly when there are
            # new line characters, we cannot correct them.
            # but we need to make sure they don't go longer than the text.
            logging.info(
                "Provided anchor end is %d, "
                "clipped to fit with the text.", end)
            end = len(pack.text)

        if end <= begin:
            logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
            continue

        anchor = WikiAnchor(pack, begin, end)
        for info_key, info_value in link_infos.items():
            if info_key == 'type':
                anchor_type = get_resource_fragment(info_value)
                if not anchor_type == 'Phrase' and not anchor_type == 'Word':
                    logging.warning("Unknown anchor type: %s", info_value)
            if info_key == 'taIdentRef':
                target_page_name = get_resource_name(info_value)
                if target_page_name in redirects:
                    target_page_name = redirects[target_page_name]
                anchor.target_page_name = target_page_name