示例#1
0
    def _collect(  # type: ignore
        self, nif_context: str
    ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f"Collecting DBpedia resource: [{c.identifier}]")

                fragment = get_resource_fragment(v)
                if (
                    nif_type
                    and nif_type == "context"
                    and fragment is not None
                    and fragment == "isString"
                ):
                    str_data["text"] = o.toPython()
                    doc_name: Optional[str] = get_resource_name(s)
                    old_id: Optional[str] = get_resource_attribute(
                        c.identifier, "oldid"
                    )
                    if doc_name is not None and old_id is not None:
                        str_data["doc_name"] = doc_name
                        str_data["oldid"] = old_id
                        yield str_data
示例#2
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        for nif_range, rel, struct_type in statements:
            r = get_resource_fragment(rel)
            if r == 'type':
                range_ = get_resource_attribute(nif_range, 'char')
                begin, end = [int(d) for d in range_.split(',')]

                if end > len(pack.text):
                    # Some nif dataset are off by a bit, mostly when there
                    # are new line characters, we cannot correct them.
                    # but we need to make sure they don't go longer than
                    # the text.
                    logging.info("NIF Structure end is %d by %s, "
                                 "clipped to fit with the text.", end,
                                 nif_range)
                    end = len(pack.text)

                if end <= begin:
                    logging.info(
                        "Provided struct [%d:%d] is invalid.", begin, end)
                    continue

                struct_ = get_resource_fragment(struct_type)

                if struct_ == 'Section':
                    WikiSection(pack, begin, end)
                elif struct_ == 'Paragraph':
                    WikiParagraph(pack, begin, end)
                elif struct_ == 'Title':
                    WikiTitle(pack, begin, end)
                else:
                    logging.warning("Unknown struct type: %s", struct_type)
示例#3
0
    def _collect(self, nif_context: str  # type: ignore
                 ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia resource: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    yield str_data
示例#4
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[
            str, Dict[str, rdflib.term.Node]
        ] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, "char")
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(",")]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.",
                    end,
                )
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            for info_key, info_value in link_infos.items():
                info_value = str(info_value)
                if info_key == "type":
                    anchor_type = get_resource_fragment(info_value)
                    if (
                        not anchor_type == "Phrase"
                        and not anchor_type == "Word"
                    ):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == "taIdentRef":
                    target_page_name = get_resource_name(info_value)
                    if (
                        target_page_name is not None
                        and target_page_name in self._redirects
                    ):
                        target_page_name = self._redirects[target_page_name]

                    if target_page_name is not None:
                        # Only create anchor with proper link.
                        anchor = WikiAnchor(pack, begin, end)
                        anchor.target_page_name = target_page_name
                        # If it is an DBpedia resource, the domain will be
                        # truncated, otherwise it will stay the same, meaning
                        # it is an external link.
                        anchor.is_external = target_page_name == str(info_value)
示例#5
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[str,
                                  Dict[str,
                                       rdflib.term.Node]] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, 'char')
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(',')]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.", end)
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            anchor = WikiAnchor(pack, begin, end)
            for info_key, info_value in link_infos.items():
                if info_key == 'type':
                    anchor_type = get_resource_fragment(info_value)
                    if (not anchor_type == 'Phrase'
                            and not anchor_type == 'Word'):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == 'taIdentRef':
                    target_page_name = get_resource_name(info_value)
                    if (target_page_name is not None
                            and target_page_name in self._redirects):
                        target_page_name = self._redirects[target_page_name]
                    anchor.target_page_name = target_page_name