Пример #1
0
    def test_grouped_nif_reader(self):
        p = os.path.join(self.data_dir, "nif_context.tql")
        parsed = {}
        for context, statements in ContextGroupedNIFReader(p):
            for statement in statements:
                s, v, o = statement

                r = get_resource_fragment(v)
                n = get_resource_name(s)
                try:
                    parsed[get_resource_name(s)].append(r)
                except KeyError:
                    parsed[get_resource_name(s)] = [r]
        expected = {
            "Animalia_(book)": [
                "type",
                "beginIndex",
                "endIndex",
                "sourceUrl",
                "isString",
                "predLang",
            ],
            "List_of_Atlas_Shrugged_characters": [
                "type",
                "beginIndex",
                "endIndex",
                "sourceUrl",
            ],
        }
        self.assertEqual(parsed, expected)
Пример #2
0
 def add_wiki_info(self, pack: DataPack, statements: List[state_type]):
     for _, _, o in statements:
         resource_name = get_resource_name(o)
         if resource_name is not None:
             wc = WikiCategory(pack)
             wc.values.append(resource_name)
             pack.add_entry(wc)
Пример #3
0
    def _collect(  # type: ignore
        self, nif_context: str
    ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f"Collecting DBpedia resource: [{c.identifier}]")

                fragment = get_resource_fragment(v)
                if (
                    nif_type
                    and nif_type == "context"
                    and fragment is not None
                    and fragment == "isString"
                ):
                    str_data["text"] = o.toPython()
                    doc_name: Optional[str] = get_resource_name(s)
                    old_id: Optional[str] = get_resource_attribute(
                        c.identifier, "oldid"
                    )
                    if doc_name is not None and old_id is not None:
                        str_data["doc_name"] = doc_name
                        str_data["oldid"] = old_id
                        yield str_data
Пример #4
0
 def add_wiki_info(self, pack: DataPack, statements: List):
     for _, v, o in statements:
         slot_name = v.toPython()
         slot_value = get_resource_name(o)
         info_box = WikiInfoBoxProperty(pack)
         info_box.key = slot_name
         info_box.value = slot_value
Пример #5
0
 def add_wiki_info(self, pack: DataPack, info_box_statements: List):
     for _, v, o in info_box_statements:
         name = get_resource_name(o)
         if name is not None:
             info_box = WikiInfoBoxMapped(pack)
             info_box.key = v.toPython()
             info_box.value = name
Пример #6
0
 def _collect(  # type: ignore
         self, nif_path: str) -> Iterator[Tuple[str, Dict[
             str, List[state_type]]]]:
     for _, statements in ContextGroupedNIFReader(nif_path):
         name = get_resource_name(statements[0][0])
         if name is not None:
             yield name, statements
Пример #7
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[
            str, Dict[str, rdflib.term.Node]
        ] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, "char")
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(",")]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.",
                    end,
                )
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            for info_key, info_value in link_infos.items():
                info_value = str(info_value)
                if info_key == "type":
                    anchor_type = get_resource_fragment(info_value)
                    if (
                        not anchor_type == "Phrase"
                        and not anchor_type == "Word"
                    ):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == "taIdentRef":
                    target_page_name = get_resource_name(info_value)
                    if (
                        target_page_name is not None
                        and target_page_name in self._redirects
                    ):
                        target_page_name = self._redirects[target_page_name]

                    if target_page_name is not None:
                        # Only create anchor with proper link.
                        anchor = WikiAnchor(pack, begin, end)
                        anchor.target_page_name = target_page_name
                        # If it is an DBpedia resource, the domain will be
                        # truncated, otherwise it will stay the same, meaning
                        # it is an external link.
                        anchor.is_external = target_page_name == str(info_value)
Пример #8
0
    def test_grouped_nif_reader(self):
        p = os.path.join(self.data_dir, 'nif_context.tql')
        parsed = {}
        for context, statements in ContextGroupedNIFReader(p):
            for statement in statements:
                s, v, o = statement

                r = get_resource_fragment(v)
                try:
                    parsed[get_resource_name(s)].append(r)
                except KeyError:
                    parsed[get_resource_name(s)] = [r]
        expected = {
            'Animalia_(book)': [
                'type', 'beginIndex', 'endIndex', 'sourceUrl', 'isString',
                'predLang'
            ],
            'List_of_Atlas_Shrugged_characters':
            ['type', 'beginIndex', 'endIndex', 'sourceUrl']
        }
        self.assertEqual(parsed, expected)
Пример #9
0
 def _collect(  # type: ignore
     self, nif_path: str
 ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
     skipped = 0
     for _, statements in ContextGroupedNIFReader(nif_path):
         name = get_resource_name(statements[0][0])
         if name is not None:
             if name not in self._resume_index:
                 yield name, statements
             else:
                 skipped += 1
                 print_progress(
                     f"Skipped {skipped} documents", terminal_only=True
                 )
Пример #10
0
def load_from_nif(link_file, output_file):
    linkings = {}
    bilinks = []

    num_articles = 0
    num_bilinks = 0

    start_time = timeit.default_timer()
    with open(output_file, "w") as out:
        for _, statements in ContextGroupedNIFReader(link_file):
            num_articles += 1

            for nif_range, rel, info in statements:
                r = get_resource_fragment(rel)
                if r is not None and r == "taIdentRef":
                    src_name = get_resource_name(nif_range)
                    target_name = get_resource_name(info)

                    if src_name == target_name:
                        continue

                    if linkings.get(target_name, None) == src_name:
                        bilinks.append((src_name, target_name))
                        linkings.pop(target_name)
                        num_bilinks += 1
                        out.write(f"{src_name}\t{target_name}\n")
                        out.flush()
                    else:
                        linkings[src_name] = target_name

            elapsed = timeit.default_timer() - start_time
            print_progress(
                f"{num_bilinks} bi-links found in {num_articles} after "
                f"{datetime.timedelta(seconds=elapsed)}, speed is "
                f"{num_articles / elapsed:.2f} (packs/second)."
            )
Пример #11
0
    def _collect(self, nif_context: str  # type: ignore
                 ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia resource: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    yield str_data
Пример #12
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[str,
                                  Dict[str,
                                       rdflib.term.Node]] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, 'char')
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(',')]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.", end)
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            anchor = WikiAnchor(pack, begin, end)
            for info_key, info_value in link_infos.items():
                if info_key == 'type':
                    anchor_type = get_resource_fragment(info_value)
                    if (not anchor_type == 'Phrase'
                            and not anchor_type == 'Word'):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == 'taIdentRef':
                    target_page_name = get_resource_name(info_value)
                    if (target_page_name is not None
                            and target_page_name in self._redirects):
                        target_page_name = self._redirects[target_page_name]
                    anchor.target_page_name = target_page_name
Пример #13
0
    def test_nif_parser(self):
        p = os.path.join(self.data_dir, "nif_page_structure.tql")

        parsed = []

        for statements in NIFParser(p):
            for statement in statements:
                s, v, o, c = statement
                parsed.append(
                    (
                        context_base(c),
                        get_resource_fragment(v),
                        get_resource_name(s),
                        strip_url_params(s),
                    )
                )

        expected = [
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "type",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "notation",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "beginIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "endIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "referenceContext",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "superString",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "hasSection",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "firstSection",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "lastSection",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "type",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "beginIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "endIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "referenceContext",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "superString",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "hasParagraph",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "lastParagraph",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "type",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "notation",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "beginIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "endIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "referenceContext",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "superString",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "hasSection",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "firstSection",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "type",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "referenceContext",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "beginIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
            (
                "http://en.wikipedia.org/wiki/Animalia_(book)",
                "endIndex",
                "Animalia_(book)",
                "http://dbpedia.org/resource/Animalia_(book)",
            ),
        ]
        self.assertEqual(parsed, expected)
Пример #14
0
    def test_nif_parser(self):
        p = os.path.join(self.data_dir, 'nif_page_structure.tql')

        parsed = []

        for statements in NIFParser(p):
            for statement in statements:
                s, v, o, c = statement
                parsed.append((context_base(c), get_resource_fragment(v),
                               get_resource_name(s), strip_url_params(s)))

        expected = [
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)',
             'referenceContext', 'Animalia_(book)',
             'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastSection',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)',
             'referenceContext', 'Animalia_(book)',
             'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasParagraph',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastParagraph',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)',
             'referenceContext', 'Animalia_(book)',
             'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)',
             'referenceContext', 'Animalia_(book)',
             'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'),
            ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex',
             'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)')
        ]
        self.assertEqual(parsed, expected)
Пример #15
0
 def _collect(self, nif_path: str  # type: ignore
              ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
     for _, statements in ContextGroupedNIFReader(nif_path):
         yield get_resource_name(statements[0][0]), statements