Пример #1
0
    def test_grouped_nif_reader(self):
        p = os.path.join(self.data_dir, "nif_context.tql")
        parsed = {}
        for context, statements in ContextGroupedNIFReader(p):
            for statement in statements:
                s, v, o = statement

                r = get_resource_fragment(v)
                n = get_resource_name(s)
                try:
                    parsed[get_resource_name(s)].append(r)
                except KeyError:
                    parsed[get_resource_name(s)] = [r]
        expected = {
            "Animalia_(book)": [
                "type",
                "beginIndex",
                "endIndex",
                "sourceUrl",
                "isString",
                "predLang",
            ],
            "List_of_Atlas_Shrugged_characters": [
                "type",
                "beginIndex",
                "endIndex",
                "sourceUrl",
            ],
        }
        self.assertEqual(parsed, expected)
Пример #2
0
 def _collect(  # type: ignore
         self, nif_path: str) -> Iterator[Tuple[str, Dict[
             str, List[state_type]]]]:
     for _, statements in ContextGroupedNIFReader(nif_path):
         name = get_resource_name(statements[0][0])
         if name is not None:
             yield name, statements
Пример #3
0
 def _collect(  # type: ignore
     self, nif_path: str
 ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
     skipped = 0
     for _, statements in ContextGroupedNIFReader(nif_path):
         name = get_resource_name(statements[0][0])
         if name is not None:
             if name not in self._resume_index:
                 yield name, statements
             else:
                 skipped += 1
                 print_progress(
                     f"Skipped {skipped} documents", terminal_only=True
                 )
Пример #4
0
    def test_grouped_nif_reader(self):
        p = os.path.join(self.data_dir, 'nif_context.tql')
        parsed = {}
        for context, statements in ContextGroupedNIFReader(p):
            for statement in statements:
                s, v, o = statement

                r = get_resource_fragment(v)
                try:
                    parsed[get_resource_name(s)].append(r)
                except KeyError:
                    parsed[get_resource_name(s)] = [r]
        expected = {
            'Animalia_(book)': [
                'type', 'beginIndex', 'endIndex', 'sourceUrl', 'isString',
                'predLang'
            ],
            'List_of_Atlas_Shrugged_characters':
            ['type', 'beginIndex', 'endIndex', 'sourceUrl']
        }
        self.assertEqual(parsed, expected)
Пример #5
0
def load_from_nif(link_file, output_file):
    linkings = {}
    bilinks = []

    num_articles = 0
    num_bilinks = 0

    start_time = timeit.default_timer()
    with open(output_file, "w") as out:
        for _, statements in ContextGroupedNIFReader(link_file):
            num_articles += 1

            for nif_range, rel, info in statements:
                r = get_resource_fragment(rel)
                if r is not None and r == "taIdentRef":
                    src_name = get_resource_name(nif_range)
                    target_name = get_resource_name(info)

                    if src_name == target_name:
                        continue

                    if linkings.get(target_name, None) == src_name:
                        bilinks.append((src_name, target_name))
                        linkings.pop(target_name)
                        num_bilinks += 1
                        out.write(f"{src_name}\t{target_name}\n")
                        out.flush()
                    else:
                        linkings[src_name] = target_name

            elapsed = timeit.default_timer() - start_time
            print_progress(
                f"{num_bilinks} bi-links found in {num_articles} after "
                f"{datetime.timedelta(seconds=elapsed)}, speed is "
                f"{num_articles / elapsed:.2f} (packs/second)."
            )
Пример #6
0
 def _collect(self, nif_path: str  # type: ignore
              ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
     for _, statements in ContextGroupedNIFReader(nif_path):
         yield get_resource_name(statements[0][0]), statements