def test_grouped_nif_reader(self): p = os.path.join(self.data_dir, "nif_context.tql") parsed = {} for context, statements in ContextGroupedNIFReader(p): for statement in statements: s, v, o = statement r = get_resource_fragment(v) n = get_resource_name(s) try: parsed[get_resource_name(s)].append(r) except KeyError: parsed[get_resource_name(s)] = [r] expected = { "Animalia_(book)": [ "type", "beginIndex", "endIndex", "sourceUrl", "isString", "predLang", ], "List_of_Atlas_Shrugged_characters": [ "type", "beginIndex", "endIndex", "sourceUrl", ], } self.assertEqual(parsed, expected)
def _collect( # type: ignore self, nif_path: str) -> Iterator[Tuple[str, Dict[ str, List[state_type]]]]: for _, statements in ContextGroupedNIFReader(nif_path): name = get_resource_name(statements[0][0]) if name is not None: yield name, statements
def _collect( # type: ignore self, nif_path: str ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]: skipped = 0 for _, statements in ContextGroupedNIFReader(nif_path): name = get_resource_name(statements[0][0]) if name is not None: if name not in self._resume_index: yield name, statements else: skipped += 1 print_progress( f"Skipped {skipped} documents", terminal_only=True )
def test_grouped_nif_reader(self): p = os.path.join(self.data_dir, 'nif_context.tql') parsed = {} for context, statements in ContextGroupedNIFReader(p): for statement in statements: s, v, o = statement r = get_resource_fragment(v) try: parsed[get_resource_name(s)].append(r) except KeyError: parsed[get_resource_name(s)] = [r] expected = { 'Animalia_(book)': [ 'type', 'beginIndex', 'endIndex', 'sourceUrl', 'isString', 'predLang' ], 'List_of_Atlas_Shrugged_characters': ['type', 'beginIndex', 'endIndex', 'sourceUrl'] } self.assertEqual(parsed, expected)
def load_from_nif(link_file, output_file): linkings = {} bilinks = [] num_articles = 0 num_bilinks = 0 start_time = timeit.default_timer() with open(output_file, "w") as out: for _, statements in ContextGroupedNIFReader(link_file): num_articles += 1 for nif_range, rel, info in statements: r = get_resource_fragment(rel) if r is not None and r == "taIdentRef": src_name = get_resource_name(nif_range) target_name = get_resource_name(info) if src_name == target_name: continue if linkings.get(target_name, None) == src_name: bilinks.append((src_name, target_name)) linkings.pop(target_name) num_bilinks += 1 out.write(f"{src_name}\t{target_name}\n") out.flush() else: linkings[src_name] = target_name elapsed = timeit.default_timer() - start_time print_progress( f"{num_bilinks} bi-links found in {num_articles} after " f"{datetime.timedelta(seconds=elapsed)}, speed is " f"{num_articles / elapsed:.2f} (packs/second)." )
def _collect(self, nif_path: str # type: ignore ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]: for _, statements in ContextGroupedNIFReader(nif_path): yield get_resource_name(statements[0][0]), statements