Exemplo n.º 1
0
    def _collect(  # type: ignore
        self, nif_context: str
    ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f"Collecting DBpedia resource: [{c.identifier}]")

                fragment = get_resource_fragment(v)
                if (
                    nif_type
                    and nif_type == "context"
                    and fragment is not None
                    and fragment == "isString"
                ):
                    str_data["text"] = o.toPython()
                    doc_name: Optional[str] = get_resource_name(s)
                    old_id: Optional[str] = get_resource_attribute(
                        c.identifier, "oldid"
                    )
                    if doc_name is not None and old_id is not None:
                        str_data["doc_name"] = doc_name
                        str_data["oldid"] = old_id
                        yield str_data
Exemplo n.º 2
0
    def _parse_pack(
        self, collection: Tuple[str, List[state_type]]
    ) -> Iterator[DataPack]:
        resource_name, statements = collection
        if resource_name in self._redirects:
            resource_name = self._redirects[resource_name]

        if resource_name in self._pack_index:
            print_progress(
                f"Handling resource [{resource_name}] in {self.component_name}"
            )
            pack_path = os.path.join(
                self._pack_dir, self._pack_index[resource_name]
            )

            if os.path.exists(pack_path):
                pack: DataPack = DataPack.deserialize(
                    pack_path,
                    self.configs.serialize_method,
                    self.configs.zip_pack,
                )
                self.add_wiki_info(pack, statements)
                yield pack
        else:
            logging.info("Resource %s pack not found.", resource_name)
Exemplo n.º 3
0
 def _collect(  # type: ignore
     self, nif_path: str
 ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]:
     skipped = 0
     for _, statements in ContextGroupedNIFReader(nif_path):
         name = get_resource_name(statements[0][0])
         if name is not None:
             if name not in self._resume_index:
                 yield name, statements
             else:
                 skipped += 1
                 print_progress(
                     f"Skipped {skipped} documents", terminal_only=True
                 )
Exemplo n.º 4
0
    def _collect(self, nif_context: str  # type: ignore
                 ) -> Iterator[Dict[str, str]]:
        str_data: Dict[str, str] = {}

        for context_statements in NIFParser(nif_context):
            for s, v, o, c in context_statements:
                nif_type = get_resource_attribute(s, "nif")
                print_progress(f'Collecting DBpedia resource: [{c.identifier}]')

                if nif_type and nif_type == "context" and get_resource_fragment(
                        v) == 'isString':
                    str_data['text'] = o.toPython()
                    str_data['doc_name'] = get_resource_name(s)
                    str_data['oldid'] = get_resource_attribute(
                        c.identifier, 'oldid')

                    yield str_data
Exemplo n.º 5
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        # A mapping from the name of the page to the path on th disk.
        self._pack_index = read_index(configs.pack_index)
        self._pack_dir = configs.pack_dir

        if self.configs.resume_index:
            self._resume_index = read_index(configs.resume_index)
            print_progress(
                f"Loaded {len(self._resume_index)} existing " f"files.", "\n"
            )

        if self.resources.contains("redirects"):
            self._redirects = self.resources.get("redirects")
            print_progress(f"{len(self._redirects)} redirects loaded.", "\n")
        else:
            raise ResourceError("Redirects not provided from resources.")
Exemplo n.º 6
0
def add_wiki_info(reader: PackReader,
                  resources: Resources,
                  input_path: str,
                  input_pack_path: str,
                  output_path: str,
                  prompt_name: str,
                  skip_existing=True):
    pl = Pipeline[DataPack](resources)

    if skip_existing and os.path.exists(output_path):
        print_progress(f'\n{output_path} exist, skipping {prompt_name}', '\n')
        return

    pl.set_reader(reader,
                  config={
                      'pack_index': os.path.join(input_pack_path,
                                                 'article.idx'),
                      'pack_dir': input_pack_path,
                  })

    pl.add(
        WikiArticleWriter(),
        config={
            'output_dir': output_path,
            'zip_pack': True,
            'drop_record': True,
        },
    )

    print_progress(f'Start running the {prompt_name} pipeline.', '\n')
    pl.run(input_path)
    print_progress(f'Done collecting {prompt_name}.', '\n')
Exemplo n.º 7
0
    def _parse_pack(
            self, collection: Tuple[str,
                                    List[state_type]]) -> Iterator[DataPack]:
        resource_name, statements = collection
        if resource_name in self._redirects:
            resource_name = self._redirects[resource_name]

        if resource_name in self._pack_index:
            print_progress(
                f"Handling resource [{resource_name}] in {self.component_name}"
            )
            pack_path = os.path.join(self._pack_dir,
                                     self._pack_index[resource_name])

            # `smart_open` can handle the `gz` files.
            if os.path.exists(pack_path):
                with open(pack_path) as pack_file:
                    pack: DataPack = DataPack.deserialize(pack_file.read())
                    self.add_wiki_info(pack, statements)
                    yield pack
        else:
            logging.info("Resource %s pack not found.", resource_name)
Exemplo n.º 8
0
def load_from_nif(link_file, output_file):
    linkings = {}
    bilinks = []

    num_articles = 0
    num_bilinks = 0

    start_time = timeit.default_timer()
    with open(output_file, "w") as out:
        for _, statements in ContextGroupedNIFReader(link_file):
            num_articles += 1

            for nif_range, rel, info in statements:
                r = get_resource_fragment(rel)
                if r is not None and r == "taIdentRef":
                    src_name = get_resource_name(nif_range)
                    target_name = get_resource_name(info)

                    if src_name == target_name:
                        continue

                    if linkings.get(target_name, None) == src_name:
                        bilinks.append((src_name, target_name))
                        linkings.pop(target_name)
                        num_bilinks += 1
                        out.write(f"{src_name}\t{target_name}\n")
                        out.flush()
                    else:
                        linkings[src_name] = target_name

            elapsed = timeit.default_timer() - start_time
            print_progress(
                f"{num_bilinks} bi-links found in {num_articles} after "
                f"{datetime.timedelta(seconds=elapsed)}, speed is "
                f"{num_articles / elapsed:.2f} (packs/second)."
            )
Exemplo n.º 9
0
def add_wiki_info(
    reader: PackReader,
    resources: Resources,
    input_path: str,
    input_pack_path: str,
    output_path: str,
    prompt_name: str,
    skip_existing=True,
    overwrite=False,
    input_index_file_name: str = "article.idx",
    output_index_file_name: str = "article.idx",
):
    pl = Pipeline[DataPack](resources)

    out_index_path = os.path.join(output_path, output_index_file_name)
    if skip_existing and os.path.exists(out_index_path):
        print_progress(f"\n{output_path} exist, skipping {prompt_name}", "\n")
        return

    pl.set_reader(
        reader,
        config={
            "pack_index": os.path.join(input_pack_path, input_index_file_name),
            "pack_dir": input_pack_path,
        },
    )

    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
            "output_index_file": output_index_file_name,
            "overwrite": overwrite,
        },
    )

    print_progress(f"Start running the {prompt_name} pipeline.", "\n")
    pl.run(input_path)
    print_progress(f"Done collecting {prompt_name}.", "\n")
Exemplo n.º 10
0
def read_wiki_text(nif_context: str,
                   output_dir: str,
                   resources: Resources,
                   skip_existing: bool = False):
    if skip_existing and os.path.exists(output_dir):
        print_progress(f'\n{output_dir} exist, skipping reading text', '\n')
        return

    pl = Pipeline[DataPack](resources)
    pl.set_reader(DBpediaWikiReader())
    pl.add(
        WikiArticleWriter(),
        config={
            'output_dir': output_dir,
            'zip_pack': True,
            'drop_record': True,
        },
    )
    print_progress('Start running wiki text pipeline.', '\n')
    pl.run(nif_context)
    print_progress('Done collecting wiki text.', '\n')
Exemplo n.º 11
0
def read_wiki_text(
    nif_context: str,
    output_dir: str,
    resources: Resources,
    skip_existing: bool = False,
):
    if skip_existing and os.path.exists(output_dir):
        print_progress(f"\n{output_dir} exist, skipping reading text", "\n")
        return

    pl = Pipeline[DataPack](resources)
    pl.set_reader(DBpediaWikiReader())
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_dir,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    print_progress("Start running wiki text pipeline.", "\n")
    pl.run(nif_context)
    print_progress("Done collecting wiki text.", "\n")
Exemplo n.º 12
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(WikiPropertyReader(),
                  resources,
                  info_boxs_properties,
                  link_dir,
                  property_dir,
                  'info_box_properties',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='properties.idx')
    print_progress("Done reading wikipedia info-boxes properties.", '\n')

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_literals,
                  property_dir,
                  literal_dir,
                  'literals',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='literals.idx')
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(WikiInfoBoxReader(),
                  resources,
                  mapping_objects,
                  literal_dir,
                  mapping_dir,
                  'objects',
                  skip_existing=True,
                  overwrite=True,
                  output_index_file_name='objects.idx')
    print_progress("Done reading wikipedia info-boxes objects.", '\n')
Exemplo n.º 13
0
def add_wiki_info(
    reader: WikiPackReader,
    resources: Resources,
    wiki_info_data_path: str,
    input_pack_path: str,
    output_path: str,
    prompt_name: str,
    use_input_index=False,
    skip_existing=True,
    resume_from_last=False,
    input_index_file_path: Optional[str] = "article.idx",
    output_index_file_name: Optional[str] = "article.idx",
):
    """
    Add wiki resource into the data pack.

    Args:
        reader: The info reader that loads the data pack.
        resources: The resources object that should contain the redirects.
        wiki_info_data_path: The path containing the wiki data.
        input_pack_path: The initial data pack path.
        output_path: The resulting output path.
        prompt_name: a name to show during processing.
        use_input_index: whether to use the input index to determine the
          output path.
        skip_existing: whether to skip this function if the folder exists.
        resume_from_last: whether to resume from last end point, at most one
          can be true between this and `skip_existing`
        input_index_file_path: the full file path to the input index.
        output_index_file_name: the file path to write the output index,
            this is relative to `output_path`.

    Returns:

    """
    pl = Pipeline[DataPack](resources)

    if resume_from_last and skip_existing:
        raise ValueError(
            "resume_from_last and skip_existing cannot both be " "true."
        )

    out_index_path = os.path.join(output_path, output_index_file_name)
    if skip_existing and os.path.exists(out_index_path):
        print_progress(
            f"\n{out_index_path} exist, skipping {prompt_name}", "\n"
        )
        return

    if resume_from_last:
        if not os.path.exists(out_index_path):
            raise ValueError(f"Configured to do resume but path "
                             f"{out_index_path} does not exists.")

        print_progress(
            f"\nWill resume from last from {out_index_path}", "\n"
        )
        pl.set_reader(
            reader,
            config={
                "pack_index": input_index_file_path,
                "pack_dir": input_pack_path,
                "resume_index": out_index_path,
            },
        )
    else:
        pl.set_reader(
            reader,
            config={
                "pack_index": input_index_file_path,
                "pack_dir": input_pack_path,
            },
        )

    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
            "use_input_index": use_input_index,
            "input_index_file": input_index_file_path,
            "output_index_file": output_index_file_name,
            "append_to_index": resume_from_last,
        },
    )

    print_progress(f"Start running the {prompt_name} pipeline.", "\n")
    pl.run(wiki_info_data_path)
    print_progress(f"Done collecting {prompt_name}.", "\n")
Exemplo n.º 14
0
def main(
    nif_context: str,
    nif_page_structure: str,
    mapping_literals: str,
    mapping_objects: str,
    nif_text_links: str,
    redirects: str,
    info_boxs_properties: str,
    categories: str,
    base_output_path: str,
    resume_existing: bool,
):
    # Whether to skip the whole step.
    if resume_existing:
        skip_existing = False
    else:
        skip_existing = True

    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress("Loading redirects", "\n")

    redirect_map: Dict[str, str] = cache_redirects(base_output_path, redirects)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", "\n")

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, "nif_raw")
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", "\n")

    # Use the same index structure for all writers.
    main_index = os.path.join(raw_pack_dir, "article.idx")

    # 2. Add wiki page structures, create a new directory for it.
    struct_dir = raw_pack_dir + "_struct"
    add_wiki_info(
        WikiStructReader(),
        resources,
        nif_page_structure,
        raw_pack_dir,
        struct_dir,
        "page_structures",
        use_input_index=True,
        skip_existing=skip_existing,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia structures.", "\n")

    # 3. Add wiki links, create a new directory for it.
    link_dir = struct_dir + "_links"
    add_wiki_info(
        WikiAnchorReader(),
        resources,
        nif_text_links,
        struct_dir,
        link_dir,
        "anchor_links",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia anchors.", "\n")

    # 4 The following steps add info boxes:
    # 4.1 Add un-mapped infobox, we directly write to the previous directory
    property_dir = link_dir
    add_wiki_info(
        WikiPropertyReader(),
        resources,
        info_boxs_properties,
        link_dir,
        property_dir,
        "info_box_properties",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="properties.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes properties.", "\n")

    # 4.1 Add mapped literal, we directly write to the previous directory.
    literal_dir = property_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_literals,
        property_dir,
        literal_dir,
        "literals",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="literals.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes literals.", "\n")

    # 4.1 Add mapped object, we directly write to the previous directory.
    mapping_dir = literal_dir
    add_wiki_info(
        WikiInfoBoxReader(),
        resources,
        mapping_objects,
        literal_dir,
        mapping_dir,
        "objects",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="objects.idx",
        input_index_file_path=main_index,
    )
    print_progress("Done reading wikipedia info-boxes objects.", "\n")

    # 4.2 Add category, directly write to previous directory.
    category_dir = mapping_dir
    add_wiki_info(
        WikiCategoryReader(),
        resources,
        categories,
        mapping_dir,
        category_dir,
        "categories",
        use_input_index=True,
        skip_existing=True,
        resume_from_last=resume_existing,
        output_index_file_name="categories.idx",
        input_index_file_path=main_index,
    )
Exemplo n.º 15
0
            true_path_lookup[article_name] = article_path

    print(f"Loaded {len(true_path_lookup)} true article paths.")

    with open(target_index) as ti:
        doc_count = 0
        removed_count = 0
        fixed_count = 0

        for line in ti:
            article_name, article_path = line.strip().split("\t")
            true_path = true_path_lookup[article_name]
            doc_count += 1

            import pdb

            if true_path == article_path:
                print_progress(f"Fixing category from {true_path} ({doc_count}"
                               f"/{removed_count}/{fixed_count})")
                fixed_count += 1
            else:
                print_progress(
                    f"Removing extra file {article_path}, will keep "
                    f"{true_path} ({doc_count}/{removed_count}/{fixed_count})")
                removed_count += 1

                print(
                    f"Removing extra file {article_path}, will keep "
                    f"{true_path} ({doc_count}/{removed_count}/{fixed_count})")
                pdb.set_trace()
Exemplo n.º 16
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs_properties: str, base_output_path: str):
    # The datasets are read in a few steps.
    # 0. Load redirects between wikipedia pages.
    print_progress('Loading redirects', '\n')
    redirect_pickle = os.path.join(base_output_path, 'redirects.pickle')

    redirect_map: Dict[str, str]
    if os.path.exists(redirect_pickle):
        redirect_map = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)

    resources: Resources = Resources()
    resources.update(redirects=redirect_map)
    print_progress("Done loading.", '\n')

    # 1. Read the wiki text.
    raw_pack_dir = os.path.join(base_output_path, 'nif_raw')
    read_wiki_text(nif_context, raw_pack_dir, resources, True)
    print_progress("Done reading wikipedia text.", '\n')

    # 2. Add the rest of wiki page structures:
    struct_dir = raw_pack_dir + '_struct'
    add_wiki_info(WikiStructReader(), resources, nif_page_structure,
                  raw_pack_dir, struct_dir, 'page_structures', True)
    print_progress("Done reading wikipedia structures.", '\n')

    link_dir = struct_dir + '_links'
    add_wiki_info(WikiAnchorReader(), resources, nif_text_links, struct_dir,
                  link_dir, 'anchor_links', True)
    print_progress("Done reading wikipedia anchors.", '\n')

    property_dir = link_dir + '_property'
    add_wiki_info(WikiPropertyReader(), resources, info_boxs_properties,
                  link_dir, property_dir, 'info_box_properties', True)
    print_progress("Done reading wikipedia info-boxes.", '\n')

    literal_dir = property_dir + '_literals'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_literals,
                  property_dir, literal_dir, 'literals', True)
    print_progress("Done reading wikipedia info-boxes literals.", '\n')

    mapping_dir = literal_dir + '_objects'
    add_wiki_info(WikiInfoBoxReader(), resources, mapping_objects, literal_dir,
                  mapping_dir, 'objects', True)
    print_progress("Done reading wikipedia info-boxes objects.", '\n')