示例#1
0
    def setUp(self):
        self.resources: Resources = Resources()
        self.resources.update(redirects={})

        self.data_dir: str = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "../../../../../data_samples/dbpedia",
            ))

        self.output_dir = tempfile.TemporaryDirectory()

        self.raw_output: str = os.path.join(self.output_dir.name, "raw")

        pl = Pipeline[DataPack](self.resources)
        pl.set_reader(DBpediaWikiReader())
        pl.add(
            WikiArticleWriter(),
            config={
                "output_dir": self.raw_output,
                "zip_pack": True,
                "drop_record": True,
            },
        )
        pl.run(os.path.join(self.data_dir, "nif_context.tql"))
示例#2
0
def add_wiki_info(reader: PackReader,
                  resources: Resources,
                  input_path: str,
                  input_pack_path: str,
                  output_path: str,
                  prompt_name: str,
                  skip_existing=True):
    pl = Pipeline[DataPack](resources)

    if skip_existing and os.path.exists(output_path):
        print_progress(f'\n{output_path} exist, skipping {prompt_name}', '\n')
        return

    pl.set_reader(reader,
                  config={
                      'pack_index': os.path.join(input_pack_path,
                                                 'article.idx'),
                      'pack_dir': input_pack_path,
                  })

    pl.add(
        WikiArticleWriter(),
        config={
            'output_dir': output_path,
            'zip_pack': True,
            'drop_record': True,
        },
    )

    print_progress(f'Start running the {prompt_name} pipeline.', '\n')
    pl.run(input_path)
    print_progress(f'Done collecting {prompt_name}.', '\n')
示例#3
0
def write_results(pl: Pipeline, output_path: str, input_data: str):
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    pl.run(input_data)
示例#4
0
def complete_and_tokens():
    # Define paths
    pack_input = os.path.join(pack_dir, "nif_raw_struct_links")
    pack_output = os.path.join(pack_dir, "nif_raw_struct_links_token")
    # Store which documents are processed, try to make input output structure
    # similar.
    pack_input_index = os.path.join(pack_input, "article.idx")
    pack_output_index = os.path.join(pack_output, "article.idx")

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        level=logging.INFO,
        filename=os.path.join(pack_dir, "complete_tokenize.log"),
    )

    pipeline = Pipeline(loaded_resource).set_reader(
        DirPackReader(),
        config={
            "suffix": ".json.gz",
            "zip_pack": True
        },
        # ).add(
        #     WikiEntityCompletion()
    ).add(WikiAddTitle()).add(SpacyProcessor(),
                              config={
                                  "processors": ["sentence", "tokenize"],
                              }).add(SubwordTokenizer(),
                                     config={
                                         "tokenizer_configs": {
                                             "pretrained_model_name":
                                             "bert-base-uncased"
                                         },
                                         "token_source":
                                         "ft.onto.base_ontology.Token",
                                     }).add(
                                         WikiArticleWriter(),
                                         config={
                                             "output_dir": pack_output,
                                             "zip_pack": True,
                                             "drop_record": True,
                                             "input_index_file":
                                             pack_input_index,
                                             "output_index_file":
                                             pack_output_index,
                                             "use_input_index": True,
                                             "serialize_method": "jsonpickle"
                                         },
                                     ).add(ProgressPrinter())
    pipeline.run(pack_input)
def add_wiki_info(
    reader: PackReader,
    resources: Resources,
    input_path: str,
    input_pack_path: str,
    output_path: str,
    prompt_name: str,
    skip_existing=True,
    overwrite=False,
    input_index_file_name: str = "article.idx",
    output_index_file_name: str = "article.idx",
):
    pl = Pipeline[DataPack](resources)

    out_index_path = os.path.join(output_path, output_index_file_name)
    if skip_existing and os.path.exists(out_index_path):
        print_progress(f"\n{output_path} exist, skipping {prompt_name}", "\n")
        return

    pl.set_reader(
        reader,
        config={
            "pack_index": os.path.join(input_pack_path, input_index_file_name),
            "pack_dir": input_pack_path,
        },
    )

    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
            "output_index_file": output_index_file_name,
            "overwrite": overwrite,
        },
    )

    print_progress(f"Start running the {prompt_name} pipeline.", "\n")
    pl.run(input_path)
    print_progress(f"Done collecting {prompt_name}.", "\n")
示例#6
0
def read_wiki_text(nif_context: str,
                   output_dir: str,
                   resources: Resources,
                   skip_existing: bool = False):
    if skip_existing and os.path.exists(output_dir):
        print_progress(f'\n{output_dir} exist, skipping reading text', '\n')
        return

    pl = Pipeline[DataPack](resources)
    pl.set_reader(DBpediaWikiReader())
    pl.add(
        WikiArticleWriter(),
        config={
            'output_dir': output_dir,
            'zip_pack': True,
            'drop_record': True,
        },
    )
    print_progress('Start running wiki text pipeline.', '\n')
    pl.run(nif_context)
    print_progress('Done collecting wiki text.', '\n')
示例#7
0
def read_wiki_text(
    nif_context: str,
    output_dir: str,
    resources: Resources,
    skip_existing: bool = False,
):
    if skip_existing and os.path.exists(output_dir):
        print_progress(f"\n{output_dir} exist, skipping reading text", "\n")
        return

    pl = Pipeline[DataPack](resources)
    pl.set_reader(DBpediaWikiReader())
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_dir,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    print_progress("Start running wiki text pipeline.", "\n")
    pl.run(nif_context)
    print_progress("Done collecting wiki text.", "\n")
示例#8
0
def add_wiki_info(
    reader: WikiPackReader,
    resources: Resources,
    wiki_info_data_path: str,
    input_pack_path: str,
    output_path: str,
    prompt_name: str,
    use_input_index=False,
    skip_existing=True,
    resume_from_last=False,
    input_index_file_path: Optional[str] = "article.idx",
    output_index_file_name: Optional[str] = "article.idx",
):
    """
    Add wiki resource into the data pack.

    Args:
        reader: The info reader that loads the data pack.
        resources: The resources object that should contain the redirects.
        wiki_info_data_path: The path containing the wiki data.
        input_pack_path: The initial data pack path.
        output_path: The resulting output path.
        prompt_name: a name to show during processing.
        use_input_index: whether to use the input index to determine the
          output path.
        skip_existing: whether to skip this function if the folder exists.
        resume_from_last: whether to resume from last end point, at most one
          can be true between this and `skip_existing`
        input_index_file_path: the full file path to the input index.
        output_index_file_name: the file path to write the output index,
            this is relative to `output_path`.

    Returns:

    """
    pl = Pipeline[DataPack](resources)

    if resume_from_last and skip_existing:
        raise ValueError(
            "resume_from_last and skip_existing cannot both be " "true."
        )

    out_index_path = os.path.join(output_path, output_index_file_name)
    if skip_existing and os.path.exists(out_index_path):
        print_progress(
            f"\n{out_index_path} exist, skipping {prompt_name}", "\n"
        )
        return

    if resume_from_last:
        if not os.path.exists(out_index_path):
            raise ValueError(f"Configured to do resume but path "
                             f"{out_index_path} does not exists.")

        print_progress(
            f"\nWill resume from last from {out_index_path}", "\n"
        )
        pl.set_reader(
            reader,
            config={
                "pack_index": input_index_file_path,
                "pack_dir": input_pack_path,
                "resume_index": out_index_path,
            },
        )
    else:
        pl.set_reader(
            reader,
            config={
                "pack_index": input_index_file_path,
                "pack_dir": input_pack_path,
            },
        )

    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
            "use_input_index": use_input_index,
            "input_index_file": input_index_file_path,
            "output_index_file": output_index_file_name,
            "append_to_index": resume_from_last,
        },
    )

    print_progress(f"Start running the {prompt_name} pipeline.", "\n")
    pl.run(wiki_info_data_path)
    print_progress(f"Done collecting {prompt_name}.", "\n")
示例#9
0
    pack_output = os.path.join(pack_dir, "category")
    # Store which documents have category.
    pack_input_index = os.path.join(pack_input, "article.idx")
    # Store which documents have category.
    pack_output_index = os.path.join(pack_output, "category.idx")

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        level=logging.INFO,
        filename=os.path.join(pack_dir, "category.log"),
    )

    Pipeline(resources).set_reader(
        WikiCategoryReader(),
        config={
            "pack_index": pack_input_index,
            "pack_dir": pack_input,
        },
    ).add(
        WikiArticleWriter(),
        config={
            "output_dir": pack_output,
            "zip_pack": True,
            "drop_record": True,
            "input_index_file": pack_input_index,
            "output_index_file": pack_output_index,
            "use_input_index": True,
            "overwrite": True,
        },
    ).run(os.path.join(base_dir, "article_categories_en.tql.bz2"))