Exemplo n.º 1
0
def test_snakemake_word_frequency():

    test_protocols: List[str] = [
        'prot-1936--ak--8.xml',
        'prot-197778--160.xml',
    ]

    workdir = aj("./tests/output/work_folder")
    config_filename = aj("./tests/test_data/test_config_output.yml")

    rmtree(workdir, ignore_errors=True)
    makedirs(workdir, exist_ok=True)
    makedirs(jj(workdir, "logs"), exist_ok=True)

    setup_parlaclarin_repository(test_protocols, workdir, "riksdagen-corpus")
    setup_work_folder_for_tagging_with_stanza(workdir)

    snakefile = jj('workflow', 'Snakefile')

    snakemake.snakemake(
        snakefile,
        config=dict(config_filename=config_filename, processes=4),
        debug=True,
        # workdir=workdir,
        keep_target_files=True,
        cores=1,
        verbose=True,
        targets=['word_frequency'],
    )

    assert isfile(jj(workdir, "riksdagen-corpus-term-frequencies.pkl"))
Exemplo n.º 2
0
def test_snakemake_execute():

    config_filename = aj("./tests/test_data/test_config.yml")

    cfg: Config = load_typed_config(config_name=config_filename)

    snakefile = jj('workflow', 'Snakefile')

    rmtree(cfg.annotated_folder, ignore_errors=True)
    makedirs(cfg.annotated_folder, exist_ok=True)

    success = snakemake.snakemake(
        snakefile,
        config=dict(config_filename=config_filename),
        debug=True,
        # workdir=workdir,
        keep_target_files=True,
        cores=1,
        verbose=True,
    )

    assert success

    source_files: List[str] = glob.glob(jj(
        cfg.data_folder, 'riksdagen-corpus/corpus/**/prot*.xml'),
                                        recursive=True)

    for filename in source_files:

        document_name: str = strip_path_and_extension(filename)
        target_dir: str = jj(cfg.annotated_folder, document_name.split('-')[1])

        assert isfile(jj(target_dir, f"{document_name}.zip"))
def run_tag_protocol_xml():

    config_filename: str = aj("./tests/test_data/test_config.yml")
    cfg: Config = load_typed_config(config_filename)

    tagger: pyriksprot.ITagger = TaggerRegistry.get(
        tagger_cls=StanzaTagger,
        model=cfg.stanza_dir,
        dehyphen_opts=dict(word_frequency_filename=cfg.word_frequency.fullname,
                           **cfg.dehyphen.opts),
        use_gpu=False,
    )

    input_filename: str = jj("tests", "test_data", "fake",
                             "prot-1958-fake.xml")
    output_filename: str = jj("tests", "output", "prot-1958-fake.zip")

    pyriksprot.tag_protocol_xml(
        input_filename,
        output_filename,
        tagger,
        storage_format="json",
    )

    assert os.path.isfile(output_filename)
def run_snakemake():

    test_protocols: List[str] = [
        'prot-1936--ak--8.xml',
        'prot-1961--ak--5.xml',
        'prot-1961--fk--6.xml',
        'prot-198687--11.xml',
        'prot-200405--7.xml',
        'prot-197778--160.xml',
    ]

    workdir = aj("./tests/test_data/work_folder")

    rmtree(workdir, ignore_errors=True)
    setup_working_folder(root_path=workdir, test_protocols=test_protocols)

    snakemake.snakemake(
        jj('workflow', 'Snakefile'),
        config=dict(config_filename=aj("./tests/test_data/test_config.yml"), ),
        debug=True,
        keep_target_files=True,
        cores=1,
        verbose=True,
    )
Exemplo n.º 5
0
def test_tagger_registry_get():
    config_filename: str = aj("./tests/test_data/test_config.yml")
    cfg: Config = load_typed_config(config_filename)
    dehyphen_opts = dict(word_frequency_filename=cfg.word_frequency.fullname,
                         **cfg.dehyphen.opts)
    tagger: ITagger = TaggerRegistry.get(
        tagger_cls=StanzaTagger,
        model=cfg.stanza_dir,
        dehyphen_opts=dehyphen_opts,
        use_gpu=False,
    )
    assert isinstance(tagger, StanzaTagger)

    tagger2: ITagger = TaggerRegistry.get(
        tagger_cls=StanzaTagger,
        model=cfg.stanza_dir,
        dehyphen_opts=dehyphen_opts,
        use_gpu=False,
    )

    assert tagger2 is tagger