def test_snakemake_word_frequency(): test_protocols: List[str] = [ 'prot-1936--ak--8.xml', 'prot-197778--160.xml', ] workdir = aj("./tests/output/work_folder") config_filename = aj("./tests/test_data/test_config_output.yml") rmtree(workdir, ignore_errors=True) makedirs(workdir, exist_ok=True) makedirs(jj(workdir, "logs"), exist_ok=True) setup_parlaclarin_repository(test_protocols, workdir, "riksdagen-corpus") setup_work_folder_for_tagging_with_stanza(workdir) snakefile = jj('workflow', 'Snakefile') snakemake.snakemake( snakefile, config=dict(config_filename=config_filename, processes=4), debug=True, # workdir=workdir, keep_target_files=True, cores=1, verbose=True, targets=['word_frequency'], ) assert isfile(jj(workdir, "riksdagen-corpus-term-frequencies.pkl"))
def test_snakemake_execute(): config_filename = aj("./tests/test_data/test_config.yml") cfg: Config = load_typed_config(config_name=config_filename) snakefile = jj('workflow', 'Snakefile') rmtree(cfg.annotated_folder, ignore_errors=True) makedirs(cfg.annotated_folder, exist_ok=True) success = snakemake.snakemake( snakefile, config=dict(config_filename=config_filename), debug=True, # workdir=workdir, keep_target_files=True, cores=1, verbose=True, ) assert success source_files: List[str] = glob.glob(jj( cfg.data_folder, 'riksdagen-corpus/corpus/**/prot*.xml'), recursive=True) for filename in source_files: document_name: str = strip_path_and_extension(filename) target_dir: str = jj(cfg.annotated_folder, document_name.split('-')[1]) assert isfile(jj(target_dir, f"{document_name}.zip"))
def run_tag_protocol_xml(): config_filename: str = aj("./tests/test_data/test_config.yml") cfg: Config = load_typed_config(config_filename) tagger: pyriksprot.ITagger = TaggerRegistry.get( tagger_cls=StanzaTagger, model=cfg.stanza_dir, dehyphen_opts=dict(word_frequency_filename=cfg.word_frequency.fullname, **cfg.dehyphen.opts), use_gpu=False, ) input_filename: str = jj("tests", "test_data", "fake", "prot-1958-fake.xml") output_filename: str = jj("tests", "output", "prot-1958-fake.zip") pyriksprot.tag_protocol_xml( input_filename, output_filename, tagger, storage_format="json", ) assert os.path.isfile(output_filename)
def run_snakemake(): test_protocols: List[str] = [ 'prot-1936--ak--8.xml', 'prot-1961--ak--5.xml', 'prot-1961--fk--6.xml', 'prot-198687--11.xml', 'prot-200405--7.xml', 'prot-197778--160.xml', ] workdir = aj("./tests/test_data/work_folder") rmtree(workdir, ignore_errors=True) setup_working_folder(root_path=workdir, test_protocols=test_protocols) snakemake.snakemake( jj('workflow', 'Snakefile'), config=dict(config_filename=aj("./tests/test_data/test_config.yml"), ), debug=True, keep_target_files=True, cores=1, verbose=True, )
def test_tagger_registry_get(): config_filename: str = aj("./tests/test_data/test_config.yml") cfg: Config = load_typed_config(config_filename) dehyphen_opts = dict(word_frequency_filename=cfg.word_frequency.fullname, **cfg.dehyphen.opts) tagger: ITagger = TaggerRegistry.get( tagger_cls=StanzaTagger, model=cfg.stanza_dir, dehyphen_opts=dehyphen_opts, use_gpu=False, ) assert isinstance(tagger, StanzaTagger) tagger2: ITagger = TaggerRegistry.get( tagger_cls=StanzaTagger, model=cfg.stanza_dir, dehyphen_opts=dehyphen_opts, use_gpu=False, ) assert tagger2 is tagger