def test_tknz(parameters, test_input: str, expected: List[str]): r"""Tokenize text based on whitespaces.""" tknzr = WsTknzr( is_uncased=parameters['is_uncased'], max_vocab=parameters['max_vocab'], min_count=parameters['min_count'], ) assert tknzr.tknz(test_input) == expected
def test_ws_tknzr(capsys, ws_tknzr: WsTknzr, exp_name: str, seed: int) -> None: """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.WsTknzr`.""" txt = 'a b c' lmp.script.tknz_txt.main(argv=[ '--exp_name', exp_name, '--seed', str(seed), '--txt', txt, ]) captured = capsys.readouterr() assert str(ws_tknzr.tknz(txt=txt)) in captured.out