예제 #1
0
def test_tknz(parameters, test_input: str, expected: List[str]):
    r"""Tokenize text based on whitespaces."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
    )
    assert tknzr.tknz(test_input) == expected
def test_ws_tknzr(capsys, ws_tknzr: WsTknzr, exp_name: str, seed: int) -> None:
  """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.WsTknzr`."""
  txt = 'a b c'

  lmp.script.tknz_txt.main(argv=[
    '--exp_name',
    exp_name,
    '--seed',
    str(seed),
    '--txt',
    txt,
  ])

  captured = capsys.readouterr()
  assert str(ws_tknzr.tknz(txt=txt)) in captured.out