示例#1
0
def test_load_result(
        ws_tknzr: WsTknzr,
        exp_name: str,
        file_path: str,
):
    r"""Ensure configuration consistency between save and load."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        False, True, 0, 1, 0.0, 0.1, 1.0, (), [], {}, set(), None, ...,
        NotImplemented,
    ]

    for bad_exp_name in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            WsTknzr.load(exp_name=bad_exp_name)

        assert (
            '`exp_name` must be an instance of `str`' in str(excinfo.value)
        )

    # Test case: Valid input.
    ws_tknzr.save(exp_name)
    load_tknzr = WsTknzr.load(exp_name)

    assert ws_tknzr.__dict__ == load_tknzr.__dict__
示例#2
0
def test_tknz(parameters, test_input: str, expected: List[str]):
    r"""Tokenize text based on whitespaces."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
    )
    assert tknzr.tknz(test_input) == expected
示例#3
0
def test_dtknz(parameters, test_input: List[str], expected: str):
    r"""Detokenize characters back to text."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
    )

    assert tknzr.dtknz(test_input) == expected
示例#4
0
def test_config_file_exist(
        ws_tknzr: WsTknzr,
        exp_name: str,
        file_path: str,
):
    r"""Save configuration as file."""

    ws_tknzr.save(exp_name)

    assert os.path.exists(file_path)
def test_dec(parameters, test_input: List[int], expected: str):
    r"""Decode token ids to text."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    assert (tknzr.dec(test_input,
                      rm_sp_tks=parameters['rm_sp_tks']) == expected)
示例#6
0
def test_config_file_format(
        ws_tknzr: WsTknzr,
        exp_name: str,
        file_path: str,
):
    r"""Saved configuration must be JSON format."""

    ws_tknzr.save(exp_name)

    with open(file_path, 'r', encoding='utf-8') as input_file:
        # Raise error if file is invalid JSON.
        assert json.load(input_file)
def test_min_count():
    r"""``min_count`` must be an integer larger than ``0``."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        -1.0,
        0.0,
        1.0,
        '',
        (),
        [],
        {},
        set(),
        None,
        ...,
        NotImplemented,
    ]

    for bad_min_count in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            WsTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=bad_min_count,
                tk2id=None,
            )

        assert '`min_count` must be an instance of `int`' in str(excinfo.value)

    # Test case: Invalid value.
    wrong_value_inputs = [-1, 0]

    for bad_min_count in wrong_value_inputs:
        with pytest.raises(ValueError) as excinfo:
            WsTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=bad_min_count,
                tk2id=None,
            )

        assert '`min_count` must be larger than `0`' in str(excinfo.value)

    # Test case: Correct input.
    for good_min_count in range(1, 10):
        tknzr = WsTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=good_min_count,
            tk2id=None,
        )
        assert tknzr.min_count == good_min_count
def test_max_vocab():
    r"""``max_vocab`` must be an integer larger than or equal to ``-1``."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        -1.0,
        0.0,
        1.0,
        '',
        (),
        [],
        {},
        set(),
        None,
        ...,
        NotImplemented,
    ]

    for bad_max_vocab in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            WsTknzr(
                is_uncased=True,
                max_vocab=bad_max_vocab,
                min_count=1,
                tk2id=None,
            )

        assert '`max_vocab` must be an instance of `int`' in str(excinfo.value)

    # Test case: Invalid value.
    with pytest.raises(ValueError) as excinfo:
        WsTknzr(
            is_uncased=True,
            max_vocab=-2,
            min_count=1,
            tk2id=None,
        )

    assert ('`max_vocab` must be larger than or equal to `-1`'
            in str(excinfo.value))

    # Test case: Correct input.
    for good_max_vocab in range(-1, 10, 1):
        tknzr = WsTknzr(
            is_uncased=True,
            max_vocab=good_max_vocab,
            min_count=1,
            tk2id=None,
        )
        assert tknzr.max_vocab == good_max_vocab
def test_lower_case(is_uncased: bool, cased_txt: Dict[str, str]):
    r"""Convert output text to lowercase when ``is_uncased == True``."""

    tknzr = WsTknzr(
        is_uncased=is_uncased,
        max_vocab=-1,
        min_count=1,
        tk2id=None,
    )

    if tknzr.is_uncased:
        assert tknzr.norm(cased_txt['input']) == cased_txt['output']
    else:
        assert tknzr.norm(cased_txt['input']) == cased_txt['input']
def test_slow_tensor_dset(max_seq_len: int) -> None:
  """Load dataset and convert to tensor on the fly."""
  tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=10)
  tknzr.build_vocab(batch_txt=['a', 'b', 'c'])

  wiki_dset = WikiText2Dset(ver='valid')

  dset = lmp.util.dset.SlowTensorDset(dset=wiki_dset, max_seq_len=max_seq_len, tknzr=tknzr)

  assert isinstance(dset, lmp.util.dset.SlowTensorDset)
  assert len(dset) == len(wiki_dset)
  for idx, tkids in enumerate(dset):
    assert isinstance(tkids, torch.Tensor), 'Each sample in the tensor dataset must be tensor.'
    assert tkids.size() == torch.Size([max_seq_len]), 'Each sample in the tensor dataset must have same length.'
    assert torch.all(dset[idx] == tkids), 'Support ``__getitem__`` and ``__iter__``.'
def test_enc(parameters, test_input: str, expected: List[int]):
    r"""Encode text to token ids."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    out = tknzr.enc(test_input, max_seq_len=parameters['max_seq_len'])

    assert out == expected

    if parameters['max_seq_len'] != -1:
        assert len(out) == parameters['max_seq_len']
def test_build_vocab(
    parameters,
    test_input: Sequence[str],
    expected: Dict[str, int],
):
    r"""Correctly build vocabulary under the constraint of given parameters."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    tknzr.build_vocab(test_input)

    assert tknzr.tk2id == expected
def test_batch_enc(
    parameters,
    test_input: List[str],
    expected: List[List[int]],
):
    r"""Encode batch of text to batch of token ids."""

    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    outs = tknzr.batch_enc(test_input, max_seq_len=parameters['max_seq_len'])

    assert outs == expected

    if parameters['max_seq_len'] != -1:
        for out in outs:
            assert len(out) == parameters['max_seq_len']
def test_is_uncased():
    r"""``is_uncased`` must be an instance of `bool`."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        0,
        1,
        -1,
        0.1,
        '',
        (),
        [],
        {},
        set(),
        None,
        ...,
        NotImplemented,
    ]

    for bad_is_uncased in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            WsTknzr(
                is_uncased=bad_is_uncased,
                max_vocab=-1,
                min_count=1,
                tk2id=None,
            )

        assert ('`is_uncased` must be an instance of `bool`'
                in str(excinfo.value))

    # Test case: Correct input.
    for good_is_uncased in [False, True]:
        tknzr = WsTknzr(
            is_uncased=good_is_uncased,
            max_vocab=-1,
            min_count=1,
            tk2id=None,
        )
        assert tknzr.is_uncased == good_is_uncased
def test_ws_tknzr(capsys, ws_tknzr: WsTknzr, exp_name: str, seed: int) -> None:
  """Ensure tokenize script output consistency when using :py:class:`lmp.tknzr.WsTknzr`."""
  txt = 'a b c'

  lmp.script.tknz_txt.main(argv=[
    '--exp_name',
    exp_name,
    '--seed',
    str(seed),
    '--txt',
    txt,
  ])

  captured = capsys.readouterr()
  assert str(ws_tknzr.tknz(txt=txt)) in captured.out
def ws_tknzr() -> WsTknzr:
    r"""Common setup of whitespace tokenizer."""

    return WsTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id={
            WsTknzr.bos_tk: WsTknzr.bos_tkid,
            WsTknzr.eos_tk: WsTknzr.eos_tkid,
            WsTknzr.pad_tk: WsTknzr.pad_tkid,
            WsTknzr.unk_tk: WsTknzr.unk_tkid,
            'a': 4,
            'b': 5,
            'c': 6,
        },
    )
示例#17
0
def test_ws_tknzr(
  exp_name: str,
  is_uncased: bool,
  max_vocab: int,
  min_count: int,
  tknzr_file_path: str,
) -> None:
  """Ensure consistency between save and load."""
  tknzr = WsTknzr(is_uncased=is_uncased, max_vocab=max_vocab, min_count=min_count)
  tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
  lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr)
  assert os.path.exists(tknzr_file_path)

  load_tknzr = lmp.util.tknzr.load(exp_name=exp_name)
  assert isinstance(load_tknzr, WsTknzr)
  assert load_tknzr.is_uncased == tknzr.is_uncased
  assert load_tknzr.max_vocab == tknzr.max_vocab
  assert load_tknzr.min_count == tknzr.min_count
  assert load_tknzr.tk2id == tknzr.tk2id
  assert load_tknzr.id2tk == tknzr.id2tk
def test_vocab_size(
    parameters,
    expected: int,
):
    r"""``WsTknzr.vocab_size`` is an instance property

    Value of ``WsTknzr.vocab_size`` is the number of tokens included in the
    vocabulary, thus must be a postive integer.
    """
    tknzr = WsTknzr(
        is_uncased=parameters['is_uncased'],
        max_vocab=parameters['max_vocab'],
        min_count=parameters['min_count'],
        tk2id=parameters['tk2id'],
    )

    # Check the type of `vocab_size`.
    assert isinstance(tknzr.vocab_size, int)

    # Check the value of `vocab_size`.
    assert tknzr.vocab_size == expected
def test_tk2id():
    r"""``tk2id`` must be an dictionary which maps `str` to `int`."""

    # Test case: Type mismatched.
    wrong_typed_inputs = [
        False,
        True,
        -1,
        0,
        1,
        -1.0,
        0.1,
        '',
        (),
        [],
        set(),
        ...,
        NotImplemented,
    ]

    for bad_tk2id in wrong_typed_inputs:
        with pytest.raises(TypeError) as excinfo:
            WsTknzr(
                is_uncased=True,
                max_vocab=-1,
                min_count=1,
                tk2id=bad_tk2id,
            )

        assert '`tk2id` must be an instance of `dict`' in str(excinfo.value)

    with pytest.raises(TypeError) as excinfo:
        WsTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={1: 1},
        )

    assert ('All keys in `tk2id` must be instances of `str`'
            in str(excinfo.value))

    with pytest.raises(TypeError) as excinfo:
        WsTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={'a': 'a'},
        )

    assert ('All values in `tk2id` must be instances of `int`'
            in str(excinfo.value))

    # Test case: Invalid value.
    with pytest.raises(ValueError) as excinfo:
        WsTknzr(
            is_uncased=True,
            max_vocab=-1,
            min_count=1,
            tk2id={'a': -1},
        )

    assert ('All values in `tk2id` must be non-negative integers'
            in str(excinfo.value))

    # Test case: Correct input.
    good_tk2id = {
        'a': 1,
        'b': 2,
    }
    tknzr = WsTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id=good_tk2id,
    )
    assert tknzr.tk2id == good_tk2id

    # Test case: Default value.
    tknzr = WsTknzr(
        is_uncased=True,
        max_vocab=-1,
        min_count=1,
        tk2id=None,
    )
    assert tknzr.tk2id == {
        WsTknzr.bos_tk: WsTknzr.bos_tkid,
        WsTknzr.eos_tk: WsTknzr.eos_tkid,
        WsTknzr.pad_tk: WsTknzr.pad_tkid,
        WsTknzr.unk_tk: WsTknzr.unk_tkid,
    }
def test_nfkc(ws_tknzr: WsTknzr, non_nfkc_txt: Dict[str, str]):
    r"""Normalize output text with NFKC."""

    assert ws_tknzr.norm(non_nfkc_txt['input']) == non_nfkc_txt['output']
示例#21
0
def ws_tknzr(exp_name: str, request, tknzr_file_path: None) -> WsTknzr:
    """Whitespace tokenizer example."""
    tknzr = WsTknzr(is_uncased=True, max_vocab=-1, min_count=0)
    tknzr.build_vocab(batch_txt=['a', 'b', 'c'])
    lmp.util.tknzr.save(exp_name=exp_name, tknzr=tknzr)
    return tknzr
def test_collapse_whitespace(ws_tknzr: WsTknzr, cws_txt: Dict[str, str]):
    r"""Collapse whitespaces in output text."""

    assert ws_tknzr.norm(cws_txt['input']) == cws_txt['output']
def test_strip_whitespace(ws_tknzr: WsTknzr, htws_txt: Dict[str, str]):
    r"""Strip output text."""

    assert ws_tknzr.norm(htws_txt['input']) == htws_txt['output']