예제 #1
0
def test_sentencepiece_tokenizer():
    url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/vocab/{}'
    filename = 'test-0690baed.bpe'
    download(url_format.format(filename), path=os.path.join('tests', 'data', filename))
    tokenizer = t.SentencepieceTokenizer(os.path.join('tests', 'data', filename))
    detokenizer = t.SentencepieceDetokenizer(os.path.join('tests', 'data', filename))
    text = "Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
    try:
        ret = tokenizer(text)
        detext = detokenizer(ret)
    except ImportError:
        warnings.warn("Sentencepiece not installed, skip test_sentencepiece_tokenizer().")
        return
    assert isinstance(ret, list)
    assert all(t in tokenizer.tokens for t in ret)
    assert len(ret) > 0
    assert text == detext
예제 #2
0
def test_sentencepiece_tokenizer_subword_regularization():
    url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/vocab/{}'
    filename = 'test-31c8ed7b.uni'
    download(url_format.format(filename), path=os.path.join('tests', 'data', filename))
    tokenizer = t.SentencepieceTokenizer(os.path.join('tests', 'data', filename),
                                         -1, 0.1)
    detokenizer = t.SentencepieceDetokenizer(os.path.join('tests', 'data', filename))
    text = "Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
    try:
        reg_ret = [tokenizer(text) for _ in range(10)]
        detext = detokenizer(reg_ret[0])
    except ImportError:
        warnings.warn("Sentencepiece not installed, skip test_sentencepiece_tokenizer().")
        return
    assert text == detext
    assert any(reg_ret[i] != reg_ret[0] for i in range(len(reg_ret)))
    assert all(t in tokenizer.tokens for ret in reg_ret for t in ret)
    assert all(detokenizer(reg_ret[i]) == detext for i in range(len(reg_ret)))