示例#1
0
def build_tokenizer(
    token_type: str,
    bpemodel: Union[Path, str, Iterable[str]] = None,
    non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
    remove_non_linguistic_symbols: bool = False,
    space_symbol: str = "<space>",
    delimiter: str = None,
    g2p_type: str = None,
) -> AbsTokenizer:
    """A helper function to instantiate Tokenizer"""
    assert check_argument_types()
    if token_type == "bpe":
        if bpemodel is None:
            raise ValueError('bpemodel is required if token_type = "bpe"')

        if remove_non_linguistic_symbols:
            raise RuntimeError(
                "remove_non_linguistic_symbols is not implemented for token_type=bpe"
            )
        return SentencepiecesTokenizer(bpemodel)

    elif token_type == "word":
        if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
            return WordTokenizer(
                delimiter=delimiter,
                non_linguistic_symbols=non_linguistic_symbols,
                remove_non_linguistic_symbols=True,
            )
        else:
            return WordTokenizer(delimiter=delimiter)

    elif token_type == "char":
        return CharTokenizer(
            non_linguistic_symbols=non_linguistic_symbols,
            space_symbol=space_symbol,
            remove_non_linguistic_symbols=remove_non_linguistic_symbols,
        )

    elif token_type == "phn":
        if g2p_type is None:
            raise ValueError("g2p_type is required if token_type=phn")
        return PhonemeTokenizer(
            g2p_type=g2p_type,
            non_linguistic_symbols=non_linguistic_symbols,
            space_symbol=space_symbol,
            remove_non_linguistic_symbols=remove_non_linguistic_symbols,
        )
    else:
        raise ValueError(f"token_mode must be one of bpe, word, char or phn: "
                         f"{token_type}")
示例#2
0
def word_tokenizer(request):
    return WordTokenizer(delimiter=request.param)
示例#3
0
def test_tokens2text(word_tokenizer: WordTokenizer):
    assert word_tokenizer.tokens2text(
        "Hello World!!".split()) == "Hello World!!"
示例#4
0
def test_text2tokens(word_tokenizer: WordTokenizer):
    assert word_tokenizer.text2tokens("Hello World!! Ummm") == [
        "Hello",
        "World!!",
        "Ummm",
    ]
示例#5
0
def test_Text2Words_tokens2text(word_converter: WordTokenizer):
    assert word_converter.tokens2text("Hello World!!".split()) == "Hello World!!"