def build_tokenizer( token_type: str, bpemodel: Union[Path, str, Iterable[str]] = None, non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, remove_non_linguistic_symbols: bool = False, space_symbol: str = "<space>", delimiter: str = None, g2p_type: str = None, ) -> AbsTokenizer: """A helper function to instantiate Tokenizer""" assert check_argument_types() if token_type == "bpe": if bpemodel is None: raise ValueError('bpemodel is required if token_type = "bpe"') if remove_non_linguistic_symbols: raise RuntimeError( "remove_non_linguistic_symbols is not implemented for token_type=bpe" ) return SentencepiecesTokenizer(bpemodel) elif token_type == "word": if remove_non_linguistic_symbols and non_linguistic_symbols is not None: return WordTokenizer( delimiter=delimiter, non_linguistic_symbols=non_linguistic_symbols, remove_non_linguistic_symbols=True, ) else: return WordTokenizer(delimiter=delimiter) elif token_type == "char": return CharTokenizer( non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, ) elif token_type == "phn": if g2p_type is None: raise ValueError("g2p_type is required if token_type=phn") return PhonemeTokenizer( g2p_type=g2p_type, non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, ) else: raise ValueError(f"token_mode must be one of bpe, word, char or phn: " f"{token_type}")
def word_tokenizer(request): return WordTokenizer(delimiter=request.param)
def test_tokens2text(word_tokenizer: WordTokenizer): assert word_tokenizer.tokens2text( "Hello World!!".split()) == "Hello World!!"
def test_text2tokens(word_tokenizer: WordTokenizer): assert word_tokenizer.text2tokens("Hello World!! Ummm") == [ "Hello", "World!!", "Ummm", ]
def test_Text2Words_tokens2text(word_converter: WordTokenizer): assert word_converter.tokens2text("Hello World!!".split()) == "Hello World!!"