def test_can_modify(self): decoder = BPEDecoder(suffix="123") assert decoder.suffix == "123" # Modify these decoder.suffix = "</w>" assert decoder.suffix == "</w>"
def test_decoding(self): decoder = BPEDecoder() assert (decoder.decode( ["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"]) == "My name is John") decoder = BPEDecoder(suffix="_") assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def test_instantiate(self): assert BPEDecoder() is not None assert BPEDecoder(suffix="_") is not None assert isinstance(BPEDecoder(), Decoder) assert isinstance(BPEDecoder(), BPEDecoder) assert isinstance(pickle.loads(pickle.dumps(BPEDecoder())), BPEDecoder)
def test_instantiate(self): assert BPEDecoder() is not None assert BPEDecoder(suffix="_") is not None assert isinstance(BPEDecoder(), Decoder)