示例#1
0
    def test_can_modify(self):
        decoder = BPEDecoder(suffix="123")

        assert decoder.suffix == "123"

        # Modify these
        decoder.suffix = "</w>"
        assert decoder.suffix == "</w>"
示例#2
0
 def test_decoding(self):
     decoder = BPEDecoder()
     assert (decoder.decode(
         ["My</w>", "na", "me</w>", "is</w>", "Jo",
          "hn</w>"]) == "My name is John")
     decoder = BPEDecoder(suffix="_")
     assert decoder.decode(["My_", "na", "me_", "is_", "Jo",
                            "hn_"]) == "My name is John"
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        unk_token: Optional[str] = "<unk>",
        suffix: Optional[str] = "</w>",
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        # OpenAI normalization is the same as Bert
        normalizers += [BertNormalizer()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = BPEDecoder(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
示例#4
0
 def test_instantiate(self):
     assert BPEDecoder() is not None
     assert BPEDecoder(suffix="_") is not None
     assert isinstance(BPEDecoder(), Decoder)
     assert isinstance(BPEDecoder(), BPEDecoder)
     assert isinstance(pickle.loads(pickle.dumps(BPEDecoder())), BPEDecoder)
示例#5
0
 def test_instantiate(self):
     assert BPEDecoder() is not None
     assert BPEDecoder(suffix="_") is not None
     assert isinstance(BPEDecoder(), Decoder)