Пример #1
0
 def pre_tokenizer(self, replacement, add_prefix_space):
     return pre_tokenizers.Sequence([
         pre_tokenizers.WhitespaceSplit(),
         pre_tokenizers.Metaspace(replacement=replacement,
                                  add_prefix_space=add_prefix_space),
     ])
 def pre_tokenizer(self, replacement, add_prefix_space):
     return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
Пример #3
0
    def __init__(
        self,
        vocab: Union[str, List],
        merges: List[Tuple[str, str]],
        bos_token: str = "<s>",
        eos_token: str = "</s>",
        sep_token: str = "</s>",
        cls_token: str = "<s>",
        pad_token: str = "<pad>",
        unk_token: str = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
        normalize: bool = True,
    ):
        bpe = BPE(
            vocab=vocab,
            merges=merges,
            unk_token=unk_token,
            fuse_unk=True,
        )

        tokenizer = Tokenizer(bpe)

        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        if normalize:
            tokenizer.normalizer = NFKC()

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False)
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False)
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False)
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False)
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False)
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False)

        self.add_special_tokens([
            bos_token,
            eos_token,
            sep_token,
            cls_token,
            unk_token,
            pad_token,
        ])