def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece.empty()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab_file is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def test_instantiate(self, bert_files): assert isinstance(WordPiece.empty(), Model) assert isinstance(WordPiece.from_files(bert_files["vocab"]), Model)