def get_recurrent_tokenizer(vocab, max_context_tokens, unk_token, pad_token, device="cpu"): """ Return a tokenizer to be used with recurrent-based models """ question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) question_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) question_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) question_tokenizer.enable_padding(direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token) context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) context_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) context_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) context_tokenizer.enable_padding( direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token, ) context_tokenizer.enable_truncation(max_context_tokens) return RecurrentSquadTokenizer(question_tokenizer, context_tokenizer, device=device)
def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def main() -> None: args = parse_args() special_tokens = list(SPECIAL_TOKENS) if args.reserved < len(special_tokens): raise AssertionError( f"number of reserved tokens should be more than number of f{len(special_tokens)}") for i in range(len(special_tokens), args.reserved): special_tokens.append(f"[unused{i:03d}]") all_filenames = get_all_filenames(args.input) # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/" tokenizer = Tokenizer(get_model(args.model)) tokenizer.normalizer = normalizers.Sequence([ NFKC(), StripAccents(), Lowercase() ]) tokenizer.pre_tokenizer = Whitespace() trainer = WordPieceTrainer( vocab_size=args.vocab_size, special_tokens=special_tokens) tokenizer.train(trainer, all_filenames) model_files = tokenizer.model.save() sys.exit(0)
def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from)
def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def __init__( self, path_src, path_tgt, path_tokenizer, path_root: Optional[str] = '', ): self.path_src = path_root + path_src self.path_tgt = path_root + path_tgt self.len = 0 self.max_len = 512 self.tokenizer = Tokenizer( BPE( path_root + path_tokenizer + 'vocab.json', path_root + path_tokenizer + 'merges.txt', )) self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) with open(self.path_src, 'r+') as f: lines_src = f.readlines() with open(self.path_tgt, 'r+') as f: lines_tgt = f.readlines() self.len = len(lines_src) self.example = list(zip(lines_src, lines_tgt))
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def test_normalize(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("My Name Is John") assert output == "my name is john"
def train(): """Source: https://huggingface.co/docs/tokenizers/pipeline""" base = os.environ['DATA_ROOT'] corpus_path = base + 'MimicIII/Encounters/Text/' bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # input to tokenizer.encode() goes through this pipeline: # normalization, pre-tokenization, model, post-processing bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)]) files = [str(file) for file in Path(corpus_path).glob('*.txt')] trainer = WordPieceTrainer( vocab_size=30522, show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train(files, trainer) os.mkdir('./Tokenizer') bert_tokenizer.save("Tokenizer/tokenizer.json")
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE( vocab, merges, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file)) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, } super().__init__(tokenizer, parameters)
def normalizer(self, proto): normalizers = [Replace("``", '"'), Replace("''", '"')] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) if self.original_tokenizer.do_lower_case: normalizers.append(Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) return Sequence(normalizers)
def tokenizer_pipeline(): """ specific pipeline for Cebuano Corpus tokenization - Uses a Byte pair encoding (BPE) tokenizer """ tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: logging.info(f"Initiating tokenizer at {vocab_file}") tokenizer = Tokenizer( WordLevel(vocab=vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordLevel(unk_token=unk_token)) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() parameters = { "model": "WordLevel", "unk_token": unk_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def _prepare_pipeline(self): self.tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) self.tokenizer.enable_padding( pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"), pad_token="[PAD]")
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first')
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
def __init__(self, tokenizer: PreTrainedTokenizerFast, cased: bool, target_vocab_size: int): """ Args: tokenizer: A Rust-based 🤗 Tokenizer cased: If False, ignore uppercases in corpus target_vocab_size: Size of augmented vocabulary Raises: ValueError: If :obj:`target_vocab_size` is larger or equal to the existing vocabulary of :obj:`tokenizer` RuntimeError: If :obj:`tokenizer` uses an unsupported tokenization model """ if target_vocab_size <= tokenizer.vocab_size: raise ValueError( f"Ensure that `target_vocab_size` is larger than tokenizer's vocab size." ) self.tokenizer = tokenizer self.cased = cased self.target_vocab_size = target_vocab_size self.model_cls: Type[ BaseTokenizer] = tokenizer.backend_tokenizer.model.__class__ # Instantiate rust tokenizer rust_tokenizer = Tokenizer(self.model_cls()) if not cased: rust_tokenizer.normalizer = Lowercase() rust_tokenizer.pre_tokenizer = Whitespace() self.rust_tokenizer = rust_tokenizer # Instantiate the appropriate Trainer based on `self.model` (i.e. BPE, WordPiece, etc) trainer_cls = self.supported_trainers.get(self.model_cls, None) if trainer_cls is None: raise RuntimeError(f"{self.model_cls} is not supported") self.trainer = trainer_cls( vocab_size=self.target_vocab_size, special_tokens=list(self.tokenizer.special_tokens_map.values()), )
def test_can_make_sequences(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence([Lowercase(), Strip()]) output = tokenizer.normalize(" HELLO ") assert output == "hello"
# name = mention + concept names = unique([mentions + list(concepts)], verbose=False) - stop_words # names = unique([mentions + list(concepts)], verbose=False) print(f"Unique names: {len(names)}\n") name_words = {n: " ".join(split_to_words(n)) for n in names} with open(f"{proc_path}/names.txt", "w") as f: f.write("\n".join(list(name_words.values()))) # f.write("\n".join(words)) tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f)
def test_lowercase(self): normalizer = Lowercase() output = normalizer.normalize_str("HELLO") assert output == "hello"
def test_instantiate(self): assert isinstance(Lowercase(), Normalizer) assert isinstance(Lowercase(), Lowercase) assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
def test_can_make_sequences(self): normalizer = Sequence([Lowercase(), Strip()]) output = normalizer.normalize_str(" HELLO ") assert output == "hello"
def prepare_tokenizer(self): tokenizer = Tokenizer(WordPiece(unk_token='[UNK]')) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([Lowercase(), NFKC()]) return tokenizer
from tokenizers.models import BPE, WordPiece, Unigram from tokenizers.normalizers import Lowercase from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=60000)
def test_lowercase(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("HELLO") assert output == "hello"