def test_encode_add_special_tokens(self, roberta_files): with pytest.deprecated_call(): tokenizer = Tokenizer( BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == [ "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>" ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == [ "ĠMy", "Ġname", "Ġis", "ĠJohn" ]
def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def __init__(self): self.tokenizer = Tokenizer(BPE()) self.tokenizer.normalizer = Sequence([ NFKC() ]) self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def test_can_modify(self): pretok = ByteLevel(add_prefix_space=False) assert pretok.add_prefix_space == False # Modify these pretok.add_prefix_space = True assert pretok.add_prefix_space == True
def load_or_train_tokenizer(file_paths, tokenizer_mode_path): ''' Tries to load saved text tokenizer If there is none, trains the new tokenizer and saves is ''' if not os.path.exists(tokenizer_mode_path): print('Tokenizer model not found, training one') from tokenizers.models import BPE from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.normalizers import NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ] ) tokenizer.train(file_paths, trainer) if not os.path.exists(tokenizer_mode_path): os.makedirs(tokenizer_mode_path) tokenizer.model.save(tokenizer_mode_path, None) print('Loading trained tokenizer model') tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path) tokenizer.add_special_tokens({ 'eos_token': '</s>', 'bos_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>' }) return tokenizer
def tokenizer_pipeline(): """ specific pipeline for Cebuano Corpus tokenization - Uses a Byte pair encoding (BPE) tokenizer """ tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer
def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(add_prefix_space=True) is not None assert ByteLevel(add_prefix_space=False) is not None assert isinstance(ByteLevel(), PreTokenizer) assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
# names = unique([mentions + list(concepts)], verbose=False) print(f"Unique names: {len(names)}\n") name_words = {n: " ".join(split_to_words(n)) for n in names} with open(f"{proc_path}/names.txt", "w") as f: f.write("\n".join(list(name_words.values()))) # f.write("\n".join(words)) tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f) bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()}
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(add_prefix_space=True) is not None assert ByteLevel(add_prefix_space=False) is not None assert isinstance(ByteLevel(), PreTokenizer)
def test_manual_reload(self): byte_level = ByteLevel() state = json.loads(byte_level.__getstate__()) reloaded = ByteLevel(**state) assert isinstance(reloaded, ByteLevel)