def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def train(): """Source: https://huggingface.co/docs/tokenizers/pipeline""" base = os.environ['DATA_ROOT'] corpus_path = base + 'MimicIII/Encounters/Text/' bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # input to tokenizer.encode() goes through this pipeline: # normalization, pre-tokenization, model, post-processing bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)]) files = [str(file) for file in Path(corpus_path).glob('*.txt')] trainer = WordPieceTrainer( vocab_size=30522, show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train(files, trainer) os.mkdir('./Tokenizer') bert_tokenizer.save("Tokenizer/tokenizer.json")
def get_tokenizer(args): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() if os.path.isdir(args.tokenizer_dir): vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json') merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) else: os.makedirs(args.tokenizer_dir) trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"]) files = [ os.path.join(args.data_dir, split) for split in ['train.json', 'val.json', 'test.json'] ] tokenizer.train(files=files, trainer=trainer) tokenizer.model.save(args.tokenizer_dir) return tokenizer
def test_cannot_train_different_model(self): tokenizer = Tokenizer(models.BPE()) trainer = trainers.UnigramTrainer(show_progress=False) with pytest.raises(Exception, match="UnigramTrainer can only train a Unigram"): tokenizer.train([], trainer)
class BPE_token(object): def __init__(self): self.tokenizer = Tokenizer(BPE()) self.tokenizer.normalizer = Sequence([NFKC()]) self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder() def bpe_train(self, paths): trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<company>", "<label>", "<category>", "<review>", ]) self.tokenizer.train(trainer, paths) def save_tokenizer(self, location, prefix=None): if not os.path.exists(location): os.makedirs(location) self.tokenizer.model.save(location, prefix)
def main() -> None: args = parse_args() special_tokens = list(SPECIAL_TOKENS) if args.reserved < len(special_tokens): raise AssertionError( f"number of reserved tokens should be more than number of f{len(special_tokens)}") for i in range(len(special_tokens), args.reserved): special_tokens.append(f"[unused{i:03d}]") all_filenames = get_all_filenames(args.input) # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/" tokenizer = Tokenizer(get_model(args.model)) tokenizer.normalizer = normalizers.Sequence([ NFKC(), StripAccents(), Lowercase() ]) tokenizer.pre_tokenizer = Whitespace() trainer = WordPieceTrainer( vocab_size=args.vocab_size, special_tokens=special_tokens) tokenizer.train(trainer, all_filenames) model_files = tokenizer.model.save() sys.exit(0)
def generate_tokenizer(equations, output, vocab_size): from tokenizers import Tokenizer, pre_tokenizers from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True) tokenizer.train(trainer, equations) tokenizer.save(path=output, pretty=False)
def __create_tokenizer(self, files): # Create, train and save the tokenizer. print("Preparing tokenizer...") tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.pre_tokenizer = WhitespaceSplit() trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=files, trainer=trainer) return tokenizer
def load_or_train_tokenizer(file_paths, tokenizer_mode_path): ''' Tries to load saved text tokenizer If there is none, trains the new tokenizer and saves is ''' if not os.path.exists(tokenizer_mode_path): print('Tokenizer model not found, training one') from tokenizers.models import BPE from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.normalizers import NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ] ) tokenizer.train(file_paths, trainer) if not os.path.exists(tokenizer_mode_path): os.makedirs(tokenizer_mode_path) tokenizer.model.save(tokenizer_mode_path, None) print('Loading trained tokenizer model') tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path) tokenizer.add_special_tokens({ 'eos_token': '</s>', 'bos_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>' }) return tokenizer
class BPETokenizer(object): def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder() def train(self, files=None) -> None: if files is None: # files 長這樣:["test.txt", "train.txt", "valid.txt"] files = [ f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"] ] self.tokenizer.train(files, self.trainer) def save(self) -> None: self.tokenizer.model.save(f"data/tokenizer/{self.lang}") def encode(self, input: Union[str, List[str], Tuple[str]]) -> Encoding: return self.tokenizer.encode(input) def decode(self, input: Encoding) -> str: # 注意 type(input) == Encoding return self.tokenizer.decode(input.ids)
def test_continuing_prefix_trainer_mistmatch(self): UNK = "[UNK]" special_tokens = [UNK] tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##")) trainer = trainers.BpeTrainer(special_tokens=special_tokens) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)] ) tokenizer.train(files=["data/big.txt"], trainer=trainer) tokenizer.save("data/tokenizer.json") tokenizer.from_file("data/tokenizer.json")
def test_train_with_special_tokens(self): filename = "tests/data/dummy-unigram-special_tokens-train.txt" with open(filename, "w") as f: f.write( """ [CLS] The Zen of Python, by Tim Peters [SEP] [CLS] Beautiful is better than ugly. [SEP] [CLS] Explicit is better than implicit. [SEP] [CLS] Simple is better than complex. [SEP] [CLS] Complex is better than complicated. [SEP] [CLS] Flat is better than nested. [SEP] [CLS] Sparse is better than dense. [SEP] [CLS] Readability counts. [SEP] [CLS] Special cases aren't special enough to break the rules. [SEP] [CLS] Although practicality beats purity. [SEP] [CLS] Errors should never pass silently. [SEP] [CLS] Unless explicitly silenced. [SEP] [CLS] In the face of ambiguity, refuse the temptation to guess. [SEP] [CLS] There should be one-- and preferably only one --obvious way to do it. [SEP] [CLS] Although that way may not be obvious at first unless you're Dutch. [SEP] [CLS] Now is better than never. [SEP] [CLS] Although never is often better than *right* now. [SEP] [CLS] If the implementation is hard to explain, it's a bad idea. [SEP] [CLS] If the implementation is easy to explain, it may be a good idea. [SEP] [CLS] Namespaces are one honking great idea -- let's do more of those! [SEP] """ ) tokenizer = Tokenizer(models.Unigram()) trainer = trainers.UnigramTrainer( show_progress=False, special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]" ) tokenizer.train([filename], trainer=trainer) assert tokenizer.encode("[CLS] This is a test [SEP]").tokens == [ "[CLS]", " T", "h", "i", "s", " is ", "a", " ", "te", "s", "t ", "[SEP]", ]
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def create_tokenizer(sentence_list): filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt' with open(filename, 'w') as f: for s in sentence_list: f.write(f'{s}\n') tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.decoder = decoders.WordPiece() tokenizer.enable_padding(pad_token='[PAD]', pad_id=0) trainer = WordPieceTrainer( vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]']) tokenizer.train(trainer, [filename]) os.remove(filename) return tokenizer
def create_train_bpe_tokenizer( bpe_vocab_size, asr_text_filepath='asr.txt', ttx_text_filepath='ttx.txt', save_tokenizer=True, tokenizer_filename=".\\data\\tokenizer-test.json"): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=bpe_vocab_size) tokenizer.pre_tokenizer = Whitespace() files = [asr_text_filepath, ttx_text_filepath] files = [file for file in files if file] # Get rid of None's tokenizer.train(files, trainer) if save_tokenizer: tokenizer.save(tokenizer_filename) return tokenizer
def test_train_parallelism_with_custom_pretokenizer(self, train_files): class GoodCustomPretok: def split(self, n, normalized): # Here we just test that we can return a List[NormalizedString], it # does not really make sense to return twice the same otherwise return [normalized, normalized] def pre_tokenize(self, pretok): pretok.split(self.split) custom = pre_tokenizers.PreTokenizer.custom(GoodCustomPretok()) bpe_tokenizer = Tokenizer(models.BPE()) bpe_tokenizer.normalizer = normalizers.Lowercase() bpe_tokenizer.pre_tokenizer = custom if "TOKENIZERS_PARALLELISM" in os.environ: del os.environ["TOKENIZERS_PARALLELISM"] trainer = trainers.BpeTrainer(special_tokens=["<unk>"], show_progress=False) bpe_tokenizer.train([train_files["small"]], trainer=trainer)
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
class LitTokenizer: def __init__(self, padding=False, truncation=False, max_length=None, lower=False, lang=None): super().__init__() self.UNK_WORD = '[UNK]' self.PAD_WORD = '[PAD]' self.MASK_WORD = '[MASK]' self.SOS_WORD = '[SOS]' self.EOS_WORD = '[EOS]' self.special_tokens = [ self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD, self.EOS_WORD ] # Define tokenizer self.tokenizer = None self.configure_tokenizers(padding, truncation, max_length, lower) # Other self.lang = lang def get_vocab_size(self): return self.tokenizer.get_vocab_size() def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first') def load_vocab(self, vocab, merges): vocab, merges = tok_model.read_file(vocab, merges) self.tokenizer.model = tok_model(vocab, merges) def train_vocab(self, files, vocab_size=32000, min_frequency=3): # Train trainer trainer = tok_trainer(vocab_size=vocab_size, min_frequency=min_frequency) self.tokenizer.train(files, trainer) def save_vocab(self, output_dir, prefix): self.tokenizer.model.save(output_dir, prefix) def pad(self, examples, keys=None): pad_idx = self.special_tokens.index(self.PAD_WORD) # Keys to modify if not keys: keys = list(examples[0].keys()) d = {} for k in keys: # Collect same-type items (list of IDs, list of masks,...) d[k] = [x[k] for x in examples] # Get max length (value to pad) max_length = max([x.shape[-1] for x in d[k]]) # Apply padding for i, x in enumerate(examples): unpadded_t = x[k] if k == "ids": tmp = torch.full((max_length, ), fill_value=pad_idx, device=unpadded_t.device) # All padding elif k == "attention_mask": tmp = torch.full( (max_length, ), fill_value=0, device=unpadded_t.device) # No attention mask else: raise TypeError("Unknown key") tmp[:unpadded_t.shape[-1]] = unpadded_t d[k][i] = tmp return d def encode(self, x): return self.tokenizer.encode(x) def decode(self, x): if isinstance(x, torch.Tensor): assert len(x.shape) == 2 x = x.detach().cpu().numpy() return [self.tokenizer.decode(x_i) for x_i in x]
name_words = {n: " ".join(split_to_words(n)) for n in names} with open(f"{proc_path}/names.txt", "w") as f: f.write("\n".join(list(name_words.values()))) # f.write("\n".join(words)) tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f) bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()} char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1} print(f"Char map size: {len(char_map)}\n")
default="bpe-bytelevel", type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = Tokenizer(models.BPE.empty()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel.new() # And then train trainer = trainers.BpeTrainer.new( vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=[ "<s>", "<pad>", "</s>" ], initial_alphabet=pre_tokenizers.ByteLevel.alphabet() ) tokenizer.train(trainer, files) # Save the files tokenizer.model.save(args.out, args.name)
from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.normalizers import Lowercase, NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer path_data = "../../ml-datasets/wmt14/tokenizer/" path_train_src = "../../ml-datasets/wmt14/train.en" path_train_tgt = "../../ml-datasets/wmt14/train.de" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(), min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ]) tokenizer.train(trainer, [path_train_src, path_train_tgt]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(path_data)
from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer #from tokenizers.pre_tokenizers import Whitespace tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #tokenizer.pre_tokenizer = Whitespace() files = ['./processed/processed_wiki_ko.txt'] tokenizer.train(files, trainer) tokenizer.save("wiki_tokenizer.json")
def train_tokenizer( input_file: str, vocab_file: str, temporary: str, subset_size: int = 512000000, vocab_size: int = 8000, limit_alphabet: int = 6000, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Train **WordPiece** tokenizer and save trained subword vocabulary. Note: Since tokenizers_ reads whole file data in training, this function could occur memory errors if `input_file` is too large. Under the assumption that `input_file` is shuffled randomly, the subset of input corpus will be used in training. Caution: The subset of input corpus is saved in `temporary` directory. Please be careful not to delete the file while executing this function. Arguments: input_file (str): Input file path. vocab_file (str): Output vocabulary file path. temporary (str): Temporary directory where the subset of corpus would be saved. subset_size (int): The maximum number of lines in the subset. vocab_size (int): The number of subwords in the vocabulary. limit_alphabet (int): The maximum number of alphabets in vocabulary. unk_tokens (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. .. _tokenizers: https://github.com/huggingface/tokenizers """ # Create **WordPiece** model and add normalizer and pre-tokenizer. # BERT-specific normalizer and pre-tokenizer are used. tokenizer = Tokenizer(models.WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Split the head of input corpus file and save in `temporary` directory. subset_file = random_filename(temporary) _split_subset_from_file(input_file, subset_file, subset_size) # Train the model with splitted subset of corpus. trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=2, show_progress=True, limit_alphabet=limit_alphabet, special_tokens=[unk_token] + control_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset_file]) # Save trained subword vocabulary in `temporary` directory and rename to # `vocab_file`. tokenizer.model.save(temporary) os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file) # Remove temporary subset corpus. os.remove(subset_file)
for line in tqdm(fin): dp = json.loads(line.strip()) for d in enumerate(dp): if "value" in d: if "," in d["value"]: print('Not cleaned up') # Extract value/types from trees and store in comma separated raw file (all_raw.json) with open("output/all_new_trees.json") as fin, open("output/all_raw.json", "w") as fout: for i, line in enumerate(tqdm(fin)): dp = json.loads(line) token_list = [] for d in dp: if "value" in d: token_list.append(d["value"]) elif "type" in d: token_list.append(d["type"]) raw = ",".join(token_list) print(json.dumps(raw), file=fout) # Train tokenizer on raw file tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",") trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"]) tokenizer.train(["output/all_raw.json"], trainer) tokenizer.save("output/tokenizer.json")
from tokenizers import Tokenizer, normalizers from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Whitespace from tokenizers.trainers import WordLevelTrainer from tokenizers.processors import TemplateProcessing t = Tokenizer(WordLevel(unk_token="[UNK]")) t.pre_tokenizer = Whitespace() trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]) t.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", # , # pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 2), ("[SEP]", 3), ]) files = ['tok-train-shuf-tgt.tsv'] t.train(files, trainer) t.save("code_tokenizer.json")
from transformers import BertTokenizerFast from transformers import BertConfig import ipdb import os os.environ["CUDA_VISIBLE_DEVICES"] = "2" uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')] tokenizer = Tokenizer(WordLevel()) tokenizer.pre_tokenizer = Whitespace() # trainer = trainers.BpeTrainer( trainer = trainers.WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(trainer, [uid_task_id_sequence_path]) tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # tokenizer.save_model("tmp") tokenizer.model.save('data/bert_and_tokenizer', 'uid_task_id') # tokenizer = ByteLevelBPETokenizer( # "./tmp/vocab.json", # "./tmp/merges.txt",
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#train-a-new-tokenizer from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # TODO True tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2) tokenizer.train(trainer, ['bar']) encoded = tokenizer.encode(seq) print(encoded.tokens) # TODO: Use a clustered set of proteins like UniRef50 # -- https://www.uniprot.org/help/uniref # TODO: Use an LSTM to train on sequences, then freeze early layers and add # classification backend, retrain. # https://github.com/huggingface/tokenizers/tree/master/bindings/python # https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers from tokenizers import CharBPETokenizer tokenizer = CharBPETokenizer(bert_normalizer=False) tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2) # tokenizer.encode(seq).tokens
for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train(trainer, data_files) # And Save it tokenizer_path = out_path / "byte-level-bpe.tokenizer.json" tokenizer.save(str(tokenizer_path), pretty=True) print(f'tokenizer saved at {str(tokenizer_path)}') return tokenizer_path