def get_tokenizer(args): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() if os.path.isdir(args.tokenizer_dir): vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json') merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) else: os.makedirs(args.tokenizer_dir) trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"]) files = [ os.path.join(args.data_dir, split) for split in ['train.json', 'val.json', 'test.json'] ] tokenizer.train(files=files, trainer=trainer) tokenizer.model.save(args.tokenizer_dir) return tokenizer
def get_tokenizer(self, tokenizer_dir): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() vocab_fn = os.path.join(tokenizer_dir, 'vocab.json') merge_fn = os.path.join(tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]']) return tokenizer
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f) bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()} char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1} print(f"Char map size: {len(char_map)}\n") MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab]) print(f"Max length of word: {MAX_LEN_OF_WORD}\n") if ZERO_PAD: word_map = { k: [char_map[c] for c in k] + [0] * (MAX_LEN_OF_WORD - len(k))
tokenizer.train(trainer, ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) # Et voilà ! You trained your very first tokenizer from scratch using tokenizers. # Of course, this covers only the basics, and you may want to have a look at the # add_special_tokens or special_tokens parameters on the Trainer class, but the # overall process should be very similar. # You will see the generated files in the output. tokenizer.model.save('/Volumes/750GB-HDD/root/Question-Answering/pyData') # Let's tokenizer a simple input tokenizer.model = BPE(pyData + 'vocab.json', pyData + 'merges.txt') encoding = tokenizer.encode("This is a simple input to be tokenized") print("Encoded string: {}".format(encoding.tokens)) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded)) # Getting started with transformers import torch from transformers import AutoModel, AutoTokenizer, BertTokenizer torch.set_grad_enabled(False) # Store the model we want to use MODEL_NAME = "bert-base-cased"
from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}tokenizer.json') if __name__ == '__main__': fire.Fire(train)
print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) # %% SAVE_PATH = Path('tokenizers') PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model' if not PATH.exists(): PATH.mkdir(parents=True, exist_ok=True) # %% # 保存模型 tokenizer.model.save(str(PATH)) # %% # 在需要时重新载入使用(可与transformers无缝衔接配合使用) # 注意,实践中这里需要按训练时的情况重新构建好tokenizer再载入model tokenizer.model = BPE(vocab=str(PATH / 'vocab.json'), merges=str(PATH / 'merges.txt')) # %% # 编码/解码 encoded = \ tokenizer.encode("This is a simple input to be tokenized.") print("Encoded string: {}".format(encoded.tokens)) decoded = \ tokenizer.decode(encoded.ids) print("Decoded string: {}".format(decoded)) # %% from tokenizers import ByteLevelBPETokenizer # tokenizer提供了一些经典tokenization算法的高级封装 # 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容
from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = BPE.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}/tokenizer.json') if __name__ == '__main__': fire.Fire(train)