def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def bpe_train(self, paths): trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) self.tokenizer.train(paths, trainer)
def train( self, files: Union[str, List[str]], vocab_size: int = 30000, min_frequency: int = 2, special_tokens: List[str] = ["<unk>"], limit_alphabet: int = 1000, initial_alphabet: List[str] = [], suffix: Optional[str] = "</w>", show_progress: bool = True, ): """ Train the model using the given files """ trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=initial_alphabet, end_of_word_suffix=suffix, show_progress=show_progress, ) if isinstance(files, str): files = [files] self._tokenizer.train(trainer, files)
def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from)
def train_tokenizer(lang, dataset, vocab_size): # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer trainer = BpeTrainer( special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'], vocab_size=vocab_size) # pre tokenizer with whitespace tokenizer.pre_tokenizer = Whitespace() # train tokenizer.train_from_iterator(dataset[lang], trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def prepare_trainer(self): return BpeTrainer(vocab_size=30000, show_progress=True, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ])
def generate_tokenizer(equations, output, vocab_size): from tokenizers import Tokenizer, pre_tokenizers from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True) tokenizer.train(trainer, equations) tokenizer.save(path=output, pretty=False)
def load_or_train_tokenizer(file_paths, tokenizer_mode_path): ''' Tries to load saved text tokenizer If there is none, trains the new tokenizer and saves is ''' if not os.path.exists(tokenizer_mode_path): print('Tokenizer model not found, training one') from tokenizers.models import BPE from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.normalizers import NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ] ) tokenizer.train(file_paths, trainer) if not os.path.exists(tokenizer_mode_path): os.makedirs(tokenizer_mode_path) tokenizer.model.save(tokenizer_mode_path, None) print('Loading trained tokenizer model') tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path) tokenizer.add_special_tokens({ 'eos_token': '</s>', 'bos_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>' }) return tokenizer
def bpe_train(self, paths): trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<company>", "<label>", "<category>", "<review>", ]) self.tokenizer.train(trainer, paths)
def create_train_bpe_tokenizer( bpe_vocab_size, asr_text_filepath='asr.txt', ttx_text_filepath='ttx.txt', save_tokenizer=True, tokenizer_filename=".\\data\\tokenizer-test.json"): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=bpe_vocab_size) tokenizer.pre_tokenizer = Whitespace() files = [asr_text_filepath, ttx_text_filepath] files = [file for file in files if file] # Get rid of None's tokenizer.train(files, trainer) if save_tokenizer: tokenizer.save(tokenizer_filename) return tokenizer
def get_tokenizer_trainer(): # START init_tokenizer from tokenizers import Tokenizer from tokenizers.models import BPE tokenizer = Tokenizer(BPE()) # END init_tokenizer # START init_trainer from tokenizers.trainers import BpeTrainer trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) # END init_trainer # START init_pretok from tokenizers.pre_tokenizers import Whitespace tokenizer.pre_tokenizer = Whitespace() # END init_pretok return tokenizer, trainer
def build_new_vocab(): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]] files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json" with open(files) as f: file = json.load(f) contexts = [] for question in file['data']: for paragraph in question['paragraphs']: contexts.append(paragraph['context']) tokenizer.train_from_iterator(contexts, trainer) additional_vocab = [k for k, v in tokenizer.get_vocab().items()] tokenizer.save("tokenizer/tokenizer-bioasq.json") return additional_vocab
from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.normalizers import Lowercase, NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer path_data = "../../ml-datasets/wmt14/tokenizer/" path_train_src = "../../ml-datasets/wmt14/train.en" path_train_tgt = "../../ml-datasets/wmt14/train.de" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(), min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ]) tokenizer.train(trainer, [path_train_src, path_train_tgt]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(path_data)
tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer if __name__ == "__main__": # preparing corpus for wiki en_vocab_size = 50257 wiki_txt = load_text_file_json('text/AA/wiki_00.json', 'text') write_text_file(wiki_txt, 'wiki-corpus.txt') corpus_files = { 'wiki-corpus': 'wiki-corpus.txt', 'oscar-corpus': 'shuff-dedup/ceb/ceb_dedup.txt' } # define a trainer for the tokenizer trainer = BpeTrainer(vocab_size=en_vocab_size, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=['<|endoftext|>', '<pad>']) for corpus, path in corpus_files.items(): tokenizer = tokenizer_pipeline() tokenizer.train([path], trainer) tokenizer.save(f'model/{corpus}-tokenizer.json')
from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer #from tokenizers.pre_tokenizers import Whitespace tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #tokenizer.pre_tokenizer = Whitespace() files = ['./processed/processed_wiki_ko.txt'] tokenizer.train(files, trainer) tokenizer.save("wiki_tokenizer.json")
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=60000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json')) # Unigram tokenizer tokenizer = Tokenizer(Unigram()) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = UnigramTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=50000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'unigram_50k.json'))
tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # Our tokenizer also needs a pre-tokenizer responsible for converting the input # to a ByteLevel representation. tokenizer.pre_tokenizer = ByteLevel() # And finally, let's plug a decoder so we can recover from a tokenized input # to the original one tokenizer.decoder = ByteLevelDecoder() from tokenizers.trainers import BpeTrainer # We initialize our trainer, giving him the details about the vocabulary we want # to generate trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet()) tokenizer.train(trainer, ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) # Et voilà ! You trained your very first tokenizer from scratch using tokenizers. # Of course, this covers only the basics, and you may want to have a look at the # add_special_tokens or special_tokens parameters on the Trainer class, but the # overall process should be very similar. # You will see the generated files in the output. tokenizer.model.save('/Volumes/750GB-HDD/root/Question-Answering/pyData')
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
parser.add_argument('--languages', help='dataset languages to tokenize', type=str, required=True) parser.add_argument('--tokenizer-out', help='tokenizer output file', type=str, required=True) parser.add_argument('--special-tokens', type=str, default="[UNK],[SEP],[PAD],[MASK],[ECHO],[TRANSLATE]") args = parser.parse_args() # translation_dataset = load_dataset(args.dataset, args.languages) # translation_datase t.set_format(columns='translation') translation_dataset = NewsCommentaryTranslationDataset() tokenizer_file = args.tokenizer_out special_tokens = args.special_tokens.split(",") tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Whitespace() trainer = BpeTrainer(special_tokens=special_tokens) all_translation_sentences = map( lambda x: [x['translation'][lang] for lang in x['translation'].keys()], translation_dataset) tokenizer.train_from_iterator(all_translation_sentences, trainer=trainer) tokenizer.save(tokenizer_file)
print(f"Unique names: {len(names)}\n") name_words = {n: " ".join(split_to_words(n)) for n in names} with open(f"{proc_path}/names.txt", "w") as f: f.write("\n".join(list(name_words.values()))) # f.write("\n".join(words)) tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f) bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()} char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1}
if __name__ == "__main__": args = parser.parse_args() for f in ['ewe-fon', "ewe", "fon"]: # instantiate tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) # splitting our inputs into words tokenizer.pre_tokenizer = Whitespace() # instantiate trainer trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], min_frequency=2) # get files files = [os.path.join(args.data_dir, f"{f}-sentences.txt")] # train tokenizer tokenizer.train(files=files, trainer=trainer) # save tokenizer config file tokenizer.save(os.path.join(args.save_dir, f"tokenizer-{f}.json")) # load trained tokenizers for f in ['ewe-fon', "ewe", "fon"]: print(f'Using {f} tokenizer : \n') try:
import fire from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = BPE.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}/tokenizer.json') if __name__ == '__main__':