def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def train(): """Source: https://huggingface.co/docs/tokenizers/pipeline""" base = os.environ['DATA_ROOT'] corpus_path = base + 'MimicIII/Encounters/Text/' bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # input to tokenizer.encode() goes through this pipeline: # normalization, pre-tokenization, model, post-processing bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)]) files = [str(file) for file in Path(corpus_path).glob('*.txt')] trainer = WordPieceTrainer( vocab_size=30522, show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train(files, trainer) os.mkdir('./Tokenizer') bert_tokenizer.save("Tokenizer/tokenizer.json")
def tokenize(dt, df): from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.pre_tokenizers import Whitespace from tokenizers import normalizers from tokenizers.normalizers import NFD, StripAccents from tokenizers.processors import TemplateProcessing from tokenizers.trainers import WordPieceTrainer #print(df.head()) #print(df.query_text.head()) #print(df.query_text.to_list()) #exit(0) data_source = get_data_source(dt) token_file = Path(data_dir, data_source, 'tokenizer.json') vocab_file = Path(data_dir, data_source, 'vocab.txt') corpus_file = Path(data_dir, data_source, 'corpus.txt') if vocab_file.is_file() and corpus_file.is_file(): print("corpus and token files already generated") return 0 bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=25000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #print(df.query_text.to_list()) bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer) bert_tokenizer.save(str(token_file)) #bert_tokenizer.save_model(directory=data_dir,name='tokenizer') df['range_idx'] = range(0, df.shape[0]) df['mean_rank_group'] = df.groupby( ['session_id'], sort=False)['range_idx'].transform(np.mean) df['separate_column'] = df['range_idx'] < df['mean_rank_group'] df = df.groupby(['session_id', 'separate_column'], as_index=False, sort=False)['query_text'].agg( ' '.join).drop(columns='separate_column') #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index() df.query_text.to_csv(corpus_file, header=False, index=False) with open(token_file) as token_f: jdata = json.load(token_f) with open(vocab_file, "w") as fd: for k in jdata['model']['vocab'].keys(): print(k, file=fd)
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def create_tokenizer(sentence_list): filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt' with open(filename, 'w') as f: for s in sentence_list: f.write(f'{s}\n') tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.decoder = decoders.WordPiece() tokenizer.enable_padding(pad_token='[PAD]', pad_id=0) trainer = WordPieceTrainer( vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]']) tokenizer.train(trainer, [filename]) os.remove(filename) return tokenizer
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
for line in tqdm(fin): dp = json.loads(line.strip()) for d in enumerate(dp): if "value" in d: if "," in d["value"]: print('Not cleaned up') # Extract value/types from trees and store in comma separated raw file (all_raw.json) with open("output/all_new_trees.json") as fin, open("output/all_raw.json", "w") as fout: for i, line in enumerate(tqdm(fin)): dp = json.loads(line) token_list = [] for d in dp: if "value" in d: token_list.append(d["value"]) elif "type" in d: token_list.append(d["type"]) raw = ",".join(token_list) print(json.dumps(raw), file=fout) # Train tokenizer on raw file tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",") trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"]) tokenizer.train(["output/all_raw.json"], trainer) tokenizer.save("output/tokenizer.json")
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=60000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json'))
def train_tokenizer( input_file: str, vocab_file: str, temporary: str, subset_size: int = 512000000, vocab_size: int = 8000, limit_alphabet: int = 6000, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Train **WordPiece** tokenizer and save trained subword vocabulary. Note: Since tokenizers_ reads whole file data in training, this function could occur memory errors if `input_file` is too large. Under the assumption that `input_file` is shuffled randomly, the subset of input corpus will be used in training. Caution: The subset of input corpus is saved in `temporary` directory. Please be careful not to delete the file while executing this function. Arguments: input_file (str): Input file path. vocab_file (str): Output vocabulary file path. temporary (str): Temporary directory where the subset of corpus would be saved. subset_size (int): The maximum number of lines in the subset. vocab_size (int): The number of subwords in the vocabulary. limit_alphabet (int): The maximum number of alphabets in vocabulary. unk_tokens (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. .. _tokenizers: https://github.com/huggingface/tokenizers """ # Create **WordPiece** model and add normalizer and pre-tokenizer. # BERT-specific normalizer and pre-tokenizer are used. tokenizer = Tokenizer(models.WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Split the head of input corpus file and save in `temporary` directory. subset_file = random_filename(temporary) _split_subset_from_file(input_file, subset_file, subset_size) # Train the model with splitted subset of corpus. trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=2, show_progress=True, limit_alphabet=limit_alphabet, special_tokens=[unk_token] + control_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset_file]) # Save trained subword vocabulary in `temporary` directory and rename to # `vocab_file`. tokenizer.model.save(temporary) os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file) # Remove temporary subset corpus. os.remove(subset_file)
def prepare_trainer(self): return WordPieceTrainer( vocab_size=30522, show_progress=True, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
from tokenizers.trainers import WordPieceTrainer from tokenizers import Tokenizer from tokenizers.models import BPE import configs import os tokenizer = Tokenizer(BPE()) trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) def main(): tokenizer.train(trainer, [configs.data.raw_cut]) tokenizer.save(os.path.join(configs.data.path, 'bpe.vocab')) print(f"save to {configs.data.path}") def train_with_sentenceprices(vocab_size: int = 3000, num_threads=2, character_coverage=0.98): os.system( f"spm_train --input={configs.data.raw_cut} --model_prefix=spiece --model_type=bpe --character_coverage={character_coverage} --vocab_size={vocab_size} --num_threads={num_threads}" ) os.system(f"mv spiece.model {configs.data.path}") if __name__ == '__main__': train_with_sentenceprices()
import fire from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}tokenizer.json') if __name__ == '__main__': fire.Fire(train)