def train_BPE_tokenizer(self) -> None: bytebpe_tokenizer = ByteLevelBPETokenizer() bytebpe_tokenizer.train(files=['./train.txt', './test.txt'], vocab_size=10000, special_tokens=["[PAD]"]) bytebpe_tokenizer.save_model("nlpbook/bbpe")
def create_tokenizer(args): # Directory for storing directory = args.store_files # Train the tokenizer # paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")] paths = [args.file] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save(args.store_files) tokenizer_config = { "max_len": 512 } with open("{}/tokenizer_config.json".format(args.store_files), 'w') as fp: json.dump(tokenizer_config, fp)
def _create_train_files_and_regenerate_vocab(): print("pass") r = run("split -l1000000 train.txt --verbose") if r.ok: print("Train splits generated") if r.ok: try: shutil.rmtree("td") except FileNotFoundError: pass os.mkdir("td") r = run( "mv xaa td/xaa.txt | mv xab td/xbb.txt | mv xac td/xac.txt | mv xad td/xad.txt | mv xae td/xae.txt | mv xaf td/xaf.txt" ) if r.ok: paths = [str(x) for x in Path(".").glob("td/*.txt")] tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train( files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) try: shutil.rmtree("codeBERT") except FileNotFoundError: pass os.mkdir("codeBERT") tokenizer.save("codeBERT")
def train_tokenizer(data_path, wiki_text_file_path): # ToDo := Load if weights exists, else setup tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer_en.pad_token = tokenizer_en.eos_token vocab_size = tokenizer_en.vocab_size max_length = 1024 tokenizer_es = ByteLevelBPETokenizer() tokenizer_es.train( files=[str(wiki_text_file_path)], vocab_size=vocab_size, min_frequency=2, special_tokens=[EOF_TOKEN] ) tokenizer_es.enable_truncation(max_length=max_length) tokenizer_es_path = data_path/"BLBPE_tokenizer_es" tokenizer_es_path.mkdir(exist_ok=True, parents=True) tokenizer_es.save_model(str(tokenizer_es_path)) tokenizer_es = GPT2TokenizerFast.from_pretrained( str(tokenizer_es_path), pad_token=EOF_TOKEN ) tokenizer_es.model_max_length = max_length # tokenizer_es = ByteLevelBPETokenizer( # vocab_file=str(tokenizer_es_path/"vocab.json"), # merges_file=str(tokenizer_es_path/"merges.txt"), # ) # tokenizer_es.enable_truncation(max_length=1024) # ToDo := is this necessary # tokenizer_en.pad_token = tokenizer_en.eos_token return tokenizer_en, tokenizer_es
def build_tokenizer(data_path, save_path): r""" Creates a tokenizer for the Bert Model based on the given data corpus Args: data_path (:obj:`str`): Path to the data corpus save_path (:obj:`str`): Path where the custom tokenizer should be saved """ # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=data_path, vocab_size=52000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save(save_path)
def train_tokenizer(paths, vocab_size=21128, min_frequency=2): """ 训练tokenizer,并保存到本地; 如果数据量大可能会很耗时. Args: paths: 训练用的文本文件目录 vocab_size: 词典大小 min_frequency: 出现次数小于该值的单词被过滤掉 Returns: 将词典保存到本地,返回分词器对象 """ # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model("data") # 保存分词器(其实就是个词典) return tokenizer
def main(): parser = ArgumentParser(description="Training tokenizer on text files.") parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).") parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.") args = parser.parse_args() text_dir = args.text_dir tokenizer_path = args.tokenizer_path if Path(tokenizer_path).exists(): paths = [str(x) for x in Path(text_dir).glob("**/*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = ByteLevel tokenizer.train( files=paths, vocab_size=config.vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", # probably not needed if using ByteLevel pretokenization "<mask>", ] ) tokenizer.save_model(tokenizer_path) else: print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
def Tok_Train(input_file_path,vocab_size,output_path): """Train a Simple BPE Tokenizer""" GPTToken = ByteLevelBPETokenizer(lowercase=True) GPTToken.enable_padding() GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"]) GPTToken.save_model(output_path) return None
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = False, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout) tokenizer.train( files=files, vocab_size=vocab_size - len(added_tokens), min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token], ) tokenizer.add_tokens(added_tokens) PREFIX = "aitextgen" save_path_str = "the current directory" if save_path == "" else save_path if serialize: logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " + "You will need this file to build the GPT2Tokenizer.") tokenizer.save(f"{PREFIX}.tokenizer.json") else: logger.info( f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. " + "You will need both files to build the GPT2Tokenizer.") tokenizer.save_model(save_path, PREFIX)
def tokenize_cards( files=['./dataset/cards_train.txt', './dataset/cards_val.txt'], output_dir='./tokenizer'): tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.save_model(output_dir)
def main(): # Instantiate argument parser parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_data_file", default=None, type=str, required=True, help= "The input training data file or a path to a directory with multiple training data files." ) parser.add_argument( "--output_dir", type=str, required=True, help="The output directory where the tokenizer model will be written.") # Optional parameters parser.add_argument("--vocab_size", default=5000, type=int, help="Vocabulary maximum size, default 5000.") parser.add_argument("--min_freq", default=2, type=int, help="Minimum number of occurrences, default 2") # Generate args args = parser.parse_args() # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Get training files paths = os.path.abspath(args.train_data_file) if not args.train_data_file.endswith(".txt"): paths = [str(x) for x in Path(paths).glob("**/*.txt")] # Customize training tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=args.min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.add_special_tokens(["<x>", "<z>"]) # Save files to disk output_dir = os.path.abspath(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) tokenizer.save_model(output_dir)
def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True): """ Train a new tokenizer on `train_files`. Args: - train_files: List of files to be used when training the tokenizer. - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer. - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir'] will be used. - use_trained_tokenizer (optional): Load the trained tokenizer once training completes. Returns: None """ if not isinstance(train_files, list): train_files = [train_files] if not output_dir: output_dir = self.args["output_dir"] tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], ) os.makedirs(output_dir, exist_ok=True) tokenizer.save(output_dir) logger.info(" Training of {} tokenizer complete. Saved to {}.".format( tokenizer_name, output_dir)) _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]] tokenizer = tokenizer_class.from_pretrained(output_dir) if use_trained_tokenizer: self.tokenizer = tokenizer self.args["tokenizer_name"] = output_dir try: model_to_resize = self.model.module if hasattr( self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) except AttributeError: pass
def prepare_data(self): if not Path(self.tokenizer_name_or_path).exists(): tokenizer = ByteLevelBPETokenizer() tokenizer.train(self.files, vocab_size=self.max_vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens) Path(self.tokenizer_name_or_path).mkdir(parents=True, exist_ok=True) tokenizer.save_model(self.tokenizer_name_or_path)
def build_tokenizer(file_paths, vocab_size, output_file="UNKNOWN_BERT_tokenizer"): tokenizer = ByteLevelBPETokenizer() # tokenizer = SentencePieceBPETokenizer(vocab_file=None, unk_token="<unk>") tokenizer.train(files=file_paths, vocab_size=vocab_size, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save_model(".", output_file)
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, prefix: str = "aitextgen", save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = True, trim_offsets: bool = True, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param prefix: File name prefix of the final tokenizer :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout, trim_offsets=trim_offsets) tokenizer.train( files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token] + added_tokens, ) if serialize: tokenizer.save(f"{prefix}.tokenizer.json") else: tokenizer.save_model(save_path, prefix)
def save_sentense_piece_model(): paths = [str(x) for x in Path("./data/").glob("**/*.txt")] print(paths) special_token = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save(".", "ko")
def tokenize(filename, vocab_size): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=filename, vocab_size=vocab_size, min_frequency=2, special_tokens=['<|endoftext|>']) # '<bos>', '<eos>', '<unk>', '<pad>', '<mask>']) tokenizer.save(corpus) return tokenizer
def generate_tokenizer(args): langs = args.languages if "all" in langs: langs = ["python", "java", "javascript", "go", "ruby", "php"] if args.combined: for size in args.sizes: lang = "_combined" paths = list(glob("data/train{}_{}.txt".format(lang, size))) tokenizer = ByteLevelBPETokenizer(lowercase=False) tokenizer.train( files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ], ) os.makedirs("tokenizer{}".format(lang), exist_ok=True) tokenizer.save_model("tokenizer{}".format(lang)) else: for language in langs: for size in args.sizes: lang = "_{}".format(language) paths = list(glob("data/train{}_{}.txt".format(lang, size))) tokenizer = ByteLevelBPETokenizer(lowercase=False) tokenizer.train( files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ], ) os.makedirs("tokenizer{}".format(lang), exist_ok=True) tokenizer.save_model("tokenizer{}".format(lang))
def save_sentense_piece_model(): ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko'] en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en'] special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "ko") tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "en")
def train_tokenizer(input_path, output_path, vocab_size=10000): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=[input_path], vocab_size=vocab_size, special_tokens=["[PAD]", "<s>", "</s>", "<unk>"]) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.save_model(output_path) return tokenizer
def create_tokenizer(self): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=self.files, vocab_size=self.vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens) vocab_path = os.path.join(self.save_directory) if not os.path.exists(vocab_path): os.makedirs(vocab_path) tokenizer.save_model(vocab_path) return tokenizer
def get_tokenizer(train_data, vocab_size): """ Trains and returns a byte-level BPE tokenizer. If a cached tokenizer with these parameters exists it is loaded instead of training a new tokenizer. :param train_data: list of dataset files :param vocab_size: BPE vocab size :return: GPT2TokenizerFast with the requested parameters. """ assert vocab_size >= 257, 'vocab size must cover all possible bytes and one special token' # calculate the name of the cached file m = hashlib.md5() m.update(str(vocab_size).encode()) for file in train_data: m.update(file.encode()) cache_id = m.hexdigest() cached_tokenizer_file = os.path.join(CACHE_DIR, 'tokenizer_{}'.format(cache_id)) train_new_tokenizer = not os.path.exists(cached_tokenizer_file) if train_new_tokenizer: start = time.time() os.makedirs(cached_tokenizer_file) tokenizer = ByteLevelBPETokenizer() tokenizer.train( train_data, vocab_size=vocab_size, special_tokens=['<|endoftext|>'], show_progress=False, ) tokenizer.save_model(cached_tokenizer_file) logger.info(f"Trained tokenizer {cached_tokenizer_file} [took %.3f s]", time.time() - start) start = time.time() tokenizer = GPT2TokenizerFast.from_pretrained(cached_tokenizer_file) tokenizer.cache_id = cache_id if not train_new_tokenizer: logger.info( f"Loaded tokenizer from {cached_tokenizer_file} [took %.3f s]", time.time() - start) return tokenizer
def save_tmp_tokenizer(): paths = [str(dataset_path / 'oscar.eo.1000.txt')] # Initialize a tokenizer tokenizer_tmp = ByteLevelBPETokenizer() # Customize training tokenizer_tmp.train(files=paths, vocab_size=10_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save_model(str(tokenizer_tmp_path))
def train_tokenizer(data_file_paths, vocab_size): special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"] wordpieces_prefix = None if tokenizer_type == 'byte': t = ByteLevelBPETokenizer() elif tokenizer_type == 'char': t = CharBPETokenizer() else: t = BertWordPieceTokenizer() special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] wordpieces_prefix = "##" t.train(files=data_file_paths, vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=special_tokens, limit_alphabet=1000, wordpieces_prefix=wordpieces_prefix) return t
def _bbpe(self): tokenizer = ByteLevelBPETokenizer( vocab=self.conf.vocab, merges=self.conf.merges, add_prefix_space=self.conf.add_prefix_space, lowercase=self.conf.lowercase, dropout=self.conf.dropout, unicode_normalizer=self.conf.unicode_normalizer, continuing_subword_prefix=self.conf.continuing_subword_prefix, end_of_word_suffix=self.conf.end_of_word_suffix, trim_offsets=self.conf.trim_offsets, ) tokenizer.train( files=self.files, vocab_size=self.conf.vocab_size, min_frequency=self.conf.min_frequency, special_tokens=self.conf.bbpe_special_tokens, ) return tokenizer
def main(args): paths = [path for path in args.input.split(":")] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train( files=paths, vocab_size=args.vocab_size, min_frequency=args.min_freq, special_tokens=["<s>", "<pad>", "</s>", "<unk>"], ) # Save files to disk tokenizer.save("{}.json".format(args.name), pretty=True) tok_spec = json.loads(tokenizer.to_str()) with open("{}-vocab.json".format(args.name), "w") as fp: json.dump(tok_spec["model"]["vocab"], fp, indent=4) with open("{}-merges.txt".format(args.name), "w") as fp: fp.write("\n".join(tok_spec["model"]["merges"]))
def get_french_vocab(model_name): root = Path(os.getcwd()).parent.parent.parent french_corpus = "Datasets/corpora/fr/text" fr_corpus_path = os.path.join(root, french_corpus) files = [] for dir_ in os.listdir(fr_corpus_path): fr_corpus_dir = os.path.join(fr_corpus_path, dir_) for text_file in os.listdir(fr_corpus_dir): text_file = os.path.join(fr_corpus_dir, text_file) files.append(text_file) tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files, vocab_size=20000, min_frequency=2, show_progress=True, special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"]) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer.save(model_name)
def main(args): # set the corpus random.seed(42) proj_dir = Path() tokenizers_dir = proj_dir / "tokenizers" if not tokenizers_dir.exists(): tokenizers_dir.mkdir(parents=True) corpus_dir = proj_dir / "corpus" comment_dir = corpus_dir / "comment" source_path = comment_dir / "20190101_20200611_v2.txt" sample_path = comment_dir / "sample.txt" # sampling source source_io = open(source_path, mode="r", encoding="utf-8") sample_io = open(sample_path, mode="w", encoding="utf-8") for line in source_io: if random.random() > (1 - args.sample_rate): sample_io.write(line) else: sample_io.close() source_io.close() # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer(add_prefix_space=False) # Customize training tokenizer.train( files=str(sample_path), vocab_size=args.vocab_size, min_frequency=args.min_freq, show_progress=True, special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"], ) tokenizer.save_model(directory=str(tokenizers_dir))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: encoding = exec_properties["encoding"] text_token_size = exec_properties["text_token_size"] end_token = exec_properties["end_token"] model_dir = get_single_uri(input_dict["model_dir"]) merged_text_dir = get_single_uri(input_dict["merged_text_dir"]) encoding_dir = get_single_uri(output_dict["encoding_dir"]) logging.info("encoding as: {}".format(encoding)) logging.info("text token size: {}".format(text_token_size)) logging.info("end token: {}".format(end_token)) logging.info("model directory: {}".format(model_dir)) logging.info("merged text directory: {}".format(merged_text_dir)) logging.info("encoding directory: {}".format(encoding_dir)) logging.info("Training BPE Tokenizer") tokenizer = ByteLevelBPETokenizer(lowercase=False, end_of_word_suffix=end_token) for (dirpath, _, fnames) in os.walk(merged_text_dir): for fname in fnames: file_path = os.path.join(dirpath, fname) if os.path.isfile(file_path): logging.info("training on {}".format(file_path)) tokenizer.train([file_path], vocab_size=text_token_size) logging.info("Storing BPE Tokenizer") encoder_file, vocab_file = tokenizer.save_model(encoding_dir) os.rename(encoder_file, os.path.join(encoding_dir, "encoder.json")) os.rename(vocab_file, os.path.join(encoding_dir, "vocab.bpe")) # load hparams and store with new value with open(os.path.join(model_dir, 'hparams.json')) as f: hparams = json.load(f) hparams["n_vocab"] = text_token_size with open(os.path.join(encoding_dir, "hparams.json"), 'w') as json_file: json.dump(hparams, json_file)
def train(): from tokenizers import ByteLevelBPETokenizer ''' Initialize a tokenizer ''' tokenizer = ByteLevelBPETokenizer() ''' Customize training Ex: Assuming I need <s> as StartOfSentence token and </s> as EndOfSentence token , and <sep> token in case of seperation between subsentences etc. we specify the required special tokens. These tokens are not broken into subword tokens by the tokenizer. ''' paths = ['data/wiki_data.txt'] tokenizer.train( files=paths, vocab_size=40000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>"]) ''' Save tokenizer ''' tokenizer.save_model("./tok_checkpoints", "tokenizer_model")