def Tok_Train(input_file_path,vocab_size,output_path): """Train a Simple BPE Tokenizer""" GPTToken = ByteLevelBPETokenizer(lowercase=True) GPTToken.enable_padding() GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"]) GPTToken.save_model(output_path) return None
def main(): parser = ArgumentParser(description="Training tokenizer on text files.") parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).") parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.") args = parser.parse_args() text_dir = args.text_dir tokenizer_path = args.tokenizer_path if Path(tokenizer_path).exists(): paths = [str(x) for x in Path(text_dir).glob("**/*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = ByteLevel tokenizer.train( files=paths, vocab_size=config.vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", # probably not needed if using ByteLevel pretokenization "<mask>", ] ) tokenizer.save_model(tokenizer_path) else: print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
def train_tokenizer(paths, vocab_size=21128, min_frequency=2): """ 训练tokenizer,并保存到本地; 如果数据量大可能会很耗时. Args: paths: 训练用的文本文件目录 vocab_size: 词典大小 min_frequency: 出现次数小于该值的单词被过滤掉 Returns: 将词典保存到本地,返回分词器对象 """ # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model("data") # 保存分词器(其实就是个词典) return tokenizer
def train_BPE_tokenizer(self) -> None: bytebpe_tokenizer = ByteLevelBPETokenizer() bytebpe_tokenizer.train(files=['./train.txt', './test.txt'], vocab_size=10000, special_tokens=["[PAD]"]) bytebpe_tokenizer.save_model("nlpbook/bbpe")
def train_tokenizer(data_path, wiki_text_file_path): # ToDo := Load if weights exists, else setup tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer_en.pad_token = tokenizer_en.eos_token vocab_size = tokenizer_en.vocab_size max_length = 1024 tokenizer_es = ByteLevelBPETokenizer() tokenizer_es.train( files=[str(wiki_text_file_path)], vocab_size=vocab_size, min_frequency=2, special_tokens=[EOF_TOKEN] ) tokenizer_es.enable_truncation(max_length=max_length) tokenizer_es_path = data_path/"BLBPE_tokenizer_es" tokenizer_es_path.mkdir(exist_ok=True, parents=True) tokenizer_es.save_model(str(tokenizer_es_path)) tokenizer_es = GPT2TokenizerFast.from_pretrained( str(tokenizer_es_path), pad_token=EOF_TOKEN ) tokenizer_es.model_max_length = max_length # tokenizer_es = ByteLevelBPETokenizer( # vocab_file=str(tokenizer_es_path/"vocab.json"), # merges_file=str(tokenizer_es_path/"merges.txt"), # ) # tokenizer_es.enable_truncation(max_length=1024) # ToDo := is this necessary # tokenizer_en.pad_token = tokenizer_en.eos_token return tokenizer_en, tokenizer_es
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = False, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout) tokenizer.train( files=files, vocab_size=vocab_size - len(added_tokens), min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token], ) tokenizer.add_tokens(added_tokens) PREFIX = "aitextgen" save_path_str = "the current directory" if save_path == "" else save_path if serialize: logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " + "You will need this file to build the GPT2Tokenizer.") tokenizer.save(f"{PREFIX}.tokenizer.json") else: logger.info( f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. " + "You will need both files to build the GPT2Tokenizer.") tokenizer.save_model(save_path, PREFIX)
def tokenize_cards( files=['./dataset/cards_train.txt', './dataset/cards_val.txt'], output_dir='./tokenizer'): tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.save_model(output_dir)
def main(): # Instantiate argument parser parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_data_file", default=None, type=str, required=True, help= "The input training data file or a path to a directory with multiple training data files." ) parser.add_argument( "--output_dir", type=str, required=True, help="The output directory where the tokenizer model will be written.") # Optional parameters parser.add_argument("--vocab_size", default=5000, type=int, help="Vocabulary maximum size, default 5000.") parser.add_argument("--min_freq", default=2, type=int, help="Minimum number of occurrences, default 2") # Generate args args = parser.parse_args() # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Get training files paths = os.path.abspath(args.train_data_file) if not args.train_data_file.endswith(".txt"): paths = [str(x) for x in Path(paths).glob("**/*.txt")] # Customize training tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=args.min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.add_special_tokens(["<x>", "<z>"]) # Save files to disk output_dir = os.path.abspath(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) tokenizer.save_model(output_dir)
def build_tokenizer(file_paths, vocab_size, output_file="UNKNOWN_BERT_tokenizer"): tokenizer = ByteLevelBPETokenizer() # tokenizer = SentencePieceBPETokenizer(vocab_file=None, unk_token="<unk>") tokenizer.train(files=file_paths, vocab_size=vocab_size, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save_model(".", output_file)
def prepare_data(self): if not Path(self.tokenizer_name_or_path).exists(): tokenizer = ByteLevelBPETokenizer() tokenizer.train(self.files, vocab_size=self.max_vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens) Path(self.tokenizer_name_or_path).mkdir(parents=True, exist_ok=True) tokenizer.save_model(self.tokenizer_name_or_path)
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, prefix: str = "aitextgen", save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = True, trim_offsets: bool = True, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param prefix: File name prefix of the final tokenizer :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout, trim_offsets=trim_offsets) tokenizer.train( files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token] + added_tokens, ) if serialize: tokenizer.save(f"{prefix}.tokenizer.json") else: tokenizer.save_model(save_path, prefix)
def generate_tokenizer(args): langs = args.languages if "all" in langs: langs = ["python", "java", "javascript", "go", "ruby", "php"] if args.combined: for size in args.sizes: lang = "_combined" paths = list(glob("data/train{}_{}.txt".format(lang, size))) tokenizer = ByteLevelBPETokenizer(lowercase=False) tokenizer.train( files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ], ) os.makedirs("tokenizer{}".format(lang), exist_ok=True) tokenizer.save_model("tokenizer{}".format(lang)) else: for language in langs: for size in args.sizes: lang = "_{}".format(language) paths = list(glob("data/train{}_{}.txt".format(lang, size))) tokenizer = ByteLevelBPETokenizer(lowercase=False) tokenizer.train( files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ], ) os.makedirs("tokenizer{}".format(lang), exist_ok=True) tokenizer.save_model("tokenizer{}".format(lang))
def train_tokenizer(input_path, output_path, vocab_size=10000): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=[input_path], vocab_size=vocab_size, special_tokens=["[PAD]", "<s>", "</s>", "<unk>"]) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.save_model(output_path) return tokenizer
def create_tokenizer(self): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=self.files, vocab_size=self.vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens) vocab_path = os.path.join(self.save_directory) if not os.path.exists(vocab_path): os.makedirs(vocab_path) tokenizer.save_model(vocab_path) return tokenizer
def get_tokenizer(train_data, vocab_size): """ Trains and returns a byte-level BPE tokenizer. If a cached tokenizer with these parameters exists it is loaded instead of training a new tokenizer. :param train_data: list of dataset files :param vocab_size: BPE vocab size :return: GPT2TokenizerFast with the requested parameters. """ assert vocab_size >= 257, 'vocab size must cover all possible bytes and one special token' # calculate the name of the cached file m = hashlib.md5() m.update(str(vocab_size).encode()) for file in train_data: m.update(file.encode()) cache_id = m.hexdigest() cached_tokenizer_file = os.path.join(CACHE_DIR, 'tokenizer_{}'.format(cache_id)) train_new_tokenizer = not os.path.exists(cached_tokenizer_file) if train_new_tokenizer: start = time.time() os.makedirs(cached_tokenizer_file) tokenizer = ByteLevelBPETokenizer() tokenizer.train( train_data, vocab_size=vocab_size, special_tokens=['<|endoftext|>'], show_progress=False, ) tokenizer.save_model(cached_tokenizer_file) logger.info(f"Trained tokenizer {cached_tokenizer_file} [took %.3f s]", time.time() - start) start = time.time() tokenizer = GPT2TokenizerFast.from_pretrained(cached_tokenizer_file) tokenizer.cache_id = cache_id if not train_new_tokenizer: logger.info( f"Loaded tokenizer from {cached_tokenizer_file} [took %.3f s]", time.time() - start) return tokenizer
def save_tmp_tokenizer(): paths = [str(dataset_path / 'oscar.eo.1000.txt')] # Initialize a tokenizer tokenizer_tmp = ByteLevelBPETokenizer() # Customize training tokenizer_tmp.train(files=paths, vocab_size=10_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save_model(str(tokenizer_tmp_path))
def main(args): # set the corpus random.seed(42) proj_dir = Path() tokenizers_dir = proj_dir / "tokenizers" if not tokenizers_dir.exists(): tokenizers_dir.mkdir(parents=True) corpus_dir = proj_dir / "corpus" comment_dir = corpus_dir / "comment" source_path = comment_dir / "20190101_20200611_v2.txt" sample_path = comment_dir / "sample.txt" # sampling source source_io = open(source_path, mode="r", encoding="utf-8") sample_io = open(sample_path, mode="w", encoding="utf-8") for line in source_io: if random.random() > (1 - args.sample_rate): sample_io.write(line) else: sample_io.close() source_io.close() # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer(add_prefix_space=False) # Customize training tokenizer.train( files=str(sample_path), vocab_size=args.vocab_size, min_frequency=args.min_freq, show_progress=True, special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"], ) tokenizer.save_model(directory=str(tokenizers_dir))
def train(): from tokenizers import ByteLevelBPETokenizer ''' Initialize a tokenizer ''' tokenizer = ByteLevelBPETokenizer() ''' Customize training Ex: Assuming I need <s> as StartOfSentence token and </s> as EndOfSentence token , and <sep> token in case of seperation between subsentences etc. we specify the required special tokens. These tokens are not broken into subword tokens by the tokenizer. ''' paths = ['data/wiki_data.txt'] tokenizer.train( files=paths, vocab_size=40000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>"]) ''' Save tokenizer ''' tokenizer.save_model("./tok_checkpoints", "tokenizer_model")
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: encoding = exec_properties["encoding"] text_token_size = exec_properties["text_token_size"] end_token = exec_properties["end_token"] model_dir = get_single_uri(input_dict["model_dir"]) merged_text_dir = get_single_uri(input_dict["merged_text_dir"]) encoding_dir = get_single_uri(output_dict["encoding_dir"]) logging.info("encoding as: {}".format(encoding)) logging.info("text token size: {}".format(text_token_size)) logging.info("end token: {}".format(end_token)) logging.info("model directory: {}".format(model_dir)) logging.info("merged text directory: {}".format(merged_text_dir)) logging.info("encoding directory: {}".format(encoding_dir)) logging.info("Training BPE Tokenizer") tokenizer = ByteLevelBPETokenizer(lowercase=False, end_of_word_suffix=end_token) for (dirpath, _, fnames) in os.walk(merged_text_dir): for fname in fnames: file_path = os.path.join(dirpath, fname) if os.path.isfile(file_path): logging.info("training on {}".format(file_path)) tokenizer.train([file_path], vocab_size=text_token_size) logging.info("Storing BPE Tokenizer") encoder_file, vocab_file = tokenizer.save_model(encoding_dir) os.rename(encoder_file, os.path.join(encoding_dir, "encoder.json")) os.rename(vocab_file, os.path.join(encoding_dir, "vocab.bpe")) # load hparams and store with new value with open(os.path.join(model_dir, 'hparams.json')) as f: hparams = json.load(f) hparams["n_vocab"] = text_token_size with open(os.path.join(encoding_dir, "hparams.json"), 'w') as json_file: json.dump(hparams, json_file)
# Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() #tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>","<pad>", "</s>","<unk>", "<mask>",]) tokenizer.train(files="kant.txt", vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model(SAVE_MODEL) tokenizer = ByteLevelBPETokenizer( SAVE_MODEL + "/vocab.json", SAVE_MODEL + "/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = RobertaConfig(
parser = argparse.ArgumentParser() parser.add_argument('--name', type=str) parser.add_argument('--vocab_size', type=int) args = parser.parse_args() # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=args.vocab_size, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) model_path = f'{current_path}../models/{args.name}' if not os.path.exists(model_path): os.mkdir(model_path) tokenizer.save_model(model_path) config = RobertaConfig(vocab_size=args.vocab_size) tokenizer = RobertaTokenizerFast.from_pretrained(model_path, max_len=512) tokenizer.save_pretrained(model_path) config.save_pretrained(model_path)
# Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model("EsperBERTo") from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "./EsperBERTo/vocab.json", "./EsperBERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512)
# use this when training BPE tokenizer from scratch from pathlib import Path from tokenizers import ByteLevelBPETokenizer paths = ['../../data/jw300.en-tw.tw', '../../data/asante_twi_bible.txt'] # dataset location # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train( files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "[CLS]", "[PAD]", "[SEP]", "[UNK]", "[MASK]", ] ) # which special tokens to use for start, padding, end, unknown and mask respectively # Save files to disk - make sure these directories exist tokenizer.save_model("distilbako-base-akuapem-twi-cased") # akuapem
from pathlib import Path from tokenizers import ByteLevelBPETokenizer paths = ['./dataset/oscar.eo.txt'] tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model("./bert-tokenizer")
tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=dpath, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # !mkdir bert-model tokenizer.save_model("bert-model2") from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "./bert-model2/vocab.json", "./bert-model2/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("Mi estas Julien.")
from pathlib import Path from tokenizers import ByteLevelBPETokenizer #https://huggingface.co/blog/how-to-train paths = [str(x) for x in Path("../results/").glob("**/*.txt")] #utf-8 problem paths = [p.encode('utf-8', 'replace').decode() for p in paths] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model(".", "latentbert")
paths = "CDLI_Data/Sumerian_monolingual_processed.txt" # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=1, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model("BERT/sumerianBERTo") tokenizer = ByteLevelBPETokenizer( "BERT/sumerianBERTo/vocab.json", "BERT/sumerianBERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("dumu a-li2-wa-aq-rum") print(tokenizer.encode("dumu a-li2-wa-aq-rum").tokens) # Configuration
paths = ["../results_file_clean.txt"] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) os.makedirs('roberta_we4lkd', exist_ok=True) tokenizer.save_model("roberta_we4lkd") from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "./roberta_we4lkd/vocab.json", "./roberta_we4lkd/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512)
class WikiText2DataModule(pl.LightningDataModule): def __init__(self, data_dir: str = 'data/wikitext-2', train_batch_size: int = 64, val_batch_size: int = 64, dataloader_num_workers: int = 4, seq_length: int = 64, vocab_size=30000): super().__init__() self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.dataloader_num_workers = dataloader_num_workers self.seq_length = seq_length self.vocab_size = vocab_size self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) def prepare_data(self, *args, **kwargs): dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") column_names = dataset.column_names def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] if (not os.path.exists("data/wiki-vocab.json")) or ( not os.path.exists("data/wiki-merges.txt")): print('TRAIN TOKENIZER') self.tokenizer.train_from_iterator(batch_iterator(), vocab_size=self.vocab_size) self.tokenizer.save_model("data/", "wiki") else: self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json", "data/wiki-merges.txt", add_prefix_space=True) dataset = load_dataset("wikitext", "wikitext-103-raw-v1") def tokenize_function(examples): return { 'input_ids': list( map(lambda x: x.ids, self.tokenizer.encode_batch(examples['text']))) } self.tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=column_names, num_proc=4) def setup(self, stage: Optional[str] = None): # datasets = load_dataset('text', # data_dir=self.data_dir, # data_files={'train': 'wiki.train.small.raw', # 'valid': 'wiki.valid.small.raw'}) def group_text(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // self.seq_length) * self.seq_length # Split by chunks of max_len. result = { k: [ t[i:i + self.seq_length] for i in range(0, total_length, self.seq_length) ] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result lm_dataset = self.tokenized_dataset.map(group_text, batched=True, num_proc=4) train_dataset = lm_dataset['train'] eval_dataset = lm_dataset['validation'] self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.test_dataset = lm_dataset['test'] def collate_fn(self, features): batch = {} batch['inputs_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) batch['labels'] = batch['inputs_ids'] return batch def train_dataloader(self) -> DataLoader: return DataLoader(self.train_dataset, batch_size=self.train_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers) def val_dataloader(self) -> DataLoader: return DataLoader(self.eval_dataset, batch_size=self.val_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers) def test_dataloader(self) -> DataLoader: return DataLoader(self.test_dataset, batch_size=self.val_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers)
[train_texts, valid_texts]): with open(f"{data_dir}/{path}","w") as f: f.write(text) paths = [str(x) for x in Path(f"{data_dir}/").glob("**/*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model(tokenizer_dir) tokenizer = ByteLevelBPETokenizer( "tokenizer/vocab.json", "tokenizer/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) config = T5Config( vocab_size=52_000, max_position_embeddings=514,