def main(): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=get_file(), vocab_size=config.VOCAB_SIZE, min_frequency=config.MIN_FREQUENCY, special_tokens=config.SPECIAL_TOKENS) tokenizer.save_model(config.TOKENIZER_PATH)
def train_tok(txt_dir, tokenizer_dir): # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=txt_dir, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model(tokenizer_dir)
# train tokenizer _pretty_print("Training tokenizer") bpe_tokenizer.train([input_path, input_path_val], vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") os.makedirs(tok_path, exist_ok=True) bpe_tokenizer.save_model(tok_path) # load tokenizer with Roberta configuration bpe_tokenizer = RobertaTokenizerFast.from_pretrained(tok_path, max_len=max_len) # create data objects dataset_gen = LineByLineTextDataset(tokenizer=bpe_tokenizer, file_path=input_path, block_size=block_size) dataset_gen_val = LineByLineTextDataset(tokenizer=bpe_tokenizer, file_path=input_path_val, block_size=block_size) data_collator = DataCollatorForLanguageModeling( tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability)
paths = [str(x) for x in Path("./").glob("**/corpus.txt")] # Byte Level Tokernize # -------------------------------------------------- # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model("BR_BERTo") # Test tokenizer = ByteLevelBPETokenizer( "./BR_BERTo/vocab.json", "./BR_BERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens) # Model type # -------------------------------------------------- config = RobertaConfig(
from transformers import BertConfig from transformers import BertTokenizer from transformers import BertForMaskedLM from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments PATH = os.getcwd() SAVE_MODEL = os.getcwd() tokenizer = ByteLevelBPETokenizer() tokenizer.train(files="kant.txt", vocab_size=52_000, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.save_model(SAVE_MODEL) tokenizer = ByteLevelBPETokenizer( SAVE_MODEL + "/vocab.json", SAVE_MODEL + "/merges.txt", ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = BertConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, )