def main(): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=get_file(), vocab_size=config.VOCAB_SIZE, min_frequency=config.MIN_FREQUENCY, special_tokens=config.SPECIAL_TOKENS) tokenizer.save_model(config.TOKENIZER_PATH)
def pretrain_tokenization(self): paths = [str(x) for x in Path("handler/datadir/").glob("*-train.txt")] print(paths) tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save(".", "danbert-small")
def _fit_tokenizer( path_to_text_file: Union[str, List[str]], tokenizer: ByteLevelBPETokenizer, vocabulary_size: int, ) -> None: tokenizer.train( path_to_text_file, vocabulary_size, special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN], )
def train_tok(txt_dir, tokenizer_dir): # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=txt_dir, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model(tokenizer_dir)
def test_language_model_dataset_fit_tokenizer_should_call_the_train_method_of_bpe_tokenizer( ): # Given language_modeling_dataset = LanguageModelingDataset(1, 1) tokenizer = ByteLevelBPETokenizer() tokenizer.train = MagicMock() language_modeling_dataset.set_tokenizer(tokenizer) # When language_modeling_dataset._fit_tokenizer(FAKE_PATH_FOR_TEST, tokenizer, 300) # Then tokenizer.train.assert_called_with( FAKE_PATH_FOR_TEST, 300, special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN], )
num_attention_heads=6, num_hidden_layers=3, epochs=5, batch_size=30, val_batch_size=60, eval_steps=50, **kwargs): # instantiate tokenizer bpe_tokenizer = ByteLevelBPETokenizer() # train tokenizer _pretty_print("Training tokenizer") bpe_tokenizer.train([input_path, input_path_val], vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") os.makedirs(tok_path, exist_ok=True) bpe_tokenizer.save_model(tok_path) # load tokenizer with Roberta configuration bpe_tokenizer = RobertaTokenizerFast.from_pretrained(tok_path, max_len=max_len) # create data objects dataset_gen = LineByLineTextDataset(tokenizer=bpe_tokenizer,
corpus_length = 6_993_330 # fazer um wc -l para ver a qtde de linhas vocab_size = 150_000 # Dataset files # -------------------------------------------------- paths = [str(x) for x in Path("./").glob("**/corpus.txt")] # Byte Level Tokernize # -------------------------------------------------- # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model("BR_BERTo") # Test tokenizer = ByteLevelBPETokenizer( "./BR_BERTo/vocab.json", "./BR_BERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512)
default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING']) args = parser.parse_args() paths = [str(x) for x in Path(args.data_dir).glob("**/*.txt")] print("data files") print(paths) # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) # Need to save it to model dir for inference tokenizer.save(args.model_dir) tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"), os.path.join(args.model_dir, "merges.txt")) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=args.token_max_len) print(tokenizer.encode("Nay, but speak not."))
import os from tokenizers.implementations import ByteLevelBPETokenizer from transformers import BertConfig from transformers import BertTokenizer from transformers import BertForMaskedLM from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments PATH = os.getcwd() SAVE_MODEL = os.getcwd() tokenizer = ByteLevelBPETokenizer() tokenizer.train(files="kant.txt", vocab_size=52_000, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.save_model(SAVE_MODEL) tokenizer = ByteLevelBPETokenizer( SAVE_MODEL + "/vocab.json", SAVE_MODEL + "/merges.txt", ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = BertConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6,