def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) ''' Create dataloaders ''' train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file']) train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))]) train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) ''' create tokenizer ''' tokenizer = ByteLevelBPETokenizer( "./data/english_tokenizer-vocab.json", "./data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) model.train() trainer = model.to(device) ''' Create Optimizer ''' loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx']) optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"]) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10) writer = SummaryWriter() return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) '''create tokenizers''' tokenizer = ByteLevelBPETokenizer( "data/english_tokenizer-vocab.json", "data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len']) tokenizer.enable_truncation(max_length=config['max_len']) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) checkpoint = torch.load(config['pretrained_model'], map_location=device) model.load_state_dict(checkpoint['net']) model.eval() model = model.to(device) return config, model, tokenizer, device
block_size=block_size ) dataset_gen_val = LineByLineTextDataset( tokenizer=bpe_tokenizer, file_path=input_path_val, block_size=block_size ) data_collator = DataCollatorForLanguageModeling( tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability ) # create model config = LongformerConfig( attention_window=attention_window, sep_token_id=bpe_tokenizer.get_vocab()["</s>"], pad_token_id=bpe_tokenizer.get_vocab()["<pad>"], bos_token_id=bpe_tokenizer.get_vocab()["<s>"], eos_token_id=bpe_tokenizer.get_vocab()["</s>"], vocab_size=bpe_tokenizer.vocab_size, max_position_embeddings=max_len+10, num_attention_heads=num_attention_heads, num_hidden_layers=num_hidden_layers, type_vocab_size=1 ) model = LongformerForMaskedLM(config=config) _pretty_print(f"Number of model parameters : {model.num_parameters():,}") model_path = os.path.join(output_path, "lm")