def val_dataloader(self): from data import IndicDataset, PadSequence pad_sequence = PadSequence(self.tokenizers.src.pad_token_id, self.tokenizers.tgt.pad_token_id) return DataLoader(IndicDataset(self.tokenizers.src, self.tokenizers.tgt, self.config.data, False), batch_size=self.config.eval_size, shuffle=False, collate_fn=pad_sequence)
def __init__(self, config): super(LightModule, self).__init__() self.hparam = config init_seed() preproc_data() self.model, self.tokenizers = M.build_model(config) self.pad_sequence = PadSequence(self.tokenizers.src.pad_token_id, self.tokenizers.tgt.pad_token_id) print('init success')
def test_dataloader(self): from data import IndicDataset, PadSequence pad_sequence = PadSequence(self.src_tokenizers.pad_token_id, self.tgt_tokenizers.pad_token_id) return DataLoader(IndicDataset(self.src_tokenizers, self.tgt_tokenizers, self.config.data, False, True), batch_size=1, shuffle=False, collate_fn=pad_sequence)
def __init__(self,config): super().__init__() src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' #hidden_size and intermediate_size are both wrt all the attention heads. #Should be divisible by num_attention_heads encoder_config = BertConfig(vocab_size=src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig(vocab_size=tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12,) #Create encoder and decoder embedding layers. encoder_embeddings = torch.nn.Embedding(src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id) decoder_embeddings = torch.nn.Embedding(tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) encoder.set_input_embeddings(encoder_embeddings.cuda()) #decoder_config.add_cross_attention=True #decoder_config.is_decoder=True decoder = BertForMaskedLM(decoder_config) decoder.set_input_embeddings(decoder_embeddings.cuda()) #Creating encoder and decoder with their respective embeddings. tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer}) self.encoder = encoder self.decoder = decoder self.pad_sequence=PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id) self.tokenizers=tokenizers self.config=config
def gen_model_loaders(config): model, tokenizers = M.build_model(config) pad_sequence = PadSequence( tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id) train_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, True), batch_size=config.batch_size, shuffle=False, collate_fn=pad_sequence) eval_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, False), batch_size=config.eval_size, shuffle=False, collate_fn=pad_sequence) return model, tokenizers, train_loader, eval_loader
def build_model(config): src_tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tgt_tokenizer.bos_token = '<s>' tgt_tokenizer.eos_token = '</s>' #hidden_size and intermediate_size are both wrt all the attention heads. #Should be divisible by num_attention_heads encoder_config = BertConfig( vocab_size=src_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) decoder_config = BertConfig( vocab_size=tgt_tokenizer.vocab_size, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, hidden_dropout_prob=config.dropout_prob, attention_probs_dropout_prob=config.dropout_prob, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, is_decoder=False) #Create encoder and decoder embedding layers. encoder_embeddings = torch.nn.Embedding( src_tokenizer.vocab_size, config.hidden_size, padding_idx=src_tokenizer.pad_token_id) decoder_embeddings = torch.nn.Embedding( tgt_tokenizer.vocab_size, config.hidden_size, padding_idx=tgt_tokenizer.pad_token_id) encoder = BertModel(encoder_config) encoder.set_input_embeddings(encoder_embeddings) decoder = BertForMaskedLM(decoder_config) decoder.set_input_embeddings(decoder_embeddings) tokenizers = ED({'src': src_tokenizer, 'tgt': tgt_tokenizer}) pad_sequence = PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id) # model = TranslationModel(encoder, decoder) model = MyLightningModule(encoder, decoder, config, tokenizers, pad_sequence) # model.cuda() return model, tokenizers
def prepare_data(self): self.pad_sequence = PadSequence(self.tokenizers.src.pad_token_id, self.tokenizers.tgt.pad_token_id)
def gen_model_loaders(config): encoder, decoder, tokenizers = build_enc_dec_tokenizers(config) pad_sequence = PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id) return encoder, decoder, tokenizers, pad_sequence