def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, False) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) config_encoder = BertConfig(vocab_size=INPUT_DIM) config_decoder = BertConfig(vocab_size=OUTPUT_DIM) config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) config_encoder = model.config.encoder config_decoder = model.config.decoder config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=10, save_steps=3000, seed=0, load_best_model_at_end=True, ) # args.place_model_on_device = device trainer = Trainer( model=model, args=args, train_dataset=train_iterator, eval_dataset=valid_iterator, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() model.save_pretrained("bert2bert")
def get_model(vocab_size=30000): config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.vocab_size = vocab_size config_decoder.vocab_size = vocab_size config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) return model
def __init__(self): super().__init__() encoder_config = BertConfig(num_hidden_layers=6, vocab_size=21128, hidden_size=512, num_attention_heads=8) self.encoder = BertModel(encoder_config) decoder_config = BertConfig(num_hidden_layers=6, vocab_size=21128, hidden_size=512, num_attention_heads=8) decoder_config.is_decoder = True self.decoder = BertModel(decoder_config) self.linear = nn.Linear(512, 21128, bias=False)
def __init__(self): super().__init__() encoder_config = BertConfig(num_hidden_layers=6, vocab_size=30522, hidden_size=512, num_attention_heads=8) self.encoder = BertModel(encoder_config) decoder_config = BertConfig(num_hidden_layers=6, vocab_size=30522, hidden_size=512, num_attention_heads=8) decoder_config.is_decoder = True decoder_config.add_cross_attention = True self.decoder = BertModel(decoder_config) self.linear = nn.Linear( 512, 30522, bias=False) # 21128 for chinese 30522 for English
def __init__(self, config, language_pretrained_model_path=None): super(VisualLinguisticBertDecoder, self).__init__(config) self.config = config # embeddings self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.end_embedding = nn.Embedding(1, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.embedding_LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.embedding_dropout = nn.Dropout(config.hidden_dropout_prob) # for compatibility of roberta self.position_padding_idx = config.position_padding_idx # visual transform self.visual_1x1_text = None self.visual_1x1_object = None if config.visual_size != config.hidden_size: self.visual_1x1_text = nn.Linear(config.visual_size, config.hidden_size) self.visual_1x1_object = nn.Linear(config.visual_size, config.hidden_size) if config.visual_ln: self.visual_ln_text = BertLayerNorm(config.hidden_size, eps=1e-12) self.visual_ln_object = BertLayerNorm(config.hidden_size, eps=1e-12) else: visual_scale_text = nn.Parameter(torch.as_tensor( self.config.visual_scale_text_init, dtype=torch.float), requires_grad=True) self.register_parameter('visual_scale_text', visual_scale_text) visual_scale_object = nn.Parameter(torch.as_tensor( self.config.visual_scale_object_init, dtype=torch.float), requires_grad=True) self.register_parameter('visual_scale_object', visual_scale_object) # ********************************************* # FM addition - Set-up decoder layer for MT # Initializing a BERT bert-base-uncased style configuration configuration = BertConfig() configuration.vocab_size = config.vocab_size # FM edit: reduce size - 12 layers doesn't fit in single 12GB GPU configuration.num_hidden_layers = 6 configuration.is_decoder = True # Initializing a model from the bert-base-uncased style configuration self.decoder = BertModel(configuration) # ********************************************* if self.config.with_pooler: self.pooler = BertPooler(config) # init weights self.apply(self.init_weights) if config.visual_ln: self.visual_ln_text.weight.data.fill_( self.config.visual_scale_text_init) self.visual_ln_object.weight.data.fill_( self.config.visual_scale_object_init) # load language pretrained model if language_pretrained_model_path is not None: self.load_language_pretrained_model(language_pretrained_model_path) if config.word_embedding_frozen: for p in self.word_embeddings.parameters(): p.requires_grad = False self.special_word_embeddings = nn.Embedding( NUM_SPECIAL_WORDS, config.hidden_size) self.special_word_embeddings.weight.data.copy_( self.word_embeddings.weight.data[:NUM_SPECIAL_WORDS])