def _init_models(self, params): # self.global_step = None with tf.variable_scope('encodervae'): encodervae_inputs = (self.x_enc_inp, self.x_dec_inp, self.x_dec_out, self.global_step) params['max_len'] = args.enc_max_len params['max_dec_len'] = args.enc_max_len + 1 self.encoder_model = BaseVAE(params, encodervae_inputs, "encoder") with tf.variable_scope('decodervae'): decodervae_inputs = (self.y_enc_inp, self.y_dec_inp, self.y_dec_out, self.global_step) params['max_len'] = args.dec_max_len params['max_dec_len'] = args.dec_max_len + 1 if args.isPointer: mask_oovs = self.encoder_model.dec_seq_len_mask self.decoder_model = BaseVAE(params, decodervae_inputs, "decoder", self.encoder_model.encoder_outputs, self.x_enc_inp_oovs, self.max_oovs, mask_oovs) elif args.isContext: self.decoder_model = BaseVAE(params, decodervae_inputs, "decoder", self.encoder_model.encoder_outputs) else: self.decoder_model = BaseVAE(params, decodervae_inputs, "decoder") with tf.variable_scope('transformer'): self.transformer = Transformer(self.encoder_model, self.decoder_model, params['graph_type'], self.global_step) with tf.variable_scope('decodervae/decoding', reuse=True): self.training_logits = self.decoder_model._decoder_training(self.transformer.predition, reuse=True) self.predicted_ids_op, self.attens = self.decoder_model._decoder_inference(self.transformer.predition)
def get_model(input_size, output_size, config): model = Transformer( input_size, # Source vocabulary size config.hidden_size, # Transformer doesn't need word_vec_size. output_size, # Target vocabulary size n_splits=config.n_splits, # Number of head in Multi-head Attention. n_enc_blocks=config.n_layers,# Number of encoder blocks n_dec_blocks=config.n_layers,# Number of decoder blocks dropout_p=config.dropout, # Dropout rate on each block ) return model
def get_model(input_size, output_size, train_config): model = Transformer( input_size, train_config.hidden_size, output_size, n_splits=train_config.n_splits, n_enc_blocks=train_config.n_layers, n_dec_blocks=train_config.n_layers, dropout_p=train_config.dropout, ) model.load_state_dict(saved_data['model']) model.eval() return model
import torch def data_gen(n_vocab, batch_size, n_batch, device): for i in range(n_batch): data = torch.randint(2, n_vocab, [batch_size, 10]) data[:, 0] = 1 data[:, -2:] = 0 data = data.to(device) yield Batch(data, data) if __name__ == '__main__': n_vocab = 10 model = Transformer(n_vocab) criterion = LabelSmoothing(n_vocab, 0.) optimizer = scheduled_adam_optimizer(model) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #device = 'cpu' model.to(device) for epoch in range(10): print("Epoch: {}".format(epoch)) data_iter = data_gen(n_vocab, 128, 10000, device) run_epoch(data_iter, model, criterion, optimizer) in_seq = torch.LongTensor([[1, 7, 5, 2, 3, 4, 5, 0]]).to(device) out_seq = torch.zeros([1, 20], dtype=torch.int64).to(device) out_seq[:, 0] = 1 model.eval()
def run(model_dir, max_len, source_train_path, target_train_path, source_val_path, target_val_path, enc_max_vocab, dec_max_vocab, encoder_emb_size, decoder_emb_size, encoder_units, decoder_units, batch_size, epochs, learning_rate, decay_step, decay_percent, log_interval, save_interval, compare_interval): train_iter, val_iter, source_vocab, target_vocab = create_dataset( batch_size, enc_max_vocab, dec_max_vocab, source_train_path, target_train_path, source_val_path, target_val_path) transformer = Transformer(max_length=max_len, enc_vocab=source_vocab, dec_vocab=target_vocab, enc_emb_size=encoder_emb_size, dec_emb_size=decoder_emb_size, enc_units=encoder_units, dec_units=decoder_units) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(transformer.parameters(), lr=learning_rate) lr_decay = StepLR(opt, step_size=decay_step, gamma=decay_percent) if torch.cuda.is_available(): transformer.cuda() loss_fn.cuda() def training_update_function(batch): transformer.train() lr_decay.step() opt.zero_grad() softmaxed_predictions, predictions = transformer(batch.src, batch.trg) flattened_predictions = predictions.view(-1, len(target_vocab.itos)) flattened_target = batch.trg.view(-1) loss = loss_fn(flattened_predictions, flattened_target) loss.backward() opt.step() return softmaxed_predictions.data, loss.data[0], batch.trg.data def validation_inference_function(batch): transformer.eval() softmaxed_predictions, predictions = transformer(batch.src, batch.trg) flattened_predictions = predictions.view(-1, len(target_vocab.itos)) flattened_target = batch.trg.view(-1) loss = loss_fn(flattened_predictions, flattened_target) return loss.data[0] trainer = Trainer(train_iter, training_update_function, val_iter, validation_inference_function) trainer.add_event_handler(TrainingEvents.TRAINING_STARTED, restore_checkpoint_hook(transformer, model_dir)) trainer.add_event_handler(TrainingEvents.TRAINING_ITERATION_COMPLETED, log_training_simple_moving_average, window_size=10, metric_name="CrossEntropy", should_log=lambda trainer: trainer. current_iteration % log_interval == 0, history_transform=lambda history: history[1]) trainer.add_event_handler(TrainingEvents.TRAINING_ITERATION_COMPLETED, save_checkpoint_hook(transformer, model_dir), should_save=lambda trainer: trainer. current_iteration % save_interval == 0) trainer.add_event_handler(TrainingEvents.TRAINING_ITERATION_COMPLETED, print_current_prediction_hook(target_vocab), should_print=lambda trainer: trainer. current_iteration % compare_interval == 0) trainer.add_event_handler(TrainingEvents.VALIDATION_COMPLETED, log_validation_simple_moving_average, window_size=10, metric_name="CrossEntropy") trainer.add_event_handler(TrainingEvents.TRAINING_COMPLETED, save_checkpoint_hook(transformer, model_dir), should_save=lambda trainer: True) trainer.run(max_epochs=epochs, validate_every_epoch=True)
def test_optimizer(self): model = Transformer(6) optimizer = scheduled_adam_optimizer(model)
def run(model_dir, max_len, source_train_path, target_train_path, source_val_path, target_val_path, enc_max_vocab, dec_max_vocab, encoder_emb_size, decoder_emb_size, encoder_units, decoder_units, batch_size, epochs, learning_rate, decay_step, decay_percent, val_interval, save_interval, compare_interval): logging.basicConfig(filename="validation.log", filemode="w", level=logging.INFO) train_iter, val_iter, source_vocab, target_vocab = create_dataset( batch_size, enc_max_vocab, dec_max_vocab, source_train_path, target_train_path, source_val_path, target_val_path) transformer = Transformer(max_length=max_len, enc_vocab=source_vocab, dec_vocab=target_vocab, enc_emb_size=encoder_emb_size, dec_emb_size=decoder_emb_size, enc_units=encoder_units, dec_units=decoder_units) loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(transformer.parameters(), lr=learning_rate) lr_decay = StepLR(opt, step_size=decay_step, gamma=decay_percent) if torch.cuda.is_available(): transformer.cuda() loss_fn.cuda() def training_step(engine, batch): transformer.train() lr_decay.step() opt.zero_grad() _, predictions = transformer(batch.src, batch.trg) flattened_predictions = predictions.view(-1, len(target_vocab.itos)) flattened_target = batch.trg.view(-1) loss = loss_fn(flattened_predictions, flattened_target) loss.backward() opt.step() return loss.cpu().item() def validation_step(engine, batch): transformer.eval() with torch.no_grad(): softmaxed_predictions, predictions = transformer( batch.src, batch.trg) flattened_predictions = predictions.view(-1, len(target_vocab.itos)) flattened_target = batch.trg.view(-1) loss = loss_fn(flattened_predictions, flattened_target) if not engine.state.output: predictions = softmaxed_predictions.argmax( -1).cpu().numpy().tolist() targets = batch.trg.cpu().numpy().tolist() else: predictions = engine.state.output[ "predictions"] + softmaxed_predictions.argmax( -1).cpu().numpy().tolist() targets = engine.state.output["targets"] + batch.trg.cpu( ).numpy().tolist() return { "loss": loss.cpu().item(), "predictions": predictions, "targets": targets } trainer = Engine(training_step) evaluator = Engine(validation_step) checkpoint_handler = ModelCheckpoint(model_dir, "Transformer", save_interval=save_interval, n_saved=10, require_empty=False) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED) # Attach training metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "train_loss") # Attach validation metrics RunningAverage(output_transform=lambda x: x["loss"]).attach( evaluator, "val_loss") pbar = ProgressBar() pbar.attach(trainer, ["train_loss"]) # trainer.add_event_handler(Events.TRAINING_STARTED, # restore_checkpoint_hook(transformer, model_dir)) trainer.add_event_handler(Events.ITERATION_COMPLETED, handler=validation_result_hook( evaluator, val_iter, target_vocab, val_interval, logger=logging.info)) trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=checkpoint_handler, to_save={ "nmt": { "transformer": transformer, "opt": opt, "lr_decay": lr_decay } }) # Run the prediction trainer.run(train_iter, max_epochs=epochs)
from modules.transformer import Transformer, create_masks import tensorflow as tf if __name__ == '__main__': sample_transformer = Transformer(num_layers=2, d_model=512, num_heads=8, dff=2048, input_size=50, output_size=512, pe_input=10000, pe_target=6000) temp_input = tf.random.uniform((3, 62), maxval=20, dtype=tf.int32) # token temp_target = tf.random.uniform((3, 90, 512)) enc_padding_mask, combined_mask, dec_padding_mask = create_masks( temp_input, temp_target) prenet_output, stops, post_output, attention_weights = sample_transformer( temp_input, temp_target, training=True, enc_padding_mask=enc_padding_mask, look_ahead_mask=combined_mask, dec_padding_mask=dec_padding_mask) print(post_output.shape) # (batch_size, tar_seq_len, target_vocab_size)