class KeplerModel(pl.LightningModule): def __init__(self): super(KeplerModel, self).__init__() #Initialize Model Parameters Using Config Properties self.model = Encoder(config['seq_length'], config['hidden_size'], config['output_dim'], config['n_layers']) #Initialize a Cross Entropy Loss Criterion for Training self.criterion = torch.nn.CrossEntropyLoss() #Define a Forward Pass of the Model def forward(self, x, h): return self.model.forward(x, h) def training_step(self, batch, batch_idx): #Set Model to Training Mode self.model.train() #Unpack Data and Labels from Batch x, y = batch #Reshape Data into Shape (batch_size, 1, seq_length) x = x.view(x.size(0), -1, x.size(1)) #Initalize the hidden state for forward pass h = self.model.init_hidden(x.size(0)) #Zero out the model gradients to avoid accumulation self.model.zero_grad() #Forward Pass Through Model out, h = self.forward(x, h) #Calculate Cross Entropy Loss loss = self.criterion(out, y.long().squeeze()) #Obtain Class Labels y_hat = torch.max(out, 1)[1] #Compute the balanced accuracy (weights based on number of ex. in each class) accuracy = balanced_accuracy_score(y, y_hat) #Compute weighted f1 score to account for class imbalance f1 = f1_score(y, y_hat, average='weighted') #Create metric object for tensorboard logging tensorboard_logs = { 'train_loss': loss.item(), 'accuracy': accuracy, 'f1': f1 } return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_idx): #Set Model to Eval Mode self.model.eval() #Unpack data and labels from batch x, y = batch #Initialize Hidden State h = self.model.init_hidden(x.size(0)) #Reshape Data into Shape (batch_size, 1, seq_length) x = x.view(x.size(0), -1, x.size(1)) #Calculate Forward Pass of The Model out, h = self.forward(x, h) #Calculate Cross Entropy Loss loss = self.criterion(out, y.long().squeeze()) #Calculate Class Indicies y_hat = torch.max(out, 1)[1] #Calculate Balanced Accuracy val_accuracy = torch.Tensor([balanced_accuracy_score(y, y_hat)]) #Calculate Balanced Accuracy val_f1 = torch.Tensor([f1_score(y, y_hat, average='weighted')]) #Create a metrics object metrics = { 'val_loss': loss, 'val_accuracy': val_accuracy, 'val_f1': val_f1 } return metrics def validation_end(self, outputs): # OPTIONAL avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() avg_acc = torch.stack([x['val_accuracy'] for x in outputs]).mean() avg_f1 = torch.stack([x['val_f1'] for x in outputs]).mean() tensorboard_logs = { 'val_loss': avg_loss, 'val_acc': avg_acc, 'val_f1': avg_f1 } return {'avg_val_loss': avg_loss, 'log': tensorboard_logs} def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=0.001) @pl.data_loader def train_dataloader(self): # REQUIRED return DataLoader(KeplerDataset(mode="train"), batch_size=64, shuffle=True) @pl.data_loader def val_dataloader(self): # REQUIRED return DataLoader(KeplerDataset(mode="test"), batch_size=128, shuffle=True)
def train(): model_path = config.get('model_path', './model/') log_step = config.get('log_step', 10) hidden_size = config.get('decoder_hidden_size', 512) num_epochs = config.get('num_epochs', 5) alpha_c = config.get('alpha_c', 1) # Create model directory if not os.path.exists(model_path): os.makedirs(model_path) losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # Used for calculating bleu scores references = [] hypotheses = [] # Load vocabulary vocab = build_and_save_vocab() # Build data loader data_loader = get_loader('train') data_loader_valid = get_loader('validate') # Build the models if config.get('checkpoint') is None: epochs_since_improvement = config.get('epochs_since_improvement') best_score = 0. encoder = Encoder(config.get('image_net')).to(device) decoder = Decoder(encoder.dim, len(vocab), hidden_size=hidden_size).to(device) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=config.get('encoder_lr', 1e-4)) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=config.get('decoder_lr', 1e-3)) else: checkpoint = torch.load(config.get('checkpoint')) epochs_since_improvement = checkpoint['epochs_since_improvement'] best_score = checkpoint['best_score'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] # Loss and optimizer criterion = nn.CrossEntropyLoss() # Train the models total_step = len(data_loader) for epoch in range(num_epochs): if epochs_since_improvement == 20: print( 'Reached the max epochs_since_improvement. Training is done.') break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.9) adjust_learning_rate(encoder_optimizer, 0.9) for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) # Forward, backward and optimize features = encoder.forward(images) prediction, alphas = decoder.forward(features, captions) att_regularization = alpha_c * ((1 - alphas.sum(1))**2).mean() loss = criterion(prediction.permute(0, 2, 1), captions) + att_regularization decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() total_caption_length = calculate_caption_lengths( vocab.word2idx, captions) acc1 = accuracy(prediction.permute(0, 2, 1), captions, 1) acc5 = accuracy(prediction.permute(0, 2, 1), captions, 5) losses.update(loss.item(), total_caption_length) top1.update(acc1, total_caption_length) top5.update(acc5, total_caption_length) # Print log info if i % log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) print( 'Top 1 Accuracy {top1.val:.3f} ({top1.avg:.3f}), Top 5 Accuracy {top5.val:.3f} ({top5.avg:.3f})' .format(top1=top1, top5=top5)) valid_score = validate(data_loader_valid, encoder, decoder, criterion, vocab) if valid_score >= best_score: epochs_since_improvement += 1 print( 'Epochs since last improvement: {epochs_since_improvement}' ) best_score = valid_score else: epochs_since_improvement = 0 state_dict = { 'epoch': epoch, 'epochs_since_improvement': epochs_since_improvement, 'decoder': decoder, 'decoder_optimizer': decoder_optimizer, 'encoder': encoder, 'encoder_optimizer': encoder_optimizer, 'valid_score': valid_score, 'best_score': best_score } filename = 'checkpoint.pth.tar' torch.save(state_dict, filename) # Print log info if i % log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) print( 'Top 1 Accuracy {top1.val:.3f} ({top1.avg:.3f}), Top 5 Accuracy {top5.val:.3f} ({top5.avg:.3f})' .format(top1=top1, top5=top5)) print('Validate score %.3f' % (valid_score))
def train(train_ids, dev_ids, test_ids, dict_ids, enc_word_alphabet, enc_char_alphabet, dec_word_alphabet, dec_char_alphabet, position_alphabet, dictionary): enc_word_emb = initialize_emb(config.get('word_emb'), enc_word_alphabet, opt.word_emb_dim) if position_alphabet is not None: pos_emb = initialize_emb(None, position_alphabet, opt.pos_emb_dim) else: pos_emb = None if opt.use_char: enc_char_emb = initialize_emb(config.get('char_emb'), enc_char_alphabet, opt.char_emb_dim) else: enc_char_emb = None encoder = Encoder(enc_word_emb, pos_emb, enc_char_emb) if opt.method == 'cla': decoder = AttnNet(dictionary) train_loader = DataLoader(MyDataset(train_ids), opt.batch_size, shuffle=True, collate_fn=my_collate) else: dec_word_emb = initialize_emb(config.get('word_emb'), dec_word_alphabet, opt.word_emb_dim) if opt.use_char: dec_char_emb = initialize_emb(config.get('char_emb'), dec_char_alphabet, opt.char_emb_dim) else: dec_char_emb = None decoder = Decoder(dec_word_emb, dec_char_emb, dec_word_alphabet) if opt.batch_size != 1: raise RuntimeError("currently, only support batch size 1") train_loader = DataLoader(MyDataset(train_ids), opt.batch_size, shuffle=True, collate_fn=my_collate_1) optimizer = optim.Adam(itertools.chain(encoder.parameters(), decoder.parameters()), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: encoder.free_emb() decoder.free_emb() if opt.pretraining: if opt.method == 'cla': dict_loader = DataLoader(MyDataset(dict_ids), opt.batch_size, shuffle=True, collate_fn=my_collate) else: dict_loader = DataLoader(MyDataset(dict_ids), opt.batch_size, shuffle=True, collate_fn=my_collate_1) logging.info("start dict pretraining ...") logging.info("dict pretraining datapoints: {}".format(len(dict_ids))) bad_counter = 0 best_accuracy = 0 for idx in range(9999): epoch_start = time.time() encoder.train() decoder.train() correct, total = 0, 0 sum_loss = 0 train_iter = iter(dict_loader) num_iter = len(dict_loader) for i in range(num_iter): if opt.method == 'cla': enc_word_seq_tensor, enc_word_seq_lengths, enc_word_seq_recover, enc_mask, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover, label_tensor = next(train_iter) encoder_outputs, _ = encoder.forward_batch(enc_word_seq_tensor, enc_word_seq_lengths, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover) loss, total_this_batch, correct_this_batch = decoder.forward_train(encoder_outputs, enc_word_seq_lengths, label_tensor) else: enc_word_seq_tensor, enc_pos_tensor, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover, dec_word_seq_tensor, \ label_tensor, dec_char_seq_tensor = next(train_iter) encoder_outputs, encoder_hidden = encoder.forward(enc_word_seq_tensor, enc_pos_tensor, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover) loss, total_this_batch, correct_this_batch = decoder.forward_train(encoder_outputs, encoder_hidden, dec_word_seq_tensor, label_tensor, dec_char_seq_tensor) sum_loss += loss.item() loss.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(encoder.parameters(), opt.gradient_clip) torch.nn.utils.clip_grad_norm_(decoder.parameters(), opt.gradient_clip) optimizer.step() encoder.zero_grad() decoder.zero_grad() total += total_this_batch correct += correct_this_batch epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info("epoch: %s pretraining finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % ( idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if accuracy > opt.expected_accuracy: logging.info("Exceed expected training accuracy, breaking ... ") break if accuracy > best_accuracy: logging.info("Exceed previous best accuracy: %.2f" % (best_accuracy)) best_accuracy = accuracy bad_counter = 0 else: bad_counter += 1 if bad_counter >= opt.patience: logging.info('Pretraining Early Stop!') break best_dev_f = -10 bad_counter = 0 logging.info("start training ...") logging.info("training datapoints: {}".format(len(train_ids))) if dev_ids is not None and len(dev_ids) != 0: logging.info("dev datapoints: {}".format(len(dev_ids))) if test_ids is not None and len(test_ids) != 0: logging.info("test datapoints: {}".format(len(test_ids))) for idx in range(opt.iter): epoch_start = time.time() encoder.train() decoder.train() train_iter = iter(train_loader) num_iter = len(train_loader) sum_loss = 0 correct, total = 0, 0 for i in range(num_iter): if opt.method == 'cla': enc_word_seq_tensor, enc_word_seq_lengths, enc_word_seq_recover, enc_mask, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover, label_tensor = next(train_iter) encoder_outputs, _ = encoder.forward_batch(enc_word_seq_tensor, enc_word_seq_lengths, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover) loss, total_this_batch, correct_this_batch = decoder.forward_train(encoder_outputs, enc_word_seq_lengths, label_tensor) else: enc_word_seq_tensor, enc_pos_tensor, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover, dec_word_seq_tensor, \ label_tensor, dec_char_seq_tensor = next(train_iter) encoder_outputs, encoder_hidden = encoder.forward(enc_word_seq_tensor, enc_pos_tensor, \ enc_char_seq_tensor, enc_char_seq_lengths, enc_char_seq_recover) loss, total_this_batch, correct_this_batch = decoder.forward_train(encoder_outputs, encoder_hidden, dec_word_seq_tensor, label_tensor, dec_char_seq_tensor) sum_loss += loss.item() loss.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(encoder.parameters(), opt.gradient_clip) torch.nn.utils.clip_grad_norm_(decoder.parameters(), opt.gradient_clip) optimizer.step() encoder.zero_grad() decoder.zero_grad() total += total_this_batch correct += correct_this_batch epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info("epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % ( idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if dev_ids is not None and len(dev_ids) != 0: if opt.method == 'cla': p, r, f = evaluate_cla(dev_ids, encoder, decoder, dictionary) else: p, r, f = evaluate(dev_ids, encoder, decoder, dec_word_alphabet, dec_char_alphabet, dictionary) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) best_dev_f = f bad_counter = 0 torch.save(encoder, os.path.join(opt.output, "encoder.pkl")) torch.save(decoder, os.path.join(opt.output, "decoder.pkl")) torch.save(enc_word_alphabet, os.path.join(opt.output, "enc_word_alphabet.pkl")) torch.save(enc_char_alphabet, os.path.join(opt.output, "enc_char_alphabet.pkl")) torch.save(dec_word_alphabet, os.path.join(opt.output, "dec_word_alphabet.pkl")) torch.save(dec_char_alphabet, os.path.join(opt.output, "dec_char_alphabet.pkl")) torch.save(position_alphabet, os.path.join(opt.output, "position_alphabet.pkl")) if test_ids is not None and len(test_ids) != 0: if opt.method == 'cla': p, r, f = evaluate_cla(test_ids, encoder, decoder, dictionary) else: p, r, f = evaluate(test_ids, encoder, decoder, dec_word_alphabet, dec_char_alphabet, dictionary) logging.info("Test: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: bad_counter += 1 if bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished")