def main(): # load vocablary with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) # build model encoder = EncoderCNN(300) decoder = FactoredLSTM(300, 512, 512, len(vocab)) encoder.load_state_dict(torch.load('pretrained_models/encoder-15.pkl')) decoder.load_state_dict(torch.load('pretrained_models/decoder-15.pkl')) # prepare images transform = transforms.Compose([ Rescale((224, 224)), transforms.ToTensor() ]) img_names, img_list = load_sample_images('sample_images/', transform) image = to_var(img_list[30], volatile=True) # if torch.cuda.is_available(): # encoder = encoder.cuda() # decoder = decoder.cuda() # farward features = encoder(image) output = decoder.sample(features, mode="factual") caption = [vocab.i2w[x] for x in output] print(img_names[30]) print(caption)
def train_attention_captioner(): print("Training The Attention Capitoner ... ") # Create model directory if not os.path.exists(path_trained_model): os.makedirs(path_trained_model) # Image preprocessing, first resize the input image then do normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((input_resnet_size, input_resnet_size), interpolation=Image.ANTIALIAS), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Loading pickle dictionary with open(dict_path, 'rb') as file: dictionary = pickle.load(file) # Build data loader data_loader = get_loader(imgs_path, data_caps, dictionary, transform, BATCH_SIZE, shuffle=True, num_workers=2) # Building the Models encoder = EncoderCNN(word_embedding_size).to(device) attn_decoder = AttnDecoderRNN(word_embedding_size, len(dictionary[0])) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(attn_decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=LEARN_RATE) word2idx = dictionary[0] # Initiazling the decoder hidden and output decoder_input = torch.tensor([[word2idx['START']]]).to(device) decoder_hidden = torch.zeros(word_embedding_size).to(device) total_steps = len(data_loader) for epcoh in range(NUM_EPOCHS): for i, (images, captions, lengths) in enumerate(data_loader): print(images.Size, captions.Size, lengths.Size) # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) decoder_output, decoder_hidden, attn_weights = attn_decoder( decoder_input, decoder_hidden, features)
def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq): super(PolicyNet, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.hidden_size = hidden_size self.vocab = vocab self.CNNp = EncoderCNN(embed_size) self.RNNp = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
def do(args: argparse.Namespace): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('gpu:', args.gpu) if not os.path.exists(args.save_model_path): os.mkdir(args.save_model_path) # preprocess preprocess = transforms.Compose([ transforms.RandomCrop(args.random_crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # dataset coco_loader = get_dataloader(root=args.dataset_path, json_path=args.json_path, vocab=vocab, batch_size=args.batch_size, num_workers=args.num_workers, transform=preprocess, shuffle=False) # models encoder = EncoderCNN(args.embed_size).cuda() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda() loss_cls = nn.CrossEntropyLoss().cuda() params = list(encoder.fc.parameters()) + list(encoder.bn1d.parameters()) + list(decoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # resume if args.resume: model_states = torch.load(os.path.join(args.save_model_path, 'model.ckpt')) print('checkpoint epoch: %d\tstep: %d' % (model_states['epoch'], model_states['step'])) encoder.load_state_dict(model_states['encoder']) decoder.load_state_dict(model_states['decoder']) print('load successfully') # train total_step = len(coco_loader) print('total step in each epoch : ', total_step) encoder.fc.train(mode=True) encoder.bn1d.train(mode=True) encoder.encoder.eval() decoder.train(mode=True) input('ready') for cur_epoch in range(args.num_epochs): for cur_step, (image, caption, length) in enumerate(coco_loader): image = image.cuda() caption = caption.cuda() target = pack_padded_sequence(caption, length, batch_first=True)[0] out = decoder(encoder(image), caption, length) loss = loss_cls(out, target) encoder.zero_grad() decoder.zero_grad() loss.backward() optimizer.step() if (cur_step + 1) % args.print_step == 0: print('Epoch : %d/%d\tStep : %d/%d\tLoss : %.8f\tPerplexity : %.8f' % ( cur_epoch + 1, args.num_epochs, cur_step + 1, total_step, loss.item(), np.exp(loss.item()))) if (cur_step + 1) % args.save_model_step == 0: torch.save({'epoch': cur_epoch + 1, 'step': cur_step + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()}, os.path.join(args.save_model_path, 'model.ckpt')) print('model saved at E:%d\tS:%d' % (cur_epoch + 1, cur_step + 1))
def main(): with open("data/vocab.pkl", 'rb') as f: vocab = pickle.load(f) img_path = "data/flickr7k_images" cap_path = "data/factual_train.txt" styled_path = "data/humor/funny_train.txt" data_loader = get_data_loader(img_path, cap_path, vocab, 3) styled_data_loader = get_styled_data_loader(styled_path, vocab, 3) encoder = EncoderCNN(30) decoder = FactoredLSTM(30, 40, 40, len(vocab)) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() # for i, (images, captions, lengths) in enumerate(data_loader): for i, (captions, lengths) in enumerate(styled_data_loader): # images = Variable(images, volatile=True) captions = Variable(captions.long()) if torch.cuda.is_available(): # images = images.cuda() captions = captions.cuda() # features = encoder(images) outputs = decoder(captions, features=None, mode="humorous") print(lengths - 1) print(outputs) print(captions[:, 1:]) loss = masked_cross_entropy(outputs, captions[:, 1:].contiguous(), lengths - 1) print(loss) break
def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq): super(ValueNet, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.hidden_size = hidden_size self.vocab = vocab self.CNNv = EncoderCNN(embed_size) self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq) self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size)) self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size)) self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1)) self.relu = nn.LeakyReLU(0.2, inplace = True) self.norm1 = nn.LayerNorm(embed_size) self.norm2 = nn.LayerNorm(embed_size)
def main(args): print("Process %s, running on %s: starting (%s)" % ( os.getpid(), os.name, time.asctime())) encoder = EncoderCNN() decoder = DecoderRNN() if torch.cuda.is_available() and args.gpu: encoder = encoder.cuda() decoder = decoder.cuda() encoder_trainables = [p for p in encoder.parameters() if p.requires_grad] decoder_trainables = [p for p in decoder.parameters() if p.requires_grad] params = encoder_trainables + decoder_trainables transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) data_loader = trainloader(transform=transform) optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
def do(args: argparse.Namespace): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('gpu :', args.gpu) # preprocess preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) # vocab with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # model encoder = EncoderCNN(args.embed_size).cuda() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda() model_state = torch.load(args.checkpoint_path) encoder.load_state_dict(model_state['encoder']) decoder.load_state_dict(model_state['decoder']) print('load successfully at\tepoch:%d\tstep:%d' % (model_state['epoch'], model_state['step'])) encoder.eval() decoder.eval() # image img = load_image(args.img_path, preprocess).cuda() outs = decoder.sample(encoder(img)) outs = outs.cpu().numpy() print(outs) # caption caption = [] for word_id in outs: word = vocab.idx2word[word_id] caption.append(word) if word == '<end>': break sentence = ' '.join(caption) print(sentence)
return image # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) # Prepare an image image = load_image(image_path, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature)
else: data_transform = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( (0.485, 0.456, 0.406), # using ImageNet norms (0.229, 0.224, 0.225)) ]) test_lines = read_lines(TOKEN_FILE_TEST) test_image_ids, test_cleaned_captions = parse_lines(test_lines) # load models encoder = EncoderCNN().to(device) decoder = torch.load("decoder.ckpt").to(device) encoder.eval() decoder.eval() # generate caption, eval mode to not influence batchnorm ######################################################################### # # QUESTION 2.1 Generating predictions on test data # ######################################################################### # TODO define decode_caption() function in utils.py image_id_candidate_reference = {} # type: dict[str, dict[str, list[str]]] import os if os.path.exists("image_id_candidate_reference.pt"): image_id_candidate_reference = torch.load(
def script(args): transform = transforms.Compose([ transforms.Resize(args.img_size), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) train_loader, vocab = get_loader(args.root_dir, args.train_tsv_path, args.image_path, transform, args.batch_size, args.shuffle, args.num_workers) vocab_size = len(vocab) print("vocab_size: ", vocab_size) val_loader, _ = get_loader(args.root_dir, args.val_tsv_path, args.image_path, transform, args.batch_size, args.shuffle, args.num_workers, vocab) encoderCNN = EncoderCNN().to(args.device) sentLSTM = SentenceLSTM(encoderCNN.enc_dim, args.sent_hidden_dim, args.att_dim, args.sent_input_dim, args.word_input_dim, args.int_stop_dim).to(args.device) wordLSTM = WordLSTM(args.word_input_dim, args.word_hidden_dim, vocab_size, args.num_layers).to(args.device) criterion_stop = nn.CrossEntropyLoss().to(args.device) criterion_words = nn.CrossEntropyLoss().to(args.device) params_cnn = list(encoderCNN.parameters()) params_lstm = list(sentLSTM.parameters()) + list(wordLSTM.parameters()) optim_cnn = torch.optim.Adam(params=params_cnn, lr=args.learning_rate_cnn) optim_lstm = torch.optim.Adam(params=params_lstm, lr=args.learning_rate_lstm) total_step = len(train_loader) evaluate(args, val_loader, encoderCNN, sentLSTM, wordLSTM, vocab) for epoch in range(args.num_epochs): encoderCNN.train() sentLSTM.train() wordLSTM.train() for i, (images, captions, prob) in enumerate(train_loader): optim_cnn.zero_grad() optim_lstm.zero_grad() batch_size = images.shape[0] images = images.to(args.device) captions = captions.to(args.device) prob = prob.to(args.device) vis_enc_output = encoderCNN(images) topics, ps = sentLSTM(vis_enc_output, captions, args.device) loss_sent = criterion_stop(ps.view(-1, 2), prob.view(-1)) loss_word = torch.tensor([0.0]).to(args.device) for j in range(captions.shape[1]): word_outputs = wordLSTM(topics[:, j, :], captions[:, j, :]) loss_word += criterion_words( word_outputs.contiguous().view(-1, vocab_size), captions[:, j, :].contiguous().view(-1)) loss = args.lambda_sent * loss_sent + args.lambda_word * loss_word loss.backward() optim_cnn.step() optim_lstm.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.num_epochs, i, total_step, loss.item())) ## Save the model checkpoints # if (i+1) % args.save_step == 0: # torch.save(decoder.state_dict(), os.path.join( # args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) # torch.save(encoder.state_dict(), os.path.join( # args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1))) evaluate(args, val_loader, encoderCNN, sentLSTM, wordLSTM, vocab)
# Load Vocabulary Wrapper with open('./data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) # Build Dataset Loader train_loader = get_loader(train_image_path, train_json_path, vocab, transform, batch_size=batch_size, shuffle=True, num_workers=2) total_step = len(train_loader) # Build Models encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) # Train the Decoder for epoch in range(num_epochs): for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = Variable(images).cuda() captions = Variable(captions).cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
def main(): cudnn.benchmark = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = get_parser().parse_args() NUM_WORKERS = 4 CROP_SIZE = 256 NUM_PIXELS = 64 ENCODER_SIZE = 2048 learning_rate = args.lr start_epoch = 0 max_BLEU = 0 vocab = pickle.load(open('vocab.p', 'rb')) train_transform = transforms.Compose([ transforms.RandomCrop(CROP_SIZE), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286)) ]) val_transform = transforms.Compose([ transforms.CenterCrop(CROP_SIZE), transforms.ToTensor(), transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286)) ]) train_loader = torch.utils.data.DataLoader(dataset=Custom_Flickr30k( '../flickr30k-images', '../flickr30k-captions/results_20130124.token', vocab, transform=train_transform, train=True), batch_size=args.batch_size, shuffle=True, num_workers=NUM_WORKERS, collate_fn=collate_fn) val_loader = torch.utils.data.DataLoader(dataset=Custom_Flickr30k( '../flickr30k-images', '../flickr30k-captions/results_20130124.token', vocab, transform=val_transform, train=False), batch_size=args.batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_fn) # Initialize models encoder = EncoderCNN(args.fine_tune).to(device) decoder = DecoderRNNwithAttention(len(vocab), args.embed_size, args.hid_size, 1, args.attn_size, ENCODER_SIZE, NUM_PIXELS, dropout=args.drop).to(device) # Initialize optimization criterion = torch.nn.CrossEntropyLoss() if args.fine_tune: params = list(encoder.parameters()) + list(decoder.parameters()) else: params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] max_BLEU = checkpoint['max_BLEU'] encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) optimizer.load_state_dict(checkpoint['optimizer']) else: print("No checkpoint found at '{}'".format(args.resume)) XEntropy = AverageMeter() PPL = AverageMeter() # Save if not args.resume: file = open(f'{args.save}/resuts.txt', 'a') file.write('Loss,PPL,BLEU \n') file.close() for epoch in range(start_epoch, args.epoch): print('Epoch {}'.format(epoch + 1)) print('training...') for i, (images, captions, lengths) in enumerate(train_loader): # Batch to device images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] encoder.train() decoder.train() features = encoder(images) predictions, attention_weights = decoder(features, captions, lengths) scores = pack_padded_sequence(predictions[:, :-1, :], torch.tensor(lengths) - 2, batch_first=True).cpu() targets = pack_padded_sequence(captions[:, 1:-1], torch.tensor(lengths) - 2, batch_first=True).cpu() loss = criterion(scores.data, targets.data) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() XEntropy.update(loss.item(), len(lengths)) PPL.update(np.exp(loss.item()), len(lengths)) print('Train Perplexity = {}'.format(PPL.avg)) if epoch % 50 == 0: learning_rate /= 5 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate print('validating...') curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size, device) is_best = curr_BLEU > max_BLEU max_BLEU = max(curr_BLEU, max_BLEU) save_checkpoint( { 'epoch': epoch + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'max_BLEU': max_BLEU, 'optimizer': optimizer.state_dict(), }, is_best, args.save) print('Validation BLEU = {}'.format(curr_BLEU)) # Save file = open(f'{args.save}/resuts.txt', 'a') file.write('{},{},{} \n'.format(XEntropy.avg, PPL.avg, curr_BLEU)) file.close()
def main(args): #defining torch configurations #torch.manual_seed(args.seed) #torch.cuda.manual_seed(args.seed) #torch.backends.cudnn.benchmark = True #extract weights from the weight matrices weights = np.load(args.file_name) # CUDA for PyTorch #if cuda: device = 3 torch.cuda.set_device(device) #device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") #use_cuda = torch.cuda.is_available() #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #defining dictionary and VQAFeatureDataset #transforms for pretrained network(transform for resnet now) train_transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) validate_transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dictionary = Dictionary.load_from_file('data/dictionary.pkl') train_dataset = VQADataset(image_root_dir=args.img_root_dir, dictionary=dictionary, dataroot=args.data_root_dir, choice='train', transform_set=train_transform) # eval_dataset = VQADataset(image_root_dir=args.img_root_dir,dictionary=dictionary,dataroot=args.data_root_dir,choice='val',transform_set=validate_transform) #model definition print('Loading the models') image_encoder = EncoderCNN(embed_size=args.img_feats).to(device) question_encoder = EncoderLSTM(hidden_size=args.num_hid, weights_matrix=weights, fc_size=args.q_embed, max_seq_length=args.max_sequence_length, batch_size=args.batch_size).to(device) fusion_network = FusionModule(qnetwork=question_encoder, img_network=image_encoder, fuse_embed_size=args.fuse_embed, input_fc_size=args.img_feats, class_size=args.num_class).to(device) #print(list(fusion_network.parameters())) print(fusion_network) #input() #Dataloader initialization train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=12) # eval_loader = DataLoader(eval_dataset, args.batch_size, shuffle=True, num_workers=1) # Loss and optimizer criterion = nn.NLLLoss() #params=lis #params = list(image_encoder.linear.parameters())+list(image_encoder.bn.parameters())+list(question_encoder.parameters()) + list(fusion_network.parameters()) optimizer = torch.optim.Adam(fusion_network.parameters(), lr=args.learning_rate) # Train the models total_step = len(train_loader) step = 0 #Training starts #print('Training Starting ......................') def evaluate_val(model, train_loader, criterion, device): loss = 0 accuracy = 0 with torch.no_grad(): for image_sample, question_token, labels in iter(train_loader): image_sample, question_token, labels = image_sample.to( device), question_token.to(device), labels.to(device) output = model.forward(question_token, image_sample) loss += criterion(output, labels).item() ps = torch.exp(output) equality = (labels.data == ps.max(dim=1)[1]) accuracy += equality.type(torch.FloatTensor).mean() return loss, accuracy file_train = open('train_loss_log.txt', 'a+') loss_save = [] for epoch in range(args.epochs): running_loss = 0.0 running_corrects = 0 step = 0 for data in tqdm(train_loader): image_samp, question_toks, labels = data image_samp = image_samp.to(device) question_toks = question_toks.to(device) labels = labels.to(device) class_outputs = fusion_network(question_toks, image_samp) _, preds = torch.max(class_outputs, 1) loss = criterion(class_outputs, labels) #question_encoder.zero_grad() optimizer.zero_grad() loss.backward() optimizer.step() #print('Enter some key') #input() # statistics running_loss += loss.item() * image_samp.size(0) running_corrects += torch.sum(preds == labels.data) if (step % 300 == 0): #optimizer.zero_grad() print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.epochs, step, total_step, loss.item())) step = step + 1 epoch_loss = running_loss / len(train_dataset) epoch_acc = running_corrects.double() / len(train_dataset) print(epoch_loss) #loss_save.append(val_loss) val_loss, accuracy = evaluate_val(fusion_network, train_loader, criterion, device) string = 'Epoch {}:{} loss: {} \t'.format(epoch, args.epochs, running_loss) string += 'Accuracy : '.format(accuracy) file_train.write(string) print('{} Loss: {:.4f} Acc: {:.4f}'.format('train', epoch_loss, epoch_acc)) file_train.close()
def train_captioner(): print("Training The Capitoner ... ") # Create model directory if not os.path.exists(path_trained_model): os.makedirs(path_trained_model) # Image preprocessing, first resize the input image then do normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((input_resnet_size, input_resnet_size), interpolation=Image.ANTIALIAS), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Loading Dictionary (as binary data) with open(dict_path, 'rb') as file: dictionary = pickle.load(file) # Build data loader data_loader = get_loader(imgs_path, data_caps, dictionary, transform, BATCH_SIZE, shuffle=True, num_workers=2) # Build the models encoder = EncoderCNN(word_embedding_size).to(device) decoder = DecoderRNN(word_embedding_size, lstm_output_size, len(dictionary[0]), num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=LEARN_RATE) # Train the models total_step = len(data_loader) for epoch in range(NUM_EPOCHS): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % 20 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, NUM_EPOCHS, i, total_step, loss.item())) # Sace model after each epoch ... torch.save( decoder.state_dict(), os.path.join(path_trained_model, 'captioner{}.ckpt'.format(epoch + 1))) torch.save( encoder.state_dict(), os.path.join(path_trained_model, 'feature-extractor-{}.ckpt'.format(epoch + 1)))
def test_captioner(show_images=False): # Load vocabulary wrapper with open(dict_path, 'rb') as file: dictionary = pickle.load(file) # Build models encoder = EncoderCNN(word_embedding_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(word_embedding_size, lstm_output_size, len(dictionary[0]), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder_model_path = os.path.join( path_trained_model, feature_gen_path + str(NUM_EPOCHS) + model_extension) decoder_model_path = os.path.join( path_trained_model, caption_gen_path + str(NUM_EPOCHS) + model_extension) encoder.load_state_dict(torch.load(encoder_model_path)) print("Feature Extractor Model Loaded Successfully") decoder.load_state_dict(torch.load(decoder_model_path)) print("Caption Generator Loaded Successfully") # Open Caption Saver File output_file = open(captions_save_path, 'w') for iter, data_img in enumerate(data_imgs): img_path = os.path.join(imgs_path, data_img['file_name']) ### Change ### inp_img = Image.open(img_path) # Since the model assumes a batch number ... image_tensor = transform(inp_img).unsqueeze(0).to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() idx2word = dictionary[1] # Convert word_ids to words sampled_caption = [] for idx in sampled_ids: word = idx2word[idx] sampled_caption.append(word) if word == 'END': break sentence = ' '.join(sampled_caption) # Writing the Caption to File output_file.write(sentence + "\n") # Print out the image and the generated caption print("Caption: ", sentence) if show_images: image = cv.imread(img_path, cv.IMREAD_COLOR) window_name = "Sample Image with Caption as Overlay" cv.imshow(window_name, image) cv.displayOverlay(window_name, sentence) cv.waitKey(0) if iter == 10: return
vocab_path = 'vocab.pkl' #Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) data_loader = get_loader(image_dir, caption_path, vocab, transform, batch_size, shuffle=True, num_workers=num_workers) #Build models encoder = EncoderCNN(embed_size).to(device) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list( encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) # Train the models total_step = len(data_loader) for epoch in range(num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset
dataset_train = Flickr8k_Images( image_ids=image_ids, transform=data_transform, ) train_loader = torch.utils.data.DataLoader( dataset_train, batch_size=64, shuffle=False, num_workers=2, ) # device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = EncoderCNN().to(device) ######################################################################### # # QUESTION 1.2 Extracting image features # ######################################################################### features = [] # TODO loop through all image data, extracting features and saving them # no gradients needed with torch.no_grad(): model.eval() for data in tqdm(train_loader): data = data.to(device) features.append(model(data))
embed_size = decoder_input_params['embed_size'] hidden_size = decoder_input_params['hidden_size'] vocab_size = decoder_input_params['vocab_size'] num_layers = decoder_input_params['num_layers'] ## Load Vocab Obj = s3.get_object(Bucket=S3_BUCKET, Key=VOCAB_PATH) bytestream = io.BytesIO(Obj['Body'].read()) decoder_vocab = pickle.load(bytestream) print('decoder_vocab loaded') # Load Encoder Obj2 = s3.get_object(Bucket=S3_BUCKET, Key=ENC_PATH) bytestream = io.BytesIO(Obj2['Body'].read()) encoder_model = EncoderCNN(embed_size) encoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE)) print('Encoder loaded') # Load Decoder Obj3 = s3.get_object(Bucket=S3_BUCKET, Key=DEC_PATH) bytestream = io.BytesIO(Obj3['Body'].read()) decoder_model = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers) decoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE)) print('Decoder loaded') # decoder = DecoderRNN( embed_size , hidden_size , vocab_size , num_layers ) # decoder.load_state_dict( torch.load( os.path.join( model_save_path , 'decoderdata.pkl' ) ) ) encoder_model.eval() decoder_model.eval()
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size, hidden_size, vocab_size): encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move to GPU, if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = encoder.to(device) decoder = decoder.to(device) criterion = nn.CrossEntropyLoss().to(device) params = list(decoder.parameters()) + list(encoder.embed.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) # This is to make sure that the 1st loss is lower than sth and # Save the model according to this comparison valid_loss_min = np.Inf for epoch in range(1, n_epochs + 1): # Keep track of training and validation loss train_loss = 0.0 valid_loss = 0.0 encoder.train() decoder.train() for data in train_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) loss.backward() optimizer.step() train_loss += loss.item() * images.size(0) encoder.eval() decoder.eval() for data in valid_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) valid_loss += loss.item() * images.size(0) # Average losses train_loss = train_loss / len(train_loader) valid_loss = valid_loss / len(valid_loader) print( f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}" ) # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min} --> {valid_loss}). Saving model ..." ) torch.save(encoder.state_dict(), save_location_path + '/encoder{n_epochs}.pt') torch.save(decoder.state_dict(), save_location_path + '/decoder{n_epochs}.pt') valid_loss_min = valid_loss
def main(args): model_path = args.model_path if not os.path.exists(model_path): os.makedirs(model_path) # load vocablary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) img_path = args.img_path factual_cap_path = args.factual_caption_path humorous_cap_path = args.humorous_caption_path # import data_loader data_loader = get_data_loader(img_path, factual_cap_path, vocab, args.caption_batch_size, shuffle=True) styled_data_loader = get_styled_data_loader(humorous_cap_path, vocab, args.language_batch_size, shuffle=True) # import models emb_dim = args.emb_dim hidden_dim = args.hidden_dim factored_dim = args.factored_dim vocab_size = len(vocab) encoder = EncoderCNN(emb_dim) decoder = FactoredLSTM(emb_dim, hidden_dim, factored_dim, vocab_size) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() # loss and optimizer criterion = masked_cross_entropy cap_params = list(decoder.parameters()) + list(encoder.A.parameters()) lang_params = list(decoder.parameters()) optimizer_cap = torch.optim.Adam(cap_params, lr=args.lr_caption) optimizer_lang = torch.optim.Adam(lang_params, lr=args.lr_language) # train total_cap_step = len(data_loader) total_lang_step = len(styled_data_loader) epoch_num = args.epoch_num for epoch in range(epoch_num): # caption for i, (images, captions, lengths) in enumerate(data_loader): images = to_var(images, volatile=True) captions = to_var(captions.long()) # forward, backward and optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(captions, features, mode="factual") loss = criterion(outputs[:, 1:, :].contiguous(), captions[:, 1:].contiguous(), lengths - 1) loss.backward() optimizer_cap.step() # print log if i % args.log_step_caption == 0: print("Epoch [%d/%d], CAP, Step [%d/%d], Loss: %.4f" % (epoch + 1, epoch_num, i, total_cap_step, loss.data.mean())) eval_outputs(outputs, vocab) # language for i, (captions, lengths) in enumerate(styled_data_loader): captions = to_var(captions.long()) # forward, backward and optimize decoder.zero_grad() outputs = decoder(captions, mode='humorous') loss = criterion(outputs, captions[:, 1:].contiguous(), lengths - 1) loss.backward() optimizer_lang.step() # print log if i % args.log_step_language == 0: print("Epoch [%d/%d], LANG, Step [%d/%d], Loss: %.4f" % (epoch + 1, epoch_num, i, total_lang_step, loss.data.mean())) # save models torch.save(decoder.state_dict(), os.path.join(model_path, 'decoder-%d.pkl' % (epoch + 1, ))) torch.save(encoder.state_dict(), os.path.join(model_path, 'encoder-%d.pkl' % (epoch + 1, )))