def __init__(self, input_size, embedding_size, hidden_size, vocab_size, num_layer): super(Model, self).__init__() self.encoder = model.EncoderCNN(input_size, embedding_size) self.decoder = model.DecoderRNN(embedding_size, hidden_size, vocab_size, num_layer) self.criterion = nn.CrossEntropyLoss()
def main(args): transform = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) with open(args.vocab_path, "rb") as f1, \ open(args.batched_file_path, "rb") as f2: vocab = pickle.load(f1) batched_val_set = pickle.load(f2) coco_caps = COCO(args.caption_path) batched_val_loader = get_loader(args.image_dir, args.caption_path, batched_val_set, vocab, transform, shuffle=True, num_workers=3) encoder = model.EncoderCNN() decoder = model.DecoderRNN(512, 196, 512, 512, len(vocab), 1) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() checkpoint = torch.load(args.load_checkpoint) decoder.load_state_dict(checkpoint["state_dict"]) checkpoint = None torch.cuda.empty_cache() for i, (images, captions, lengths, ids) in enumerate(batched_val_loader): if i == args.num_runs: break print("\nactual captions for batch " + str(i) + " are: ") annIds = coco_caps.getAnnIds(imgIds=ids) anns = coco_caps.loadAnns(annIds) for ann in anns: print(ann["caption"]) images = to_var(images, volatile=True) captions = to_var(captions, volatile=True) features = encoder(images) results = decoder.sample(features, args.beam_size) print("predicted captions are: ") for result in results: candidate = [vocab(i) for i in result[1][:-1]] references = [nltk.tokenize.word_tokenize(ann["caption"].lower()) for ann in anns] score = bleu_score.sentence_bleu(references, candidate) print("probability: %5.4f, BLEU score: %5.4f, caption: %s" %(result[0], score, caption_id_to_string(result[1], vocab)))
with open('vocabSet.pkl', 'rb') as f: vocabularySet = pickle.load(f) print("Loaded Vocabulary Set") with open('vocabSet2.pkl', 'rb') as f: vocabularySet2 = pickle.load(f) print("Loaded Reverse Vocabulary Set") modelsPath = "LSTM4Models/" imagesPath = "../data/val2014/" captionsPath = "../data/annotations/captions_val.json" cnnEn = model.EncoderCNN(wordEmbeddings).eval() lstmDe = model.DecoderRNN(wordEmbeddings, lstmHiddenStates, len(vocabularySet), lstmLayers) cnnEn = cnnEn.to(device) lstmDe = lstmDe.to(device) valData = COCO(captionsPath) #Exploiting Pycocotools to get insights about data print("Total Annotations: " + str(len(valData.anns.keys()))) print("Total Images: " + str(len(valData.imgs.keys()))) #Visualise print(valData.imgToAnns[393212]) for (i, key) in enumerate(valData.imgToAnns.keys()):
batch_size = args.bs imgh = args.imh imgw = args.imw embed_dim = args.embed_size hidden_dim = args.nhid attention_dim =args.attention_dim transform = transforms.Compose([transforms.Resize((imgh, imgw)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) fine_tune_encoder = False encoder = model.EncoderCNN().to(device) encoder.fine_tune(fine_tune_encoder) decoder = model.DecoderRNN(ntokens, embed_dim, hidden_dim, idx2word, word2idx).to(device) loss_fn = nn.CrossEntropyLoss().to(device) decoder_optimizer = t.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder_optimizer = t.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) if fine_tune_encoder else None # def prepare_sequence(seq, to_ix): # idxs = [to_ix[w] for w in seq] # return t.tensor(idxs, dtype=t.long, device = device) def batchify(data, bs):
## DataLoaders provide various ways to get batches of examples. train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dataload.collate_fn, **kwargs) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dataload.collate_fn, **kwargs) ## Load the proper neural network model. if args.model == 'Pretrained': model.encoder = model.EncoderCNN(args.embed_dim) model.decoder = model.DecoderRNN(embed_size=args.embed_dim, hidden_size=args.hidden_dim, vocab_size=vocab_size, num_layers=1, max_seq_length=10) else: raise Exception('Unknown model {}'.format(args.model)) ## the loss function -cross-entropy. criterion = functional.cross_entropy ## Activate CUDA if specified and available. if args.cuda:
def main(args): transform = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) useCuda = not args.disable_cuda with open(args.vocab_path, 'rb') as f1, open(args.batched_train_path, 'rb') as f2, open(args.batched_val_path, 'rb') as f3: vocab = pickle.load(f1) batched_train_set = pickle.load(f2) batched_val_set = pickle.load(f3) batched_train_loader = get_loader(args.train_image_dir, args.train_caption_path, batched_train_set, vocab, transform, shuffle=True, num_workers=3) batched_val_loader = get_loader(args.val_image_dir, args.val_caption_path, batched_val_set, vocab, transform, shuffle=True, num_workers=1) random_val_loader = get_loader(args.val_image_dir, args.val_caption_path, batched_val_set, vocab, transform, shuffle=True, num_workers=1) encoder_cnn = model.EncoderCNN(args.is_normalized, useCuda=useCuda) decoder_rnn = model.DecoderRNN(args.embedding_dim, args.hidden_size, len(vocab), args.batch_size, dropout=args.dropout, useCuda=useCuda) if torch.cuda.is_available() and useCuda: decoder_rnn.cuda() loss_function = nn.NLLLoss() #loss_function = nn.CrossEntropyLoss() params = list(decoder_rnn.parameters()) optimizer = optim.Adam(params, lr=args.encoder_lr) #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1) output_train_file = open(args.output_train_name, 'w') output_val_file = open(args.output_val_name, 'w') start_epoch = 0 save_name = file_namer.make_checkpoint_name(args.batch_size, args.min_occurrences, args.num_epochs, \ args.dropout, args.decoder_lr, args.encoder_lr, args.embedding_dim, args.hidden_size, args.grad_clip, \ args.is_normalized) if args.load_checkpoint == "" else args.load_checkpoint checkpoint_name = file_namer.get_checkpoint(save_name) if checkpoint_name is not None: print("loading from checkpoint " + checkpoint_name) checkpoint = torch.load(checkpoint_name) if useCuda else torch.load( checkpoint_name, map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] decoder_rnn.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) args.load_checkpoint = checkpoint_name checkpoint = None torch.cuda.empty_cache() else: print("No existing checkpoints, starting from scratch") args.load_checkpoint = "No checkpoint found" full_return_index = mp.Value('i', 0) full_return_value = mp.Value('d', 0.0) full_val_processes = None for epoch in range(start_epoch, args.num_epochs): val_processes = None return_index = mp.Value('i', 0) return_value = mp.Value('d', 0.0) train_progress_bar = tqdm(iterable=batched_train_loader, desc='Epoch [%i/%i] (Train)' % (epoch, args.num_epochs)) train_sum_loss = 0 for i, (images, captions, _) in enumerate(train_progress_bar): train_sum_loss += trainer.train(encoder_cnn, decoder_rnn, loss_function, optimizer, images, captions, args.grad_clip, useCuda) train_progress_bar.set_postfix(loss=train_sum_loss / ((i % 100) + 1)) if i % 100 == 0: output_train_file.write( "%d, %5.4f\n" % (epoch * len(batched_train_loader) + i, train_sum_loss / 100 if i > 0 else train_sum_loss)) if i % 1000 == 0: if val_processes is not None: val_processes.join() output_val_file.write( "%d, %5.4f\n" % (return_index.value, return_value.value)) val_processes = mp.Process( target=validate, args=(random_val_loader, encoder_cnn, decoder_rnn, loss_function, useCuda, epoch * len(batched_train_loader) + i, return_index, return_value)) val_processes.start() train_sum_loss = 0 if full_val_processes is not None: full_val_processes.join() #scheduler.step(full_return_value.value) output_val_file.write( "End of Epoch\n%d, %5.4f\n" % (full_return_index.value, full_return_value.value)) full_val_processes = mp.Process( target=validate_full, args=(batched_val_loader, encoder_cnn, decoder_rnn, loss_function, useCuda, epoch, args.num_epochs, len(batched_train_loader), full_return_index, full_return_value)) full_val_processes.start() torch.save({'epoch': epoch + 1, 'state_dict': decoder_rnn.state_dict(), 'optimizer': optimizer.state_dict()}, file_namer.make_checkpoint_name(args.batch_size, args.min_occurrences, epoch + 1, args.dropout, \ args.decoder_lr, args.encoder_lr, args.embedding_dim, args.hidden_size, args.grad_clip, args.is_normalized)) if full_val_processes is not None: full_val_processes.join() output_val_file.write( "End of Epoch\n%d, %5.4f\n" % (full_return_index.value, full_return_value.value)) full_val_processes = None output_train_file.close() output_val_file.close() if args.plot: args.train_files.append(args.output_train_name) args.val_files.append(args.output_val_name) plot(args) args.png_files = [args.plot_name] if args.send_email: args.txt_files = [args.output_train_name, args.output_val_name] f = open('arguments.txt', 'w') for arg in sorted(vars(args)): # arguments we don't want sent in the email ignore_args = [ 'user', 'password', 'to', 'plot_name', 'train_image_dir', 'val_image_dir', 'send_email', 'plot', 'plot_name', 'train_caption_path', 'val_caption_path', 'png_files', 'txt_files', 'disable_cuda', 'body', 'output_train_name', 'output_val_name', 'show', 'subject', 'max_batched_set_size' ] if not arg in ignore_args: f.write("%s: %s\n" % (arg, getattr(args, arg))) f.close() if not args.body: args.body = 'arguments.txt' else: args.txt_files.append('arguments.txt') send_email(args)
collate_fn=dataload.collate_fn, **kwargs) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dataload.collate_fn, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dataload.val_collate, **kwargs) ## Load the proper neural network model. if args.model == 'Pretrained': # Problem 2 (no hidden layer, input -> output) model.encoder = model.EncoderCNN(10) model.decoder = model.DecoderRNN(encoder_dim=2048, decoder_dim=512, attention_dim=512, embed_size=512, hidden_size=args.hidden_dim, vocab_size=vocab_size, num_layers=1, max_seq_length=15) #elif args.model == 'resnet_common': # Problem 5 (multiple hidden layers, input -> hidden layers -> output) # print("sruthi check 1") # model = models.resnetcommon.ResnetCommon(im_size, args.hidden_dim, args.kernel_size, n_classes) else: raise Exception('Unknown model {}'.format(args.model))
vocabularySet2 = pickle.load(f) print("Loaded Reverse Vocabulary Set") modelPath = "models/" imagesPath = "../data/images/" captionsPath = "../data/annotations/captions_train.json" #Hyper Parameters - TUNABLE lstmLayers = 3 lstmHiddenStates = 512 wordEmbeddings = 256 epochs = 5 batchSize = 64 learningRate = 0.001 cnn = model.EncoderCNN(wordEmbeddings).to(device) lstm = model.DecoderRNN(wordEmbeddings, lstmHiddenStates, len(vocabularySet), lstmLayers).to(device) criterion = torch.nn.CrossEntropyLoss() parameters = list(lstm.parameters()) + list(cnn.linear.parameters()) + list( cnn.bn.parameters()) optimizer = torch.optim.Adam(parameters, lr=learningRate) #Preprocessing of Image data transform = transforms.Compose([ transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ])
with open("../ImageCaptioner/data/vocab/vocab_occurrence_5.pkl", 'rb') as f1,\ open("../ImageCaptioner/data/batched_data/val_batch_1.pkl", "rb") as f2: vocab = pickle.load(f1) batched_val_set = pickle.load(f2) coco_caps = COCO("../ImageCaptioner/data/annotations/captions_val2014.json") batched_val_loader = get_loader( "../ImageCaptioner/data/val2014", "../ImageCaptioner/data/annotations/captions_val2014.json", batched_val_set, vocab, transform, shuffle=True, num_workers=3) encoder = model.EncoderCNN() decoder = model.DecoderRNN(512, 196, 512, 512, len(vocab), 1) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() checkpoint = torch.load( "noNorm/model_batch_100_dims_512x512_lr_0.0001/checkpoint_25.pt") decoder.load_state_dict(checkpoint['state_dict']) checkpoint = None torch.cuda.empty_cache() for i, (images, captions, lengths, ids) in enumerate(batched_val_loader): if i == 1: break print("actual captions are: ")
def main(args): transform = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) useCuda = not args.disable_cuda with open(args.vocab_path, 'rb') as vocab_path, \ open(args.batched_train_path, 'rb') as batched_train_path, \ open(args.batched_val_path, 'rb') as batched_val_path: vocab = pickle.load(vocab_path) batched_train_set = pickle.load(batched_train_path) batched_val_set = pickle.load(batched_val_path) batched_train_loader = get_loader(args.train_image_dir, args.train_caption_path, batched_train_set, vocab, transform, shuffle=True, num_workers=3) batched_val_loader = get_loader(args.val_image_dir, args.val_caption_path, batched_val_set, vocab, transform, shuffle=True, num_workers=1) batched_val_loader_full = get_loader(args.val_image_dir, args.val_caption_path, batched_val_set, vocab, transform, shuffle=True, num_workers=1) encoder_cnn = model.EncoderCNN() decoder_rnn = model.DecoderRNN(512, 196, args.embedding_dim, args.hidden_dim, len(vocab), args.num_layers, args.dropout, useCuda=useCuda) if torch.cuda.is_available() and useCuda: encoder_cnn.cuda() decoder_rnn.cuda() loss_function = nn.NLLLoss() params = list(decoder_rnn.parameters()) optimizer = optim.Adam(params, lr=args.lr) output_train_file = open( args.output_dir + "/train_" + str(args.num_epochs) + ".txt", 'w') output_val_file = open( args.output_dir + "/val_" + str(args.num_epochs) + ".txt", 'w') start_epoch = 0 if args.load_checkpoint is not None: checkpoint = torch.load( args.load_checkpoint) if useCuda else torch.load( args.load_checkpoint, map_location=lambda storage, loc: storage) print("loading from checkpoint " + str(args.load_checkpoint)) start_epoch = checkpoint['epoch'] decoder_rnn.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) checkpoint = None torch.cuda.empty_cache() for epoch in range(start_epoch, args.num_epochs): progress_bar = tqdm(iterable=batched_train_loader, desc='Epoch [%i/%i] (Train)' % (epoch, args.num_epochs)) train_sum_loss = 0 for i, (images, captions, lengths, ids) in enumerate(progress_bar, 1): loss = train(images, captions, encoder_cnn, decoder_rnn, loss_function, optimizer, args.grad_clip, useCuda) train_sum_loss += loss.data.select(0, 0) progress_bar.set_postfix(loss=train_sum_loss / ((i % 100) + 1)) if i % 100 == 0: output_train_file.write("%d, %5.4f\n" % (epoch * len(batched_train_loader) + i, train_sum_loss / 100)) train_sum_loss = 0 if i % 1000 == 0: temp_loss = validate(batched_val_loader, encoder_cnn, decoder_rnn, loss_function, useCuda) output_val_file.write( "%d, %5.4f\n" % (epoch * len(batched_train_loader) + i, temp_loss)) # end of batch output_train_file.write( "%d, %5.4f\n" % ((epoch + 1) * len(batched_train_loader), train_sum_loss / len(batched_train_loader) / 100)) val_sum_loss = 0 val_progress_bar = tqdm(iterable=batched_val_loader_full, desc='Epoch [%i/%i] (Val)' % (epoch, args.num_epochs)) for i, (images, captions, lengths, ids) in enumerate(val_progress_bar, 1): loss = evaluate(images, captions, encoder_cnn, decoder_rnn, loss_function, optimizer, useCuda) val_sum_loss += loss.data.select(0, 0) val_progress_bar.set_postfix(loss=val_sum_loss / i) output_val_file.write("%d, %5.4f\n" % ((epoch + 1) * len(batched_train_loader), val_sum_loss / len(batched_val_loader_full))) torch.save( { 'epoch': epoch + 1, 'state_dict': decoder_rnn.state_dict(), 'optimizer': optimizer.state_dict() }, args.output_dir + "/checkpoint_" + str(epoch + 1) + ".pt") output_train_file.close() output_val_file.close()
def main(): VOCAB = vocabulary.load_vocab(vocab_dict_path) print(len(VOCAB)) dataset_raw = preprocessing.PreprocessedData(tagg_toy, im_toy, dataset_new_folder, nlp, VOCAB) # dataset_raw = preprocessing.PreprocessedData(tagged_files_paths, im_paths, dataset_new_folder, nlp, VOCAB) print("\nDataset lengths.... data / labels") print(len(dataset_raw.train_data), len(dataset_raw.labels)) dataset_processed = dataset.ImageTextDataset(dataset_raw) print('\nLength of dataset') print(len(dataset_processed)) U = utils.Utils() train_n, val_n = U.partition_numbers(.8, len(dataset_processed)) train_set, val_set = torch.utils.data.random_split(dataset_processed, [train_n, val_n]) print('\nTrainset {}, valset {}'.format(train_set, val_set)) ########################### HYPERPARAMETERS ########################################################## num_workers = 8 if CUDA else 0 batch_size = 32 embedding_dim = 64 num_hidden_nodes = 512 size_of_vocab = len(VOCAB) num_output_nodes = size_of_vocab num_layers = 3 bidirection = True dropout = 0 nepochs = 20 lr = 0.001 weight_decay = 0.00001 ##################################################################################################### train_dataloader = DataLoader(train_set, batch_size=batch_size, collate_fn=dataset.collate, shuffle=True, num_workers=num_workers, drop_last=False) val_dataloader = DataLoader(val_set, batch_size=batch_size, collate_fn=dataset.collate, shuffle=True, num_workers=num_workers, drop_last=True) # Instantiate encoder = model.EncoderCNN(embedding_dim) decoder = model.DECODER(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, bidirectional=bidirection, dropout=dropout) # Criterion & Optimizer criterion = nn.CrossEntropyLoss(ignore_index=0) optimizer = optim.Adam(decoder.parameters(), lr=lr, weight_decay=weight_decay) # Train train_losses, test_losses, train_perplexities, test_perplexities = train_test.run_epochs( encoder, decoder, optimizer, criterion, train_dataloader, val_dataloader, nepochs) # Generate generate.generate_labels(dataset_raw.train_data[0:20])