def main(): dir_photos = "./data/flickr8k/Flicker8k_photos/" file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt" jpg_files = ds.images_info(dir_photos) ann_dframe = ds.annots_info(file_annot, df=True) print( "Dataset overview\n-------------------------------------------------------------------------------------------------------------\n" ) print(ann_dframe) print( "\n-------------------------------------------------------------------------------------------------------------\n" ) ## Prepare captions print("Preparing caption data for images") word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") print(ann_dframe) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) ann_dfrm = ds.add_start_end_tokens(ann_dframe) print(ann_dfrm) vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True) for p in vgg_net.parameters(): p.requires_grad = False ## Load model parameters from path # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth')) ## Features in the last layer num_ftrs = vgg_net.classifier[-1].in_features print(num_ftrs) print(vgg_net) ## Remove the last classifier layer: Softmax, ReLU, Dropout vgg_net.classifier = vgg_net.classifier[:-1] # ## Net architecture # summary(vgg_net, input_size=(3, 224, 224)) print(vgg_net) # ## Features in the last layer # num_ftrs = vgg_net.classifier[-1].in_features # print(num_ftrs) # ## Read images with specified transforms print("Reading images ... ", end='') images = ds.read_image(jpg_files, dir_photos, normalize=True, resize=224, tensor=True) print("done.") # print(images.keys()) ## Get feature map for image tensor through VGG-16 img_featrs = OD() print("Gathering images' features from last conv layer ... ", end='') for i, jpg_name in enumerate(images.keys()): with torch.no_grad(): print(i, jpg_name) img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0)) print("done.") # print(img_featrs, img_featrs[jpg_name].size(), sep='\n') print(img_featrs.keys()) # Get features for images in our dataset from pretrained VGG-16 features = mdl.get_features(dir_photos, read=True, download=False) print(features) ## Prep image tensor print("Prepping image tensor ... ", end="") fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if (i % 5) == 0: if jpg_name in img_featrs.keys(): fnames.append(jpg_name) img_tns_list.append(img_featrs[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print("done.") print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print( "Saving filenames list, image tensor list, captions tensor list ... ", end="") torch.save(fnames, 'fnames.pkl') torch.save(img_tns_list, 'image_tns_list.pkl') torch.save(cap_list, 'captions_list.pkl') print("done.") print("Loading fnames, image tensor list and captions tensor list ... ", end="") fnames = torch.load('fnames.pkl') img_tns_list = torch.load('image_tns_list.pkl') img_tns = torch.cat(img_tns_list) cap_list = torch.load('captions_list.pkl') # print(len(fnames), cap_list) print("done.") cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) # print(len(train_fnm), len(valid_fnm), len(evaln_fnm)) images_train, captions_train, target_caps_train = ds.prep_data( train_ims, train_cap, vocab_size, cap_max_len) images_valid, captions_valid, target_caps_valid = ds.prep_data( valid_ims, valid_cap, vocab_size, cap_max_len) ## Dataloader bs = 64 trainset = ds.Flickr8k(images_train, captions_train, target_caps_train) validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid) trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True) validloader = torch.utils.data.DataLoader(validset, batch_size=bs) # # ## Device: CPU or GPU? device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) ## Model model = mdl.CapNet(vocab_size, cap_max_len).to(device) criterion = nn.CrossEntropyLoss() ## Optimizer optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1) max_n_epochs = 5 # ## Training print("Starting training ... ") epoch_train_loss, epoch_valid_loss = [], [] min_val_loss = 100 for epoch in range(1, max_n_epochs + 1): print("-------------------- Epoch: [%d / %d] ----------------------" % (epoch, max_n_epochs)) training_loss, validation_loss = 0.0, 0.0 ## Batch training for i, data in enumerate(trainloader): images, captions, target_caps = data[0].to(device), data[1].to( device), data[2].to(device) optimizer.zero_grad() out = model(images, captions.t()) loss = criterion(out, target_caps) loss.backward() optimizer.step() training_loss += loss.item() epoch_train_loss.append(training_loss / len(trainloader)) print("Training loss: %f" % (epoch_train_loss[-1]), end=" ") for i, data in enumerate(validloader): with torch.set_grad_enabled(False): images, captions, target_caps = data[0].to(device), data[1].to( device), data[2].to(device) out = model(images, captions.t()) loss = criterion(out, target_caps) validation_loss += loss.item() epoch_valid_loss.append(validation_loss / len(validloader)) print("Validation loss: %f" % (epoch_valid_loss[-1])) scheduler.step() if epoch_valid_loss[-1] < min_val_loss: print("Found best model.") best_model = deepcopy(model) plt.plot(list(range(max_n_epochs)), epoch_train_loss, label="Training loss") plt.plot(list(range(max_n_epochs)), epoch_valid_loss, label="Validation loss") plt.xlabel("Number of epochs") plt.ylabel("Loss") plt.title("Number of epochs vs loss") plt.legend() plt.show() ########### # Save model print("Saving best model ... ") torch.save(best_model, 'best_model.pkl')
def main(): dir_photos = "./data/flickr8k/Flicker8k_photos/" file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt" jpg_files = ds.images_info(dir_photos) ann_dframe = ds.annots_info(file_annot, df=True) print("Dataset overview\n-------------------------------------------------------------------------------------------------------------\n") print(ann_dframe) print("\n-------------------------------------------------------------------------------------------------------------\n") ## Prepare captions print("Preparing caption data for images") word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") print(ann_dframe) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) ann_dfrm = ds.add_start_end_tokens(ann_dframe) print(ann_dfrm) vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True) for p in vgg_net.parameters(): p.requires_grad = False ## Load model parameters from path # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth')) ## Features in the last layer num_ftrs = vgg_net.classifier[-1].in_features print(num_ftrs) print(vgg_net) ## Remove the last classifier layer: Softmax, ReLU, Dropout vgg_net.classifier = vgg_net.classifier[:-1] # ## Net architecture # summary(vgg_net, input_size=(3, 224, 224)) print(vgg_net) # ## Features in the last layer # num_ftrs = vgg_net.classifier[-1].in_features # print(num_ftrs) # ## Read images with specified transforms print("Reading images ... ", end='') images = ds.read_image(jpg_files, dir_photos, normalize=True, resize=224, tensor=True) print("done.") # print(images.keys()) ## Get feature map for image tensor through VGG-16 img_featrs = OD() print("Gathering images' features from last conv layer ... ", end='') for i, jpg_name in enumerate(images.keys()): with torch.no_grad(): print(i, jpg_name) img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0)) print("done.") # print(img_featrs, img_featrs[jpg_name].size(), sep='\n') print(img_featrs.keys()) # Get features for images in our dataset from pretrained VGG-16 features = mdl.get_features(dir_photos, read=True, download=False) print(features) ## Prep image tensor print("Prepping image tensor ... ", end="") fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if (i % 5) == 0: if jpg_name in img_featrs.keys(): fnames.append(jpg_name) img_tns_list.append(img_featrs[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print("done.") print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print("Saving filenames list, image tensor list, captions tensor list ... ", end="") torch.save(fnames, 'fnames.pkl') torch.save(img_tns_list, 'image_tns_list.pkl') torch.save(cap_list, 'captions_list.pkl') print("done.") print("Loading fnames, image tensor list and captions tensor list ... ", end="") fnames = torch.load('fnames.pkl') img_tns_list = torch.load('image_tns_list.pkl') img_tns = torch.cat(img_tns_list) cap_list = torch.load('captions_list.pkl') # print(len(fnames), cap_list) print("done.") cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) print("Loading model ...") model = torch.load('best_model.pkl') print(model) model.eval() # print(fnames) preds = [] for feat in evaln_ims: preds.append(predict_caption(model, feat, cap_max_len, tokens, device)) best_targets = [] for p, t in zip(preds, cap_list[:n_test]): pred = p.split(" ") targ = [t.split(" ")] z=sentence_bleu(targ, pred, weights=(1, 0, 0, 0)) if z > 0.50: print(p, t, z, sep='\n') print("\n") best_targets.append(t) print(best_targets) for cap in best_targets: rows = ann_dfrm.loc[ann_dfrm["caption"]==cap, "filename"] print(rows)
#from torchtext import data import numpy as np from collections import OrderedDict as OD from copy import copy # In[3]: #dir_photos = "./data/flickr8k/Flicker8k_photos/" #file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt" #jpg_files = ds.images_info(dir_photos) ann_dframe = ds.annots_info('Flickr8k.token.txt', df=True) # print(ann_dframe) ## Prepare captions word_count = ds.word_freq(ann_dframe) # In[8]: z = word_count.loc[1:20, ['word', 'count']] z # In[9]: word_freq_dict = dict(zip(z['word'], z['count'])) word_freq_dict # import matplotlib.pyplot as plt # fig= plt.figure(figsize=(10,10)) # plt.barh(list(word_freq_dict.keys()), word_freq_dict.values(), color='g') # plt.xlabel('Word frequency')
def main(): dir_photos = "./data/Flickr8k/Flickr8k_Dataset/Flicker8k_Dataset/" file_annot = "./data/Flickr8k/Flickr8k_text/Flickr8k.token.txt" print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Get basic dataset info print("DATASET INFO") print( "---------------------------------------------------------------------------------------------------------\n" ) jpg_files = ds.images_info(dir_photos) print("Number of photos in Flickr8k: %d" % (len(jpg_files))) ann_dframe = ds.annots_info(file_annot, df=True) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Visualize data overview print("DATASET OVERVIEW") print( "---------------------------------------------------------------------------------------------------------\n" ) print(ann_dframe) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prepare captions print("CURATE CAPTIONS") print( "---------------------------------------------------------------------------------------------------------\n" ) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text start = time.time() print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") # print(ann_dframe) # word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) print("Adding start and end tokens ... ", end="") ann_dfrm = ds.add_start_end_tokens(ann_dframe) print("done.") elapsed = time.time() - start print("\nTime to preprocess {} captions: {:.2f} \ seconds".format(i, elapsed)) # print(ann_dfrm) print( "\n-------------------------------------------------------------------------------------------------------\n" ) # ## Read images with specified transforms print("READ IMAGES & EXTRACT FEATURES") print( "---------------------------------------------------------------------------------------------------------\n" ) mean = [0.485, 0.456, 0.406] stdv = [0.229, 0.224, 0.225] transforms = vis.transforms.Compose([ vis.transforms.Resize(256), vis.transforms.CenterCrop(224), vis.transforms.ToTensor(), vis.transforms.Normalize(mean=mean, std=stdv) ]) print("Reading images ... ", end='') images = ds.read_image(dir_photos, transforms) print("done.") # Get feature maps for image tensor through VGG-16 features_dict, features_fname = mdl.get_features(images, download_wts=False, save=True, cuda=True) # print(features_dict) ## Load feature maps features_dict = torch.load(features_fname) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prep image tensor print("PREP IMAGE TENSOR") print( "---------------------------------------------------------------------------------------------------------\n" ) ann_dfrm = ann_dfrm.loc[ann_dfrm["idx"].values == "0", :] print(ann_dfrm) ds.word_freq(ann_dfrm) fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if jpg_name in features_dict.keys(): fnames.append(jpg_name) img_tns_list.append(features_dict[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Text tokenize print("TEXT TOKENIZE") print( "---------------------------------------------------------------------------------------------------------\n" ) tokens, cap_seq, vocab_size, cap_max_len = ds.tokenizer(cap_list) print("Vocab size: ", vocab_size) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Dataset splits print("DATASET SPLIT") print( "---------------------------------------------------------------------------------------------------------\n" ) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) print(len(train_fnm), len(valid_fnm), len(evaln_fnm)) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prep data for training and validation print("FINAL PREP FOR TRAINING & VALIDATION") print( "---------------------------------------------------------------------------------------------------------\n" ) images_train, captions_train, target_caps_train = ds.prep_data( train_ims, train_cap, cap_max_len) images_valid, captions_valid, target_caps_valid = ds.prep_data( valid_ims, valid_cap, cap_max_len) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## TRAINING print("TRAINING") print( "---------------------------------------------------------------------------------------------------------\n" ) ## Hyperparameters bs = 64 lr = 0.001 lr_steps = 20 gamma = 0.1 max_n_epochs = 5 ## Dataloader print("DATALOADERS") trainset = ds.Flickr8k(images_train, captions_train, target_caps_train) validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid) trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True) validloader = torch.utils.data.DataLoader(validset, batch_size=bs) ## Device: CPU or GPU? print("DEVICE:", end=" ") device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) ## Model print("MODEL:") model = mdl.CapNet(vocab_size, cap_max_len).to(device) # Criterion criterion = nn.CrossEntropyLoss() ## Optimizer optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_steps, gamma=gamma) ## Training print("\nStarting training ... ") epoch_train_loss, epoch_valid_loss = [], [] min_val_loss = 100 for epoch in range(1, max_n_epochs + 1): print("-------------------- Epoch: [%d / %d] ----------------------" % (epoch, max_n_epochs)) training_loss, validation_loss = 0.0, 0.0 ## Batch training for i, data in enumerate(trainloader): tr_images, tr_captions, tr_target_caps = data[0].to( device), data[1].to(device), data[2].to(device) optimizer.zero_grad() tr_out = model(tr_images, tr_captions.t()) tr_loss = criterion(tr_out, tr_target_caps) tr_loss.backward() optimizer.step() training_loss += tr_loss.item() epoch_train_loss.append(training_loss / len(trainloader)) print("Training loss: %f" % (epoch_train_loss[-1]), end=" || ") for i, data in enumerate(validloader): with torch.set_grad_enabled(False): vl_images, vl_captions, vl_target_caps = data[0].to( device), data[1].to(device), data[2].to(device) vl_out = model(vl_images, vl_captions.t()) vl_loss = criterion(vl_out, vl_target_caps) validation_loss += vl_loss.item() epoch_valid_loss.append(validation_loss / len(validloader)) print("Validation loss: %f" % (epoch_valid_loss[-1])) scheduler.step(epoch=epoch) if epoch_valid_loss[-1] < min_val_loss: print("Found best model.") best_model = deepcopy(model) min_val_loss = epoch_valid_loss[-1] plt.plot(list(range(max_n_epochs)), epoch_train_loss, label="Training loss") plt.plot(list(range(max_n_epochs)), epoch_valid_loss, label="Validation loss") plt.xlabel("Number of epochs") plt.ylabel("Loss") plt.title("Number of epochs vs loss") plt.legend() plt.show() ## Save model print("Saving best model ... ") torch.save(best_model, 'best_model.pkl') print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Check output print("Loading model ...") model = torch.load('best_model.pkl') print(model) model.eval() preds = [] for feat in evaln_ims: preds.append(model.prediction(feat, tokens, device)) best_targets = [] bleu_scores = [] for p, t in zip(preds, cap_list[:n_test]): pred = p.split(" ") targ = [t.split(" ")] z = sentence_bleu(targ, pred, weights=(1, 0, 0, 0)) bleu_scores.append(z) if z > 0.50: print(p, t, z, sep='\n') print("\n") best_targets.append((p, t, z)) for i, tgt in enumerate(best_targets): print("{}: {}".format(i, tgt)) print("MEAN BLEU SCORE: %3f" % np.mean(bleu_scores))