예제 #1
0
def find_bb():
    im_path = 'EM\data\Other data\EM CA1 hippocampus region of brain/training_groundtruth.tif'
    im_path = root_path + im_path
    mask = read_image(im_path)
    find_boundingbox(mask[0])

    return mask
예제 #2
0
def colorize(img_path, model, img_size, show_original=False):
    '''
    Colorize image with specified model. Since human eye is much more sensitive to luminance (lightness change)
    than to chrominance (color change), we convert image to specified image size needed by the model
    used (e.g. 192x192), colorize it and then assemble final image in CIE LAB color space by putting together
    original image lightness and colorized image's AB channels resized to original image size.

    Args:
        img_path (string): JPEG image full path.
        model (Keras model): Model instance to use.
        img_size (tuple): Model input image size. Output image will keep the size of original image.
        show_original (bool): Concatenate original image with colorized image
    '''
    orig_rgb   = read_image(img_path)
    orig_lab   = skimage.color.rgb2lab((orig_rgb + 1) / 2)
    input_rgb  = skimage.transform.resize(orig_rgb, img_size)
    input_gray = rgb_to_lab(input_rgb)[:, :, :1]
    input_gray = np.repeat(input_gray, 3, axis=-1)  # Repeat channel to keep input 3-dimensional
    output_rgb = model.predict(input_gray.reshape((1, *img_size, 3)))[0] / 2 + 0.5  # Colorize
    output_lab = skimage.color.rgb2lab(output_rgb)  # Convert colorized image to LAB
    output_lab = skimage.transform.resize(output_lab, (orig_rgb.shape[0], orig_rgb.shape[1]))  # Resize LAB to orig size
    final_lab  = np.zeros((orig_rgb.shape[0], orig_rgb.shape[1], 3))  # Finals image LAB image
    final_lab[:, :, 0] = orig_lab[:, :, 0]  # Original image lightness channel
    final_lab[:, :, 1:] = output_lab[:, :, 1:]  # Take colorized image AB channels
    final_rgb  = skimage.color.lab2rgb(final_lab)

    if show_original:
        final_rgb = np.concatenate(((orig_rgb + 1) / 2, final_rgb), axis=1)

    return np.rint(final_rgb * 255).astype(np.uint8)  # Rescale to (0,255)
예제 #3
0
 def _parse_data(self, gt, pd):
     """
     gt: ground truth,
     pd: predicted data
     """
     # read data
     if isinstance(gt, str):
         gt = read_image(gt)
     if isinstance(pd, str):
         pd = read_image(pd)
     # uniting data shape
     if gt.shape != pd.shape:
         if self.shape_constrains:
             raise ValueError("the shape of ground truth is not consitent with predicted's")
         else:
             pd = augmentation.resize(pd, gt.shape)
     return gt, pd
예제 #4
0
 def log_colorized_images(self, iter):
     ground_truth = []
     batch = []
     # Read images
     for path in self.paths:
         img_rgb = read_image(path, self.img_size)
         ground_truth.append(img_rgb)
         img_gray = rgb2gray(img_rgb).reshape(self.img_size + (1, ))
         batch.append(img_gray)
     # Predict color using trained model
     colorized = self.model.predict(np.repeat(batch, 3, axis=-1))
     # Concat ground truth and predicted images and log them to comet
     for i in range(len(colorized)):
         final = np.concatenate((ground_truth[i], colorized[i]), axis=1)
         final = np.rint(final * 127.5 + 127.5).astype(np.uint8)
         self.experiment.log_image(final,
                                   name=f'iter_{iter:06d}_image_{i:02d}')
예제 #5
0
def get_features(dir, read=True, download=True):
    if read:
        if download:
            vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True)
        else:
            ## Load model parameters from path
            vgg_net = vis.models.vgg16()
            vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth'))

        jpg_files = ds.images_info(dir)

        ## Set requires to eliminate space taken for grads
        for p in vgg_net.parameters():
            p.requires_grad = False

        ## Net architecture
        print(vgg_net)
        # summary(vgg_net, input_size=(3, 224, 224))
        ## Remove the last classifier layer: Softmax
        print("Removing softmax layer of VGG16 ... ")
        vgg_net.classifier = vgg_net.classifier[:-1]
        print(vgg_net)
        # summary(vgg_net, input_size=(3, 224, 224))

        ## Read images with specified transforms
        print("Reading images ... ", end='')
        images = ds.read_image(dir, normalize=True, resize=224, tensor=True)
        print("done.")
        # print(images.keys())
        ## Get feature map for image tensor through VGG-16
        img_featrs = OD()
        print("Gathering images' features from last conv layer ... ", end='')
        for i, jpg_name in enumerate(images.keys()):
            with torch.no_grad():
                print(i, jpg_name)
                img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0))
        print("done.")

        return img_featrs
def main():

    print("Loading the image")
    dir_path = os.path.dirname(os.path.realpath(__file__))
    image_path = sys.argv[1]
    if image_path[0] != "/":
        image_path = dir_path + '/' + image_path
    image = np.array(
        [dataset.read_image(filename=image_path, image_size=IMG_SIZE)],
        dtype=np.uint8)

    print("Shapping the image for the model input")
    # The input to the network is of shape [None image_size image_size num_channels]. Hence we reshape.
    x_batch = image.reshape(1, IMG_SIZE, IMG_SIZE, NUM_CHANNELS)

    print("Please choose the model to use : ")
    les_meta_path = locate_files(extension=".meta",
                                 path=os.getcwd(),
                                 dbName="meta")
    for i, meta_path in enumerate(les_meta_path):
        print("\n\n" + str(i) + " : " + str(meta_path))
        info_txt_path = str('/'.join(meta_path.split("/")[:-1]) + "/info.txt")
        try:
            with open(info_txt_path, 'r') as f:
                for line in f:
                    print("\t" + str(line.replace("\n", "")))
                print("")
        except FileNotFoundError:
            print("// No info.txt \n")
    model_num = int(input(">> "))

    try:
        meta_path = les_meta_path[model_num]
        model_dir_path = '/'.join(meta_path.split("/")[:-1]) + "/"
    except IndexError or TypeError:
        print("Wrong input")
        return -1

    print("Restoring the model", end="")
    sys.stdout.flush()
    sess = tf.Session()
    # Step-1: Recreate the network graph. At this step only graph is created.
    saver = tf.train.import_meta_graph(meta_path)
    # Step-2: Now let's load the weights saved using the restore method.
    saver.restore(sess, tf.train.latest_checkpoint(model_dir_path))
    graph = tf.get_default_graph()
    y_pred = graph.get_tensor_by_name("y_pred:0")
    print(" - Done")

    print("Feeding the image to the input")
    x = graph.get_tensor_by_name("x:0")
    y_true = graph.get_tensor_by_name("y_true:0")

    les_labels = []
    try:
        with open(model_dir_path + "labels.txt", 'r') as f:
            for line in f:
                label = line.replace("\n", "")
                if label != "":
                    les_labels.append(label)
    except Exception as e:
        les_labels = ['Bathroom', 'Bedroom', 'Kitchen', 'Living Room']
        print(
            "Error openning labels.txt. We are going to use default values : "
            + str(les_labels))
        print("***\n" + str(e) + "\n***")
    print("Using labels : " + str(les_labels))

    y_test_images = np.zeros((1, len(les_labels)))
    ### Creating the feed_dict that is required to be fed to calculate y_pred
    feed_dict_testing = {x: x_batch, y_true: y_test_images}
    result = sess.run(y_pred, feed_dict=feed_dict_testing)
    print(result[0])
    # result is of this format [probabiliy_of_rose probability_of_sunflower]
    print("Prediction : ")
    for i in range(len(result[0])):
        print("\t" + les_labels[i] + " : " +
              str('{0:f}'.format(round(result[0][i] * 100, 5))) + "%")
    for k, v in org_dict['net'].items():
        temp['.'.join(k.split('.')[1:])] = v
    pretrained_model.load_state_dict(temp)

    val_transform = data_transform(False)
    val_data = TextImageSet(data_root, transform=val_transform, is_train=False)
    val_loader = DataLoader(val_data,
                            batch_size=1,
                            shuffle=False,
                            num_workers=6)

    for i, (data, labels_pro, img_path) in enumerate(val_loader):
        if i > 10:
            break
        img_path = img_path[0]
        original_image = read_image(img_path)
        original_image = original_image.resize((192, 64))
        file_name_to_export = img_path[img_path.rfind('/') +
                                       1:img_path.rfind('.')]

        # Grad cam
        grad_cam = GradCam(pretrained_model)
        # Generate cam mask single
        # cam = grad_cam.generate_cam(
        #     data, labels_pro, num_classes=len(cfg.alphabets))
        for l in range(3, 26):
            cam = grad_cam.generate_cam(data,
                                        labels_pro,
                                        num_classes=len(cfg.alphabets),
                                        nl=l)
            # Save mask
예제 #8
0
def main():
    dir_photos = "./data/flickr8k/Flicker8k_photos/"
    file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt"

    jpg_files = ds.images_info(dir_photos)
    ann_dframe = ds.annots_info(file_annot, df=True)
    print(
        "Dataset overview\n-------------------------------------------------------------------------------------------------------------\n"
    )
    print(ann_dframe)
    print(
        "\n-------------------------------------------------------------------------------------------------------------\n"
    )

    ## Prepare captions
    print("Preparing caption data for images")
    word_count = ds.word_freq(ann_dframe)
    # print(word_count)

    ## Clean text
    print("Cleaning text ... ", end="")
    for i, cpt in enumerate(ann_dframe.caption.values):
        ann_dframe["caption"].iloc[i] = ds.clean_text(cpt)
    print("done.")
    print(ann_dframe)
    word_count = ds.word_freq(ann_dframe)
    # print(word_count)

    ## Add start and end sequence token
    ann_dframe_orig = copy(ann_dframe)
    ann_dfrm = ds.add_start_end_tokens(ann_dframe)
    print(ann_dfrm)

    vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True)
    for p in vgg_net.parameters():
        p.requires_grad = False
    ## Load model parameters from path
    # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth'))
    ## Features in the last layer
    num_ftrs = vgg_net.classifier[-1].in_features
    print(num_ftrs)
    print(vgg_net)
    ## Remove the last classifier layer: Softmax, ReLU, Dropout
    vgg_net.classifier = vgg_net.classifier[:-1]
    # ## Net architecture
    # summary(vgg_net, input_size=(3, 224, 224))
    print(vgg_net)
    # ## Features in the last layer
    # num_ftrs = vgg_net.classifier[-1].in_features
    # print(num_ftrs)
    #
    ## Read images with specified transforms
    print("Reading images ... ", end='')
    images = ds.read_image(jpg_files,
                           dir_photos,
                           normalize=True,
                           resize=224,
                           tensor=True)
    print("done.")
    # print(images.keys())
    ## Get feature map for image tensor through VGG-16
    img_featrs = OD()
    print("Gathering images' features from last conv layer ... ", end='')
    for i, jpg_name in enumerate(images.keys()):
        with torch.no_grad():
            print(i, jpg_name)
            img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0))
    print("done.")
    # print(img_featrs, img_featrs[jpg_name].size(), sep='\n')
    print(img_featrs.keys())

    # Get features for images in our dataset from pretrained VGG-16
    features = mdl.get_features(dir_photos, read=True, download=False)
    print(features)

    ## Prep image tensor
    print("Prepping image tensor ... ", end="")
    fnames = []
    img_tns_list = []
    cap_list = []
    for i, jpg_name in enumerate(ann_dfrm.filename.values):
        if (i % 5) == 0:
            if jpg_name in img_featrs.keys():
                fnames.append(jpg_name)
                img_tns_list.append(img_featrs[jpg_name])
                cap_list.append(ann_dfrm.iloc[i]["caption"])
    print("done.")
    print(len(img_tns_list), len(cap_list))
    img_tns = torch.cat(img_tns_list)
    print(img_tns.shape)
    print(
        "Saving filenames list, image tensor list, captions tensor list ... ",
        end="")
    torch.save(fnames, 'fnames.pkl')
    torch.save(img_tns_list, 'image_tns_list.pkl')
    torch.save(cap_list, 'captions_list.pkl')
    print("done.")

    print("Loading fnames, image tensor list and captions tensor list ... ",
          end="")
    fnames = torch.load('fnames.pkl')
    img_tns_list = torch.load('image_tns_list.pkl')
    img_tns = torch.cat(img_tns_list)
    cap_list = torch.load('captions_list.pkl')
    # print(len(fnames), cap_list)
    print("done.")

    cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list)
    n_cap = len(cap_seq)
    vald_prop, test_prop = 0.2, 0.2
    n_vald = int(n_cap * vald_prop)
    n_test = int(n_cap * test_prop)

    train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test)
    train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test)
    # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test)

    print(len(train_cap), len(valid_cap), len(evaln_cap))
    print(len(train_ims), len(valid_ims), len(evaln_ims))
    # print(len(train_fnm), len(valid_fnm), len(evaln_fnm))

    images_train, captions_train, target_caps_train = ds.prep_data(
        train_ims, train_cap, vocab_size, cap_max_len)
    images_valid, captions_valid, target_caps_valid = ds.prep_data(
        valid_ims, valid_cap, vocab_size, cap_max_len)

    ## Dataloader
    bs = 64
    trainset = ds.Flickr8k(images_train, captions_train, target_caps_train)
    validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=bs,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(validset, batch_size=bs)

    #
    # ## Device: CPU or GPU?
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print("Using " + device)

    ## Model
    model = mdl.CapNet(vocab_size, cap_max_len).to(device)
    criterion = nn.CrossEntropyLoss()

    ## Optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
    max_n_epochs = 5

    # ## Training
    print("Starting training ... ")

    epoch_train_loss, epoch_valid_loss = [], []
    min_val_loss = 100
    for epoch in range(1, max_n_epochs + 1):
        print("-------------------- Epoch: [%d / %d] ----------------------" %
              (epoch, max_n_epochs))
        training_loss, validation_loss = 0.0, 0.0
        ## Batch training
        for i, data in enumerate(trainloader):
            images, captions, target_caps = data[0].to(device), data[1].to(
                device), data[2].to(device)
            optimizer.zero_grad()
            out = model(images, captions.t())
            loss = criterion(out, target_caps)
            loss.backward()
            optimizer.step()
            training_loss += loss.item()
        epoch_train_loss.append(training_loss / len(trainloader))
        print("Training loss: %f" % (epoch_train_loss[-1]), end=" ")
        for i, data in enumerate(validloader):
            with torch.set_grad_enabled(False):
                images, captions, target_caps = data[0].to(device), data[1].to(
                    device), data[2].to(device)
                out = model(images, captions.t())
                loss = criterion(out, target_caps)
                validation_loss += loss.item()
        epoch_valid_loss.append(validation_loss / len(validloader))
        print("Validation loss: %f" % (epoch_valid_loss[-1]))
        scheduler.step()

        if epoch_valid_loss[-1] < min_val_loss:
            print("Found best model.")
            best_model = deepcopy(model)

    plt.plot(list(range(max_n_epochs)),
             epoch_train_loss,
             label="Training loss")
    plt.plot(list(range(max_n_epochs)),
             epoch_valid_loss,
             label="Validation loss")
    plt.xlabel("Number of epochs")
    plt.ylabel("Loss")
    plt.title("Number of epochs vs loss")
    plt.legend()
    plt.show()

    ###########
    # Save model
    print("Saving best model ... ")
    torch.save(best_model, 'best_model.pkl')
예제 #9
0
def main():
    dir_photos = "./data/flickr8k/Flicker8k_photos/"
    file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt"

    jpg_files = ds.images_info(dir_photos)
    ann_dframe = ds.annots_info(file_annot, df=True)
    print("Dataset overview\n-------------------------------------------------------------------------------------------------------------\n")
    print(ann_dframe)
    print("\n-------------------------------------------------------------------------------------------------------------\n")


    ## Prepare captions
    print("Preparing caption data for images")
    word_count = ds.word_freq(ann_dframe)
    # print(word_count)


    ## Clean text
    print("Cleaning text ... ", end="")
    for i, cpt in enumerate(ann_dframe.caption.values):
        ann_dframe["caption"].iloc[i] = ds.clean_text(cpt)
    print("done.")
    print(ann_dframe)
    word_count = ds.word_freq(ann_dframe)
    # print(word_count)

    ## Add start and end sequence token
    ann_dframe_orig = copy(ann_dframe)
    ann_dfrm = ds.add_start_end_tokens(ann_dframe)
    print(ann_dfrm)


    vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True)
    for p in vgg_net.parameters():
        p.requires_grad = False
    ## Load model parameters from path
    # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth'))
    ## Features in the last layer
    num_ftrs = vgg_net.classifier[-1].in_features
    print(num_ftrs)
    print(vgg_net)
    ## Remove the last classifier layer: Softmax, ReLU, Dropout
    vgg_net.classifier = vgg_net.classifier[:-1]
    # ## Net architecture
    # summary(vgg_net, input_size=(3, 224, 224))
    print(vgg_net)
    # ## Features in the last layer
    # num_ftrs = vgg_net.classifier[-1].in_features
    # print(num_ftrs)
    #
    ## Read images with specified transforms
    print("Reading images ... ", end='')
    images = ds.read_image(jpg_files, dir_photos, normalize=True, resize=224, tensor=True)
    print("done.")
    # print(images.keys())
    ## Get feature map for image tensor through VGG-16
    img_featrs = OD()
    print("Gathering images' features from last conv layer ... ", end='')
    for i, jpg_name in enumerate(images.keys()):
        with torch.no_grad():
            print(i, jpg_name)
            img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0))
    print("done.")
    # print(img_featrs, img_featrs[jpg_name].size(), sep='\n')
    print(img_featrs.keys())

    # Get features for images in our dataset from pretrained VGG-16
    features = mdl.get_features(dir_photos, read=True, download=False)
    print(features)
    
    ## Prep image tensor
    print("Prepping image tensor ... ", end="")
    fnames = []
    img_tns_list = []
    cap_list = []
    for i, jpg_name in enumerate(ann_dfrm.filename.values):
         if (i % 5) == 0:
             if jpg_name in img_featrs.keys():
                 fnames.append(jpg_name)
                 img_tns_list.append(img_featrs[jpg_name])
                 cap_list.append(ann_dfrm.iloc[i]["caption"])
    print("done.")
    print(len(img_tns_list), len(cap_list))
    img_tns = torch.cat(img_tns_list)
    print(img_tns.shape)
    print("Saving filenames list, image tensor list, captions tensor list ... ", end="")
    torch.save(fnames, 'fnames.pkl')
    torch.save(img_tns_list, 'image_tns_list.pkl')
    torch.save(cap_list, 'captions_list.pkl')
    print("done.")
    
    print("Loading fnames, image tensor list and captions tensor list ... ", end="")
    fnames = torch.load('fnames.pkl')
    img_tns_list = torch.load('image_tns_list.pkl')
    img_tns = torch.cat(img_tns_list)
    cap_list = torch.load('captions_list.pkl')
    # print(len(fnames), cap_list)
    print("done.")

    cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list)
    n_cap = len(cap_seq)
    vald_prop, test_prop = 0.2, 0.2
    n_vald = int(n_cap * vald_prop)
    n_test = int(n_cap * test_prop)

    train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test)
    train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test)
    # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test)

    print(len(train_cap), len(valid_cap), len(evaln_cap))
    print(len(train_ims), len(valid_ims), len(evaln_ims))

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print("Using " + device)

    print("Loading model ...")
    model = torch.load('best_model.pkl')
    print(model)
    model.eval()

    # print(fnames)

    preds = []
    for feat in evaln_ims:
        preds.append(predict_caption(model, feat, cap_max_len, tokens, device))

    best_targets = []
    for p, t in zip(preds, cap_list[:n_test]):
        pred = p.split(" ")
        targ = [t.split(" ")]
        z=sentence_bleu(targ, pred, weights=(1, 0, 0, 0))
        if z > 0.50:
            print(p, t, z, sep='\n')
            print("\n")
            best_targets.append(t)
    print(best_targets)

    for cap in best_targets:
        rows = ann_dfrm.loc[ann_dfrm["caption"]==cap, "filename"]
        print(rows)
예제 #10
0
def main():
    dir_photos = "./data/Flickr8k/Flickr8k_Dataset/Flicker8k_Dataset/"
    file_annot = "./data/Flickr8k/Flickr8k_text/Flickr8k.token.txt"

    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Get basic dataset info
    print("DATASET INFO")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    jpg_files = ds.images_info(dir_photos)
    print("Number of photos in Flickr8k: %d" % (len(jpg_files)))
    ann_dframe = ds.annots_info(file_annot, df=True)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Visualize data overview
    print("DATASET OVERVIEW")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    print(ann_dframe)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Prepare captions
    print("CURATE CAPTIONS")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    word_count = ds.word_freq(ann_dframe)
    # print(word_count)

    ## Clean text
    start = time.time()
    print("Cleaning text ... ", end="")
    for i, cpt in enumerate(ann_dframe.caption.values):
        ann_dframe["caption"].iloc[i] = ds.clean_text(cpt)
    print("done.")
    # print(ann_dframe)
    # word_count = ds.word_freq(ann_dframe)
    # print(word_count)

    ## Add start and end sequence token
    ann_dframe_orig = copy(ann_dframe)
    print("Adding start and end tokens ... ", end="")
    ann_dfrm = ds.add_start_end_tokens(ann_dframe)
    print("done.")
    elapsed = time.time() - start
    print("\nTime to preprocess {} captions: {:.2f} \
            seconds".format(i, elapsed))
    # print(ann_dfrm)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    # ## Read images with specified transforms
    print("READ IMAGES & EXTRACT FEATURES")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    mean = [0.485, 0.456, 0.406]
    stdv = [0.229, 0.224, 0.225]
    transforms = vis.transforms.Compose([
        vis.transforms.Resize(256),
        vis.transforms.CenterCrop(224),
        vis.transforms.ToTensor(),
        vis.transforms.Normalize(mean=mean, std=stdv)
    ])
    print("Reading images ... ", end='')
    images = ds.read_image(dir_photos, transforms)
    print("done.")

    # Get feature maps for image tensor through VGG-16
    features_dict, features_fname = mdl.get_features(images,
                                                     download_wts=False,
                                                     save=True,
                                                     cuda=True)
    # print(features_dict)

    ## Load feature maps
    features_dict = torch.load(features_fname)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Prep image tensor
    print("PREP IMAGE TENSOR")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    ann_dfrm = ann_dfrm.loc[ann_dfrm["idx"].values == "0", :]
    print(ann_dfrm)
    ds.word_freq(ann_dfrm)
    fnames = []
    img_tns_list = []
    cap_list = []
    for i, jpg_name in enumerate(ann_dfrm.filename.values):
        if jpg_name in features_dict.keys():
            fnames.append(jpg_name)
            img_tns_list.append(features_dict[jpg_name])
            cap_list.append(ann_dfrm.iloc[i]["caption"])
    print(len(img_tns_list), len(cap_list))
    img_tns = torch.cat(img_tns_list)
    print(img_tns.shape)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Text tokenize
    print("TEXT TOKENIZE")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    tokens, cap_seq, vocab_size, cap_max_len = ds.tokenizer(cap_list)
    print("Vocab size: ", vocab_size)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Dataset splits
    print("DATASET SPLIT")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    n_cap = len(cap_seq)
    vald_prop, test_prop = 0.2, 0.2
    n_vald = int(n_cap * vald_prop)
    n_test = int(n_cap * test_prop)
    train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test)
    train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test)
    train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test)

    print(len(train_cap), len(valid_cap), len(evaln_cap))
    print(len(train_ims), len(valid_ims), len(evaln_ims))
    print(len(train_fnm), len(valid_fnm), len(evaln_fnm))
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Prep data for training and validation
    print("FINAL PREP FOR TRAINING & VALIDATION")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )
    images_train, captions_train, target_caps_train = ds.prep_data(
        train_ims, train_cap, cap_max_len)
    images_valid, captions_valid, target_caps_valid = ds.prep_data(
        valid_ims, valid_cap, cap_max_len)
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## TRAINING
    print("TRAINING")
    print(
        "---------------------------------------------------------------------------------------------------------\n"
    )

    ## Hyperparameters
    bs = 64
    lr = 0.001
    lr_steps = 20
    gamma = 0.1
    max_n_epochs = 5

    ## Dataloader
    print("DATALOADERS")
    trainset = ds.Flickr8k(images_train, captions_train, target_caps_train)
    validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=bs,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(validset, batch_size=bs)

    ## Device: CPU or GPU?
    print("DEVICE:", end=" ")
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print("Using " + device)

    ## Model
    print("MODEL:")
    model = mdl.CapNet(vocab_size, cap_max_len).to(device)

    # Criterion
    criterion = nn.CrossEntropyLoss()

    ## Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=lr_steps,
                                          gamma=gamma)

    ## Training
    print("\nStarting training ... ")

    epoch_train_loss, epoch_valid_loss = [], []
    min_val_loss = 100
    for epoch in range(1, max_n_epochs + 1):
        print("-------------------- Epoch: [%d / %d] ----------------------" %
              (epoch, max_n_epochs))
        training_loss, validation_loss = 0.0, 0.0
        ## Batch training
        for i, data in enumerate(trainloader):
            tr_images, tr_captions, tr_target_caps = data[0].to(
                device), data[1].to(device), data[2].to(device)
            optimizer.zero_grad()
            tr_out = model(tr_images, tr_captions.t())
            tr_loss = criterion(tr_out, tr_target_caps)
            tr_loss.backward()
            optimizer.step()
            training_loss += tr_loss.item()
        epoch_train_loss.append(training_loss / len(trainloader))
        print("Training loss: %f" % (epoch_train_loss[-1]), end=" || ")
        for i, data in enumerate(validloader):
            with torch.set_grad_enabled(False):
                vl_images, vl_captions, vl_target_caps = data[0].to(
                    device), data[1].to(device), data[2].to(device)
                vl_out = model(vl_images, vl_captions.t())
                vl_loss = criterion(vl_out, vl_target_caps)
                validation_loss += vl_loss.item()
        epoch_valid_loss.append(validation_loss / len(validloader))
        print("Validation loss: %f" % (epoch_valid_loss[-1]))
        scheduler.step(epoch=epoch)

        if epoch_valid_loss[-1] < min_val_loss:
            print("Found best model.")
            best_model = deepcopy(model)
            min_val_loss = epoch_valid_loss[-1]

    plt.plot(list(range(max_n_epochs)),
             epoch_train_loss,
             label="Training loss")
    plt.plot(list(range(max_n_epochs)),
             epoch_valid_loss,
             label="Validation loss")
    plt.xlabel("Number of epochs")
    plt.ylabel("Loss")
    plt.title("Number of epochs vs loss")
    plt.legend()
    plt.show()

    ## Save model
    print("Saving best model ... ")
    torch.save(best_model, 'best_model.pkl')
    print(
        "\n-------------------------------------------------------------------------------------------------------\n"
    )

    ## Check output
    print("Loading model ...")
    model = torch.load('best_model.pkl')
    print(model)

    model.eval()
    preds = []
    for feat in evaln_ims:
        preds.append(model.prediction(feat, tokens, device))

    best_targets = []
    bleu_scores = []
    for p, t in zip(preds, cap_list[:n_test]):
        pred = p.split(" ")
        targ = [t.split(" ")]
        z = sentence_bleu(targ, pred, weights=(1, 0, 0, 0))
        bleu_scores.append(z)
        if z > 0.50:
            print(p, t, z, sep='\n')
            print("\n")
            best_targets.append((p, t, z))
    for i, tgt in enumerate(best_targets):
        print("{}: {}".format(i, tgt))
    print("MEAN BLEU SCORE: %3f" % np.mean(bleu_scores))