def main(model_name, model_type, feats_name, img_savepath, query_file, vocab_file, cuda):
    """Main function."""
    queries = json.load(open(query_file))
    vocab = json.load(open(vocab_file))

    data = h5py.File(feats_name, 'r')
    data_dict = dict()
    for fname, feat in zip(data['filenames'], data['features']):
        data_dict[fname] = feat

    if model_type == 'inception':
        model = inception(512, 512, 2480, batch_first=True, dropout=0.7)
    elif model_type == 'vgg':
        model = vgg(512, 512, 2480, batch_first=True, dropout=0.7)
    elif model_type == 'squeezenet':
        model = squeezenet(512, 512, 2480, batch_first=True, dropout=0.7)
    else:
        print("Please, specify a valid model type: inception, vgg, squeezenet"\
              "instead of %s" % model_type)
        return

    """Load the model weights."""
    if cuda:
        model = model.cuda()
    model.load_state_dict(torch.load(model_name))
    model.eval()

    txt_trf = TextTransforms()
    # pylint: disable=W0108
    txt_norm = lambda x: txt_trf.normalize(x)

    # First of all, add a zero vector to the features data for start/stop.
    data_dict['zeros'] = np.zeros_like(data['features'][0])
    zero_idx = list(data_dict.keys()).index('zeros')

    answers_feats = torch.from_numpy(np.array(data_dict.values()))
    answers_feats = torch.nn.functional.normalize(answers_feats, p=2, dim=1)
    if cuda:
        answers_feats = answers_feats.cuda()

    for nq, query in enumerate(queries):
        # Now, generate outfit for one image (forward and backward prediction until start/stop):
        query_feats = torch.from_numpy(np.array([data_dict[q] for q in query['image_query']]))
        query_feats = torch.nn.functional.normalize(query_feats, p=2, dim=1)

        if cuda:
            query_feats = query_feats.cuda()

        first_prod = query_feats[0].unsqueeze(0)  # Start with the first image
        # Forward prediction
        forward_seq = run_forward_lstm(model, first_prod, answers_feats, data_dict, zero_idx, cuda)
        # Backward prediction
        backward_seq = run_backward_lstm(model, first_prod, answers_feats, data_dict, zero_idx, cuda)

        # Concatenate full sequence (forward + backward) generated by first product
        first_sequence = backward_seq + [query['image_query'][0]] + forward_seq
        seq_feats = torch.from_numpy(np.array([data_dict[im] for im in first_sequence]))
        seq_feats = torch.nn.functional.normalize(seq_feats, p=2, dim=1)
        if cuda:
            seq_feats = seq_feats.cuda()

        # If there are more images, substitute the nearest one by the query and recompute:
        if len(query['image_query']) >= 2:
            positions = [len(backward_seq)]  # Position of the first query in the sequence
            for i, img in enumerate(query['image_query'][1:]):
                # Find NN of the next item
                dists = torch.mm(query_feats[i + 1].unsqueeze(0), seq_feats.permute(1, 0))
                _, idx = torch.max(dists, 1)
                positions.append(idx)

            start_pos = np.min(positions)
            end_pos = np.max(positions)
            if start_pos == positions[0]:
                start_feats = query_feats[0].unsqueeze(0)
                end_feats = query_feats[i + 1].unsqueeze(0)
                start_item = query['image_query'][0]
                end_item = query['image_query'][i + 1]
            elif end_pos == positions[0]:
                start_feats = query_feats[i + 1].unsqueeze(0)
                end_feats = query_feats[0].unsqueeze(0)
                start_item = query['image_query'][i + 1]
                end_item = query['image_query'][0]

            blanks = run_fill_lstm(model, start_feats, end_feats, end_pos - start_pos - 1,
                                 answers_feats, data_dict, zero_idx, cuda)
            sets = [start_item] + blanks + [end_item]
            sets_feats = torch.from_numpy(np.array([data_dict[im] for im in sets]))
            sets_feats = torch.nn.functional.normalize(sets_feats, p=2, dim=1)
            if cuda:
                sets_feats = sets_feats.cuda()

            # run bi LSTM again
            forward_seq = run_forward_lstm(model, sets_feats, answers_feats, data_dict, zero_idx, cuda)
            backward_seq = run_backward_lstm(model, sets_feats, answers_feats, data_dict, zero_idx, cuda)
            sets = backward_seq + sets + forward_seq
            positions = [len(backward_seq), len(sets) - len(forward_seq) - 1]

        else:
            sets = backward_seq + query['image_query'] + forward_seq

        if len(query['text_query']):
            text_query = txt_norm(query['text_query'])
            texts = torch.stack([get_one_hot(word, vocab) for word in text_query.split()])
            texts = torch.autograd.Variable(texts)
            if cuda:
                texts = texts.cuda()
            text_query_feat = model.textn(texts)
            text_query_feat = torch.mean(text_query_feat.view(len(text_query_feat), -1), 0)
            text_query_feat = torch.nn.functional.normalize(text_query_feat.unsqueeze(0), p=2, dim=1)

            sets_text = sets[:]
            for i, j in enumerate(sets):
                if j not in query['image_query']:
                    sets_text[i] = nn_search(j, text_query_feat, data_dict, answers_feats, cuda)

        create_img_outfit(sets, positions, os.path.join(img_savepath, "%d.jpg" % nq))
        create_img_outfit(sets_text, positions, os.path.join(img_savepath, "%d_%s.jpg" % (nq, text_query)))
Exemplo n.º 2
0
def config(net_params, data_params, opt_params, cuda_params):
    """Get parameters to configure the experiment and prepare the needed variables.

    Args:
        - net_params: list containing:
            input_dimension for LSTM (int)
            output_dimension for LSTM (int)
            margin for contrastive loss (float)
            size of the vocabulary (int)
            load_path for loading weights (str) (None by default)
            freeze (bool) whether or not freezing the cnn layers
        - data_params: dictionary with keys:
            'img_dir': path to the directory where images are (string)
            'json_dir': path to the directory wher jsons are (string)
            'json_files': names of the train, test and validation jsons (dictionary)
            'batch_size': batch_size (int)
            'batch_first': batch_first (bool) for the LSTM sequences
        - opt_params: dictionary with keys:
            'learning_rate': learning rate value (float)
            'weight_decay': weight decay value (float)
        - cuda_params: dictionary with keys:
            'cuda': (bool): whether to use GPU or not
            'multigpu': (list of int): indices of GPUs to use

    Returns:
        - model: pytorch model to train
        - dataloaders: data iterators
        - scheduler: scheduler of the optimizer function to train
        - criterion: loss equation to train

    """
    model_type, input_dim, hidden_dim, margin, vocab_size, load_path, freeze = net_params


    if model_type == 'inception':

        model = inception(input_dim, hidden_dim, vocab_size, data_params['batch_first'],
                           dropout=0.7, freeze=freeze)
        img_size = 299
        img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5),
                   'test': ImageTransforms(img_size)}
        img_train_tf = lambda x: torchvision.transforms.ToTensor()(img_trf['train'].random_crop(
            img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip(
                img_trf['train'].resize(x)))))
        img_test_val_tf = lambda x: torchvision.transforms.ToTensor()(img_trf['test'].resize(x))

    elif model_type == 'vgg':

        model = vgg(input_dim, hidden_dim, vocab_size, data_params['batch_first'],
                           dropout=0.7, freeze=freeze)
        img_size = 224
        norm_trf = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5),
                   'test': ImageTransforms(img_size)}
        img_train_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['train'].random_crop(
            img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip(
                img_trf['train'].resize(x))))))
        img_test_val_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['test'].resize(x)))

    elif model_type == 'squeezenet':
        model = squeezenet(input_dim, hidden_dim, vocab_size, data_params['batch_first'],
                           dropout=0.7, freeze=freeze)
        img_size = 227
        norm_trf = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        img_trf = {'train': ImageTransforms(img_size + 6, 5, img_size, 0.5),
                   'test': ImageTransforms(img_size)}
        img_train_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['train'].random_crop(
            img_trf['train'].random_rotation(img_trf['train'].random_horizontal_flip(
                img_trf['train'].resize(x))))))
        img_test_val_tf = lambda x: norm_trf(torchvision.transforms.ToTensor()(img_trf['test'].resize(x)))

    else:
        print("Please, specify a valid model type: inception, vgg or squeezenet"\
              "instead of %s" % model_type)
        return

    txt_train_tf = lambda x: TXT_TRF.random_delete(TXT_TRF.normalize(x))

    img_transforms = {'train': img_train_tf,
                      'test': img_test_val_tf,
                      'val': img_test_val_tf}

    txt_transforms = {'train': txt_train_tf,
                      'test': TXT_TEST_VAL_TF,
                      'val': TXT_TEST_VAL_TF}

    if load_path is not None:
        print("Loading weights from %s" % load_path)
        model.load_state_dict(torch.load(load_path))
    if cuda_params['cuda']:
        print("Switching model to gpu")
        model.cuda()
    if cuda_params['multigpu']:
        print("Switching model to multigpu")
        multgpu = ast.literal_eval(multigpu[0])
        model.cuda()
        model = nn.DataParallel(model, device_ids=cuda_params['multigpu'])

    dataloaders = {x: torch.utils.data.DataLoader(
        PolyvoreDataset(os.path.join(data_params['json_dir'], data_params['json_files'][x]),
                        data_params['img_dir'],
                        img_transform=img_transforms[x], txt_transform=txt_transforms[x]),
        batch_size=data_params['batch_size'],
        shuffle=True, num_workers=24,
        collate_fn=collate_seq,
        pin_memory=True)
                   for x in ['train', 'test', 'val']}

    # Optimize only the layers with requires_grad = True, not the frozen layers:
    optimizer = optim.SGD(filter(lambda x: x.requires_grad, model.parameters()),
                          lr=opt_params['learning_rate'], weight_decay=opt_params['weight_decay'])
    criterion = LSTMLosses(data_params['batch_first'], cuda_params['cuda'])
    contrastive_criterion = SBContrastiveLoss(margin)

    return model, dataloaders, optimizer, criterion, contrastive_criterion
Exemplo n.º 3
0
def main(model_name, model_type, feats_name, img_savepath, cuda):
    """Main function."""
    outfit_file = 'data/label/fill_in_blank_test.json'
    fitb = json.load(open(outfit_file))

    data = h5py.File(feats_name, 'r')
    data_dict = dict()
    for fname, feat in zip(data['filenames'], data['features']):
        data_dict[fname] = feat

    if model_type == 'inception':
        model = inception(512, 512, 2480, batch_first=True, dropout=0.7)
    elif model_type == 'vgg':
        model = vgg(512, 512, 2480, batch_first=True, dropout=0.7)
    elif model_type == 'squeezenet':
        model = squeezenet(512, 512, 2480, batch_first=True, dropout=0.7)
    else:
        print("Please, specify a valid model type: inception, vgg or squeezenet"\
              "instead of %s" % model_type)
        return
    """Load the model weights."""
    if cuda:
        model = model.cuda()
    model.load_state_dict(torch.load(model_name))
    model.eval()

    scores = []

    tic = time.time()
    for i, outfit in enumerate(fitb):
        sys.stdout.write('Outfit %d/%d - %2.f secs remaining\r' %
                         (i, len(fitb),
                          (time.time() - tic) / (i + 1) * (len(fitb) - i)))
        sys.stdout.flush()
        question_feats = torch.from_numpy(
            np.array([data_dict[q] for q in outfit['question']]))
        question_feats = torch.nn.functional.normalize(question_feats,
                                                       p=2,
                                                       dim=1)

        answers_feats = torch.from_numpy(
            np.array([data_dict[a] for a in outfit['answers']]))
        answers_feats = torch.nn.functional.normalize(answers_feats,
                                                      p=2,
                                                      dim=1)

        if cuda:
            question_feats = question_feats.cuda()
            answers_feats = answers_feats.cuda()

        position = outfit['blank_position'] - 1

        if position == 0:
            out, _ = model.lstm(
                torch.autograd.Variable(question_feats).unsqueeze(0))
            out = out.data
            bw_hidden = out[0, :question_feats.size(0),
                            out.size(2) // 2:][0].view(1, -1)
            pred = predict_single_direction(
                torch.autograd.Variable(bw_hidden),
                torch.autograd.Variable(answers_feats))

        elif position == len(question_feats):
            out, _ = model.lstm(
                torch.autograd.Variable(question_feats).unsqueeze(0))
            out = out.data
            fw_hidden = out[0, :question_feats.size(0), :out.size(2) //
                            2][-1].view(1, -1)
            pred = predict_single_direction(
                torch.autograd.Variable(fw_hidden),
                torch.autograd.Variable(answers_feats))

        else:
            prev = question_feats[:position]
            prev_out, _ = model.lstm(
                torch.autograd.Variable(prev).unsqueeze(0))
            prev_out = prev_out.data
            fw_hidden = prev_out[0, :prev.size(0), :prev_out.size(2) //
                                 2][-1].view(1, -1)

            post = question_feats[position:]
            post_out, _ = model.lstm(
                torch.autograd.Variable(post).unsqueeze(0))
            post_out = post_out.data
            bw_hidden = post_out[0, :post.size(0),
                                 post_out.size(2) // 2:][0].view(1, -1)

            pred = predict_multi_direction(
                torch.autograd.Variable(fw_hidden),
                torch.autograd.Variable(bw_hidden),
                torch.autograd.Variable(answers_feats))

        create_img_fitb(
            outfit, pred[0].data[0],
            os.path.join(img_savepath,
                         "%d_score%.2f.jpg" % (i, pred[1].data[0] * 100)))
        scores.append(pred)

    print("\n")
    acc = np.sum([i[0].data[0] == 0 for i in scores]) / float(len(scores))

    print("\033[0;31m\nModel: %s\033[0m" % model_name)
    print("\033[1;30mFITB accuracy for %d outfits: %.2f%%\033[0m" %
          (len(fitb), acc * 100))