def shard_xattn_t2i(images, captions, caplens, opt, tag_masks, shard_size=128):
    """
    Computer pairwise t2i image-caption distance with locality sharding
    """
    # n_im_shard = (len(images)-1)/shard_size + 1
    # n_cap_shard = (len(captions)-1)/shard_size + 1
    n_im_shard = (len(images) - 1) // shard_size + 1
    n_cap_shard = (len(captions) - 1) // shard_size + 1
    # pdb.set_trace()
    d = np.zeros((len(images), len(captions)))
    for i in range(n_im_shard):
        im_start, im_end = shard_size * i, min(shard_size * (i + 1),
                                               len(images))
        for j in range(n_cap_shard):
            sys.stdout.write('\r>> shard_xattn_t2i batch (%d,%d)' % (i, j))
            cap_start, cap_end = shard_size * j, min(shard_size * (j + 1),
                                                     len(captions))
            # im = Variable(torch.from_numpy(images[im_start:im_end]), volatile=True).cuda()
            # s = Variable(torch.from_numpy(captions[cap_start:cap_end]), volatile=True).cuda()
            im = torch.from_numpy(images[im_start:im_end]).cuda()
            s = torch.from_numpy(captions[cap_start:cap_end]).cuda()
            l = caplens[cap_start:cap_end]
            #======================================================
            batch_tag_masks = tag_masks[cap_start:cap_end]
            #======================================================
            if opt.pos:
                sim = xattn_score_t2i(im, s, l, opt, batch_tag_masks)
            else:
                sim = xattn_score_t2i(im, s, l, opt)
            d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()
    sys.stdout.write('\n')
    return d
示例#2
0
def get_attn(img_embs, cap_embs, cap_lens, wanted_id, target_id, opt,
             t2i_switch, freqs):
    i, s, l = prepare_embeddings(img_embs, cap_embs, cap_lens, wanted_id,
                                 target_id, t2i_switch)

    sim, attn = xattn_score_t2i(i, s, l, freqs, opt)
    return attn
示例#3
0
def shard_xattn_t2i(images, captions, caplens, opt, shard_size=128):
    """
    Computer pairwise t2i image-caption distance with locality sharding
    """
    n_im_shard = (len(images) - 1) / shard_size + 1
    n_cap_shard = (len(captions) - 1) / shard_size + 1

    d = np.zeros((len(images), len(captions)))
    for i in range(n_im_shard):
        im_start, im_end = shard_size * i, min(shard_size * (i + 1),
                                               len(images))
        for j in range(n_cap_shard):
            sys.stdout.write('\r>> shard_xattn_t2i batch (%d,%d)' % (i, j))
            cap_start, cap_end = shard_size * j, min(shard_size * (j + 1),
                                                     len(captions))
            im = torch.from_numpy(images[im_start:im_end]).cuda()
            s = torch.from_numpy(captions[cap_start:cap_end]).cuda()
            l = caplens[cap_start:cap_end]
            sim = xattn_score_t2i(im, s, l, opt)
            d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()
    sys.stdout.write('\n')
    return d
示例#4
0
def create_attn(data_loader, positions, opt, model):
    # collect all attention scores
    total_attn = []

    with torch.no_grad():

        for i, (images, captions, lengths, ids, freq_score,
                freqs) in enumerate(data_loader):
            # compute the embeddings
            img_emb, cap_emb, cap_len = model.forward_emb(images,
                                                          captions,
                                                          lengths,
                                                          volatile=True)

            if opt.cross_attn == "i2t":
                sim, attn = xattn_score_i2t(img_emb, cap_emb, cap_len, freqs,
                                            opt)
            else:
                sim, attn = xattn_score_t2i(img_emb, cap_emb, cap_len, freqs,
                                            opt)

            total_attn.append(attn[0][0, :, positions[i]])
    return total_attn
示例#5
0
def shard_xattn_t2i(images, captions, caplens, opt, shard_size=128):
    """
    Computer pairwise t2i image-caption distance with locality sharding
    """
    ##########
    # n_im_shard = (len(images)-1)/shard_size + 1
    # n_cap_shard = (len(captions)-1)/shard_size + 1
    #
    # d = np.zeros((len(images), len(captions)))
    # for i in range(n_im_shard):
    #     im_start, im_end = shard_size*i, min(shard_size*(i+1), len(images))
    #     for j in range(n_cap_shard):
    #         sys.stdout.write('\r>> shard_xattn_t2i batch (%d,%d)' % (i, j))
    #         cap_start, cap_end = shard_size * j, min(shard_size * (j + 1), len(captions))
    #         im = Variable(torch.from_numpy(images[im_start:im_end]), volatile=True).cuda()
    #         s = Variable(torch.from_numpy(captions[cap_start:cap_end]), volatile=True).cuda()
    #         l = caplens[cap_start:cap_end]
    #         sim = xattn_score_t2i(im, s, l, opt)
    #         d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()
    # sys.stdout.write('\n')
    # return d

    n_im_shard = (len(images) - 1) / shard_size + 1
    print("bbbbbb", captions[0].shape)
    d = np.zeros((len(images), 1))
    for i in range(n_im_shard):
        im_start, im_end = shard_size * i, min(shard_size * (i + 1),
                                               len(images))
        sys.stdout.write('\r>> shard_xattn_t2i batch (%d)' % i)
        im = Variable(torch.from_numpy(images[im_start:im_end]),
                      volatile=True).cuda()
        s = Variable(torch.from_numpy(captions[0]), volatile=True).cuda()
        l = caplens
        sim = xattn_score_t2i(im, s, l, opt)
        d[im_start:im_end] = sim.data.cpu().numpy()
    sys.stdout.write('\n')
    return d
示例#6
0
def shard_xattn_t2i(images, captions, caplens, freqs, opt, shard_size=128):
    """
    Computer pairwise t2i image-caption distance with locality sharding
    """

    # takes a block and calculated the similarity, instead of entire d-matrix in one time
    n_im_shard = (len(images) - 1) / shard_size + 1
    n_cap_shard = (len(captions) - 1) / shard_size + 1

    d = np.zeros((len(images), len(captions)))

    attention = []

    for i in range(int(n_im_shard)):
        im_start, im_end = shard_size * i, min(shard_size * (i + 1),
                                               len(images))
        for j in range(int(n_cap_shard)):
            sys.stdout.write('\r>> shard_xattn_t2i batch (%d,%d)' % (i, j))
            cap_start, cap_end = shard_size * j, min(shard_size * (j + 1),
                                                     len(captions))
            if torch.cuda.is_available():
                im = Variable(torch.from_numpy(images[im_start:im_end])).cuda()
                s = Variable(torch.from_numpy(
                    captions[cap_start:cap_end])).cuda()
            else:
                im = Variable(torch.from_numpy(images[im_start:im_end]))
                s = Variable(torch.from_numpy(captions[cap_start:cap_end]))
            l = caplens[cap_start:cap_end]
            f = freqs[cap_start:cap_end]

            sim, attn = xattn_score_t2i(im, s, l, f, opt)
            d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()

            attention = attention + attn
    sys.stdout.write('\n')
    return d, attention
示例#7
0
def xattn_sim(images, captions, caplens, opt, shard_size=64):
    """
    Computer pairwise t2i image-caption distance with locality sharding
    """
    n_im_shard = (len(images) - 1) // shard_size + 1
    n_cap_shard = (len(captions) - 1) // shard_size + 1

    d = np.zeros((len(images), len(captions)))
    for i in range(n_im_shard):
        im_start, im_end = shard_size * i, min(shard_size * (i + 1), len(images))
        for j in range(n_cap_shard):
            sys.stdout.write('\r>> xattn_sim batch (%d,%d)' % (i, j))
            cap_start, cap_end = shard_size * j, min(shard_size * (j + 1), len(captions))
            im = torch.from_numpy(images[im_start:im_end]).cuda()
            s = torch.from_numpy(captions[cap_start:cap_end]).cuda()
            l = caplens[cap_start:cap_end]
            t2i_scores = xattn_score_t2i(im, s, l, opt.lambda_softmax,
                                         opt.norm_func, opt.agg_func, opt.lambda_lse)
            i2t_scores = xattn_score_i2t(im, s, l, opt.lambda_softmax,
                                         opt.norm_func, opt.agg_func, opt.lambda_lse)
            sim = opt.alpha * t2i_scores + (1 - opt.alpha) * i2t_scores
            d[im_start:im_end, cap_start:cap_end] = sim.data.cpu().numpy()
    sys.stdout.write('\n')
    return d