Exemplo n.º 1
0
def word_similarity(img_features, words_emb, words_num):
    # -> 1 x nef x words_num
    word = words_emb[0, :, :words_num].unsqueeze(0).contiguous()
    # 1 x nef x 17*17
    context = img_features
    """
        word(query): 1 x nef x words_num
        context: 1 x nef x 16 x 16
        weiContext: 1 x nef x words_num
        attn: 1 x words_num x 16 x 16
    """
    weiContext, attn = func_attention(word, context, cfg.GAMMA1)
    att_maps = attn[0].unsqueeze(0).contiguous()
    # --> batch_size x words_num x nef
    word = word.transpose(1, 2).contiguous()
    weiContext = weiContext.transpose(1, 2).contiguous()
    # --> batch_size*words_num x nef
    word = word.view(1 * words_num, -1)
    weiContext = weiContext.view(1 * words_num, -1)
    #
    # -->batch_size*words_num
    row_sim = cosine_similarity(word, weiContext)
    # --> batch_size x words_num
    row_sim = row_sim.view(1, words_num)

    # Eq. (10)
    row_sim.mul_(cfg.GAMMA2).exp_()
    row_sim = row_sim.sum(dim=1, keepdim=True)
    row_sim = -torch.log(row_sim)
    return row_sim, att_maps
Exemplo n.º 2
0
def words_loss(img_features, words_emb, labels, cap_lens, class_ids,
               batch_size):
    """
        words_emb(query): batch x nef x seq_len
        img_features(context): batch x nef x 17 x 17
    """
    masks = []
    att_maps = []
    similarities = []
    cap_lens = cap_lens.data.tolist()
    for i in range(batch_size):
        if class_ids is not None:
            mask = (class_ids == class_ids[i]).astype(np.uint8)
            mask[i] = 0
            masks.append(mask.reshape((1, -1)))
        # Get the i-th text description
        words_num = cap_lens[i]
        # -> 1 x nef x words_num
        word = words_emb[i, :, :words_num].unsqueeze(0).contiguous()
        # -> batch_size x nef x words_num
        word = word.repeat(batch_size, 1, 1)
        # batch x nef x 17*17
        context = img_features
        """
            word(query): batch x nef x words_num
            context: batch x nef x 17 x 17
            weiContext: batch x nef x words_num
            attn: batch x words_num x 17 x 17
        """
        weiContext, attn = func_attention(word, context,
                                          cfg.TRAIN.SMOOTH.GAMMA1)
        att_maps.append(attn[i].unsqueeze(0).contiguous())
        # --> batch_size x words_num x nef
        word = word.transpose(1, 2).contiguous()
        weiContext = weiContext.transpose(1, 2).contiguous()
        # --> batch_size*words_num x nef
        word = word.view(batch_size * words_num, -1)
        weiContext = weiContext.view(batch_size * words_num, -1)
        #
        # -->batch_size*words_num
        row_sim = cosine_similarity(word, weiContext)
        # --> batch_size x words_num
        row_sim = row_sim.view(batch_size, words_num)

        # Eq. (10)
        row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_()
        row_sim = row_sim.sum(dim=1, keepdim=True)
        row_sim = torch.log(row_sim)

        # --> 1 x batch_size
        # similarities(i, j): the similarity between the i-th image and the j-th text description
        similarities.append(row_sim)

    # batch_size x batch_size
    similarities = torch.cat(similarities, 1)
    if class_ids is not None:
        masks = np.concatenate(masks, 0)
        # masks: batch_size x batch_size
        masks = torch.ByteTensor(masks)
        if cfg.CUDA:
            masks = masks.cuda()

    similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3
    if class_ids is not None:
        similarities.data.masked_fill_(masks, -float('inf'))
    similarities1 = similarities.transpose(0, 1)
    if labels is not None:
        loss0 = nn.CrossEntropyLoss()(similarities, labels)
        loss1 = nn.CrossEntropyLoss()(similarities1, labels)
    else:
        loss0, loss1 = None, None
    return loss0, loss1, att_maps
Exemplo n.º 3
0
def words_similarity(img_features, words_emb, labels, cap_lens, class_ids, batch_size):
    """
        words_emb(query): batch x nef x seq_len
        img_features(context): batch x nef x 17 x 17
    """
    masks = []
    att_maps = []
    similarities = []
    cap_lens = cap_lens.data.tolist()
    #print(cap_lens)
    #print(words_emb)

    for i in range(batch_size):

        # Get the i-th text description
        words_num = cap_lens[i]
        # -> 1 x nef x words_num
        word = words_emb[i, :, :words_num].unsqueeze(0).contiguous()
        # -> batch_size x nef x words_num
        word = word.repeat(batch_size, 1, 1)
        # batch x nef x 17*17
        context = img_features
        """
            word(query): batch x nef x words_num
            context: batch x nef x 17 x 17
            weiContext: batch x nef x words_num
            attn: batch x words_num x 17 x 17
        """
        #weiContext, attn = func_attention(word, context, GAMMA1)
        weiContext, attn = func_attention(word, context, cfg.TRAIN.SMOOTH.GAMMA1)
        att_maps.append(attn[i].unsqueeze(0).contiguous())
        # --> batch_size x words_num x nef
        word = word.transpose(1, 2).contiguous()
        weiContext = weiContext.transpose(1, 2).contiguous()
        # --> batch_size*words_num x nef
        word = word.view(batch_size * words_num, -1)
        weiContext = weiContext.view(batch_size * words_num, -1)
        #
        # -->batch_size*words_num
        row_sim = cosine_similarity(word, weiContext)
        # --> batch_size x words_num
        row_sim = row_sim.view(batch_size, words_num)

        # Eq. (10)
        #row_sim.mul_(GAMMA2).exp_()
        row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_()
        row_sim = row_sim.sum(dim=1, keepdim=True)
        row_sim = torch.log(row_sim)
        print(row_sim)
        #row_sim = row_sim.cpu().squeeze(0)
        #print(row_sim.item())

        # --> 1 x batch_size
        # similarities(i, j): the similarity between the i-th image and the j-th text description
        #similarities.append(row_sim.item())
        similarities.append(row_sim)

    # batch_size x batch_size
    similarities = torch.cat(similarities, 1)
    #similarities = similarities.detach().cpu().numpy()

    similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3
    similarities1 = similarities.transpose(0, 1)
    if labels is not None:
        loss0 = nn.CrossEntropyLoss()(similarities, labels)
        loss1 = nn.CrossEntropyLoss()(similarities1, labels)
    else:
        loss0, loss1 = None, None
    
    w_loss = (loss0 + loss1) * cfg.TRAIN.SMOOTH.LAMBDA   
    print('w_loss = ', w_loss.item(), loss0.item(), loss1.item())

    #average
    print(similarities)
    words_sim = similarities.detach().cpu().numpy()
    avg_sim = np.mean(words_sim,axis=0)
    print('similarities average(batch): ', avg_sim)
    
    #return avg_sim
    #return w_loss.item()
    return [w_loss.item(), loss0.item(), loss1.item()]