Exemplo n.º 1
0
 def forward(self, input, target, mask):
     # truncate to the same size
     target = target[:, :input.size(1)]
     #  print "target:", target
     #  print "mask:", mask
     # duplicate
     num_img = input.size(0) // self.seq_per_img
     input_per_image = input.chunk(num_img)
     input = torch.cat(
         [t.repeat(self.seq_per_img, 1, 1) for t in input_per_image], dim=0)
     target = torch.unsqueeze(target, 0)
     target = target.permute(1, 0, 2)
     target = target.repeat(1, self.seq_per_img, 1)
     target = target.resize(target.size(0) * target.size(1), target.size(2))
     mask = mask[:, :input.size(1)]
     mask = torch.unsqueeze(mask, 0)
     mask = mask.permute(1, 0, 2)
     mask = mask.repeat(1, self.seq_per_img, 1)
     mask = mask.resize(mask.size(0) * mask.size(1), mask.size(2))
     #  print "target:", target
     #  print "mask:", mask
     input = to_contiguous(input).view(-1, input.size(2))
     target = to_contiguous(target).view(-1, 1)
     mask = to_contiguous(mask).view(-1, 1)
     output = -input.gather(1, target) * mask
     output = torch.sum(output) / torch.sum(mask)
     return output, output
Exemplo n.º 2
0
    def forward(self, input, target, mask, sampling_ratios):
        # truncate to the same size
        ratios = sampling_ratios
        sampling_ratios = sampling_ratios.repeat(1, input.size(1))
        # self.opt.logger.debug('Importance scores shaped:', sampling_ratios)
        target = target[:, :input.size(1)]
        mask = mask[:, :input.size(1)]
        # print('Updated mask:', mask_)
        input = to_contiguous(input).view(-1, input.size(2))
        target = to_contiguous(target).view(-1, 1)
        mask = to_contiguous(mask).view(-1, 1)
        real_output = - input.gather(1, target) * mask
        real_output = torch.sum(real_output) / torch.sum(mask)
        output = - input.gather(1, target) * mask * sampling_ratios
        if torch.sum(mask).data[0] > 0 and torch.sum(ratios).data[0] > 0:
            # self.opt.logger.debug('output without avg %s' % str(torch.sum(output).data))
            output = torch.sum(output) / torch.sum(mask) / torch.sum(ratios)
            self.opt.logger.warn('Avergaging over the sampling scores and the seq length')
        else:
            self.opt.logger.warn("Smooth targets weights sum to 0")
            output = torch.sum(output)
            print('WARNING: Output loss without averaging:', output.data[0])
            output = real_output

        return real_output, self.alpha * output + (1 - self.alpha) * real_output
Exemplo n.º 3
0
def get_ml_loss(input, target, mask):
    """
    Compute the usual ML loss
    """
    input = to_contiguous(input).view(-1, input.size(2))
    target = to_contiguous(target).view(-1, 1)
    mask = to_contiguous(mask).view(-1, 1)
    ml_output = -input.gather(1, target) * mask
    if torch.sum(mask).data[0] > 0:
        ml_output = torch.sum(ml_output) / torch.sum(mask)
    else:
        raise ValueError('Mask shouldnt be all null')
    return ml_output
Exemplo n.º 4
0
    def forward(self, input, target, mask, scores=None):
        # truncate to the same size
        seq_length = input.size(1)
        target = target[:, :seq_length]
        mask = mask[:, :seq_length]
        if self.scale_loss:
            row_scores = scores.repeat(1, seq_length)
            mask = torch.mul(mask, row_scores)
        ml_output = get_ml_loss(input, target, mask)
        # Get the similarities of the words in the batch (Vb, V)
        sim = self.Sim_Matrix[to_contiguous(target).view(-1, 1).squeeze().data]
        # print('raw sim:', sim)
        if self.clip_sim:
            # keep only the similarities larger than the margin
            # self.logger.warn('Clipping the sim')
            sim = sim * sim.ge(self.margin).float()
        if self.limited:
            # self.logger.warn('Limitig smoothing to the gt vocab')
            indices_vocab = get_indices_vocab(target, self.seq_per_img)
            sim = sim.gather(1, indices_vocab)
            input = input.gather(1, indices_vocab)

        if self.tau_word:
            smooth_target = torch.exp(
                torch.mul(torch.add(sim, -1.), 1 / self.tau_word))
        else:
            # Do not exponentiate
            smooth_target = torch.add(sim, -1.)
        if self.smooth_remove_equal:
            smooth_target = smooth_target * sim.lt(1.0).float()
        # print('smooth_target:', smooth_target)
        # Store some stats about the sentences scores:
        scalars = smooth_target.data.cpu().numpy()[:]
        stats = {"word_mean": np.mean(scalars), "word_std": np.std(scalars)}
        # Format
        mask = to_contiguous(mask).view(-1, 1)
        mask = mask.repeat(1, sim.size(1))
        input = to_contiguous(input).view(-1, input.size(2))
        # print('in:', input.size(), 'mask:', mask.size(), 'smooth:', smooth_target.size())
        output = -input * mask * smooth_target

        if torch.sum(smooth_target * mask).data[0] > 0:
            output = torch.sum(output) / torch.sum(smooth_target * mask)
        else:
            self.logger.warn("Smooth targets weights sum to 0")
            output = torch.sum(output)

        return ml_output, self.alpha * output + (1 -
                                                 self.alpha) * ml_output, stats
Exemplo n.º 5
0
    def forward(self, input, target, mask, scores=None):
        # truncate
        seq_length = input.size(1)
        target = target[:, :seq_length]
        mask = mask[:, :seq_length]
        if self.scale_loss:
            row_scores = scores.repeat(1, seq_length)
            mask = torch.mul(mask, row_scores)
        ml_output = get_ml_loss(input, target, mask)
        preds = torch.max(input, dim=2)[1].squeeze().cpu().data
        sent_scores = self.get_scores(preds, target)
        # Process scores:
        if self.tau_sent:
            sent_scores = np.exp(np.array(sent_scores) / self.tau_sent)
        else:
            sent_scores = np.array(sent_scores)
            if not np.sum(sent_scores):
                self.logger.warn('Adding +1 to the zero scores')
                sent_scores += 1
        # sent_scores from (N, 1) to (N, seq_length)
        self.logger.warn('Scores after processing: %s' % str(sent_scores))
        # Store some stats about the sentences scores:
        stats = {
            "sent_mean": np.mean(sent_scores),
            "sent_std": np.std(sent_scores)
        }
        sent_scores = np.repeat(sent_scores, seq_length)
        smooth_target = Variable(torch.from_numpy(sent_scores).view(
            -1, 1)).cuda().float()
        # substitute target with the prediction (aka sampling wrt p_\theta)
        preds = Variable(preds[:, :seq_length]).cuda()
        # Flatten
        preds = to_contiguous(preds).view(-1, 1)
        input = to_contiguous(input).view(-1, input.size(2))
        mask = to_contiguous(mask).view(-1, 1)
        output = -input.gather(1, preds) * mask * smooth_target
        if torch.sum(smooth_target * mask).data[0] > 0:
            output = torch.sum(output) / torch.sum(smooth_target * mask)
        else:
            self.logger.warn("Smooth targets weights sum to 0")
            self.logger.warn('Mask: %s' % str(torch.sum(mask).data[0]))
            self.logger.warn('Scores: %s' %
                             str(torch.sum(smooth_target).data[0]))
            # output = torch.sum(output)
            output = torch.zeros(1)

        return ml_output, self.alpha * output + (1 -
                                                 self.alpha) * ml_output, stats
Exemplo n.º 6
0
    def forward(self, input, target, mask, scores=None):
        # truncate
        seq_length = input.size(1)
        target = target[:, :seq_length]
        mask = mask[:, :seq_length]
        if self.scale_loss:
            row_scores = scores.repeat(1, seq_length)
            mask = torch.mul(mask, row_scores)
        ml_output = get_ml_loss(input, target, mask)
        # Sentence level
        preds = torch.max(input, dim=2)[1].squeeze().cpu().data
        sent_scores = self.get_scores(preds, target)
        # Process scores:
        if self.tau_sent:
            sent_scores = np.exp(np.array(sent_scores) / self.tau_sent)
        else:
            sent_scores = np.array(sent_scores)
            if not np.sum(sent_scores):
                self.logger.warn('Adding +1 to the zero scores')
                sent_scores += 1

        # sent_scores from (N, 1) to (N, seq_length)
        self.logger.warn('Scores after processing: %s' % str(sent_scores))
        # Store some stats about the sentences scores:
        stats = {
            "sent_mean": np.mean(sent_scores),
            "sent_std": np.std(sent_scores)
        }

        sent_scores = np.repeat(sent_scores, seq_length)
        smooth_target = Variable(torch.from_numpy(sent_scores).view(
            -1, 1)).cuda().float()

        # Word level
        preds = Variable(preds[:, :input.size(1)]).cuda()
        preds = to_contiguous(preds).view(-1, 1)
        sim = self.Sim_Matrix[preds.squeeze().data]
        if self.tau_word:
            smooth_target_wl = torch.exp(
                torch.mul(torch.add(sim, -1.), 1 / self.tau_word))
        else:
            smooth_target_wl = torch.add(sim, -1.)

        scalars = smooth_target_wl.data.cpu().numpy()[:]
        stats["word_mean"] = np.mean(scalars)
        stats["word_std"] = np.std(scalars)

        mask_wl = mask.repeat(1, sim.size(1))
        # format the sentence scores
        smooth_target = smooth_target.repeat(1, sim.size(1))
        output_wl = -input * smooth_target_wl * mask_wl * smooth_target
        norm = torch.sum(smooth_target_wl * mask_wl * smooth_target)
        if norm.data[0] > 0:
            output = torch.sum(output_wl) / norm
        else:
            self.logger.warn("Smooth targets weights sum to 0")
            output = torch.sum(output_wl)
        return ml_output, self.alpha * output + (1 -
                                                 self.alpha) * ml_output, stats
Exemplo n.º 7
0
    def forward(self, input, seq, reward):
        '''
        input = utils.to_contiguous(input).view(-1)
        reward = utils.to_contiguous(reward).view(-1)
        mask = (seq>0).float()
        mask = utils.to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1)
        output = (- input.float() * reward.float() * Variable(mask)) if utils.under_0_4() else (- input.float() * reward.float() * mask)
        output = torch.sum(output.float()) / torch.sum(mask.float())
        '''
        input = utils.to_contiguous(input).view(-1)
        reward = utils.to_contiguous(reward).view(-1)
        mask = (seq > 0).float()
        mask = utils.to_contiguous(
            torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]],
                      1)).view(-1)
        output = -input * reward * Variable(mask)
        output = torch.sum(output) / torch.sum(mask)

        return output
Exemplo n.º 8
0
 def forward(self, input, target, mask):
     # truncate to the same size
     max_length = input.size(1)
     num_img = input.size(0) // self.seq_per_img
     target = target[:, :input.size(1)]
     mask =  mask[:, :input.size(1)]
     mask_ = mask
     input = to_contiguous(input).view(-1, input.size(2))
     target = to_contiguous(target).view(-1, 1)
     mask = to_contiguous(mask).view(-1, 1)
     output = - input.gather(1, target) * mask
     real_output = torch.sum(output) / torch.sum(mask)
     # ------------------------------------------------
     output = output.view(-1, max_length)
     sent_scores = output.sum(dim=1) / mask_.sum(dim=1)
     sent_scores_per_image = sent_scores.chunk(num_img)
     output = torch.sum(torch.cat([t.max() for t in sent_scores_per_image], dim=0))
     output = output / num_img
     return real_output, output
Exemplo n.º 9
0
 def forward(self, input, target, mask, scores=None):
     """
     input : the decoder logits (N, seq_length, V)
     target : the ground truth labels (N, seq_length)
     mask : the ground truth mask to ignore UNK tokens (N, seq_length)
     scores: scalars to scale the loss of each sentence (N, 1)
     """
     # truncate to the same size
     seq_length = input.size(1)
     target = target[:, :seq_length]
     mask = mask[:, :seq_length]
     if self.scale_loss:
         row_scores = scores.repeat(1, seq_length)
         mask = torch.mul(mask, row_scores)
     input = to_contiguous(input).view(-1, input.size(2))
     target = to_contiguous(target).view(-1, 1)
     mask = to_contiguous(mask).view(-1, 1)
     output = - input.gather(1, target) * mask
     output = torch.sum(output) / torch.sum(mask)
     return output, output
Exemplo n.º 10
0
def get_indices_vocab(target, seq_per_img):
    seq_length = target.size(1)
    num_img = target.size(0) // seq_per_img
    vocab_per_image = target.chunk(num_img)
    vocab_per_image = [
        np.unique(to_contiguous(t).data.cpu().numpy()) for t in vocab_per_image
    ]
    max_vocab = max([len(t) for t in vocab_per_image])
    vocab_per_image = [
        np.pad(t, (0, max_vocab - len(t)), 'constant') for t in vocab_per_image
    ]
    indices_vocab = Variable(torch.cat([torch.from_numpy(t).\
                             repeat(seq_per_img * seq_length, 1)
                             for t in vocab_per_image], dim=0)).cuda()
    return indices_vocab
Exemplo n.º 11
0
    def forward(self, input, target, mask, sampling_ratios):
        seq_length = input.size(1)
        batch_size = input.size(0)
        target = target[:, :seq_length]
        mask = mask[:, :seq_length]
        num_img = batch_size // self.seq_per_img
        input_per_image = input.chunk(num_img)
        mask_per_image = mask.chunk(num_img)
        target_per_image = target.chunk(num_img)
        ratios_per_image = sampling_ratios.chunk(num_img)
        input_gt = torch.cat([t[:5] for t in input_per_image], dim=0)
        target_gt = torch.cat([t[:5] for t in target_per_image], dim=0)
        mask_gt = torch.cat([t[:5] for t in mask_per_image],dim=0)

        input_gen = torch.cat([t[5:] for t in input_per_image], dim=0)
        target_gen = torch.cat([t[5:] for t in target_per_image], dim=0)
        mask_gen = torch.cat([t[5:] for t in mask_per_image], dim=0)
        ratios_gen = torch.cat([t[5:] for t in ratios_per_image], dim=0)
        # print('Ratios GEN:', ratios_gen)

        # For the first 5 captions per image (gt) compute LM
        input_gt = to_contiguous(input_gt).view(-1, input_gt.size(2))
        target_gt = to_contiguous(target_gt).view(-1, 1)
        mask_gt = to_contiguous(mask_gt).view(-1, 1)
        output_gt = - input_gt.gather(1, target_gt) * mask_gt
        output_gt = torch.sum(output_gt) / torch.sum(mask_gt)

        # For the rest of the captions: importance sampling

        # truncate to the same size
        sampling_ratios = ratios_gen.repeat(1, input_gen.size(1))
        input_gen = to_contiguous(input_gen).view(-1, input_gen.size(2))
        target_gen = to_contiguous(target_gen).view(-1, 1)
        mask_gen = to_contiguous(mask_gen).view(-1, 1)
        if self.word_level:
            dist = self.Dist[target_gen.squeeze().data]
            smooth_target = torch.exp(torch.mul(torch.add(dist, -1.), 1/self.tau))
            mask_wl = mask_gen.repeat(1, dist.size(1))
            sampling_ratios_wl = sampling_ratios.view(-1,1).repeat(1, dist.size(1))
            output_wl = - input_gen * smooth_target * mask_wl * sampling_ratios_wl
            if torch.sum(smooth_target * mask_wl * sampling_ratios_wl).data[0] > 0:
                output_gen = torch.sum(output_wl) / torch.sum(smooth_target * mask_wl * sampling_ratios_wl)
            else:
                self.logger.warn("Smooth targets weights sum to 0")
                output_gen = torch.sum(output_wl)
        else:
            output_gen = - input_gen.gather(1, target_gen) * mask_gen * sampling_ratios
            if torch.sum(mask_gen).data[0] > 0 and torch.sum(ratios_gen).data[0] > 0:
                # self.opt.logger.debug('output without avg %s' % str(torch.sum(output).data))
                output_gen = torch.sum(output_gen) / torch.sum(mask_gen) / torch.sum(ratios_gen)
                self.opt.logger.warn('Avergaging over the sampling scores and the seq length')
            else:
                self.opt.logger.warn("Smooth targets weights sum to 0")
                output_gen = torch.sum(output_gen)
        return output_gt, self.alpha * output_gen + (1 - self.alpha) * output_gt
Exemplo n.º 12
0
 def forward(self, input, target, mask, scores=None):
     # truncate
     N = input.size(0)
     seq_length = input.size(1)
     target = target[:, :seq_length]
     mask = mask[:, :seq_length]
     if self.scale_loss:
         row_scores = scores.repeat(1, seq_length)
         mask = torch.mul(mask, row_scores)
     ml_output = get_ml_loss(input, target, mask)
     # Sample a distance then sample a prediction wrt the reward
     V = 30
     distrib = [
         binom(seq_length, e) *
         ((V - 1) * math.exp(-1 / self.tau))**(e - seq_length)
         for e in range(seq_length + 1)
     ]
     select = np.random.choice(a=np.arange(seq_length + 1),
                               p=distrib / sum(distrib))
     score = math.exp(-select / self.tau)
     self.logger.debug("exp-neg Hamming distances (d=%d): %.2e" %
                       (select, score))
     scores = np.ones((N, seq_length), dtype="float32") * score
     smooth_target = Variable(torch.from_numpy(scores).view(
         -1, 1)).cuda().float()
     refs = target.cpu().data.numpy()
     # Format preds by changing d=select tokens at random
     preds = refs
     change_index = np.random.randint(seq_length, size=(N, select))
     rows = np.arange(N).reshape(-1, 1).repeat(select, axis=1)
     select_index = np.random.randint(self.vocab_size, size=(N, select))
     preds[rows, change_index] = select_index
     preds = Variable(torch.from_numpy(preds)).cuda()
     preds = to_contiguous(preds).view(-1, 1)
     output = -input.gather(1, preds) * mask * smooth_target
     if torch.sum(smooth_target * mask).data[0] > 0:
         output = torch.sum(output) / torch.sum(smooth_target * mask)
     else:
         self.logger.warn("Smooth targets weights sum to 0")
         output = torch.sum(output)
     return ml_output, self.alpha * output + (1 - self.alpha) * ml_output
Exemplo n.º 13
0
    def forward(self, input, target, mask, pre_scores):
        # truncate to the same size
        input_ = input
        seq_length = input.size(1)
        target = target[:, :input.size(1)]
        target_ = target
        mask = mask[:, :input.size(1)]
        if self.less_confident:
            row_scores = pre_scores.repeat(1, input.size(1))
            #  gen_rows = np.arange(mask.size(0),)
            #  gen_rows = (gen_rows % self.seq_per_img) > 4
            #  gen_rows = torch.from_numpy(np.where(gen_rows)[0]).cuda()
            #  mask_ = mask
            #  mask_.data[gen_rows] = torch.mul(mask_.data[gen_rows], self.less_confident)
            mask_ = torch.mul(mask, row_scores)
            #  print "Mask (scaled)", mask_
        else:
            mask_ = mask
        input = to_contiguous(input).view(-1, input.size(2))
        target = to_contiguous(target).view(-1, 1)
        mask = to_contiguous(mask).view(-1, 1)
        real_output = - input.gather(1, target) * mask_
        real_output = torch.sum(real_output) / torch.sum(mask_)

        #-------------------------------------------------------
        if self.alpha > 0:
            dist = self.Dist[target.squeeze().data]
            #  print "Dist:", dist
            if self.version == "exp":
                smooth_target = torch.exp(torch.mul(torch.add(dist, -1.), 1/self.tau))
                # print('Smooth target:', smooth_target)
            elif self.version == "clip":
                indices_up = dist.ge(self.margin)
                smooth_target = dist * indices_up.float()
            elif self.version == "vocab":
                num_img = target_.size(0) // self.seq_per_img
                vocab_per_image = target_.chunk(num_img)
                vocab_per_image = [np.unique(to_contiguous(t).data.cpu().numpy())
                                   for t in vocab_per_image]
                max_vocab = max([len(t) for t in vocab_per_image])
                vocab_per_image = [np.pad(t, (0, max_vocab - len(t)), 'constant')
                                   for t in vocab_per_image]
                indices_vocab = Variable(torch.cat([torch.from_numpy(t).\
                                         repeat(self.seq_per_img * seq_length, 1)
                                         for t in vocab_per_image], dim=0)).cuda()
                mask_ = mask_.repeat(1, indices_vocab.size(1))
                dist_vocab = dist.gather(1, indices_vocab)
                smooth_target = torch.exp(torch.mul(torch.add(dist_vocab, -1.),
                                                    1/self.tau))
                output = - input.gather(1, indices_vocab) * mask_ * smooth_target
                if self.isolate_gt:
                    indices_down = dist_vocab.lt(1.0)
                    smooth_target = smooth_target * indices_down.float()
                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            # case exp & clip
            if self.isolate_gt:
                indices_down = dist.lt(1.0)
                smooth_target = smooth_target * indices_down.float()
            # Deprecated
            if self.normalize:
                # Make sur that each row of smoothtarget sum to 1:
                Z = torch.sum(smooth_target, 1).repeat(1, smooth_target.size(1))
                smooth_target = smooth_target / Z
            # // Deprecated
            mask_ = mask_.repeat(1, dist.size(1))
            output = - input * smooth_target * mask_
            if torch.sum(smooth_target * mask_).data[0] > 0:
                output = torch.sum(output) / torch.sum(smooth_target * mask_)
            else:
                self.logger.warn("Smooth targets weights sum to 0")
                output = torch.sum(output)
            return real_output, self.alpha * output + (1 - self.alpha) * real_output
        else:
            return real_output, real_output
Exemplo n.º 14
0
    def forward(self, input, target, mask, pre_scores):
        # truncate to the same size
        input_ = input
        seq_length = input.size(1)
        target = target[:, :input.size(1)]
        target_ = target
        mask = mask[:, :input.size(1)]
        if self.less_confident:
            row_scores = pre_scores.repeat(1, input.size(1))
            #  gen_rows = np.arange(mask.size(0),)
            #  gen_rows = (gen_rows % self.seq_per_img) > 4
            #  gen_rows = torch.from_numpy(np.where(gen_rows)[0]).cuda()
            #  mask_ = mask
            #  mask_.data[gen_rows] = torch.mul(mask_.data[gen_rows], self.less_confident)
            mask_ = torch.mul(mask, row_scores)
            #  print "Mask (scaled)", mask_
        else:
            mask_ = mask
        input = to_contiguous(input).view(-1, input.size(2))
        target = to_contiguous(target).view(-1, 1)
        mask = to_contiguous(mask).view(-1, 1)
        real_output = - input.gather(1, target) * mask_
        real_output = torch.sum(real_output) / torch.sum(mask_)

        #-------------------------------------------------------
        if self.alpha > 0:
            dist = self.Dist[target.squeeze().data]
            elif self.version == 'glove-cider':
                cider_scorer = CiderScorer(n=4, sigma=6)
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                for e, h in enumerate(hypo):
                    ix_start =  e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    cider_scorer += (h, refs[ix_start : ix_end])
                (score, scores) = cider_scorer.compute_score()
                self.logger.debug("CIDEr score: %s" %  str(scores))
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.minimum(np.repeat(scores, seq_length), 1)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                dist = self.Dist[preds.squeeze().data]
                smooth_target_wl = torch.exp(torch.mul(torch.add(dist, -1.), 1/self.tau))
                mask_wl = mask_.repeat(1, dist.size(1))
                smooth_target = smooth_target.repeat(1, dist.size(1))
                output_wl = - input * smooth_target_wl * mask_wl * smooth_target
                norm = torch.sum(smooth_target_wl * mask_wl * smooth_target)
                if norm.data[0] > 0:
                    output = torch.sum(output_wl) / norm
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output_wl)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'cider':
                cider_scorer = CiderScorer(n=4, sigma=6)
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                for e, h in enumerate(hypo):
                    ix_start =  e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    cider_scorer += (h, refs[ix_start : ix_end])
                (score, scores) = cider_scorer.compute_score()
                self.logger.debug("CIDEr score: %s" %  str(scores))
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.minimum(np.repeat(scores, seq_length), 1)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                output = - input.gather(1, preds) * mask_ * smooth_target
                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output
            elif self.version == 'glove-cider-exp':
                cider_scorer = CiderScorer(n=4, sigma=6)
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                for e, h in enumerate(hypo):
                    ix_start =  e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    cider_scorer += (h, refs[ix_start : ix_end])
                (score, scores) = cider_scorer.compute_score()
                self.logger.debug("CIDEr score: %s" %  str(scores))
                scores = np.exp(np.array(scores) / self.tau_bis)
                scores = np.repeat(scores, seq_length)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)

                dist = self.Dist[preds.squeeze().data]
                smooth_target_wl = torch.exp(torch.mul(torch.add(dist, -1.), 1/self.tau))
                mask_wl = mask_.repeat(1, dist.size(1))
                smooth_target = smooth_target.repeat(1, dist.size(1))
                output_wl = - input * smooth_target_wl * mask_wl * smooth_target
                norm = torch.sum(smooth_target_wl * mask_wl * smooth_target)
                if norm.data[0] > 0:
                    output = torch.sum(output_wl) / norm
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output_wl)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'cider-exp':
                cider_scorer = CiderScorer(n=4, sigma=6)
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                for e, h in enumerate(hypo):
                    ix_start = e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    cider_scorer += (h, refs[ix_start : ix_end])
                (score, scores) = cider_scorer.compute_score()
                self.logger.debug("CIDEr score: %s" %  str(scores))
                scores = np.exp(np.array(scores) / self.tau)
                scores = np.repeat(scores, seq_length)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                output = - input.gather(1, preds) * mask_ * smooth_target
                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'infersent':
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                scores = []
                lr = len(refs)
                codes = self.infersent.encode(refs + hypo)
                refs = codes[:lr]
                hypo = codes[lr:]
                for e, h in enumerate(hypo):
                    ix_start =  e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    scores.append(group_similarity(h, refs[ix_start : ix_end]))
                self.logger.debug("Infersent similairities: %s" %  str(scores))
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.repeat(np.exp(np.array(scores)/ self.tau), seq_length)
                print('Scaling with', scores)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                output = - input.gather(1, preds) * mask_ * smooth_target
                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif "bleu" in self.version:
                n = int(self.version[-1])
                # bleu_scorer = BleuScorer(n=n)
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                hypo = decode_sequence(self.loader_vocab, preds)  # candidate
                refs = decode_sequence(self.loader_vocab, target_.data)  # references
                num_img = target_.size(0) // self.seq_per_img
                scores = []
                for e, h in enumerate(hypo):
                    ix_start =  e // self.seq_per_img * self.seq_per_img
                    ix_end = ix_start + 5  # self.seq_per_img
                    # bleu_scorer += (h, refs[ix_start : ix_end])
                    scores.append(sentence_bleu(h, ' '.join(refs[ix_start: ix_end]), order=n))
                # (score, scores) = bleu_scorer.compute_score()
                # scores = scores[-1]
                self.logger.debug("Bleu scores: %s" %  str(scores))
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.minimum(np.repeat(scores, seq_length), 1)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                output = - input.gather(1, preds) * mask_ * smooth_target
                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'hamming': # here sampling with p instead of the reward q
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                refs =  target_.cpu().data.numpy()
                num_img = target_.size(0) // self.seq_per_img
                # Hamming distances
                scores = np.array([hamming(u, v) for u, v in zip(preds.numpy(), refs)])
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.exp(-1 * scores / self.tau)
                self.logger.debug("exp-neg Hamming distances: %s" %  str(scores))

                scores = np.repeat(scores, seq_length)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                #  output = - input.gather(1, target) * mask * smooth_target
                output = - input.gather(1, preds) * mask_ * smooth_target

                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'glove-hamming': # here sampling with p instead of the reward q
                preds = torch.max(input_, dim=2)[1].squeeze().cpu().data
                refs =  target_.cpu().data.numpy()
                num_img = target_.size(0) // self.seq_per_img
                # Hamming distances
                scores = np.array([hamming(u, v) for u, v in zip(preds.numpy(), refs)])
                #  scores = np.maximum(1 - np.repeat(scores, seq_length), 0)
                scores = np.exp(-1 * scores / self.tau_bis)
                self.logger.debug("exp-neg Hamming distances: %s" %  str(scores))

                scores = np.repeat(scores, seq_length)
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                preds = Variable(preds[:, :input.size(1)]).cuda()
                preds = to_contiguous(preds).view(-1, 1)

                #  output = - input.gather(1, target) * mask * smooth_target

                dist = self.Dist[preds.squeeze().data]
                smooth_target_wl = torch.exp(torch.mul(torch.add(dist, -1.), 1/self.tau))
                mask_wl = mask_.repeat(1, dist.size(1))
                smooth_target = smooth_target.repeat(1, dist.size(1))
                output_wl = - input * smooth_target_wl * mask_wl * smooth_target
                norm = torch.sum(smooth_target_wl * mask_wl * smooth_target)
                if norm.data[0] > 0:
                    output = torch.sum(output_wl) / norm
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output_wl)

                return real_output, self.alpha * output + (1 - self.alpha) * real_output

            elif self.version == 'hamming-sample':
                # Sample a distance:
                V = 30
                N = input_.size(0)
                distrib = [binom(seq_length, e) *
                           ((V-1) * math.exp(-1/self.tau))**(e-seq_length)
                           for e in range(seq_length+1)]
                select = np.random.choice(a=np.arange(seq_length+1),
                                          p=distrib/sum(distrib))
                score = math.exp(-select / self.tau)
                self.logger.debug("exp-neg Hamming distances (d=%d): %.2e" %
                                  (select, score))
                scores = np.ones((N, seq_length), dtype="float32") * score
                smooth_target = Variable(torch.from_numpy(scores).view(-1, 1)).cuda().float()
                refs =  target_.cpu().data.numpy()
                # Format preds by changing d=select tokens at random
                preds = refs
                change_index = np.random.randint(seq_length, size=(N, select))
                rows = np.arange(N).reshape(-1, 1).repeat(select, axis=1)
                select_index = np.random.randint(self.vocab_size, size=(N, select))
                preds[rows, change_index] = select_index
                preds = Variable(torch.from_numpy(preds)).cuda()
                preds = to_contiguous(preds).view(-1, 1)
                #  output = - input.gather(1, target) * mask * smooth_target
                output = - input.gather(1, preds) * mask_ * smooth_target

                if torch.sum(smooth_target * mask_).data[0] > 0:
                    output = torch.sum(output) / torch.sum(smooth_target * mask_)
                else:
                    self.logger.warn("Smooth targets weights sum to 0")
                    output = torch.sum(output)
                return real_output, self.alpha * output + (1 - self.alpha) * real_output
Exemplo n.º 15
0
    def forward(self, input, target, mask, scores=None):
        # truncate to the same size
        seq_length = input.size(1)
        target = target[:, :seq_length]
        mask = mask[:, :seq_length]
        if self.scale_loss:
            row_scores = scores.repeat(1, seq_length)
            mask = torch.mul(mask, row_scores)
        ml_output = get_ml_loss(input, target, mask, norm=self.normalize_batch)
        # Get the similarities of the words in the batch (NxL, V)
        indices = to_contiguous(target).view(-1, 1).squeeze().data
        sim = self.Sim_Matrix[indices]
        # print('raw sim:', sim)
        if self.clip_sim:
            # keep only the similarities larger than the margin
            # self.logger.warn('Clipping the sim')
            sim = sim * sim.ge(self.margin).float()
        if self.limited:
            # self.logger.warn('Limitig smoothing to the gt vocab')
            indices_vocab = get_indices_vocab(target, self.seq_per_img)
            sim = sim.gather(1, indices_vocab)
            input = input.gather(1, indices_vocab)

        if self.tau_word:
            smooth_target = torch.exp(torch.mul(sim, 1 / self.tau_word))
        else:
            # Do not exponentiate
            smooth_target = sim
        # Normalize the word reward distribution:
        smooth_target = normalize_reward(smooth_target)

        if self.exact:
            delta = Variable(torch.eye(self.vocab_size)[indices.cpu()]).cuda()
            smooth_target = torch.mul(smooth_target, self.alpha) + torch.mul(
                delta, (1 - self.alpha))
            # print("Smooth:", smooth_target)

        # Store some stats about the sentences scores:
        scalars = smooth_target.data.cpu().numpy()
        # print("Reward multip:", scalars[0][:10])

        stats = {"word_mean": np.mean(scalars), "word_std": np.std(scalars)}

        # print('smooth_target:', smooth_target)
        # Format
        mask = to_contiguous(mask).view(-1, 1)
        input = to_contiguous(input).view(-1, input.size(2))
        # print('in:', input.size(), 'mask:', mask.size(), 'smooth:', smooth_target.size())
        output = -input * mask.repeat(1, sim.size(1)) * smooth_target

        if self.normalize_batch:
            if torch.sum(mask).data[0] > 0:
                output = torch.sum(output) / torch.sum(mask)
            else:
                self.logger.warn("Smooth targets weights sum to 0")
                output = torch.sum(output)
        else:
            output = torch.sum(output)

        if self.add_entropy:
            H = rows_entropy(smooth_target).unsqueeze(1)
            entropy = torch.sum(H * mask)
            if self.normalize_batch:
                entropy /= torch.sum(mask)
            # print('Entropy:', entropy.data[0])
            output += entropy

        if self.scale_wl:
            self.logger.warn('Scaling the pure WL RAML by %.3f' %
                             self.scale_wl)
            output = self.scale_wl * output
        output = self.alpha * output + (1 - self.alpha) * ml_output
        return ml_output, output, stats