Exemplo n.º 1
0
 def compute_sim(self, ltext, rtext):
     with tf.variable_scope('encode') as encode_scope:
         ltext_feature = self.lforward(ltext)
         encode_scope.reuse_variables()
         rtext_feature = self.rforward(rtext)
     score = compute_sim(ltext_feature, rtext_feature)
     return score
Exemplo n.º 2
0
    def compute_image_text_sim(self, normed_image_feature,
                               normed_text_feature):
        #[batch_size, hidden_size]
        if FLAGS.fix_image_embedding:
            normed_image_feature = tf.stop_gradient(normed_image_feature)

        if FLAGS.fix_text_embedding:
            #not only stop internal text ebmeddding but also mlp part so fix final text embedding
            normed_text_feature = tf.stop_gradient(normed_text_feature)

        return compute_sim(normed_image_feature, normed_text_feature)
Exemplo n.º 3
0
    def build_graph(self, ltext, rtext, neg_ltext, neg_rtext):
        assert (neg_ltext is not None) or (neg_rtext is not None)
        with tf.variable_scope(self.scope) as scope:
            ltext_feature = self.lforward(ltext)
            scope.reuse_variables()  #rfword share same rnn or cnn..
            rtext_feature = self.rforward(rtext)
            pos_score = compute_sim(ltext_feature, rtext_feature)

            #scope.reuse_variables()

            neg_scores_list = []
            if neg_rtext is not None:
                num_negs = neg_rtext.get_shape()[1]
                for i in xrange(num_negs):
                    neg_rtext_feature_i = self.rforward(neg_rtext[:, i, :])
                    neg_scores_i = compute_sim(ltext_feature,
                                               neg_rtext_feature_i)
                    neg_scores_list.append(neg_scores_i)
            if neg_ltext is not None:
                num_negs = neg_ltext.get_shape()[1]
                for i in xrange(num_negs):
                    neg_ltext_feature_i = self.lforward(neg_ltext[:, i, :])
                    neg_scores_i = compute_sim(neg_ltext_feature_i,
                                               rtext_feature)
                    neg_scores_list.append(neg_scores_i)

            #[batch_size, num_negs]
            neg_scores = tf.concat(neg_scores_list, 1)
            #---------------rank loss
            #[batch_size, 1 + num_negs]
            scores = tf.concat([pos_score, neg_scores], 1)
            tf.add_to_collection('scores', scores)

            loss = pairwise_loss(pos_score, neg_scores)

        return loss
  def build_graph(self, image_feature, text, 
                  neg_image_feature=None, neg_text=None, 
                  exact_prob=False, exact_loss=False,
                  weights=None):
    
    scope = tf.get_variable_scope()
    if not FLAGS.showtell_noimage:
      with tf.variable_scope(FLAGS.showtell_encode_scope or scope):
        attention_states, initial_state, image_emb = self.encode(image_feature)
        if image_emb is not None:
          assert not FLAGS.add_text_start, 'if use image emb as input then must not pad start mark before sentence'
        else:
          assert FLAGS.add_text_start, 'if not use image emb as input then must pad start mark before sentence'
    else:
      print('Language only mode!', file=sys.stderr)
      image_emb = tf.zeros([melt.get_batch_size(text), self.emb_dim])
      initial_state = None
      attention_states = None

    with tf.variable_scope(FLAGS.showtell_decode_scope or scope):
      #will pad start in decoder.sequence_loss if FLAGS.image_as_init_state
      scores = self.decoder.sequence_loss(text,
                                          input=image_emb, 
                                          initial_state=initial_state, 
                                          attention_states=attention_states, 
                                          exact_prob=exact_prob,
                                          exact_loss=exact_loss,
                                          vocab_weights=self.idf_weights if self.is_training else None,
                                          weights=weights if self.is_training else None) 

      loss = scores 

      if FLAGS.reinforcement_learning and self.is_training:
        assert not FLAGS.image_as_init_state, 'not support im2txt style for reinforcement_learning now, not tested!'
        assert self.rl, 'need to set rl for reinforcement_learning'
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        #code borrow from https://github.com/arieling/SelfCriticalSequenceTraining-tensorflow
        #scores is -(negative log loss)
        sampled_caption, sampled_loss = self.decoder.generate_sequence_multinomial(image_emb, 
                                          max_words=max_words, 
                                          #max_words=16,
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          #length_normalization_factor=0.,
                                          need_logprobs=True)  

        self.rl.sampled_caption = sampled_caption

        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                          max_words=max_words,
                                          #max_words=20, 
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          need_logprobs=False)

        self.rl.greedy_caption = greedy_caption

        ratio = FLAGS.reinforcement_ratio
        
        #if doing this need loss and sampled_loss same shape batch_size or batch_size * text_length 
        loss = ratio * (self.rl.rewards_feed - self.rl.baseline_feed) * sampled_loss + (1- ratio) * loss

        #loss = -loss

      if not self.is_predict:
        loss = tf.reduce_mean(loss)

      #if not self.is_training and not self.is_predict: #evaluate mode
      if self.is_training:
        tf.add_to_collection('train_scores', scores)
      elif not self.is_predict:
        tf.add_to_collection('eval_scores', scores)

      if FLAGS.discriminant_loss_ratio > 0 and self.is_training:
        assert neg_text is not None
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                  max_words=max_words,
                                  #max_words=20, 
                                  initial_state=initial_state,
                                  attention_states=attention_states,
                                  convert_unk=convert_unk,
                                  need_logprobs=False)
        text_feature = self.encoder2.encode(text, self.emb)
        text_feature = normalize(text_feature)
        # neg_text = neg_text[:, 0, :]
        # neg_text_feature = self.encoder2.encode(neg_text, self.emb)
        # neg_text_feature = normalize(neg_text_feature)
        caption_feature = self.encoder2.encode(greedy_caption, self.emb)
        caption_feature = normalize(caption_feature)
        pos_score = compute_sim(caption_feature, text_feature)
        # neg_score = compute_sim(caption_feature, neg_text_feature)
        tf.add_to_collection('pos_score', pos_score)
        # tf.add_to_collection('neg_score', neg_score)
        # discriminant_loss = pairwise_loss(pos_score, neg_score)
        discriminant_loss = tf.reduce_mean((1. - pos_score) / 2.)
        #TODO this is mean loss so can use reduced loss then add discriminant_loss * ratio
        tf.add_to_collection('discriminant_loss', discriminant_loss)
        ratio = FLAGS.discriminant_loss_ratio
        tf.add_to_collection('gen_loss', loss)
        loss += ratio * discriminant_loss 

      if FLAGS.alignment_history and self.is_training:
        alignment_history = self.decoder.alignment_history
        tf.add_to_collection('alignment_history', alignment_history)

        if FLAGS.alignment_loss_ratio > 0: 
          lengths = self.decoder.final_sequence_lengths
          alignment_loss = self.calc_alignment_loss(alignment_history, lengths)
          tf.add_to_collection('alignment_loss', alignment_loss)
          #alignment_loss might be 4.1 ..
          ratio = FLAGS.alignment_loss_ratio
          #loss = (1 - ratio) * loss + ratio * alignment_loss
          loss += ratio * alignment_loss 

    self.main_loss = loss

    if self.is_predict:
      loss = tf.squeeze(loss)

    return loss