def build_predict_text_graph(self,
                                 image,
                                 decode_method='greedy',
                                 beam_size=5,
                                 convert_unk=True):
        attention_states, initial_state, image_emb = self.encoder.encode(
            self.process(image))
        if FLAGS.image_as_init_state:
            #for im2txt one more step at first
            with tf.variable_scope(self.decoder.scope) as scope:
                batch_size = melt.get_batch_size(image_emb)
                zero_state = self.decoder.cell.zero_state(batch_size,
                                                          dtype=tf.float32)
                _, initial_state = self.decoder.cell(image_emb, zero_state)
                image_emb = self.decoder.get_start_embedding_input(batch_size)
        elif image_emb is None:
            #TODO check
            batch_size = melt.get_batch_size(image)
            image_emb = self.decoder.get_start_embedding_input(batch_size)

        max_words = TEXT_MAX_WORDS
        if decode_method == SeqDecodeMethod.greedy:
            return self.decoder.generate_sequence_greedy(
                image_emb,
                max_words=max_words,
                initial_state=initial_state,
                attention_states=attention_states,
                convert_unk=convert_unk)
        elif decode_method == SeqDecodeMethod.multinomal:
            return self.decoder.generate_sequence_multinomial(
                image_emb,
                max_words=max_words,
                initial_state=initial_state,
                attention_states=attention_states,
                convert_unk=convert_unk)
        else:
            if decode_method == SeqDecodeMethod.ingraph_beam:
                decode_func = self.decoder.generate_sequence_ingraph_beam
            elif decode_method == SeqDecodeMethod.outgraph_beam:
                decode_func = self.decoder.generate_sequence_outgraph_beam
            else:
                raise ValueError('not supported decode_method: %s' %
                                 decode_method)

            return decode_func(
                image_emb,
                max_words=max_words,
                initial_state=initial_state,
                beam_size=beam_size,
                convert_unk=convert_unk,
                attention_states=attention_states,
                length_normalization_factor=FLAGS.length_normalization_factor)
示例#2
0
    def build_image_words_sim_graph(self):
        with tf.variable_scope(self.scope):
            #[1, atten_size, emb_dim]
            image_feature = self.forward_image_feature(
                self.get_image_feature_feed())
            #[vocab_size, 1, emb_dim]
            word_feature, words = self.forward_word_feature()

            #image_feature_batch_size = melt.get_batch_size(image_feature)
            word_feature_batch_size = melt.get_batch_size(word_feature)

            #[vocab_size, atten_size, emb_dim]
            image_feature = tf.contrib.seq2seq.tile_batch(
                image_feature, word_feature_batch_size)
            #word_feature = tf.contrib.seq2seq.tile_batch(word_feature, image_feature_batch_size)

            #[vocab_size, 1]
            #score = self.compute_image_text_sim(image_feature, word_feature, words)
            score = self.compute_image_text_sim(image_feature, word_feature)

            score = tf.expand_dims(tf.squeeze(score), 0)
            #print(image_feature, word_feature, words, score)
            #[1, vocab_size]
            #score = tf.transpose(score, [1, 0])
            return score
示例#3
0
  def encode(self, seq, seq_len=None, output_method='all'):
    with tf.variable_scope(self.scope):
      num_filters = self.num_units
      seqs = [seq]
      batch_size = melt.get_batch_size(seq)
     
      kernel_sizes = [3, 5, 7, 9, 11, 13]
      #kernel_sizes = [3] * 7
      assert self.num_layers <= len(kernel_sizes)

      for layer in range(self.num_layers):
        input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters
        seq = melt.dropout(seq, self.keep_prob, self.is_train)
        seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32),
        #                   keep_prob=self.keep_prob, is_train=self.is_train, mode=None)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu)
        #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu)
        
        # if self.is_train and self.keep_prob < 1:
        #   seq = tf.nn.dropout(seq, self.keep_prob)
        #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer)
        seqs.append(seq)
      
      outputs = tf.concat(seqs[1:], 2)
      # not do any dropout in convet just dropout outside 
      # if self.is_train and self.keep_prob < 1:
      #   outputs = tf.nn.dropout(outputs, self.keep_prob)

      # compact for rnn with sate return
      return melt.rnn.encode_outputs(outputs, seq_len, output_method)
示例#4
0
    def __init__(self,
                 input,
                 max_steps,
                 initial_state,
                 beam_size=7,
                 done_token=0,
                 batch_size=None,
                 num_classes=None,
                 output_fn=None,
                 length_normalization_factor=0.,
                 topn=1,
                 need_softmax=True,
                 logprobs_history=False,
                 alignment_history=False,
                 fast_greedy=False):
        self.length_normalization_factor = length_normalization_factor
        self.topn = topn
        self.need_softmax = need_softmax
        self.beam_size = beam_size
        self.batch_size = batch_size
        if self.batch_size is None:
            self.batch_size = melt.get_batch_size(input)
        self.max_len = max_steps
        self.num_classes = num_classes
        self.done_token = done_token
        self.pad_token = 0

        self.output_fn = output_fn

        self.past_logprobs = None
        self.past_symbols = None

        self.past_step_logprobs = None

        self.fast_greedy = fast_greedy
        if self.fast_greedy:
            self.finished_beams = tf.zeros((self.batch_size, self.max_len),
                                           dtype=tf.int32)
            self.logprobs_finished_beams = tf.ones(
                (self.batch_size, ), dtype=tf.float32) * -float('inf')
        else:
            self.path_list = []
            self.logprobs_list = []
            self.step_logprobs_list = []
            self.alignments_path_list = []

        #for rnn_decoder function need one more step loop, since i== 0 will not output words, take step start from i==1
        self.decoder_inputs = [None] * (self.max_len + 1)
        self.decoder_inputs[0] = tf.contrib.seq2seq.tile_batch(
            input, beam_size)

        self.initial_state = initial_state
        self.final_state = None

        self.log_probs_history = None
        self.alignment_history = None

        self.need_logprobs_history = logprobs_history
        self.need_alignment_history = alignment_history
示例#5
0
    def beam_search_step(self,
                         input,
                         state,
                         cell,
                         beam_size,
                         attention_construct_fn=None,
                         input_text=None):
        output, state = cell(input, state)

        if hasattr(state, 'alignments'):
            tf.add_to_collection('attention_alignments', state.alignments)
            tf.add_to_collection('beam_search_alignments',
                                 tf.get_collection('attention_alignments')[-1])

        #TODO: this step cause.. attenion decode each step after initalization still need input_text feed
        #will this case attention_keys and attention_values to be recompute(means redo encoding process) each step?
        #can we avoid this? seems no better method,
        #if enocding is slow may be feed attention_keys, attention_values each step
        if not FLAGS.decode_use_alignment:
            if FLAGS.gen_only:
                output_fn = self.output_fn
                logits = output_fn(output)
            else:
                indices = melt.batch_values_to_indices(tf.to_int32(input_text))
                batch_size = melt.get_batch_size(input)

                if FLAGS.copy_only:
                    output_fn_ = self.copy_output_fn
                else:
                    output_fn_ = self.gen_copy_output_fn
                output_fn = lambda cell_output, cell_state: output_fn_(
                    indices, batch_size, cell_output, cell_state)

                logits = output_fn(output, state)

            if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax:
                logprobs = tf.log(logits)
            else:
                logprobs = tf.nn.log_softmax(logits)

            if FLAGS.decode_copy:
                logprobs = melt.gather_cols(logprobs, tf.to_int32(input_text))
        else:
            logits = state.alignments
            logits = scores[:, :tf.shape(input_text)[-1]]
            logprobs = tf.nn.log_softmax(logits)

        top_logprobs, top_ids = tf.nn.top_k(logprobs, beam_size)
        #------too slow... for transfering large data between py and c++ cost a lot!
        #top_logprobs, top_ids = tf.nn.top_k(logprobs, self.vocab_size)

        if input_text is not None and FLAGS.decode_copy:
            top_ids = tf.nn.embedding_lookup(input_text, top_ids)

        if hasattr(state, 'cell_state'):
            state = state.cell_state

        return output, state, top_logprobs, top_ids
示例#6
0
    def _encode(self, image):
        attention_states, initial_state, image_emb = self.encoder.encode(
            self.process(image))
        if FLAGS.image_as_init_state:
            #for im2txt one more step at first
            with tf.variable_scope(self.decoder.scope):
                batch_size = melt.get_batch_size(image_emb)
                zero_state = self.decoder.cell.zero_state(batch_size,
                                                          dtype=tf.float32)
                _, initial_state = self.decoder.cell(image_emb, zero_state)
                image_emb = self.decoder.get_start_embedding_input(batch_size)
        elif image_emb is None:
            #TODO check
            batch_size = melt.get_batch_size(image)
            image_emb = self.decoder.get_start_embedding_input(batch_size)

        image_emb = self._post_deal_image_embedding(image_emb, image)
        #attention_states, initial_state, image_emb = self._post_deal(attention_states, initial_state, image_emb)
        return attention_states, initial_state, image_emb
示例#7
0
    def generate_sequence_greedy(self,
                                 input,
                                 max_words,
                                 initial_state=None,
                                 attention_states=None,
                                 convert_unk=True,
                                 input_text=None,
                                 emb=None):
        """
    this one is using greedy search method
    for beam search using generate_sequence_by_beam_search with addditional params like beam_size
    """
        if emb is None:
            emb = self.emb

        batch_size = melt.get_batch_size(input)
        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        helper = melt.seq2seq.GreedyEmbeddingHelper(embedding=emb,
                                                    first_input=input,
                                                    end_token=self.end_id)

        if FLAGS.gen_only:
            output_fn = self.output_fn
        else:
            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn_ = self.copy_output_fn
            else:
                output_fn_ = self.gen_copy_output_fn
            output_fn = lambda cell_output, cell_state: output_fn_(
                indices, batch_size, cell_output, cell_state)

        my_decoder = melt.seq2seq.BasicDecoder(cell=cell,
                                               helper=helper,
                                               initial_state=state,
                                               vocab_size=self.vocab_size,
                                               output_fn=output_fn)

        outputs, _, _ = melt.seq2seq.dynamic_decode(
            my_decoder, maximum_iterations=max_words, scope=self.scope)
        generated_sequence = outputs.sample_id
        #------like beam search return sequence, score
        return generated_sequence, tf.zeros([
            batch_size,
        ])
  def encode(self, image_feature):
    if FLAGS.scene_model:
      if not hasattr(self.encoder, 'scene_feature'):
        self.encoder.scene_feature = self.scene.feature_feed
    attention_states, initial_state, image_emb = self.encoder.encode(self.process(image_feature))
    if FLAGS.image_as_init_state:
      #for im2txt one more step at first, just for exp not used much 
      with tf.variable_scope(self.decoder.scope) as scope:
        zero_state = self.decoder.cell.zero_state(batch_size=melt.get_batch_size(input), dtype=tf.float32)
        _, initial_state = self.decoder.cell(image_emb, zero_state)
        image_emb = None 

    self.image_emb = image_emb
    image_emb = self._post_deal_image_embedding(image_emb, image_feature)
    #attention_states, initial_state, image_emb = self._post_deal(attention_states, initial_state, image_emb)
    return attention_states, initial_state, image_emb   
示例#9
0
    def build_predict_text_graph(self,
                                 input_text,
                                 decode_method=0,
                                 beam_size=5,
                                 convert_unk=True):
        with tf.variable_scope("encode"):
            encoder_output, state = self.encoder.encode(input_text)
            if not FLAGS.use_attention:
                encoder_output = None
        with tf.variable_scope("decode"):
            #---try to use static shape if possible
            batch_size = melt.get_batch_size(input_text)
            decoder_input = self.decoder.get_start_embedding_input(batch_size)
            max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS
            if decode_method == SeqDecodeMethod.greedy:
                input_text = self.encoder.sequence
                return self.decoder.generate_sequence_greedy(
                    decoder_input,
                    max_words=max_words,
                    initial_state=state,
                    attention_states=encoder_output,
                    convert_unk=convert_unk,
                    input_text=input_text)
            else:
                if decode_method == SeqDecodeMethod.beam:
                    decode_func = self.decoder.generate_sequence_beam
                elif decode_method == SeqDecodeMethod.beam_search:
                    decode_func = self.decoder.generate_sequence_beam_search
                else:
                    raise ValueError('not supported decode_method: %d' %
                                     decode_method)

                input_text, input_text_length = melt.pad(
                    input_text, end_id=self.encoder.end_id)
                #input_text = self.encoder.sequence
                #input_text_length = self.encoder.sequence_length
                return decode_func(decoder_input,
                                   max_words=max_words,
                                   initial_state=state,
                                   attention_states=encoder_output,
                                   beam_size=beam_size,
                                   convert_unk=convert_unk,
                                   length_normalization_factor=FLAGS.
                                   length_normalization_factor,
                                   input_text=input_text,
                                   input_text_length=input_text_length)
示例#10
0
    def build_graph(self,
                    image_feature,
                    text,
                    neg_image_feature=None,
                    neg_text=None,
                    exact_prob=False,
                    exact_loss=False):
        attention_states, initial_state, image_emb = self.encoder.encode(
            self.process(image_feature))

        if not FLAGS.image_as_init_state:
            #mostly go here
            scores = self.decoder.sequence_loss(
                text,
                input=image_emb,
                initial_state=initial_state,
                attention_states=attention_states,
                exact_prob=exact_prob,
                exact_loss=exact_loss)
        else:
            #for im2txt one more step at first, just for exp not used much
            with tf.variable_scope(self.decoder.scope) as scope:
                zero_state = self.decoder.cell.zero_state(
                    batch_size=melt.get_batch_size(input), dtype=tf.float32)
                _, initial_state = self.decoder.cell(input, zero_state)
            #will pad start in decoder.sequence_loss
            scores = self.decoder.sequence_loss(
                text,
                input=None,
                initial_state=initial_state,
                attention_states=attention_states,
                exact_prob=exact_prob,
                exact_loss=exact_loss)

        if not self.is_training and not self.is_predict:  #evaluate mode
            tf.add_to_collection('scores', scores)

        if not self.is_predict:
            loss = tf.reduce_mean(scores)
        else:
            loss = scores

        return loss
    def build_graph(self,
                    image_feature,
                    text,
                    neg_image_feature=None,
                    neg_text=None,
                    exact_loss=False):

        image_emb = self.build_image_embeddings(image_feature)

        attention_states = None
        if FLAGS.show_atten_tell:
            image_emb, attention_states = self.init_attention(image_emb)

        if not FLAGS.image_as_init_state:
            scores = self.decoder.sequence_loss(
                text,
                input=image_emb,
                attention_states=attention_states,
                exact_loss=exact_loss)
        else:
            #for im2txt one more step at first
            with tf.variable_scope(self.decoder.scope) as scope:
                zero_state = self.decoder.cell.zero_state(
                    batch_size=melt.get_batch_size(image_emb),
                    dtype=tf.float32)
                _, initial_state = self.decoder.cell(image_emb, zero_state)
            #will pad start in decoder.sequence_loss
            scores = self.decoder.sequence_loss(
                text,
                initial_state=initial_state,
                attention_states=attention_states,
                exact_loss=exact_loss)

        if not self.is_training and not self.is_predict:  #evaluate mode
            tf.add_to_collection('scores', scores)

        if not self.is_predict:
            loss = tf.reduce_mean(scores)
        else:
            loss = scores

        return loss
示例#12
0
  def encode(self, image_features):
    image_emb = features2feature(image_features, is_training=self.is_training)
 
    image_features = tf.concat([image_features, tf.expand_dims(image_emb, 1)], 1)    
    image_embs = self.build_image_embeddings(image_features)
    image_emb = image_embs[:,-1]
    image_embs = image_embs[:,:-1]
    #128,64,512 64,512
    image_embs = tf.concat([image_embs, tf.tile(tf.expand_dims(self.pos_emb, 0), [melt.get_batch_size(image_embs), 1, 1])], 1)
    #to make it like rnn encoder outputs
    with tf.variable_scope("attention_embedding") as scope:
      encoder_output = tf.contrib.layers.fully_connected(
          inputs=image_embs,
          num_outputs=FLAGS.rnn_hidden_size,
          activation_fn=None,
          weights_initializer=self.initializer,
          biases_initializer=None,
          scope=scope)
    state = None
    #image_emb = tf.reduce_mean(image_embs, 1)

    return encoder_output, state, image_emb
  def build_graph(self, image_feature, text, 
                  neg_image_feature=None, neg_text=None, 
                  exact_prob=False, exact_loss=False,
                  weights=None):
    
    scope = tf.get_variable_scope()
    if not FLAGS.showtell_noimage:
      with tf.variable_scope(FLAGS.showtell_encode_scope or scope):
        attention_states, initial_state, image_emb = self.encode(image_feature)
        if image_emb is not None:
          assert not FLAGS.add_text_start, 'if use image emb as input then must not pad start mark before sentence'
        else:
          assert FLAGS.add_text_start, 'if not use image emb as input then must pad start mark before sentence'
    else:
      print('Language only mode!', file=sys.stderr)
      image_emb = tf.zeros([melt.get_batch_size(text), self.emb_dim])
      initial_state = None
      attention_states = None

    with tf.variable_scope(FLAGS.showtell_decode_scope or scope):
      #will pad start in decoder.sequence_loss if FLAGS.image_as_init_state
      scores = self.decoder.sequence_loss(text,
                                          input=image_emb, 
                                          initial_state=initial_state, 
                                          attention_states=attention_states, 
                                          exact_prob=exact_prob,
                                          exact_loss=exact_loss,
                                          vocab_weights=self.idf_weights if self.is_training else None,
                                          weights=weights if self.is_training else None) 

      loss = scores 

      if FLAGS.reinforcement_learning and self.is_training:
        assert not FLAGS.image_as_init_state, 'not support im2txt style for reinforcement_learning now, not tested!'
        assert self.rl, 'need to set rl for reinforcement_learning'
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        #code borrow from https://github.com/arieling/SelfCriticalSequenceTraining-tensorflow
        #scores is -(negative log loss)
        sampled_caption, sampled_loss = self.decoder.generate_sequence_multinomial(image_emb, 
                                          max_words=max_words, 
                                          #max_words=16,
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          #length_normalization_factor=0.,
                                          need_logprobs=True)  

        self.rl.sampled_caption = sampled_caption

        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                          max_words=max_words,
                                          #max_words=20, 
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          need_logprobs=False)

        self.rl.greedy_caption = greedy_caption

        ratio = FLAGS.reinforcement_ratio
        
        #if doing this need loss and sampled_loss same shape batch_size or batch_size * text_length 
        loss = ratio * (self.rl.rewards_feed - self.rl.baseline_feed) * sampled_loss + (1- ratio) * loss

        #loss = -loss

      if not self.is_predict:
        loss = tf.reduce_mean(loss)

      #if not self.is_training and not self.is_predict: #evaluate mode
      if self.is_training:
        tf.add_to_collection('train_scores', scores)
      elif not self.is_predict:
        tf.add_to_collection('eval_scores', scores)

      if FLAGS.discriminant_loss_ratio > 0 and self.is_training:
        assert neg_text is not None
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                  max_words=max_words,
                                  #max_words=20, 
                                  initial_state=initial_state,
                                  attention_states=attention_states,
                                  convert_unk=convert_unk,
                                  need_logprobs=False)
        text_feature = self.encoder2.encode(text, self.emb)
        text_feature = normalize(text_feature)
        # neg_text = neg_text[:, 0, :]
        # neg_text_feature = self.encoder2.encode(neg_text, self.emb)
        # neg_text_feature = normalize(neg_text_feature)
        caption_feature = self.encoder2.encode(greedy_caption, self.emb)
        caption_feature = normalize(caption_feature)
        pos_score = compute_sim(caption_feature, text_feature)
        # neg_score = compute_sim(caption_feature, neg_text_feature)
        tf.add_to_collection('pos_score', pos_score)
        # tf.add_to_collection('neg_score', neg_score)
        # discriminant_loss = pairwise_loss(pos_score, neg_score)
        discriminant_loss = tf.reduce_mean((1. - pos_score) / 2.)
        #TODO this is mean loss so can use reduced loss then add discriminant_loss * ratio
        tf.add_to_collection('discriminant_loss', discriminant_loss)
        ratio = FLAGS.discriminant_loss_ratio
        tf.add_to_collection('gen_loss', loss)
        loss += ratio * discriminant_loss 

      if FLAGS.alignment_history and self.is_training:
        alignment_history = self.decoder.alignment_history
        tf.add_to_collection('alignment_history', alignment_history)

        if FLAGS.alignment_loss_ratio > 0: 
          lengths = self.decoder.final_sequence_lengths
          alignment_loss = self.calc_alignment_loss(alignment_history, lengths)
          tf.add_to_collection('alignment_loss', alignment_loss)
          #alignment_loss might be 4.1 ..
          ratio = FLAGS.alignment_loss_ratio
          #loss = (1 - ratio) * loss + ratio * alignment_loss
          loss += ratio * alignment_loss 

    self.main_loss = loss

    if self.is_predict:
      loss = tf.squeeze(loss)

    return loss
 def dupimage_process(self, image_feature):
   processed_image_feature = self.image_process_fn(tf.slice(image_feature, [0], [1]))
   # TODO seems below not work
   preocessed_image_feature = tf.contrib.seq2seq.tile_batch(processed_image_feature, melt.get_batch_size(image_feature)) 
   return processed_image_feature
示例#15
0
    def encode(self,
               inputs,
               seq_len,
               emb=None,
               concat_layers=True,
               output_method=OutputMethod.all):
        if emb is not None:
            inputs = tf.nn.embedding_lookup(emb, inputs)

        outputs = [inputs]
        keep_prob = self.keep_prob
        num_units = self.num_units
        is_train = self.is_train

        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            for layer in range(self.num_layers):
                input_size_ = melt.get_shape(
                    inputs, -1) if layer == 0 else 2 * num_units
                batch_size = melt.get_batch_size(inputs)
                with tf.variable_scope("fw_{}".format(layer)):
                    gru_fw = tf.contrib.rnn.GRUCell(num_units)
                    if not self.share_dropout:
                        mask_fw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          is_train=is_train,
                                          mode=self.dropout_mode)
                    else:
                        if self.dropout_mask_fw[layer] is None:
                            mask_fw = dropout(
                                tf.ones([batch_size, 1, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                is_train=is_train,
                                mode=self.dropout_mode)
                            self.dropout_mask_fw[layer] = mask_fw
                        else:
                            mask_fw = self.dropout_mask_fw[layer]
                    if self.train_init_state:
                        if self.init_fw[layer] is None:
                            self.init_fw[layer] = tf.tile(
                                tf.get_variable("init_state", [1, num_units],
                                                tf.float32,
                                                tf.zeros_initializer()),
                                [batch_size, 1])
                    out_fw, state = tf.nn.dynamic_rnn(
                        gru_fw,
                        outputs[-1] * mask_fw,
                        seq_len,
                        initial_state=self.init_fw[layer],
                        dtype=tf.float32)
                with tf.variable_scope("bw_{}".format(layer)):
                    gru_bw = tf.contrib.rnn.GRUCell(num_units)
                    if not self.share_dropout:
                        mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          is_train=is_train,
                                          mode=self.dropout_mode)
                    else:
                        if self.dropout_mask_bw[layer] is None:
                            mask_bw = dropout(
                                tf.ones([batch_size, 1, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                is_train=is_train,
                                mode=self.dropout_mode)
                            self.dropout_mask_bw[layer] = mask_bw
                        else:
                            mask_bw = self.dropout_mask_bw[layer]
                    if self.train_init_state:
                        if self.init_bw[layer] is None:
                            self.init_bw[layer] = tf.tile(
                                tf.get_variable("init_state", [1, num_units],
                                                tf.float32,
                                                tf.zeros_initializer()),
                                [batch_size, 1])
                    inputs_bw = tf.reverse_sequence(outputs[-1] * mask_bw,
                                                    seq_lengths=seq_len,
                                                    seq_dim=1,
                                                    batch_dim=0)
                    out_bw, _ = tf.nn.dynamic_rnn(
                        gru_bw,
                        inputs_bw,
                        seq_len,
                        initial_state=self.init_bw[layer],
                        dtype=tf.float32)
                    out_bw = tf.reverse_sequence(out_bw,
                                                 seq_lengths=seq_len,
                                                 seq_dim=1,
                                                 batch_dim=0)
                outputs.append(tf.concat([out_fw, out_bw], axis=2))

        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
        else:
            res = outputs[-1]
        res = encode_outputs(res, seq_len, output_method=output_method)
        return res
示例#16
0
    def encode(self,
               inputs,
               seq_len,
               emb=None,
               concat_layers=True,
               output_method=OutputMethod.all):
        if emb is not None:
            inputs = tf.nn.embedding_lookup(emb, inputs)

        outputs = [tf.transpose(inputs, [1, 0, 2])]
        #states = []
        keep_prob = self.keep_prob
        num_units = self.num_units
        is_train = self.is_train

        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            for layer in range(self.num_layers):
                input_size_ = melt.get_shape(
                    inputs, -1) if layer == 0 else 2 * num_units
                batch_size = melt.get_batch_size(inputs)

                with tf.variable_scope("fw_{}".format(layer)):
                    gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1,
                                                           num_units=num_units)
                    if not self.share_dropout:
                        # mode is None since by define mask.. is already recurrent mode
                        mask_fw = dropout(tf.ones([1, batch_size, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          is_train=is_train,
                                          mode=None)
                    else:
                        if self.dropout_mask_fw[layer] is None:
                            mask_fw = dropout(
                                tf.ones([1, batch_size, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                is_train=is_train,
                                mode=None)
                            self.dropout_mask_fw[layer] = mask_fw
                        else:
                            mask_fw = self.dropout_mask_fw[layer]
                    if self.train_init_state:
                        if self.init_fw[layer] is None:
                            self.init_fw[layer] = (tf.tile(
                                tf.get_variable("init_state",
                                                [1, 1, num_units], tf.float32,
                                                tf.zeros_initializer()),
                                [1, batch_size, 1]), )
                    out_fw, state_fw = gru_fw(outputs[-1] * mask_fw,
                                              self.init_fw[layer])

                with tf.variable_scope("bw_{}".format(layer)):
                    gru_bw = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1,
                                                           num_units=num_units)
                    if not self.share_dropout:
                        mask_bw = dropout(tf.ones([1, batch_size, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          is_train=is_train,
                                          mode=None)
                    else:
                        if self.dropout_mask_bw[layer] is None:
                            mask_bw = dropout(
                                tf.ones([1, batch_size, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                is_train=is_train,
                                mode=None)
                            self.dropout_mask_bw[layer] = mask_bw
                        else:
                            mask_bw = self.dropout_mask_bw[layer]
                    inputs_bw = tf.reverse_sequence(outputs[-1] * mask_bw,
                                                    seq_lengths=seq_len,
                                                    seq_dim=0,
                                                    batch_dim=1)
                    if self.train_init_state:
                        if self.init_bw[layer] is None:
                            self.init_bw[layer] = (tf.tile(
                                tf.get_variable("init_state",
                                                [1, 1, num_units], tf.float32,
                                                tf.zeros_initializer()),
                                [1, batch_size, 1]), )
                    out_bw, state_bw = gru_bw(inputs_bw, self.init_bw[layer])
                    out_bw = tf.reverse_sequence(out_bw,
                                                 seq_lengths=seq_len,
                                                 seq_dim=0,
                                                 batch_dim=1)

                outputs.append(tf.concat([out_fw, out_bw], axis=2))
                #states.append(tf.concat([state_fw, state_bw], axis=-1))

        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
            #state = tf.concat(states, axis=-1)
        else:
            res = outputs[-1]
            #state = states[-1]

        res = tf.transpose(res, [1, 0, 2])
        #state = tf.squeeze(state)
        #state = tf.reshape(state, [-1, num_units * 2 * self.num_layers])
        #res = encode_outputs(res, output_method=output_method, sequence_length=seq_len, state=state)
        res = encode_outputs(res,
                             output_method=output_method,
                             sequence_length=seq_len)

        self.state = (state_fw, state_bw)
        return res
示例#17
0
    def generate_sequence_beam_search(self,
                                      input,
                                      max_words=None,
                                      initial_state=None,
                                      attention_states=None,
                                      beam_size=10,
                                      convert_unk=True,
                                      length_normalization_factor=0.,
                                      input_text=None,
                                      input_text_length=None,
                                      emb=None):
        """
    outgraph beam search, input should be one instance only batch_size=1
    max_words actually not used here... for it is determined outgraph..
    return top (path, score)
    TODO this is hacky, first step attention_state, input , state all size 1,
    then should be attention_state 1, input, state size is beam_size,
    also might be less then beam_size.. if not possible to find beam_size un done
    """
        if emb is None:
            emb = self.emb

        tf.add_to_collection('beam_search_beam_size', tf.constant(beam_size))
        if input_text is not None:
            if FLAGS.decode_copy:
                input_text = tf.squeeze(input_text)
                input_text_length = tf.to_int32(tf.squeeze(input_text_length))
                input_text = input_text[0:input_text_length]
                input_text, _ = tf.unique(input_text)
                input_text_length = tf.shape(input_text)[-1]
                #sort from small to large
                #input_text, _ = -tf.nn.top_k(-input_text, input_text_length)
                #TODO may be need to be input_text_length, so as to do more decode limit out graph like using trie!
                beam_size = tf.minimum(beam_size, input_text_length)
            elif FLAGS.decode_use_alignment:
                input_text = tf.squeeze(input_text)
                input_text_length = tf.to_int32(tf.squeeze(input_text_length))
                input_text = input_text[0:input_text_length]
                input_text_length = tf.shape(input_text)[-1]
                beam_size = tf.minimum(beam_size, input_text_length)
            else:
                if FLAGS.gen_only:
                    input_text = None

        batch_size = melt.get_batch_size(input)
        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(batch_size, tf.float32) \
            if initial_state is None else initial_state

        ##--TODO hard.. since need to reuse to share ValueError:
        ##Variable seq2seq/main/decode/memory_layer/kernel already exists, disallowed. Did you mean to set reuse=True in VarScope?
        ##another way to solve is always using tiled_batch attention_states and state, the first step will choose from only first beam
        ##will not all solve the problem since feed data might be less than beam size, so attention states always be 1 is safe
        #cell2 = self.prepare_attention(tf.contrib.seq2seq.tile_batch(attention_states, beam_size), reuse=True)

        first_state = state

        beam_search_step = functools.partial(self.beam_search_step,
                                             beam_size=beam_size)

        #since before hack using generate_sequence_greedy, here can not set scope.reuse_variables
        #NOTICE inorder to use lstm which is in .../rnn/ nameapce here you must also add this scope to use the shared
        with tf.variable_scope(self.scope) as scope:
            inital_attention, initial_state, initial_logprobs, initial_ids = \
                  beam_search_step(input, state, cell, input_text=input_text)

            if attention_states is not None:
                tf.add_to_collection(
                    'beam_search_initial_alignments',
                    tf.get_collection('attention_alignments')[-1])

            scope.reuse_variables()
            # In inference mode, use concatenated states for convenient feeding and
            # fetching.
            state_is_tuple = len(initial_state) == 2

            if state_is_tuple:
                initial_state = tf.concat(initial_state,
                                          1,
                                          name="initial_state")
                state_size = sum(self.cell.state_size)
            else:
                state_size = self.cell.state_size

            #output is used only when use attention
            if attention_states is not None:
                initial_state = tf.concat([initial_state, inital_attention],
                                          1,
                                          name="initial_attention_state")
                state_size += self.cell.output_size

            tf.add_to_collection('beam_search_initial_state', initial_state)
            tf.add_to_collection('beam_search_initial_logprobs',
                                 initial_logprobs)
            tf.add_to_collection('beam_search_initial_ids', initial_ids)

            input_feed = tf.placeholder(
                dtype=tf.int64,
                shape=[None],  # batch_size
                name="input_feed")
            tf.add_to_collection('beam_search_input_feed', input_feed)
            input = tf.nn.embedding_lookup(emb, input_feed)

            # Placeholder for feeding a batch of concatenated states.
            state_feed = tf.placeholder(dtype=tf.float32,
                                        shape=[None, state_size],
                                        name="state_feed")
            tf.add_to_collection('beam_search_state_feed', state_feed)

            if attention_states is not None:
                state, attention = tf.split(state_feed, [
                    state_size - self.cell.output_size, self.cell.output_size
                ],
                                            axis=1)
            else:
                state = state_feed

            if state_is_tuple:
                state = tf.split(state, num_or_size_splits=2, axis=1)

            if attention_states is not None:
                state_ = first_state.clone(cell_state=state,
                                           attention=attention)
            else:
                state_ = state

            #--TODO here is not safe if change attention_wrapper, notice batch size of attention states is 1
            #--but cell input and state is beam_size
            #attention, state, top_logprobs, top_ids = beam_search_step(input, state_, cell2)

            if input_text is not None and not FLAGS.decode_copy:
                input_text = tf.contrib.seq2seq.tile_batch(
                    input_text, melt.get_batch_size(input))

            attention, state, top_logprobs, top_ids = beam_search_step(
                input, state_, cell, input_text=input_text)

            if state_is_tuple:
                # Concatentate the resulting state.
                state = tf.concat(state, 1, name="state")
            if attention_states is not None:
                state = tf.concat([state, attention],
                                  1,
                                  name="attention_state")

            tf.add_to_collection('beam_search_state', state)
            tf.add_to_collection('beam_search_logprobs', top_logprobs)
            tf.add_to_collection('beam_search_ids', top_ids)

            #just same return like return path list, score list
            return tf.no_op(), tf.no_op()
示例#18
0
    def call(self,
             x,
             sequence_length=None,
             mask_fws=None,
             mask_bws=None,
             concat_layers=None,
             output_method=None,
             training=False):

        concat_layers = concat_layers or self.concat_layers
        output_mehtod = output_method or self.output_method

        if self.residual_connect:
            x = self.residual_linear(x)

        outputs = [x]

        #states = []
        keep_prob = self.keep_prob
        num_units = self.num_units
        batch_size = melt.get_batch_size(x)

        if sequence_length is None:
            len_ = melt.get_shape(x, 1)
            sequence_length = tf.ones([
                batch_size,
            ], dtype=tf.int64) * len_

        for layer in range(self.num_layers):
            input_size_ = melt.get_shape(x,
                                         -1) if layer == 0 else 2 * num_units

            gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer]

            if self.train_init_state:
                #init_fw = tf.tile(self.init_fw[layer], [batch_size, 1])
                #init_fw = tf.tile(self.init_fw_layer(layer), [batch_size, 1])
                init_fw = self.init_fw_layer(layer, batch_size)
                if self.cell == 'lstm':
                    init_fw = (init_fw, self.init_fw2_layer(layer, batch_size))
            else:
                init_fw = None

            if self.recurrent_dropout:
                if mask_fws is not None:
                    mask_fw = mask_fws[layer]
                else:
                    if not self.share_dropout:
                        mask_fw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          training=training,
                                          mode=None)
                    else:
                        if self.dropout_mask_fw[layer] is None or (
                                tf.executing_eagerly() and batch_size !=
                                self.dropout_mask_fw[layer].shape[0]):
                            mask_fw = dropout(
                                tf.ones([batch_size, 1, input_size_],
                                        dtype=tf.float32),
                                keep_prob=keep_prob,
                                training=training,
                                mode=None)
                            self.dropout_mask_fw[layer] = mask_fw
                        else:
                            mask_fw = self.dropout_mask_fw[layer]

                inputs_fw = outputs[-1] * mask_fw
            else:
                inputs_fw = dropout(outputs[-1],
                                    keep_prob=keep_prob,
                                    training=training,
                                    mode=None)

            # https://stackoverflow.com/questions/48233400/lstm-initial-state-from-dense-layer
            # gru and lstm different ... state lstm need tuple (,) states as input state\
            if self.cell == 'gru':
                out_fw, state_fw = gru_fw(inputs_fw, init_fw)
            else:
                out_fw, state_fw1, state_fw2 = gru_fw(inputs_fw, init_fw)
                state_fw = (state_fw1, state_fw2)

            if self.train_init_state:
                #init_bw = tf.tile(self.init_bw[layer], [batch_size, 1])
                #init_bw = tf.tile(self.init_bw_layer(layer), [batch_size, 1])
                init_bw = self.init_bw_layer(layer, batch_size)
                if self.cell == 'lstm':
                    init_bw = (init_bw, self.init_bw2_layer(layer, batch_size))
            else:
                init_bw = None

            if mask_bws is not None:
                mask_bw = mask_bws[layer]
            else:
                if not self.share_dropout:
                    mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                              dtype=tf.float32),
                                      keep_prob=keep_prob,
                                      training=training,
                                      mode=None)
                else:
                    if self.dropout_mask_bw[layer] is None or (
                            tf.executing_eagerly() and batch_size !=
                            self.dropout_mask_bw[layer].shape[0]):
                        mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                                  dtype=tf.float32),
                                          keep_prob=keep_prob,
                                          training=training,
                                          mode=None)
                        self.dropout_mask_bw[layer] = mask_bw
                    else:
                        mask_bw = self.dropout_mask_bw[layer]

            if self.recurrent_dropout:
                inputs_bw = outputs[-1] * mask_bw
            else:
                if self.bw_dropout:
                    inputs_bw = dropout(outputs[-1],
                                        keep_prob=keep_prob,
                                        training=training,
                                        mode=None)
                else:
                    inputs_bw = inputs_fw

            inputs_bw = tf.reverse_sequence(inputs_bw,
                                            seq_lengths=sequence_length,
                                            seq_axis=1,
                                            batch_axis=0)

            if self.cell == 'gru':
                out_bw, state_bw = gru_bw(inputs_bw, init_bw)
            else:
                out_bw, state_bw1, state_bw2 = gru_bw(inputs_bw, init_bw)
                state_bw = (state_bw1, state_bw2)

            out_bw = tf.reverse_sequence(out_bw,
                                         seq_lengths=sequence_length,
                                         seq_axis=1,
                                         batch_axis=0)

            outputs.append(tf.concat([out_fw, out_bw], axis=2))
            if self.residual_connect:
                outputs[-1] = self.batch_norm(outputs[-2] + outputs[-1])

        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
        else:
            res = outputs[-1]

        res = encode_outputs(res,
                             output_method=output_method,
                             sequence_length=sequence_length)

        self.state = (state_fw, state_bw)
        if not self.return_state:
            return res
        else:
            return res, self.state
示例#19
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for img2text/showandtell input is image_embedding, inital_state is None/zero set
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = melt.get_batch_size(sequence)

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        #[batch_size, num_steps]
        targets = sequence

        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        if FLAGS.gen_only:
            #gen only mode
            #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why
            outputs, state = tf.nn.dynamic_rnn(cell,
                                               inputs,
                                               initial_state=state,
                                               sequence_length=sequence_length,
                                               dtype=tf.float32,
                                               scope=self.scope)

            #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s
            #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            #my_decoder = melt.seq2seq.BasicTrainingDecoder(
            ##my_decoder = tf.contrib.seq2seq.BasicDecoder(
            ##my_decoder = melt.seq2seq.BasicDecoder(
            #      cell=cell,
            #      helper=helper,
            #      initial_state=state)
            ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            ##outputs = outputs.rnn_output
        else:
            #---copy only or gen copy
            helper = melt.seq2seq.TrainingHelper(inputs,
                                                 tf.to_int32(sequence_length))

            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn = lambda cell_output, cell_state: self.copy_output_fn(
                    indices, batch_size, cell_output, cell_state)
            else:
                #gen_copy right now, not use switch
                sampled_values = None
                if self.softmax_loss_function is not None:
                    sampled_values = tf.nn.log_uniform_candidate_sampler(
                        true_classes=tf.reshape(targets, [-1, 1]),
                        num_true=1,
                        num_sampled=self.num_sampled,
                        unique=True,
                        range_max=self.vocab_size)
                    #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values
                    #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much
                output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn(
                    time, indices, targets, sampled_values, batch_size,
                    cell_output, cell_state)

            my_decoder = melt.seq2seq.BasicTrainingDecoder(
                cell=cell,
                helper=helper,
                initial_state=state,
                vocab_size=self.vocab_size,
                output_fn=output_fn)
            outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)

        tf.add_to_collection('outputs', outputs)

        #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
        softmax_loss_function = self.softmax_loss_function
        if self.is_predict and (exact_prob or exact_loss):
            softmax_loss_function = None

        if not FLAGS.gen_only:
            logits = outputs
            softmax_loss_function = None
        elif softmax_loss_function is not None:
            logits = outputs
        else:
            #[batch_size, num_steps, num_units] * [num_units, vocab_size]
            # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
            #or [batch_size * num_steps, vocab_size] by default flatten=True
            keep_dims = exact_prob or exact_loss
            logits = melt.batch_matmul_embedding(
                outputs, self.w, keep_dims=keep_dims) + self.v
            if not keep_dims:
                targets = tf.reshape(targets, [-1])

        tf.add_to_collection('logits', logits)

        #if input_text is not None:
        #  logits = outputs

        mask = tf.cast(tf.sign(targets), dtype=tf.float32)

        if FLAGS.gen_copy_switch:
            #TODO why need more gpu mem ? ...  do not save logits ? just calc loss in output_fn ?
            #batch size 256
            #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body
            #step_logits = logits[:, i, :]
            #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470]
            num_steps = tf.shape(targets)[1]

            loss = melt.seq2seq.exact_predict_loss(logits,
                                                   targets,
                                                   mask,
                                                   num_steps,
                                                   need_softmax=False,
                                                   need_average=True,
                                                   batch_size=batch_size)

            # loss = melt.seq2seq.sequence_loss_by_example(
            #     logits,
            #     targets,
            #     weights=mask)
        elif self.is_predict and exact_prob:
            #generate real prob for sequence
            #for 10w vocab textsum seq2seq 20 -> 4 about
            loss = melt.seq2seq.exact_predict_loss(logits,
                                                   targets,
                                                   mask,
                                                   num_steps,
                                                   batch_size=batch_size)
        elif self.is_predict and exact_loss:
            #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
            #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
            #but 256 vocab sample will use only about 10ms
            loss = melt.seq2seq.sequence_loss_by_example(logits,
                                                         targets,
                                                         weights=mask)
        else:
            #loss [batch_size,]
            loss = melt.seq2seq.sequence_loss_by_example(
                logits,
                targets,
                weights=mask,
                softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses]
        loss = tf.reshape(loss, [-1, 1])

        if self.is_predict:
            loss = self.normalize_length(loss, sequence_length, exact_prob)
            #loss = tf.squeeze(loss)  TODO: later will uncomment this with all models rerun
        return loss
示例#20
0
    def build_predict_text_graph(self,
                                 image,
                                 decode_method='greedy',
                                 beam_size=5,
                                 convert_unk=False,
                                 length_normalization_factor=None,
                                 max_words=None,
                                 logprobs_history=False,
                                 alignment_history=False):
        scope = tf.get_variable_scope()
        if not FLAGS.showtell_noimage:
            with tf.variable_scope(FLAGS.showtell_encode_scope or scope):
                attention_states, initial_state, image_emb = self._encode(
                    image)
        else:
            image_emb = tf.zeros([melt.get_batch_size(image), self.emb_dim])
            initial_state = None
            attention_states = None

        with tf.variable_scope(FLAGS.showtell_decode_scope or scope):
            # max_words = max_words or TEXT_MAX_WORDS
            max_words = max_words or FLAGS.decoder_max_words
            decode_func = None
            if decode_method == SeqDecodeMethod.greedy:
                decode_func = self.decoder.generate_sequence_greedy
            elif decode_method == SeqDecodeMethod.multinomal:
                decode_func = self.decoder.generate_sequence_multinomial
            if decode_func is not None:
                results = decode_func(
                    image_emb,
                    max_words=max_words,
                    initial_state=initial_state,
                    attention_states=attention_states,
                    convert_unk=convert_unk,
                    need_logprobs=FLAGS.greedy_decode_with_logprobs)
            else:
                if decode_method == SeqDecodeMethod.ingraph_beam:
                    decode_func = self.decoder.generate_sequence_ingraph_beam
                elif decode_method == SeqDecodeMethod.outgraph_beam:
                    decode_func = self.decoder.generate_sequence_outgraph_beam
                else:
                    raise ValueError('not supported decode_method: %s' %
                                     decode_method)

                results = decode_func(
                    image_emb,
                    max_words=max_words,
                    initial_state=initial_state,
                    beam_size=beam_size,
                    convert_unk=convert_unk,
                    attention_states=attention_states,
                    length_normalization_factor=length_normalization_factor
                    or FLAGS.length_normalization_factor,
                    logprobs_history=logprobs_history,
                    alignment_history=alignment_history)
            if logprobs_history:
                if self.decoder.log_probs_history is not None:
                    tf.add_to_collection('decoder_logprobs_history',
                                         self.decoder.log_probs_history)
            if alignment_history:
                if self.decoder.alignment_history is not None:
                    tf.add_to_collection('decoder_alignment_history',
                                         self.decoder.alignment_history)
        return results
示例#21
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for showandtell input is image_embedding, inital_state is None/zero set, if use im2txt mode set image_as_init_state=True will do as above, need to PAD <GO> !
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = melt.get_batch_size(sequence)

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        #[batch_size, num_steps]
        targets = sequence

        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
        softmax_loss_function = self.softmax_loss_function
        if self.is_predict and (exact_prob or exact_loss):
            softmax_loss_function = None

        scheduled_sampling_probability = FLAGS.scheduled_sampling_probability if self.is_training else 0.
        if FLAGS.gen_only:
            #gen only mode
            #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why
            if scheduled_sampling_probability > 0.:
                helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs, tf.to_int32(sequence_length), emb,
                    tf.constant(FLAGS.scheduled_sampling_probability))
                #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
                my_decoder = melt.seq2seq.BasicDecoder(
                    #my_decoder = tf.contrib.seq2seq.BasicDecoder(
                    #my_decoder = melt.seq2seq.BasicDecoder(
                    cell=cell,
                    helper=helper,
                    initial_state=state)
                outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder, scope=self.scope)
                #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
                outputs = outputs.rnn_output
            else:
                outputs, state = tf.nn.dynamic_rnn(
                    cell,
                    inputs,
                    initial_state=state,
                    sequence_length=sequence_length,
                    dtype=tf.float32,
                    scope=self.scope)

            #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s
            #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            #my_decoder = melt.seq2seq.BasicTrainingDecoder(
            ##my_decoder = tf.contrib.seq2seq.BasicDecoder(
            ##my_decoder = melt.seq2seq.BasicDecoder(
            #      cell=cell,
            #      helper=helper,
            #      initial_state=state)
            ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            ##outputs = outputs.rnn_output
        else:
            #---copy only or gen copy
            if scheduled_sampling_probability > 0.:
                #not tested yet TODO
                helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs, tf.to_int32(sequence_length), emb,
                    tf.constant(FLAGS.scheduled_sampling_probability))
                Decoder_ = melt.seq2seq.BasicDecoder
            else:
                #as before
                helper = melt.seq2seq.TrainingHelper(
                    inputs, tf.to_int32(sequence_length))
                Decoder_ = melt.seq2seq.BasicTrainingDecoder

            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn = lambda cell_output, cell_state: self.copy_output_fn(
                    indices, batch_size, cell_output, cell_state)
            else:
                #gen_copy right now, not use switch ? gen_copy and switch?
                sampled_values = None
                #TODO CHECK this is it ok? why train and predict not equal and score/exact score same? FIXME
                #need first debug why score and exact score is same ? score should be the same as train! TODO
                #sh ./inference/infrence-score.sh to reproduce
                #now just set num_sampled = 0 for safe, may be here train also not correct FIXME
                if softmax_loss_function is not None:
                    sampled_values = tf.nn.log_uniform_candidate_sampler(
                        true_classes=tf.reshape(targets, [-1, 1]),
                        num_true=1,
                        num_sampled=self.num_sampled,
                        unique=True,
                        range_max=self.vocab_size)
                    #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values
                    #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much
                output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn(
                    time, indices, targets, sampled_values, batch_size,
                    cell_output, cell_state)

            my_decoder = Decoder_(cell=cell,
                                  helper=helper,
                                  initial_state=state,
                                  vocab_size=self.vocab_size,
                                  output_fn=output_fn)
            outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            if hasattr(outputs, 'rnn_output'):
                outputs = outputs.rnn_output

        tf.add_to_collection('outputs', outputs)

        if not FLAGS.gen_only:
            logits = outputs
            softmax_loss_function = None
        elif softmax_loss_function is not None:
            logits = outputs
        else:
            #--softmax_loss_function is None means num_sample = 0 or exact_loss or exact_prob
            #[batch_size, num_steps, num_units] * [num_units, vocab_size]
            # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
            #or [batch_size * num_steps, vocab_size] by default flatten=True
            #this will be fine for train [batch_size * num_steps] but not good for eval since we want
            #get score of each instance also not good for predict
            #--------only training mode not keep dims, but this will be dangerous, since class call rnn_decoder
            #need to manully set rnn_decoder.is_training=False!  TODO other wise will show incorrect scores in eval mode
            #but not affect the final model!
            keep_dims = exact_prob or exact_loss or (not self.is_training)
            logits = melt.batch_matmul_embedding(
                outputs, self.w, keep_dims=keep_dims) + self.v
            if not keep_dims:
                targets = tf.reshape(targets, [-1])

        tf.add_to_collection('logits', logits)

        mask = tf.cast(tf.sign(targets), dtype=tf.float32)

        if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax:
            #TODO why need more gpu mem ? ...  do not save logits ? just calc loss in output_fn ?
            #batch size 256
            #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body
            #step_logits = logits[:, i, :]
            #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470]
            num_steps = tf.shape(targets)[1]

            loss = melt.seq2seq.exact_predict_loss(
                logits,
                targets,
                mask,
                num_steps,
                need_softmax=False,
                average_across_timesteps=not self.is_predict,
                batch_size=batch_size)
        elif self.is_predict and exact_prob:
            #generate real prob for sequence
            #for 10w vocab textsum seq2seq 20 -> 4 about
            loss = melt.seq2seq.exact_predict_loss(
                logits,
                targets,
                mask,
                num_steps,
                batch_size=batch_size,
                average_across_timesteps=False)
        elif self.is_predict and exact_loss:
            #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
            #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
            #but 256 vocab sample will use only about 10ms
            loss = melt.seq2seq.sequence_loss_by_example(
                logits, targets, weights=mask, average_across_timesteps=False)
        else:
            #loss [batch_size,]
            loss = melt.seq2seq.sequence_loss_by_example(
                logits,
                targets,
                weights=mask,
                average_across_timesteps=not self.
                is_predict,  #train must average, other wise long sentence big loss..
                softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses] here may be [batch_size * num_steps,] if is_training and not exact loss/prob
        loss = tf.reshape(loss, [-1, 1])

        self.ori_loss = loss
        if self.is_predict:
            #note use avg_loss not to change loss pointer, avg_loss is same as average time step=True is length_normalize_fator=1.0
            avg_loss = self.normalize_length(loss, sequence_length)
            return avg_loss

        #if not is_predict loss is averaged per time step else not but avg loss will average it
        return loss
示例#22
0
    def generate_sequence(self,
                          input,
                          max_words,
                          initial_state=None,
                          attention_states=None,
                          convert_unk=True,
                          input_text=None,
                          Helper=None,
                          emb=None):
        """
    this one is using greedy search method
    for beam search using generate_sequence_by_beam_search with addditional params like beam_size
    """
        if emb is None:
            emb = self.emb

        batch_size = melt.get_batch_size(input)
        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        need_logprobs = FLAGS.greedy_decode_with_logprobs
        if Helper is None:
            if not need_logprobs:
                helper = melt.seq2seq.GreedyEmbeddingHelper(
                    embedding=emb, first_input=input, end_token=self.end_id)
            else:
                helper = melt.seq2seq.LogProbsGreedyEmbeddingHelper(
                    embedding=emb,
                    first_input=input,
                    end_token=self.end_id,
                    need_softmax=self.need_softmax)
        else:
            helper = melt.seq2seq.MultinomialEmbeddingHelper(
                embedding=emb,
                first_input=input,
                end_token=self.end_id,
                need_softmax=self.need_softmax)

        if FLAGS.gen_only:
            output_fn = self.output_fn
        else:
            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn_ = self.copy_output_fn
            else:
                output_fn_ = self.gen_copy_output_fn
            output_fn = lambda cell_output, cell_state: output_fn_(
                indices, batch_size, cell_output, cell_state)

        Decoder = melt.seq2seq.BasicDecoder if not need_logprobs else melt.seq2seq.LogProbsDecoder
        my_decoder = Decoder(cell=cell,
                             helper=helper,
                             initial_state=state,
                             vocab_size=self.vocab_size,
                             output_fn=output_fn)

        outputs, final_state, sequence_length = melt.seq2seq.dynamic_decode(
            my_decoder,
            maximum_iterations=max_words,
            #MUST set to True, other wise will not set zero and sumup tokens past done/end token
            impute_finished=True,
            scope=self.scope)
        sequence = outputs.sample_id
        if not hasattr(final_state, 'log_probs'):
            score = tf.zeros([
                batch_size,
            ])
        else:
            score = self.normalize_length(final_state.log_probs,
                                          sequence_length,
                                          reshape=False)
            ##below can be verified to be the same
            # num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            # score2 = -melt.seq2seq.exact_predict_loss(outputs.rnn_output, sequence, tf.to_float(tf.sign(sequence)),
            #                                       num_steps, need_softmax=True, average_across_timesteps=False)
            # score2 = self.normalize_length(score2, sequence_length, reshape=False)
            # score -= score2

            #score = tf.exp(score)
            #score = tf.concat([tf.expand_dims(score, 1), outputs.log_probs], 1)
            if FLAGS.predict_use_prob:
                score = tf.exp(score)
            tf.add_to_collection('greedy_log_probs_list', outputs.log_probs)

        #------like beam search return sequence, score
        return sequence, score
示例#23
0
    def call(self,
             inputs,
             sequence_length,
             inputs2,
             sequence_length2,
             mask_fws,
             mask_bws,
             concat_layers=True,
             output_method=OutputMethod.all,
             training=False):

        outputs = [inputs]
        outputs2 = [inputs2]

        keep_prob = self.keep_prob
        num_units = self.num_units
        batch_size = melt.get_batch_size(inputs)

        for layer in range(self.num_layers):
            input_size_ = melt.get_shape(inputs,
                                         -1) if layer == 0 else 2 * num_units

            gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer]

            if self.train_init_state:
                init_fw = self.init_fw_layer(layer, batch_size)
            else:
                init_fw = None

            mask_fw = mask_fws[layer]
            out_fw, state_fw = gru_fw(outputs[-1] * mask_fw, init_fw)
            out_fw2, state_fw2 = gru_fw(outputs2[-1] * mask_fw, state_fw)

            mask_bw = mask_bws[layer]
            inputs_bw = tf.reverse_sequence(
                outputs[-1] * mask_bw,
                sequence_lengthgths=sequence_length,
                seq_axis=1,
                batch_axis=0)
            inputs_bw2 = tf.reverse_sequence(
                outputs2[-1] * mask_bw,
                sequence_lengthgths=sequence_length2,
                seq_axis=1,
                batch_axis=0)

            if self.train_init_state:
                init_bw = self.init_bw_layer(layer, batch_size)
            else:
                init_bw = None

            out_bw, state_bw = gru_bw(inputs_bw, init_bw)
            out_bw2, state_bw2 = gru_bw(inputs_bw2, state_bw)

            outputs.append(tf.concat([out_fw, out_bw], axis=2))
            outputs2.append(tf.concat([out_fw2, out_bw2], axis=2))

        if concat_layers:
            res = tf.concat(outputs[1:], axis=2)
            res2 = tf.concat(outputs2[1:], axis=2)
        else:
            res = outputs[-1]
            res2 = outpus2[-1]

        res = tf.concat([res, res2], axis=1)

        res = encode_outputs(res,
                             output_method=output_method,
                             sequence_length=sequence_length)

        self.state = (state_fw2, state_bw2)
        return res
示例#24
0
    def generate_sequence_beam(self,
                               input,
                               max_words,
                               initial_state=None,
                               attention_states=None,
                               beam_size=5,
                               convert_unk=True,
                               length_normalization_factor=0.,
                               input_text=None,
                               input_text_length=None,
                               emb=None):
        """
    beam dcode means ingraph beam search
    return top (path, score)
    """
        if emb is None:
            emb = self.emb

        def loop_function(i, prev, state, decoder):
            prev, state = decoder.take_step(i, prev, state)
            next_input = tf.nn.embedding_lookup(emb, prev)
            return next_input, state

        batch_size = melt.get_batch_size(input)

        if initial_state is not None:
            initial_state = nest.map_structure(
                lambda x: tf.contrib.seq2seq.tile_batch(x, beam_size),
                initial_state)
        if attention_states is None:
            cell = self.cell
        else:
            attention_states = tf.contrib.seq2seq.tile_batch(
                attention_states, beam_size)
            #print('tiled_attention_states', attention_states, 'tiled_initial_state', initial_state)
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None

        state = cell.zero_state(batch_size * beam_size, tf.float32) \
                  if initial_state is None else initial_state

        if FLAGS.gen_only:
            output_fn = self.output_fn
        else:
            input_text = tf.contrib.seq2seq.tile_batch(input_text, beam_size)
            batch_size = batch_size * beam_size
            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn_ = self.copy_output_fn
            else:
                output_fn_ = self.gen_copy_output_fn
            output_fn = lambda cell_output, cell_state: output_fn_(
                indices, batch_size, cell_output, cell_state)

        ##TODO to be safe make topn the same as beam size
        return melt.seq2seq.beam_decode(
            input,
            max_words,
            state,
            cell,
            loop_function,
            scope=self.scope,
            beam_size=beam_size,
            done_token=vocabulary.vocab.end_id(),
            output_fn=output_fn,
            length_normalization_factor=length_normalization_factor,
            topn=beam_size)