示例#1
0
def pointer_decoder(encoder_inputs_emb,
                    decoder_inputs,
                    initial_state,
                    attention_states,
                    cell,
                    feed_prev=True,
                    dtype=dtypes.float32,
                    scope=None):
    #print 'encoder_inputs',encoder_inputs_emb
    #print 'decoder_inputs', decoder_inputs
    #print 'attention_states', attention_states
    encoder_inputs = encoder_inputs_emb
    #attn_length = attention_states.get_shape()[1].value
    #attn_size = attention_states.get_shape()[2].value
    attn_length = shape(attention_states, 1)
    attn_size = shape(attention_states, 2)
    with tf.name_scope('attention_setup'):
        attnw = tf.get_variable("AttnW", [1, attn_size, attn_size])
        attention_states = tf.nn.conv1d(attention_states, attnw, 1, 'SAME')
        attnv = tf.get_variable("AttnV", [attn_size])
    sys.stdout = sys.stderr

    def attention_weight(output):
        y = _linear(output, attn_size, True)
        y = tf.reshape(y, [-1, 1, attn_size])
        # Calculate attention weights for every encoder's input by taking an inner product between the weight bector (attnv), and the conbined decoder's state with the encoder's output.
        attention_vectors = tf.nn.softmax(
            tf.reduce_sum(attnv * tf.tanh(y + attention_states), axis=2))
        return attention_vectors

    states = [initial_state]
    outputs = []
    pointed_idxs = []

    for i, d in enumerate(tf.unstack(decoder_inputs, axis=1)):
        with tf.name_scope('Decode_%d' % i):
            if i > 0:
                tf.get_variable_scope().reuse_variables()
            pointed_idx = d
            # in testing, inputs to decoder won't be used except the first one.
            if feed_prev and i > 0:
                # take argmax, convert the pointed index into one-hot, and get the pointed encoder_inputs by multiplying and reduce_sum.
                pointed_idx = tf.argmax(output, axis=1, output_type=tf.int32)
            pointed_idxs.append(pointed_idx)
            with tf.name_scope('copy_from_encoder_inputs'):
                pointed_idx = tf.reshape(
                    tf.one_hot(pointed_idx, depth=attn_length),
                    [-1, attn_length, 1])
                inp = tf.reduce_sum(encoder_inputs * pointed_idx, axis=1)
                inp = tf.stop_gradient(inp)
            output, state = cell(inp, states[-1])
            with tf.name_scope('attention_weight'):
                output = attention_weight(output)
            #print 'output', output
            states.append(state)
            outputs.append(output)
    outputs = tf.stack(outputs, axis=1)
    states = tf.stack(states, axis=1)
    return outputs, states, pointed_idxs
示例#2
0
    def setup_placeholder(self, config):
        '''
    Prepare tf.placeholder and their lengthes. 
    They are kept as instance variables.
    '''
        self.e_inputs_w_ph = tf.placeholder(tf.int32, [None, None],
                                            name="EncoderInputWords")
        self.e_inputs_c_ph = tf.placeholder(tf.int32, [None, None, None],
                                            name="EncoderInputChars")
        #self.d_outputs_ph = tf.placeholder(
        #  tf.int32, [None, None], name="DecoderOutput")
        self.d_outputs_ph = self.e_inputs_w_ph

        self.is_training = tf.placeholder(tf.bool, [], name='is_training')
        with tf.name_scope('keep_prob'):
            self.keep_prob = 1.0 - tf.to_float(
                self.is_training) * config.dropout_rate

        with tf.name_scope('batch_size'):
            self.batch_size = batch_size = shape(self.d_outputs_ph, 0)
        with tf.name_scope('start_tokens'):
            self.start_tokens = tf.tile(tf.constant([BOS_ID], dtype=tf.int32),
                                        [batch_size])
        with tf.name_scope('end_tokens'):
            self.end_token = PAD_ID
            end_tokens = tf.tile(tf.constant([self.end_token], dtype=tf.int32),
                                 [batch_size])
        # Count the length of each dialogue, utterance, (word).
        with tf.name_scope('utterance_length'):
            self.uttr_lengths = tf.count_nonzero(self.e_inputs_w_ph,
                                                 axis=1,
                                                 dtype=tf.int32)
        '''
    # Example of the decoder's inputs and outputs.
    Against a given input ['how', 'are', 'you', '?'] to the decoder's placeholder,
    - decoder's input : ['_BOS', 'how', 'are', 'you', '?']
    - decoder's output (target) : ['how', 'are', 'you', '?', '_PAD']
    - target_length: 5
    - target_weights: [1, 1, 1, 1, 1]
    Here, the token _PAD behaves as EOS.
    '''
        with tf.name_scope('decoder_inputs'):
            self.decoder_inputs = tf.concat(
                [tf.expand_dims(self.start_tokens, 1), self.d_outputs_ph],
                axis=1)
        # the length of decoder's inputs/outputs is increased by 1 because of BOS or EOS.
        with tf.name_scope('target_lengths'):
            self.target_length = tf.count_nonzero(
                self.d_outputs_ph, axis=1, dtype=tf.int32) + 1
        with tf.name_scope('target_weights'):
            self.target_weights = tf.sequence_mask(self.target_length,
                                                   dtype=tf.float32)

        with tf.name_scope('targets'):
            self.targets = tf.concat(
                [self.d_outputs_ph,
                 tf.expand_dims(end_tokens, 1)],
                axis=1)[:, :shape(self.target_weights, 1)]
示例#3
0
 def encode(self, inputs, sequence_length):
     with tf.variable_scope(self.shared_scope or "CNNEncoder"):
         target_rank = 3  # [*, max_sequence_length, hidden_size]
         flattened_inputs, prev_shape = flatten(inputs, target_rank)
         flattened_aggregated_outputs = cnn(flattened_outputs,
                                            activation=self.activation)
         target_shape = prev_shape[:-2] + [
             shape(flattened_aggregated_outputs, -1)
         ]
         outputs = tf.reshape(flattened_aggregated_outputs, target_shape)
     outputs = tf.nn.dropout(outputs, self.keep_prob)
     return outputs, outputs
示例#4
0
    def setup_decoder_states(self,
                             config,
                             encoder_outputs,
                             encoder_state,
                             scope=None):
        attention_states = encoder_outputs

        response_emb = tf.nn.embedding_lookup(self.w_embeddings,
                                              self.d_outputs_ph)
        response_lengths = tf.count_nonzero(self.d_outputs_ph,
                                            axis=1,
                                            dtype=tf.int32)
        print 'encoder_state', encoder_state
        print 'encoder_outputs', encoder_outputs

        _, h_future = self.uttr_encoder.encode(response_emb, response_lengths)
        print 'h_future', h_future

        def _get_distribution(state, output_size):
            h = state
            num_layers = 1
            for i in range(num_layers):
                with tf.variable_scope('linear%d' % i) as scope:
                    h = linear(h, output_size, scope=scope)
            with tf.variable_scope('Mean'):
                mean = linear(h, output_size, activation=None)
            with tf.variable_scope('Var'):
                var = linear(h, output_size, activation=tf.nn.softplus)
            return tfd.MultivariateNormalDiag(mean, var)

        output_size = shape(encoder_state, -1)
        with tf.variable_scope('Prior'):
            self.prior = _get_distribution(encoder_state, output_size)
        with tf.variable_scope('Posterior'):
            self.posterior = _get_distribution(
                tf.concat([encoder_state, h_future], axis=-1), output_size)

        train_decoder_state = tf.concat(
            [encoder_state, self.posterior.sample()], axis=-1)
        test_decoder_state = tf.concat(
            [encoder_state, self.prior.sample()], axis=-1)
        #train_decoder_state = encoder_state + self.posterior.sample()
        #test_decoder_state = encoder_state + self.prior.sample()
        print train_decoder_state
        print test_decoder_state
        return train_decoder_state, test_decoder_state, attention_states
示例#5
0
    def __init__(self, sess, conf, vocab):
        ModelBase.__init__(self, sess, conf)
        self.vocab = vocab
        input_max_len, output_max_len = None, conf.output_max_len
        self.is_training = tf.placeholder(tf.bool, [], name='is_training')
        with tf.name_scope('keep_prob'):
            self.keep_prob = 1.0 - tf.to_float(
                self.is_training) * conf.dropout_rate

        with tf.name_scope('EncoderInput'):
            self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len],
                                              name="EncoderInput")

        with tf.name_scope('batch_size'):
            batch_size = shape(self.e_inputs_ph, 0)

        with tf.variable_scope('Embeddings') as scope:
            self.w_embeddings = self.initialize_embeddings(
                'Word',
                vocab.embeddings.shape,
                initializer=tf.constant_initializer(vocab.embeddings),
                trainable=conf.train_embedding)
示例#6
0
    def __init__(self, sess, conf, vocab):
        ModelBase.__init__(self, sess, conf)
        self.vocab = vocab
        input_max_len, output_max_len = None, conf.output_max_len
        self.is_training = tf.placeholder(tf.bool, [], name='is_training')
        with tf.name_scope('keep_prob'):
            self.keep_prob = 1.0 - tf.to_float(
                self.is_training) * conf.dropout_rate

        # <Sample input>
        # e_inputs: [1, 40, 44, 0, 0], d_outputs: [2, 0, 0] (target=44)
        with tf.name_scope('EncoderInput'):
            self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len],
                                              name="EncoderInput")

        with tf.name_scope('batch_size'):
            batch_size = shape(self.e_inputs_ph, 0)

        with tf.variable_scope('Embeddings') as scope:
            w_embeddings = self.initialize_embeddings(
                'Word',
                vocab.embeddings.shape,
                initializer=tf.constant_initializer(vocab.embeddings),
                trainable=conf.train_embedding)

        with tf.variable_scope('WordEncoder') as scope:
            word_encoder = WordEncoder(conf,
                                       w_embeddings,
                                       self.keep_prob,
                                       shared_scope=scope)
            e_inputs_emb = word_encoder.encode([self.e_inputs_ph])

        with tf.variable_scope('SentEncoder') as scope:
            sent_encoder = SentenceEncoder(conf,
                                           self.keep_prob,
                                           shared_scope=scope)
            e_inputs_length = tf.count_nonzero(self.e_inputs_ph, axis=1)
            e_outputs, e_state = sent_encoder.encode(e_inputs_emb,
                                                     e_inputs_length)
            attention_states = e_outputs

        self.d_outputs_ph = []
        self.losses = []
        self.greedy_predictions = []
        self.copied_inputs = []
        for i, col_name in enumerate(conf.target_columns):
            with tf.name_scope('DecoderOutput%d' % i):
                d_outputs_ph = tf.placeholder(tf.int32, [None, output_max_len],
                                              name="DecoderOutput")

            ds_name = 'Decoder' if conf.share_decoder else 'Decoder%d' % i
            with tf.variable_scope(ds_name) as scope:
                d_cell = setup_cell(conf.cell_type,
                                    conf.rnn_size,
                                    conf.num_layers,
                                    keep_prob=self.keep_prob)
                teacher_forcing = conf.teacher_forcing if 'teacher_forcing' in conf else False
                d_outputs, predictions, copied_inputs = setup_decoder(
                    d_outputs_ph,
                    e_inputs_emb,
                    e_state,
                    attention_states,
                    d_cell,
                    batch_size,
                    output_max_len,
                    scope=scope,
                    teacher_forcing=teacher_forcing)
                self.copied_inputs.append(copied_inputs)
                d_outputs_length = tf.count_nonzero(d_outputs_ph,
                                                    axis=1,
                                                    name='outputs_length')
                with tf.name_scope('add_eos'):
                    targets = tf.concat([
                        d_outputs_ph,
                        tf.zeros([batch_size, 1], dtype=tf.int32)
                    ],
                                        axis=1)

                # the length of outputs should be also added by 1 because of EOS.
                with tf.name_scope('output_weights'):
                    d_outputs_weights = tf.sequence_mask(
                        d_outputs_length + 1,
                        maxlen=shape(d_outputs_ph, 1) + 1,
                        dtype=tf.float32)
                with tf.name_scope('loss%d' % i):
                    loss = tf.contrib.seq2seq.sequence_loss(
                        d_outputs, targets, d_outputs_weights)
            self.d_outputs_ph.append(d_outputs_ph)
            self.losses.append(loss)
            self.greedy_predictions.append(predictions)
        with tf.name_scope('Loss'):
            self.loss = tf.reduce_mean(self.losses)
        self.updates = self.get_updates(self.loss)
示例#7
0
    def setup_decoder(self,
                      config,
                      train_decoder_state,
                      test_decoder_state,
                      embeddings,
                      encoder_input_lengths=None,
                      attention_states=None,
                      projection_layer=None,
                      scope=None):
        batch_size = self.batch_size
        decoder_inputs_emb = tf.nn.embedding_lookup(embeddings,
                                                    self.decoder_inputs)
        # TODO: 多言語対応にする時はbias, trainableをfalseにしてembeddingをconstantにしたい
        decoder_cell = setup_cell(config.decoder.cell_type,
                                  shape(train_decoder_state, -1),
                                  config.decoder.num_layers,
                                  keep_prob=self.keep_prob)
        if projection_layer is None:
            with tf.variable_scope('projection') as scope:
                kernel = tf.transpose(embeddings, perm=[1, 0])
                projection_layer = SharedKernelDense(shape(embeddings, 0),
                                                     use_bias=False,
                                                     trainable=False,
                                                     shared_kernel=kernel)

        with tf.name_scope('Training'):
            train_decoder_cell = decoder_cell
            decoder_initial_state = train_decoder_state
            helper = tf.contrib.seq2seq.TrainingHelper(
                decoder_inputs_emb,
                sequence_length=self.target_length,
                time_major=False)

            decoder = tf.contrib.seq2seq.BasicDecoder(
                train_decoder_cell,
                helper,
                decoder_initial_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder,
                impute_finished=True,
                maximum_iterations=tf.reduce_max(self.target_length),
                scope=scope)
            logits = train_decoder_outputs.rnn_output

        with tf.name_scope('Test'):
            beam_width = config.beam_width
            test_decoder_cell = decoder_cell
            decoder_initial_state = tf.contrib.seq2seq.tile_batch(
                test_decoder_state, multiplier=beam_width)

            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                test_decoder_cell,
                embeddings,
                self.start_tokens,
                self.end_token,
                decoder_initial_state,
                beam_width,
                output_layer=projection_layer,
                length_penalty_weight=config.length_penalty_weight)
            test_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder,
                impute_finished=False,
                maximum_iterations=config.utterance_max_len,
                scope=scope)
            predictions = test_decoder_outputs.predicted_ids
            #self.beam_scores = test_decoder_outputs.beam_search_decoder_output.scores
            # memo: 出力結果はbeam_scoresの低い順にならんでいて (負の値を取る),概ねそれはちゃんと正確さと一致してそう?

        return logits, predictions
示例#8
0
def pointer_decoder(encoder_inputs_emb, decoder_inputs, initial_state, 
                    attention_states, cell,
                    feed_prev=True, dtype=dtypes.float32, scope=None):
  encoder_inputs = encoder_inputs_emb
  attn_length = shape(attention_states, 1)
  attn_size = shape(attention_states, 2)

  with tf.name_scope('attention_setup'):
    # Prepare the weights for attention calculation. We assume here the sizes of attention_states (encoder's outputs), encoder's state, decoder's output are same.
    attnw = tf.get_variable("AttnW1", [1, attn_size, attn_size])
    attnw2 = tf.get_variable("AttnW2", [attn_size, attn_size])
    attnv = tf.get_variable("AttnV", [attn_size])

    # Calculate W1 * attention_states in advance since each output and state of encoder is unchanged while decoding.
    attention_states = tf.nn.conv1d(attention_states, attnw, 1, 'SAME')
  sys.stdout = sys.stderr

  def attention_weight(output):
    """
    Calculate attention weights for every encoder's input by taking an inner product the weight bector (attnv) with the conbined and transformed the encoder's output and decoder's state.

    output_probabilities[i] = V・tanh(W1・attention_state[i] + W2・decoder's output[t])
     - i: the index of an input word
     - t: current time-step in decoding
     - V: a tensor with the shape [attention_size]
     - W1: a tensor with the shape [attention_size, encoder's rnn_size]
     - W2: a tensor with the shape [attention_size, decoder's rnn_size]
    """
    y = tf.matmul(output, attnw2)
    y = tf.reshape(y, [-1, 1, attn_size])

    attention_vectors = tf.nn.softmax(tf.reduce_sum(attnv * tf.tanh(attention_states + y), axis=2))
    return attention_vectors

  states = [initial_state]
  outputs = []
  pointed_idxs = []
  with tf.name_scope('Decode_Timestep'):
    for i, d in enumerate(tf.unstack(decoder_inputs, axis=1)):
      with tf.name_scope('Decode_%d' % i):
        if i > 0:
          tf.get_variable_scope().reuse_variables()
        # The first input to the decoder is something like _START (or just a _PAD) token we prepared to start decoding.
        pointed_idx = d

        # If feed_prev == True, inputs to decoder won't be used except the first one. The model makes decisions of which should be the next inputs by itself.
        if feed_prev and i > 0:
          # Take argmax to decide which indices of input should be most possible.
          pointed_idx = tf.argmax(output, axis=1, output_type=tf.int32)
        pointed_idxs.append(pointed_idx)
        with tf.name_scope('copy_from_encoder_inputs'):
          # Convert the pointed index into one-hot, and get the pointed encoder_inputs by multiplying and reduce_sum.
          pointed_idx = tf.reshape(tf.one_hot(pointed_idx, depth=attn_length), [-1, attn_length, 1]) 
          inp = tf.reduce_sum(encoder_inputs * pointed_idx, axis=1)

          # In their original paper, the gradients shouldn't be propagated to input embeddings through these copying. The embeddings should be updated only from the encoder.
          inp = tf.stop_gradient(inp)
        output, state = cell(inp, states[-1])

        # Calculate the output (and the next input) distribution 
        with tf.name_scope('attention_weight'):
          output = attention_weight(output)
        states.append(state)
        outputs.append(output)
  with tf.name_scope('outputs'):
    outputs = tf.stack(outputs, axis=1)
  with tf.name_scope('states'):
    states = tf.stack(states, axis=1)
  with tf.name_scope('pointed_idx'):
    pointed_idxs = tf.stack(pointed_idxs, axis=1)
  return outputs, states, pointed_idxs
示例#9
0
    def setup_decoder(self,
                      config,
                      train_decoder_state,
                      test_decoder_state,
                      encoder_input_lengths=None,
                      attention_states=None,
                      projection_layer=None,
                      scope=None):
        batch_size = self.batch_size
        decoder_inputs_emb = tf.nn.embedding_lookup(self.w_embeddings,
                                                    self.decoder_inputs)
        # TODO: 多言語対応にする時はbias, trainableをfalseにしてembeddingをconstantにしたい
        decoder_cell = setup_cell(config.decoder.cell_type,
                                  shape(train_decoder_state, -1),
                                  config.decoder.num_layers,
                                  keep_prob=self.keep_prob)
        if projection_layer is None:
            with tf.variable_scope('projection') as scope:
                projection_layer = tf.layers.Dense(config.w_vocab_size,
                                                   use_bias=True,
                                                   trainable=True)

        with tf.name_scope('Training'):
            if config.attention_type:
                assert attention_states is not None
                num_units = shape(attention_states, -1)
                attention = tf.contrib.seq2seq.LuongAttention(
                    num_units,
                    attention_states,
                    memory_sequence_length=encoder_input_length)
                train_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell, attention)
                decoder_initial_state = train_decoder_cell.zero_state(
                    batch_size,
                    tf.float32).clone(cell_state=train_decoder_state)
            else:
                train_decoder_cell = decoder_cell
                decoder_initial_state = train_decoder_state

            # encoder_state can't be directly copied into decoder_cell when using the attention mechanisms, initial_state must be an instance of AttentionWrapperState. (https://github.com/tensorflow/nmt/issues/205)

            helper = tf.contrib.seq2seq.TrainingHelper(
                decoder_inputs_emb,
                sequence_length=self.target_length,
                time_major=False)

            decoder = tf.contrib.seq2seq.BasicDecoder(
                train_decoder_cell,
                helper,
                decoder_initial_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder,
                impute_finished=True,
                maximum_iterations=tf.reduce_max(self.target_length),
                scope=scope)
            logits = train_decoder_outputs.rnn_output

        with tf.name_scope('Test'):
            beam_width = config.beam_width
            if config.attention_type:
                num_units = shape(attention_states, -1)
                attention = tf.contrib.seq2seq.LuongAttention(
                    num_units,
                    tf.contrib.seq2seq.tile_batch(attention_states,
                                                  multiplier=beam_width),
                    memory_sequence_length=tf.contrib.seq2seq.tile_batch(
                        encoder_input_length, multiplier=beam_width))
                test_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell, attention)
                decoder_initial_state = test_decoder_cell.zero_state(
                    batch_size * beam_width,
                    tf.float32).clone(cell_state=tf.contrib.seq2seq.tile_batch(
                        test_decoder_state, multiplier=beam_width))
            else:
                test_decoder_cell = decoder_cell
                decoder_initial_state = tf.contrib.seq2seq.tile_batch(
                    test_decoder_state, multiplier=beam_width)

            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                test_decoder_cell,
                self.w_embeddings,
                self.start_tokens,
                self.end_token,
                decoder_initial_state,
                beam_width,
                output_layer=projection_layer,
                length_penalty_weight=config.length_penalty_weight)
            test_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder,
                impute_finished=False,
                maximum_iterations=config.utterance_max_len,
                scope=scope)
            predictions = test_decoder_outputs.predicted_ids
        return logits, predictions
示例#10
0
    def __init__(self, sess, config, vocab, encoder=None, is_training=None):
        PointerNetworkBase.__init__(self,
                                    sess,
                                    config,
                                    vocab,
                                    is_training=is_training)

        input_max_len, output_max_len = None, config.output_max_len

        # <Sample input>
        # e_inputs: [1, 40, 44, 0, 0], d_outputs: [2, 0, 0] (target=44)
        with tf.name_scope('EncoderInput'):
            self.e_inputs_ph = tf.placeholder(tf.int32, [None, input_max_len],
                                              name="EncoderInput")
            self.pos_inputs_ph = tf.placeholder(tf.int32,
                                                [None, input_max_len],
                                                name="EncoderInputPOS")
            self.wtype_inputs_ph = tf.placeholder(tf.int32,
                                                  [None, input_max_len],
                                                  name="EncoderInputWordType")

        with tf.name_scope('batch_size'):
            batch_size = shape(self.e_inputs_ph, 0)

        with tf.variable_scope('Embeddings') as scope:
            e_inputs_emb = []

            w_embeddings = self.initialize_embeddings(
                'Word',
                vocab.word.embeddings.shape,
                initializer=tf.constant_initializer(vocab.word.embeddings),
                trainable=config.train_embedding)

            e_inputs_emb.append(
                tf.nn.embedding_lookup(w_embeddings, self.e_inputs_ph))

            if self.use_pos:
                pos_embeddings = self.initialize_embeddings(
                    'POS', [vocab.pos.size, config.feature_size],
                    trainable=True)
                e_inputs_emb.append(
                    tf.nn.embedding_lookup(pos_embeddings, self.pos_inputs_ph))
            if self.use_wtype:
                wtype_embeddings = self.initialize_embeddings(
                    'Wtype', [vocab.wtype.size, config.feature_size],
                    trainable=True)
                e_inputs_emb.append(
                    tf.nn.embedding_lookup(wtype_embeddings,
                                           self.wtype_inputs_ph))

            e_inputs_emb = tf.concat(e_inputs_emb, axis=-1)
            e_inputs_emb = tf.nn.dropout(e_inputs_emb, self.keep_prob)

        with tf.variable_scope('SentEncoder') as scope:
            # If an encoder is not given, prepare a new one.
            if encoder is None:
                encoder_type = getattr(encoder_class, config.encoder_type)
                sent_encoder = encoder_type(config,
                                            self.keep_prob,
                                            shared_scope=scope)
            else:
                sent_encoder = encoder

            e_inputs_length = tf.count_nonzero(self.e_inputs_ph, axis=1)
            e_outputs, e_state = sent_encoder.encode(e_inputs_emb,
                                                     e_inputs_length)
            attention_states = e_outputs

        self.d_outputs_ph = []
        self.losses = []
        self.greedy_predictions = []
        self.copied_inputs = []
        for i, col_name in enumerate(self.target_columns):
            with tf.name_scope('DecoderOutput%d' % i):
                d_outputs_ph = tf.placeholder(tf.int32, [None, output_max_len],
                                              name="DecoderOutput")

            ds_name = 'Decoder' if config.share_decoder else 'Decoder%d' % i
            with tf.variable_scope(ds_name) as scope:
                d_cell = setup_cell(config.cell_type,
                                    config.rnn_size,
                                    config.num_layers,
                                    keep_prob=self.keep_prob)
                teacher_forcing = config.teacher_forcing if 'teacher_forcing' in config else False
                d_outputs, predictions, copied_inputs = setup_decoder(
                    d_outputs_ph,
                    e_inputs_emb,
                    e_state,
                    attention_states,
                    d_cell,
                    batch_size,
                    output_max_len,
                    scope=scope,
                    teacher_forcing=teacher_forcing)
                self.copied_inputs.append(copied_inputs)
                d_outputs_length = tf.count_nonzero(d_outputs_ph,
                                                    axis=1,
                                                    name='outputs_length')
                with tf.name_scope('add_eos'):
                    targets = tf.concat([
                        d_outputs_ph,
                        tf.zeros([batch_size, 1], dtype=tf.int32)
                    ],
                                        axis=1)

                # the length of outputs should be also added by 1 because of EOS.
                with tf.name_scope('output_weights'):
                    d_outputs_weights = tf.sequence_mask(
                        d_outputs_length + 1,
                        maxlen=shape(d_outputs_ph, 1) + 1,
                        dtype=tf.float32)
                with tf.name_scope('loss%d' % i):
                    loss = tf.contrib.seq2seq.sequence_loss(
                        d_outputs, targets, d_outputs_weights)
            self.d_outputs_ph.append(d_outputs_ph)
            self.losses.append(loss)
            self.greedy_predictions.append(predictions)
        with tf.name_scope('Loss'):
            self.loss = tf.reduce_mean(self.losses)
        self.updates = self.get_updates(self.loss)