Exemplo n.º 1
0
 def get_cell(n_hidden):
     logging.info("Constructing cell of size={}".format(n_hidden))
     if use_lstm:
         logging.info("Using LSTM cells")
         if initializer:
             cell = rnn_cell.LSTMCell(n_hidden, initializer=initializer)
         else:
             # to use peephole connections, cell clipping or a projection layer, use LSTMCell
             cell = rnn_cell.BasicLSTMCell(n_hidden)
     else:
         logging.info("Using GRU cells")
         cell = rnn_cell.GRUCell(n_hidden)
     if not forward_only and use_lstm and keep_prob < 1:
         logging.info("Adding dropout wrapper around lstm cells")
         cell = rnn_cell.DropoutWrapper(cell,
                                        output_keep_prob=keep_prob)
     if encoder == "bidirectional":
         logging.info("Bidirectional model")
         if init_backward:
             logging.info(
                 "Use backward encoder state to initialize decoder state"
             )
         cell = BidirectionalRNNCell([cell] * 2)
     elif encoder == "bow":
         logging.info("BOW model")
         if num_layers > 1:
             logging.info("Model with %d layers for the decoder" %
                          num_layers)
             cell = BOWCell(rnn_cell.MultiRNNCell([cell] * num_layers))
         else:
             cell = BOWCell(single_cell)
     elif num_layers > 1:
         logging.info("Model with %d layers" % num_layers)
         cell = rnn_cell.MultiRNNCell([cell] * num_layers)
     return cell
Exemplo n.º 2
0
    def __init__(self,
                 vocab_size,
                 buckets_or_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 model_type,
                 use_lstm=True,
                 num_samples=512,
                 forward_only=False):
        """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 

    Args:
      vocab_size: size of the vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets_or_sentence_length: 
        if using buckets:
          a list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        else:
          number of the maximum number of words per sentence.
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        # Need to determine if we're using buckets or not:
        if type(buckets_or_sentence_length) == list:
            self.buckets = buckets_or_sentence_length
        else:
            self.max_sentence_length = buckets_or_sentence_length

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # Summary variables. NOTE: added these.
        # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
        # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell  #i, j, f, o = array_ops.split(1, 4, concat)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell(
                [single_cell] *
                num_layers)  #cur_inp, array_ops.concat(1, new_states)

        # The seq2seq function: we use embedding for the input and attention (if applicable).
        if model_type is 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:  # just build embedding model, I should probably change this to throw an error

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model.
        try:
            encoder_range = self.buckets[-1][0]
            decoder_range = self.buckets[-1][1]
        except AttributeError:
            encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length

        for i in xrange(encoder_range):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(decoder_range + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        try:
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(self.buckets)):
                        self.outputs[b] = [
                            tf.nn.xw_plus_b(output, output_projection[0],
                                            output_projection[1])
                            for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

        except AttributeError:
            if forward_only:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      True)
                self.losses = seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function)
                # Project outputs for decoding
                if output_projection is not None:
                    self.outputs = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs
                    ]
            else:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      False)
                self.losses = (seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.params = params  # Hold onto this for Woz
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            try:
                for b in xrange(len(self.buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
            except AttributeError:
                gradients = tf.gradients(self.losses, params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms = norm
                self.updates = opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Exemplo n.º 3
0
    def __init__(self, enc_out, target_vocab_size, buckets, embedding_size, hidden_size,
                 num_layers, batch_size, use_lstm=False, num_samples=512, 
                 encoder="reverse", use_src_mask=False, maxout_layer=False, init_backward=False,
                 variable_prefix=None, init_const=False, use_bow_mask=False, initializer=None):
        super(TFSeq2SeqSingleStepDecodingGraph, self).__init__(buckets, batch_size)
        self.target_vocab_size = target_vocab_size
        self.num_heads = 1
    
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
          with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                               reuse=True), tf.device("/cpu:0"):
            w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
          output_projection = (w, b)
            
          def sampled_loss(inputs, labels):
            with tf.device("/cpu:0"):
              labels = tf.reshape(labels, [-1, 1])
              return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                                self.target_vocab_size)
          softmax_loss_function = sampled_loss
        else:
          logging.info("Using maxout_layer=%d and full softmax loss" % maxout_layer)          
    
        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
          logging.info("Using LSTM cells of size={}".format(hidden_size))
          if initializer:
            single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer)
          else:
            # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
            single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
          logging.info("Using GRU cells of size={}".format(hidden_size))
          single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
          logging.info("Bidirectional model")
          if init_backward:
            logging.info("Use backward encoder state to initialize decoder state")
          cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
          logging.info("BOW model")
          if num_layers > 1:
            logging.info("Model with %d layers for the decoder" % num_layers)
            cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers))
          else:
            cell = BOWCell(single_cell)
        elif num_layers > 1:
          logging.info("Model with %d layers" % num_layers)
          cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
    
        # List of placeholders deeper within the decoder (i.e. bucket dependent)
        self.enc_hidden = []
        self.enc_hidden_features = []
        self.enc_v = []
        self.dec_attns = []

        # Placeholder for last state
        if encoder == "bidirectional":
          if cell._cells[0]._state_is_tuple:
            dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_c")
            dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_h")
            self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h)
          else:
            self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size], name="dec_state")
        elif encoder == "reverse" or encoder == "bow":
          if cell._state_is_tuple:
            dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_c")
            dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_h")
            self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h)
          else:
            self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.state_size], name="dec_state")

        if use_src_mask:
          logging.info("Using source mask for decoder") 
          self.src_mask = tf.placeholder(dtypes.float32, shape=[None, None],
                                         name="src_mask")
        else:
          self.src_mask = None

        if use_bow_mask:
          logging.info("Using bow mask for output layer") 
          self.bow_mask = tf.placeholder(dtypes.float32, shape=[None, None],
                                         name="bow_mask")
        else:
          self.bow_mask = None          

        # placeholder to indicate whether we're at the start of the target sentence
        self.start = tf.placeholder(tf.bool, name="start")

        # The seq2seq function: we use embedding for the input and attention.
        scope = None
        if variable_prefix is not None:
          scope = variable_prefix+"/embedding_attention_seq2seq"
          logging.info("Using variable scope {}".format(scope))
        def seq2seq_f(bucket_enc_out, decoder_input):
            return self._tf_dec_embedding_attention_seq2seq(bucket_enc_out,
                decoder_input, self.dec_state, cell, target_vocab_size, embedding_size, 
                output_projection=output_projection, encoder=encoder, 
                src_mask=self.src_mask, maxout_layer=maxout_layer, init_backward=init_backward,
                start=self.start, scope=scope, init_const=init_const, bow_mask=self.bow_mask)
    
        self.dec_decoder_input = tf.placeholder(tf.int32, shape=[None],
                                                    name="dec_decoder_input")
        self.outputs = self._tf_dec_model_with_buckets(enc_out,
            self.dec_decoder_input, buckets, seq2seq_f)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
            # self.outputs contains outputs, new_attns, new_state in flattened list
            for b in xrange(len(buckets)): 
                output = self.outputs[b][0]
                ''' Standard implementation uses following code here to get the previous output (_extract_argmax_and_embed):
                output = tf.nn.xw_plus_b(output, output_projection[0],
                                                 output_projection[1])
                                                 
                However, we have to normalize during decoding using a softmax
                (and then taking a log to produce logprobs),
                as described in nn.py, def sampled_softmax_loss:
                "This operation is for training only.  It is generally an underestimate of the full softmax loss. 
                At inference time, you can compute full softmax probabilities with the expression 
                `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`."
                Note: tf.matmul(i, w) + b does the same as tf.nn.xw_plus_b(i, w, b)
                '''
                output = tf.log(tf.nn.softmax(tf.nn.xw_plus_b(output, output_projection[0],
                                                 output_projection[1])))
                self.outputs[b][0] = output
        else:
          logging.info("Apply full softmax")
          for b in xrange(len(buckets)):
            self.outputs[b][0] = tf.log(tf.nn.softmax(self.outputs[b][0]))
                
        # for update_buckets
        self.enc_out = enc_out
        self.seq2seq_f = seq2seq_f
        self.output_projection = output_projection
Exemplo n.º 4
0
    def __init__(self, source_vocab_size, buckets, embedding_size, hidden_size,
                 num_layers, batch_size, use_lstm=False, num_samples=512, 
                 encoder="reverse", use_sequence_length=False, init_backward=False,
                 variable_prefix=None, initializer=None):
        super(TFSeq2SeqEncodingGraph, self).__init__(buckets, batch_size)
        self.source_vocab_size = source_vocab_size
        self.num_heads = 1
    
        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
          logging.info("Using LSTM cells of size={}".format(hidden_size))
          if initializer:
            single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer)
          else:
            # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
            single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
          logging.info("Using GRU cells of size={}".format(hidden_size))
          single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
          logging.info("Bidirectional model")
          if init_backward:
            logging.info("Use backward encoder state to initialize decoder state")
          cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
          logging.info("BOW model")
          if num_layers > 1:
            logging.info("Model with %d layers for the decoder" % num_layers)
            cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers))
          else:
            cell = BOWCell(single_cell)
        elif num_layers > 1:
          logging.info("Model with %d layers" % num_layers)
          cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
    
        # The seq2seq function: we use embedding for the input and attention.
        scope = None
        if variable_prefix is not None:
          scope = variable_prefix+"/embedding_attention_seq2seq"
          logging.info("Using variable scope {}".format(scope))    
        def seq2seq_f(encoder_inputs, bucket_length):
          return self._tf_enc_embedding_attention_seq2seq(encoder_inputs, cell, source_vocab_size, embedding_size, 
                                                          encoder=encoder, 
                                                          sequence_length=self.sequence_length,
                                                          bucket_length=bucket_length,
                                                          init_backward=init_backward,
                                                          bow_emb_size=hidden_size,
                                                          scope=scope)
    
        # Feeds for inputs.
        self.encoder_inputs = []
        self.sequence_lengths = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
          self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                    name="encoder{0}".format(i)))
        if use_sequence_length:
          logging.info("Using sequence length for encoder")                                            
          self.sequence_length = tf.placeholder(tf.int32, shape=[None], name="seq_len")          
        else:
          self.sequence_length = None
    
        self.outputs = self._tf_enc_model_with_buckets(self.encoder_inputs, buckets, seq2seq_f)
            
        # for update_buckets            
        self.seq2seq_f = seq2seq_f
Exemplo n.º 5
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 embedding_size,
                 hidden_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 dtype=tf.float32,
                 opt_algorithm="sgd",
                 encoder="reverse",
                 use_sequence_length=False,
                 use_src_mask=False,
                 maxout_layer=False,
                 init_backward=False,
                 no_pad_symbol=False,
                 variable_prefix=None,
                 rename_variable_prefix=None,
                 init_const=False,
                 use_bow_mask=False,
                 max_to_keep=0,
                 keep_prob=1.0,
                 initializer=None,
                 legacy=False,
                 train_align=None):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        with tf.variable_scope(variable_prefix or ""):
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False)
            self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.no_pad_symbol = no_pad_symbol

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w",
                                  [self.target_vocab_size, hidden_size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)
            logging.info("Using output projection of shape (%d, %d)" %
                         (hidden_size, self.target_vocab_size))
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(local_w_t, local_b,
                                               local_inputs, labels,
                                               num_samples,
                                               self.target_vocab_size), dtype)

            softmax_loss_function = sampled_loss
        else:
            logging.info("Using maxout_layer=%r and full softmax loss" %
                         maxout_layer)

        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
            logging.info("Using LSTM cells of size={}".format(hidden_size))
            if initializer:
                single_cell = rnn_cell.LSTMCell(hidden_size,
                                                initializer=initializer)
            else:
                # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
                single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
            logging.info("Using GRU cells of size={}".format(hidden_size))
            single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
            logging.info("Bidirectional model")
            if init_backward:
                logging.info(
                    "Use backward encoder state to initialize decoder state")
            cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
            logging.info("BOW model")
            if not forward_only and use_lstm and keep_prob < 1:
                logging.info("Adding dropout wrapper around lstm cells")
                single_cell = rnn_cell.DropoutWrapper(
                    single_cell, output_keep_prob=keep_prob)
            if num_layers > 1:
                logging.info("Model with %d layers for the decoder" %
                             num_layers)
                cell = BOWCell(
                    rnn_cell.MultiRNNCell([single_cell] * num_layers))
            else:
                cell = BOWCell(single_cell)
        elif num_layers > 1:
            logging.info("Model with %d layers" % num_layers)
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        logging.info("Embedding size={}".format(embedding_size))
        scope = None
        if variable_prefix is not None:
            scope = variable_prefix + "/embedding_attention_seq2seq"
            logging.info("Using variable scope {}".format(scope))

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode,
                      bucket_length):
            return embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype,
                encoder=encoder,
                sequence_length=self.sequence_length,
                bucket_length=bucket_length,
                src_mask=self.src_mask,
                maxout_layer=maxout_layer,
                init_backward=init_backward,
                bow_emb_size=hidden_size,
                scope=scope,
                init_const=init_const,
                bow_mask=self.bow_mask,
                keep_prob=keep_prob,
                legacy=legacy)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        self.alignments = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))
        if train_align is not None and not forward_only:
            for i in xrange(self.batch_size):
                self.alignments.append(
                    tf.placeholder(tf.float32,
                                   shape=[None],
                                   name="align{0}".format(i)))

        if use_sequence_length is True:
            logging.info("Using sequence length for encoder")
            self.sequence_length = tf.placeholder(tf.int32,
                                                  shape=[None],
                                                  name="seq_len")
        else:
            self.sequence_length = None

        if use_src_mask:
            logging.info("Using source mask for decoder")
            self.src_mask = tf.placeholder(tf.float32,
                                           shape=[None, None],
                                           name="src_mask")
        else:
            self.src_mask = None

        if use_bow_mask:
            logging.info("Using bow mask for output layer")
            self.bow_mask = tf.placeholder(tf.float32,
                                           shape=[None, None],
                                           name="bow_mask")
        else:
            self.bow_mask = None

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, True, z),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    # This is similar to what is done in the loop function (where xw_plus_b is used instead of matmul).
                    # The loop function also takes the argmax, but the result is not saved, we pass the logits
                    # and take the argmax again in the vanilla decoder.
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, False, z),
                softmax_loss_function=softmax_loss_function,
                alignments=self.alignments)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            if opt_algorithm == "sgd":
                logging.info("Using optimizer GradientDescentOptimizer")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif opt_algorithm == "adagrad":
                print("Using optimizer AdagradOptimizer")
                lr = 3.0
                init_acc = 0.1
                opt = tf.train.AdagradOptimizer(lr, init_acc)
            elif opt_algorithm == "adadelta":
                print("Using optimizer AdadeltaOptimizer")
                rho = 0.95
                epsilon = 1e-6
                opt = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon)

            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        if variable_prefix:
            # save only the variables that belong to the prefix
            logging.info("Using variable prefix={}".format(variable_prefix))
            self.saver = tf.train.Saver(
                {
                    v.op.name: v
                    for v in tf.global_variables()
                    if v.op.name.startswith(variable_prefix)
                },
                max_to_keep=max_to_keep,
                write_version=saver_pb2.SaverDef.V1)
        else:
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=max_to_keep,
                                        write_version=saver_pb2.SaverDef.V1)

        if rename_variable_prefix:
            # create a saver that explicitly stores model variables with a prefix
            logging.info("Saving model with new prefix={}".format(
                rename_variable_prefix))
            self.saver_prefix = tf.train.Saver(
                {
                    v.op.name.replace(variable_prefix, rename_variable_prefix):
                    v
                    for v in tf.global_variables()
                },
                write_version=saver_pb2.SaverDef.V1)