Пример #1
0
 def get_word_chars(table, char_embedding, word_chars, char_lengths, word_size):
     word_chars = tf.reshape(word_chars, [-1, word_size, WORD_CHAR_SIZE])
     char_ids = table.lookup(word_chars)
     x = char_embedding(char_ids)
     mask = tf.sequence_mask(char_lengths, WORD_CHAR_SIZE, dtype=tf.float32)
     mask = tf.expand_dims(mask, 3)  # [batch, seq_len, char_dim, 1]
     x = x * mask
     x = tf.reshape(x, [-1, WORD_CHAR_SIZE, CHAR_DIM])   # [batch * word_size, word_char_size, char_dim]
     if self.args.word_char_type == 'cnn':
         filters = 16
         output = shallow_and_wide_cnn(x, filters, [1,2,3])
         last_states = output
     else:
         length = tf.reshape(char_lengths, [-1])
         outputs, last_states = stack_bidirectional_dynamic_rnn(x, [CHAR_DIM],
                 length, dropout_keep_prob=dropout_keep_prob,
                 cell_wrapper=self.rnn_cell_wrapper,
                 variational_recurrent=self.variational_recurrent,
                 base_cell=base_cell,
                 is_training=is_training)
     return tf.reshape(last_states, [-1, word_size, CHAR_DIM*2]) # [batch, word_size, char_dim*2]
Пример #2
0
  def build_graph(self, data_paths, batch_size, graph_mod):
    """Builds generic graph for training or eval."""
    tensors = GraphReferences()
    is_training = graph_mod == GraphMod.TRAIN
    tf.keras.backend.set_learning_phase(1 if is_training else 0)
    if data_paths:
      tensors.keys, tensors.examples = util.read_examples(
          data_paths,
          batch_size,
          shuffle=is_training,
          num_epochs=None if is_training else 2)
    else:
      tensors.examples = tf.placeholder(tf.string, name='input', shape=(None,))

    if graph_mod == GraphMod.PREDICT:
      inception_input, inception_embeddings = self.build_inception_graph()
      image_embeddings = inception_embeddings

      title_embeddings = tf.placeholder(tf.float32, shape=[None, TITLE_EMBEDDING_SIZE])
      title_words_count = tf.placeholder(tf.int64, shape=[None])
      content_embeddings = tf.placeholder(tf.float32, shape=[None, CONTENT_EMBEDDING_SIZE])
      content_words_count = tf.placeholder(tf.int64, shape=[None])

      title_word_chars = tf.placeholder(tf.string, shape=[None, TITLE_WORD_CHARS_SIZE])
      content_word_chars = tf.placeholder(tf.string, shape=[None, CONTENT_WORD_CHARS_SIZE])
      title_word_char_lengths = tf.placeholder(tf.int64, shape=[None, TITLE_WORD_SIZE])
      content_word_char_lengths = tf.placeholder(tf.int64, shape=[None, CONTENT_WORD_SIZE])

      category_ids = tf.placeholder(tf.int64, shape=[None])
      price = tf.placeholder(tf.int64, shape=[None])
      images_count = tf.placeholder(tf.int64, shape=[None])
      recent_articles_count = tf.placeholder(tf.int64, shape=[None])
      title_length = tf.placeholder(tf.int64, shape=[None])
      content_length = tf.placeholder(tf.int64, shape=[None])
      blocks_inline = tf.placeholder(tf.string, shape=[None])
      username_chars = tf.placeholder(tf.string, shape=[None, USERNAME_CHAR_SIZE])
      username_length = tf.placeholder(tf.int64, shape=[None])
      created_at_ts = tf.placeholder(tf.int64, shape=[None])
      offerable = tf.placeholder(tf.int64, shape=[None])

      tensors.input_image = inception_input
      tensors.input_title = title_embeddings
      tensors.input_title_words_count = title_words_count
      tensors.input_content = content_embeddings
      tensors.input_content_words_count = content_words_count
      tensors.input_category_id = category_ids
      tensors.input_price = price
      tensors.input_images_count = images_count
      tensors.input_recent_articles_count = recent_articles_count
      tensors.input_title_length = title_length
      tensors.input_content_length = content_length
      tensors.input_blocks_inline = blocks_inline
      tensors.input_username_chars = username_chars
      tensors.input_username_length = username_length
      tensors.input_created_at_ts = created_at_ts
      tensors.input_offerable = offerable
      tensors.input_title_word_chars = title_word_chars
      tensors.input_content_word_chars = content_word_chars
      tensors.input_title_word_char_lengths = title_word_char_lengths
      tensors.input_content_word_char_lengths = content_word_char_lengths

      username_chars = tf.reshape(username_chars, [-1, USERNAME_CHAR_SIZE])
    else:
      # For training and evaluation we assume data is preprocessed, so the
      # inputs are tf-examples.
      # Generate placeholders for examples.
      with tf.name_scope('inputs'):
        feature_map = {
            'id':
                tf.FixedLenFeature(
                    shape=[], dtype=tf.string, default_value=['']),
            # Some images may have no labels. For those, we assume a default
            # label. So the number of labels is label_count+1 for the default
            # label.
            'label':
                tf.FixedLenFeature(
                    shape=[1], dtype=tf.int64,
                    default_value=[self.label_count]),
            'embedding':
                tf.FixedLenFeature(
                    shape=[BOTTLENECK_TENSOR_SIZE], dtype=tf.float32),
            'title_embedding':
                tf.FixedLenFeature(
                    shape=[TITLE_EMBEDDING_SIZE], dtype=tf.float32),
            'title_words_count':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'content_embedding':
                tf.FixedLenFeature(
                    shape=[CONTENT_EMBEDDING_SIZE], dtype=tf.float32),
            'content_words_count':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'category_id':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'price':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'images_count':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'recent_articles_count':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'title_length':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'content_length':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'blocks_inline':
                tf.FixedLenFeature(shape=[], dtype=tf.string),
            'username_chars':
                tf.FixedLenFeature(shape=[USERNAME_CHAR_SIZE], dtype=tf.string),
            'username_length':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'created_at_ts':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'offerable':
                tf.FixedLenFeature(shape=[], dtype=tf.int64),
            'title_word_chars':
                tf.FixedLenFeature(shape=[TITLE_WORD_CHARS_SIZE], dtype=tf.string),
            'content_word_chars':
                tf.FixedLenFeature(shape=[CONTENT_WORD_CHARS_SIZE], dtype=tf.string),
            'title_word_char_lengths':
                tf.FixedLenFeature(shape=[TITLE_WORD_SIZE], dtype=tf.int64),
            'content_word_char_lengths':
                tf.FixedLenFeature(shape=[CONTENT_WORD_SIZE], dtype=tf.int64),
        }
        parsed = tf.parse_example(tensors.examples, features=feature_map)
        labels = tf.squeeze(parsed['label'])
        tensors.labels = labels
        tensors.ids = tf.squeeze(parsed['id'])
        image_embeddings = parsed['embedding']
        title_embeddings = parsed['title_embedding']
        title_words_count = parsed['title_words_count']
        content_embeddings = parsed['content_embedding']
        content_words_count = parsed['content_words_count']
        category_ids = parsed['category_id']
        price = parsed['price']
        images_count = parsed['images_count']
        recent_articles_count = parsed['recent_articles_count']
        title_length = parsed['title_length']
        content_length = parsed['content_length']
        blocks_inline = parsed['blocks_inline']
        username_chars = parsed['username_chars']
        username_length = parsed['username_length']
        created_at_ts = parsed['created_at_ts']
        offerable = parsed['offerable']
        title_word_chars = parsed['title_word_chars']
        content_word_chars = parsed['content_word_chars']
        title_word_char_lengths = parsed['title_word_char_lengths']
        content_word_char_lengths = parsed['content_word_char_lengths']

    dropout_keep_prob = self.dropout if is_training else None
    if self.rnn_type == 'LSTM':
        if tf.test.gpu_device_name():
            base_cell = tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell
        else:
            base_cell = tf.contrib.rnn.BasicLSTMCell
    else:
        if tf.test.gpu_device_name():
            base_cell = tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell
        else:
            base_cell = tf.contrib.rnn.GRUCell

    def dropout(x, keep_prob):
        if keep_prob:
            return tf.nn.dropout(x, keep_prob)
        return x

    if self.args.l2_reg_scale > 0.:
        regularizer = tf.contrib.layers.l2_regularizer(self.args.l2_reg_scale)
    else:
        regularizer = None

    def dense(x, units):
        for unit in units:
            if self.activation == 'maxout':
                x = layers.fully_connected(x, unit, activation_fn=None,
                        weights_regularizer=regularizer)
                x = tf.contrib.layers.maxout(x, unit)
                x = tf.reshape(x, [-1, unit])
            elif self.activation == 'none':
                x = layers.fully_connected(x, unit,
                        weights_regularizer=regularizer,
                        normalizer_fn=tf.contrib.layers.batch_norm,
                        normalizer_params={'is_training': is_training})
            else:
                x = layers.fully_connected(x, unit, weights_regularizer=regularizer)
            x = dropout(x, dropout_keep_prob)
        return x

    def shallow_and_wide_cnn(inputs, filters, kernel_sizes):
        outputs = []
        for kernel_size in kernel_sizes:
            conv = tf.layers.conv1d(inputs, filters, kernel_size, padding="same",
                    kernel_regularizer=regularizer)
            conv = tf.layers.batch_normalization(conv, training=is_training)
            conv = tf.nn.relu(conv)
            conv = GlobalMaxPooling1D()(conv)
            outputs.append(conv)
        output = tf.concat(outputs, 1)
        return dropout(output, dropout_keep_prob)

    def get_word_chars(table, char_embedding, word_chars, char_lengths, word_size):
        word_chars = tf.reshape(word_chars, [-1, word_size, WORD_CHAR_SIZE])
        char_ids = table.lookup(word_chars)
        x = char_embedding(char_ids)
        mask = tf.sequence_mask(char_lengths, WORD_CHAR_SIZE, dtype=tf.float32)
        mask = tf.expand_dims(mask, 3)  # [batch, seq_len, char_dim, 1]
        x = x * mask
        x = tf.reshape(x, [-1, WORD_CHAR_SIZE, CHAR_DIM])   # [batch * word_size, word_char_size, char_dim]
        if self.args.word_char_type == 'cnn':
            filters = 16
            output = shallow_and_wide_cnn(x, filters, [1,2,3])
            last_states = output
        else:
            length = tf.reshape(char_lengths, [-1])
            outputs, last_states = stack_bidirectional_dynamic_rnn(x, [CHAR_DIM],
                    length, dropout_keep_prob=dropout_keep_prob,
                    cell_wrapper=self.rnn_cell_wrapper,
                    variational_recurrent=self.variational_recurrent,
                    base_cell=base_cell,
                    is_training=is_training)
        return tf.reshape(last_states, [-1, word_size, CHAR_DIM*2]) # [batch, word_size, char_dim*2]

    if self.args.word_char_type != 'none':
        with tf.variable_scope("word_chars", reuse=tf.AUTO_REUSE):
            table = tf.contrib.lookup.index_table_from_tensor(
                    mapping=tf.constant(self.text_chars),
                    default_value=len(self.text_chars))
            char_dict_size = len(self.text_chars) + 1 # add unknown char
            char_embedding = Embedding(char_dict_size, CHAR_DIM)
            title_word_chars = get_word_chars(table, char_embedding,
                    title_word_chars, title_word_char_lengths, TITLE_WORD_SIZE)
            content_word_chars = get_word_chars(table, char_embedding,
                    content_word_chars, content_word_char_lengths, CONTENT_WORD_SIZE)

    with tf.variable_scope("username"):
        table = tf.contrib.lookup.index_table_from_tensor(
                mapping=tf.constant(self.username_chars),
                default_value=len(self.username_chars))
        char_ids = table.lookup(username_chars)
        char_dict_size = len(self.username_chars) + 1 # add unknown char
        x = Embedding(char_dict_size, CHAR_DIM)(char_ids)
        mask = tf.sequence_mask(username_length, USERNAME_CHAR_SIZE, dtype=tf.float32)
        x = x * tf.expand_dims(mask, 2)

        if self.username_type == 'dense':
            username = tf.reshape(x, [-1, USERNAME_CHAR_SIZE * CHAR_DIM])
            username = dense(username, [30, 30])
        elif self.username_type == 'cnn':
            def conv_username(x, filters):
                k3 = tf.layers.conv1d(x, filters, 3)
                k3 = tf.nn.relu(k3)
                k3 = tf.layers.max_pooling1d(k3, 3, 3)
                k3 = tf.layers.conv1d(k3, filters, 3)
                k3 = tf.nn.relu(k3)

                k2 = tf.layers.conv1d(x, filters, 2)
                k2 = tf.nn.relu(k2)
                k2 = tf.layers.max_pooling1d(k2, 2, 2)
                k2 = tf.layers.conv1d(k2, filters, 2, strides=2)
                k2 = tf.nn.relu(k2)
                k2 = tf.layers.max_pooling1d(k2, 2, 2)

                k1 = tf.layers.conv1d(x, filters, 1)
                k1 = tf.nn.relu(k1)
                k1 = tf.layers.max_pooling1d(k1, 3, 3)
                k1 = tf.layers.conv1d(k1, filters, 2, strides=2)
                k1 = tf.nn.relu(k1)
                k1 = tf.layers.max_pooling1d(k1, 2, 2)

                x = tf.concat([k1, k2, k3], 2)
                x = tf.reshape(x, [-1, filters * 3])
                return tf.layers.batch_normalization(x, training=is_training)

            filters = 10
            #username = shallow_and_wide_cnn(x, filters, [1,2,3])
            username = conv_username(x, filters)
        elif self.username_type == 'rnn':
            outputs, last_states = stack_bidirectional_dynamic_rnn(x, [CHAR_DIM],
                    username_length, dropout_keep_prob=dropout_keep_prob,
                    cell_wrapper=self.rnn_cell_wrapper,
                    variational_recurrent=self.variational_recurrent,
                    base_cell=base_cell,
                    is_training=is_training)
            username = last_states
        elif self.username_type == 'none':
            username = None
        else:
            raise Exception('Invaild username_type: %s' % self.username_type)


    with tf.variable_scope("user"):
        recent_articles_count = tf.minimum(recent_articles_count, 300)
        recent_articles_count = tf.expand_dims(recent_articles_count, 1)
        recent_articles_count = tf.to_int32(recent_articles_count)
        blocks = blocks_inline_to_matrix(blocks_inline)
        blocks = tf.minimum(blocks, 50)

        user = tf.concat([recent_articles_count#, blocks
            ], 1)
        user = tf.cast(user, tf.float32)
        user = tf.layers.batch_normalization(user, training=is_training)
        user = dropout(user, dropout_keep_prob)

    with tf.variable_scope("category"):
        category_ids = tf.minimum(category_ids - 1, TOTAL_CATEGORIES_COUNT - 1)
        category = Embedding(TOTAL_CATEGORIES_COUNT, 10)(category_ids)
        category = dropout(category, dropout_keep_prob)

    with tf.variable_scope("continuous"):
        price = tf.minimum(price, 1000000000)
        title_length = tf.minimum(title_length, 100)
        content_length = tf.minimum(content_length, 3000)
        created_time = tf.mod(created_at_ts, DAY_TIME)
        day = tf.mod(created_at_ts / DAY_TIME, 7)

        continuous = tf.stack([price, images_count, title_length,
            content_length#, offerable, created_time, day
            ], 1)
        continuous = tf.cast(continuous, tf.float32)
        continuous = tf.concat([continuous, tf.square(continuous)], 1)
        continuous = tf.layers.batch_normalization(continuous, training=is_training)
        continuous = dropout(continuous, dropout_keep_prob)

    with tf.variable_scope("image"):
        image_embeddings = dense(image_embeddings, [256])

    with tf.variable_scope('bunch'):
      bunch = tf.concat([image_embeddings, category, continuous, user], 1)
      if self.username_type != 'none':
          bunch = tf.concat([bunch, username], 1)

    if self.args.word_char_type != 'none':
        word_dim = CHAR_WORD_DIM
    else:
        word_dim = WORD_DIM

    with tf.variable_scope('title'):
      initial_state = dense(bunch, [word_dim*2])
      layer_sizes = [word_dim * (2**i) for i in range(max(1, self.rnn_layers_count-1))]
      title_embeddings = tf.reshape(title_embeddings, [-1, TITLE_WORD_SIZE, WORD_DIM])
      if self.args.word_char_type != 'none':
          title_embeddings = tf.concat([title_embeddings, title_word_chars], -1)
      title_outputs, title_last_states = stack_bidirectional_dynamic_rnn(title_embeddings, layer_sizes,
              title_words_count, initial_state=initial_state,
              cell_wrapper=self.rnn_cell_wrapper, variational_recurrent=self.variational_recurrent,
              base_cell=base_cell, dropout_keep_prob=dropout_keep_prob, is_training=is_training)

    with tf.variable_scope('content'):
      bunch = tf.concat([bunch, title_last_states], 1)
      initial_state = dense(bunch, [192, word_dim*2])

      layer_sizes = [word_dim * (2**i) for i in range(self.rnn_layers_count)]
      content_embeddings = tf.reshape(content_embeddings, [-1, CONTENT_WORD_SIZE, WORD_DIM])
      if self.args.word_char_type != 'none':
          content_embeddings = tf.concat([content_embeddings, content_word_chars], -1)
      content_outputs, content_last_states = stack_bidirectional_dynamic_rnn(content_embeddings, layer_sizes,
              content_words_count, initial_state=initial_state,
              cell_wrapper=self.rnn_cell_wrapper, variational_recurrent=self.variational_recurrent,
              base_cell=base_cell, dropout_keep_prob=dropout_keep_prob, is_training=is_training)

    with tf.variable_scope('final_ops'):
      hidden = tf.concat([bunch, content_last_states], 1)
      if self.final_layers_count > 0:
          hidden = dense(hidden, [192] + [64] * (self.final_layers_count-1))
      softmax, logits = self.add_final_training_ops(hidden, self.label_count)

    # Prediction is the index of the label with the highest score. We are
    # interested only in the top score.
    prediction = tf.argmax(logits, 1)
    tensors.predictions = [prediction, softmax]

    if graph_mod == GraphMod.PREDICT:
      return tensors

    def is_l2_var_name(name):
        for token in ['bias', 'table', 'BatchNorm']:
            if token in name:
                return False
        return True

    with tf.name_scope('evaluate'):
      loss_value = loss(logits, labels)
      #l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if is_l2_var_name(v.name) ])
      #loss_value += l2_loss * 0.001

    # Add to the Graph the Ops that calculate and apply gradients.
    if is_training:
      tensors.train, tensors.global_step = training(loss_value)
    else:
      tensors.global_step = tf.Variable(0, name='global_step', trainable=False)

    # Add means across all batches.
    loss_updates, loss_op = util.loss(loss_value)
    accuracy_updates, accuracy_op = util.accuracy(logits, labels)

    all_precision_op, all_precision_update = tf.metrics.precision(labels, prediction)
    all_recall_op, all_recall_update = tf.metrics.recall(labels, prediction)

    precision = {'ops': [], 'updates': []}
    recall = {'ops': [], 'updates': []}

    with tf.name_scope('metrics'):
        for i in range(self.label_count):
            op, update = tf.metrics.recall_at_k(labels, logits, 1, class_id=i)
            recall['ops'].append(op)
            recall['updates'].append(update)
            op, update = tf.metrics.precision_at_k(labels, logits, 1, class_id=i)
            precision['ops'].append(op)
            precision['updates'].append(update)

    if not is_training:
      tf.summary.scalar('accuracy', accuracy_op, family='general')
      tf.summary.scalar('loss', loss_op, family='general')
      tf.summary.scalar('precision', all_precision_op, family='general')
      tf.summary.scalar('recall', all_recall_op, family='general')
      for i in range(self.label_count):
          label_name = self.labels[i]
          tf.summary.scalar('%s' % label_name, recall['ops'][i], family='recall')
          tf.summary.scalar('%s' % label_name, precision['ops'][i], family='precision')

    tensors.metric_updates = loss_updates + accuracy_updates + \
            [all_precision_update, all_recall_update] + \
            recall['updates'] + precision['updates']
    tensors.metric_values = [loss_op, accuracy_op, all_precision_op, all_recall_op]
    return tensors
Пример #3
0
def multi_encoder(encoder_inputs, encoders, encoder_input_length, other_inputs=None, **kwargs):
    """
    Build multiple encoders according to the configuration in `encoders`, reading from `encoder_inputs`.
    The result is a list of the outputs produced by those encoders (for each time-step), and their final state.

    :param encoder_inputs: list of tensors of shape (batch_size, input_length), one tensor for each encoder.
    :param encoders: list of encoder configurations
    :param encoder_input_length: list of tensors of shape (batch_size,) (one tensor for each encoder)
    :return:
      encoder outputs: a list of tensors of shape (batch_size, input_length, encoder_cell_size), hidden states of the
        encoders.
      encoder state: concatenation of the final states of all encoders, tensor of shape (batch_size, sum_of_state_sizes)
      new_encoder_input_length: list of tensors of shape (batch_size,) with the true length of the encoder outputs.
        May be different than `encoder_input_length` because of maxout strides, and time pooling.
    """
    encoder_states = []
    encoder_outputs = []

    # create embeddings in the global scope (allows sharing between encoder and decoder)
    embedding_variables = []
    for encoder in encoders:
        if encoder.binary:
            embedding_variables.append(None)
            continue
        # inputs are token ids, which need to be mapped to vectors (embeddings)
        embedding_shape = [encoder.vocab_size, encoder.embedding_size]

        if encoder.embedding_initializer == 'sqrt3':
            initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3))
        else:
            initializer = None

        device = '/cpu:0' if encoder.embeddings_on_cpu else None
        with tf.device(device):  # embeddings can take a very large amount of memory, so
            # storing them in GPU memory can be impractical
            embedding = get_variable('embedding_{}'.format(encoder.name), shape=embedding_shape,
                                     initializer=initializer)
        embedding_variables.append(embedding)

    new_encoder_input_length = []

    for i, encoder in enumerate(encoders):
        if encoder.use_lstm is False:
            encoder.cell_type = 'GRU'

        with tf.variable_scope('encoder_{}'.format(encoder.name)):
            encoder_inputs_ = encoder_inputs[i]
            encoder_input_length_ = encoder_input_length[i]

            def get_cell(input_size=None, reuse=False):
                if encoder.cell_type.lower() == 'lstm':
                    cell = CellWrapper(BasicLSTMCell(encoder.cell_size, reuse=reuse))
                elif encoder.cell_type.lower() == 'dropoutgru':
                    cell = DropoutGRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm,
                                          input_size=input_size, input_keep_prob=encoder.rnn_input_keep_prob,
                                          state_keep_prob=encoder.rnn_state_keep_prob)
                else:
                    cell = GRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm)

                if encoder.use_dropout and encoder.cell_type.lower() != 'dropoutgru':
                    cell = DropoutWrapper(cell, input_keep_prob=encoder.rnn_input_keep_prob,
                                          output_keep_prob=encoder.rnn_output_keep_prob,
                                          state_keep_prob=encoder.rnn_state_keep_prob,
                                          variational_recurrent=encoder.pervasive_dropout,
                                          dtype=tf.float32, input_size=input_size)
                return cell

            embedding = embedding_variables[i]

            batch_size = tf.shape(encoder_inputs_)[0]
            time_steps = tf.shape(encoder_inputs_)[1]

            if embedding is not None:
                flat_inputs = tf.reshape(encoder_inputs_, [tf.multiply(batch_size, time_steps)])
                flat_inputs = tf.nn.embedding_lookup(embedding, flat_inputs)
                encoder_inputs_ = tf.reshape(flat_inputs,
                                             tf.stack([batch_size, time_steps, flat_inputs.get_shape()[1].value]))

            if other_inputs is not None:
                encoder_inputs_ = tf.concat([encoder_inputs_, other_inputs], axis=2)

            if encoder.use_dropout:
                noise_shape = [1, time_steps, 1] if encoder.pervasive_dropout else [batch_size, time_steps, 1]
                encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.word_keep_prob,
                                                noise_shape=noise_shape)

                size = tf.shape(encoder_inputs_)[2]
                noise_shape = [1, 1, size] if encoder.pervasive_dropout else [batch_size, time_steps, size]
                encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.embedding_keep_prob,
                                                noise_shape=noise_shape)

            if encoder.input_layers:
                for j, layer_size in enumerate(encoder.input_layers):
                    if encoder.input_layer_activation is not None and encoder.input_layer_activation.lower() == 'relu':
                        activation = tf.nn.relu
                    else:
                        activation = tf.tanh

                    encoder_inputs_ = dense(encoder_inputs_, layer_size, activation=activation, use_bias=True,
                                            name='layer_{}'.format(j))
                    if encoder.use_dropout:
                        encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.input_layer_keep_prob)

            # Contrary to Theano's RNN implementation, states after the sequence length are zero
            # (while Theano repeats last state)
            inter_layer_keep_prob = None if not encoder.use_dropout else encoder.inter_layer_keep_prob

            parameters = dict(
                inputs=encoder_inputs_, sequence_length=encoder_input_length_,
                dtype=tf.float32, parallel_iterations=encoder.parallel_iterations
            )

            input_size = encoder_inputs_.get_shape()[2].value
            state_size = (encoder.cell_size * 2 if encoder.cell_type.lower() == 'lstm' else encoder.cell_size)

            def get_initial_state(name='initial_state'):
                if encoder.train_initial_states:
                    initial_state = get_variable(name, initializer=tf.zeros(state_size))
                    return tf.tile(tf.expand_dims(initial_state, axis=0), [batch_size, 1])
                else:
                    return None

            if encoder.bidir:
                rnn = lambda reuse: stack_bidirectional_dynamic_rnn(
                    cells_fw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse)
                              for j in range(encoder.layers)],
                    cells_bw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse)
                              for j in range(encoder.layers)],
                    initial_states_fw=[get_initial_state('initial_state_fw')] * encoder.layers,
                    initial_states_bw=[get_initial_state('initial_state_bw')] * encoder.layers,
                    time_pooling=encoder.time_pooling, pooling_avg=encoder.pooling_avg,
                    **parameters)

                initializer = CellInitializer(encoder.cell_size) if encoder.orthogonal_init else None
                with tf.variable_scope(tf.get_variable_scope(), initializer=initializer):
                    try:
                        encoder_outputs_, _, encoder_states_ = rnn(reuse=False)
                    except ValueError:   # Multi-task scenario where we're reusing the same RNN parameters
                        encoder_outputs_, _, encoder_states_ = rnn(reuse=True)
            else:
                if encoder.time_pooling or encoder.final_state == 'concat_last':
                    raise NotImplementedError

                if encoder.layers > 1:
                    cell = MultiRNNCell([get_cell(input_size if j == 0 else encoder.cell_size)
                                         for j in range(encoder.layers)])
                    initial_state = (get_initial_state(),) * encoder.layers
                else:
                    cell = get_cell(input_size)
                    initial_state = get_initial_state()

                encoder_outputs_, encoder_states_ = auto_reuse(tf.nn.dynamic_rnn)(cell=cell,
                                                                                  initial_state=initial_state,
                                                                                  **parameters)

            last_backward = encoder_outputs_[:, 0, encoder.cell_size:]
            indices = tf.stack([tf.range(batch_size), encoder_input_length_ - 1], axis=1)
            last_forward = tf.gather_nd(encoder_outputs_[:, :, :encoder.cell_size], indices)
            last_forward.set_shape([None, encoder.cell_size])

            if encoder.final_state == 'concat_last': # concats last states of all backward layers (full LSTM states)
                encoder_state_ = tf.concat(encoder_states_, axis=1)
            elif encoder.final_state == 'average':
                mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_outputs_)[1], dtype=tf.float32)
                mask = tf.expand_dims(mask, axis=2)
                encoder_state_ = tf.reduce_sum(mask * encoder_outputs_, axis=1) / tf.reduce_sum(mask, axis=1)
            elif encoder.final_state == 'average_inputs':
                mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_inputs_)[1], dtype=tf.float32)
                mask = tf.expand_dims(mask, axis=2)
                encoder_state_ = tf.reduce_sum(mask * encoder_inputs_, axis=1) / tf.reduce_sum(mask, axis=1)
            elif encoder.bidir and encoder.final_state == 'last_both':
                encoder_state_ = tf.concat([last_forward, last_backward], axis=1)
            elif encoder.bidir and not encoder.final_state == 'last_forward':   # last backward hidden state
                encoder_state_ = last_backward
            else:  # last forward hidden state
                encoder_state_ = last_forward

            if encoder.bidir and encoder.bidir_projection:
                encoder_outputs_ = dense(encoder_outputs_, encoder.cell_size, use_bias=False, name='bidir_projection')

            encoder_outputs.append(encoder_outputs_)
            encoder_states.append(encoder_state_)
            new_encoder_input_length.append(encoder_input_length_)

    encoder_state = tf.concat(encoder_states, 1)
    return encoder_outputs, encoder_state, new_encoder_input_length