示例#1
0
    def cross_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor:
        assert self.cross_attention_sublayer is not None
        assert self.n_cross_att_heads is not None
        assert self.input_for_cross_attention is not None

        encoder_att_states = get_attention_states(
            self.input_for_cross_attention)
        encoder_att_mask = get_attention_mask(self.input_for_cross_attention)

        # Layer normalization
        normalized_queries = layer_norm(queries)

        encoder_context, _ = attention(
            queries=normalized_queries,
            keys=encoder_att_states,
            values=encoder_att_states,
            keys_mask=encoder_att_mask,
            num_heads=self.n_cross_att_heads,
            dropout_callback=lambda x: dropout(
                x, self.attention_dropout_keep_prob, self.train_mode),
            use_bias=self.use_att_transform_bias)

        # Apply dropout
        encoder_context = dropout(
            encoder_context, self.dropout_keep_prob, self.train_mode)

        # Add residual connections
        return encoder_context + queries
示例#2
0
    def self_attention_sublayer(
            self, prev_layer: TransformerLayer) -> tf.Tensor:
        """Create the decoder self-attention sublayer with output mask."""

        # Layer normalization
        normalized_states = layer_norm(prev_layer.temporal_states)

        # Run self-attention
        # TODO handle attention histories
        self_context, _ = attention(
            queries=normalized_states,
            keys=normalized_states,
            values=normalized_states,
            keys_mask=prev_layer.temporal_mask,
            num_heads=self.n_heads_self,
            masked=True,
            dropout_callback=lambda x: dropout(
                x, self.self_att_dropout_keep_prob, self.train_mode),
            use_bias=self.use_att_transform_bias)

        # Apply dropout
        self_context = dropout(
            self_context, self.dropout_keep_prob, self.train_mode)

        # Add residual connections
        return self_context + prev_layer.temporal_states
示例#3
0
    def test_invalid_keep_prob(self):
        """Tests invalid dropout values"""

        var = tf.constant(np.arange(5))
        train_mode = tf.constant(True)

        for kprob in [-1, 2, 0]:
            with self.assertRaises(ValueError):
                dropout(var, kprob, train_mode)
示例#4
0
    def func(
            train_mode: tf.Tensor,
            rnn_size: int,
            encoders: List[TemporalStatefulWithOutput]) -> tf.Tensor:

        if len(encoders) != 1:
            raise ValueError("Exactly one encoder required for this type of "
                             "projection. {} given.".format(len(encoders)))
        encoder = encoders[0]

        # shape (batch, time)
        masked_sum = tf.reduce_sum(
            encoder.temporal_states
            * tf.expand_dims(encoder.temporal_mask, 2), 1)

        # shape (batch, 1)
        lengths = tf.reduce_sum(encoder.temporal_mask, 1, keepdims=True)

        means = masked_sum / lengths

        encoder_rnn_size = means.get_shape()[1].value

        kernel_initializer = orthogonal_initializer()
        if encoder_rnn_size != rnn_size:
            kernel_initializer = None

        return dropout(
            tf.layers.dense(
                means, rnn_size, activation=tf.tanh,
                kernel_initializer=get_initializer(
                    "encoders_projection/kernel", kernel_initializer),
                name="encoders_projection"),
            dropout_keep_prob, train_mode)
示例#5
0
    def initial_state(self) -> tf.Tensor:
        """Compute initial decoder state.

        The part of the computation graph that computes
        the initial state of the decoder.
        """
        with tf.variable_scope("initial_state"):
            # pylint: disable=not-callable
            initial_state = dropout(
                self.encoder_projection(self.train_mode,
                                        self.rnn_size,
                                        self.encoders),
                self.dropout_keep_prob,
                self.train_mode)
            # pylint: enable=not-callable

            init_state_shape = initial_state.get_shape()

            # Broadcast the initial state to the whole batch if needed
            if len(init_state_shape) == 1:
                assert init_state_shape[0].value == self.rnn_size
                tiles = tf.tile(initial_state,
                                tf.expand_dims(self.batch_size, 0))
                initial_state = tf.reshape(tiles, [-1, self.rnn_size])

        return initial_state
示例#6
0
 def states(self) -> tf.Tensor:
     if self.hidden_dim is None:
         return self.concatenated_inputs
     states = tf.layers.dense(
         self.concatenated_inputs, self.hidden_dim, self.activation,
         name="hidden_layer")
     return dropout(states, self.dropout_keep_prob, self.train_mode)
示例#7
0
    def test_train_false(self):
        """Checks that dropout is not used when not training"""

        var = tf.ones([10000])
        s = tf.Session()

        dropped_var = dropout(var, 0.1, tf.constant(False))
        dropped_size = tf.reduce_sum(dropped_var)
        dsize = s.run(dropped_size)

        self.assertTrue(dsize == 10000)
示例#8
0
 def embedded_inputs(self) -> tf.Tensor:
     with tf.variable_scope("input_projection"):
         embedding_matrix = get_variable(
             "word_embeddings",
             [len(self.vocabulary), self.embedding_size],
             initializer=tf.variance_scaling_initializer(
                 mode="fan_avg", distribution="uniform"))
         return dropout(
             tf.nn.embedding_lookup(embedding_matrix, self.inputs),
             self.dropout_keep_prob,
             self.train_mode)
示例#9
0
    def attention(self,
                  query: tf.Tensor,
                  decoder_prev_state: tf.Tensor,
                  decoder_input: tf.Tensor,
                  loop_state: MultiHeadLoopState) -> Tuple[tf.Tensor,
                                                           MultiHeadLoopState]:
        """Run a multi-head attention getting context vector for a given query.

        This method is an API-wrapper for the global function 'attention'
        defined in this module. Transforms a query of shape(batch, query_size)
        to shape(batch, 1, query_size) and applies the attention function.
        Output context has shape(batch, 1, value_size) and weights
        have shape(batch, n_heads, 1, time(k)). The output is then processed
        to produce output vector of contexts and the following attention
        loop state.

        Arguments:
            query: Input query for the current decoding step
                of shape(batch, query_size).
            decoder_prev_state: Previous state of the decoder.
            decoder_input: Input to the RNN cell of the decoder.
            loop_state: Attention loop state.

        Returns:
            Vector of contexts and the following attention loop state.
        """

        context_3d, weights_4d = attention(
            queries=tf.expand_dims(query, 1),
            keys=self.attention_keys,
            values=self.attention_values,
            keys_mask=self.attention_mask,
            num_heads=self.n_heads,
            dropout_callback=lambda x: dropout(
                x, self.dropout_keep_prob, self.train_mode))

        # head_weights_3d is HEAD-wise list of (batch, 1, 1, time(keys))
        head_weights_3d = tf.split(weights_4d, self.n_heads, axis=1)

        context = tf.squeeze(context_3d, axis=1)
        head_weights = [tf.squeeze(w, axis=[1, 2]) for w in head_weights_3d]

        next_contexts = tf.concat(
            [loop_state.contexts, tf.expand_dims(context, 0)], axis=0)
        next_head_weights = [
            tf.concat([loop_state.head_weights[i],
                       tf.expand_dims(head_weights[i], 0)], axis=0)
            for i in range(self.n_heads)]

        next_loop_state = MultiHeadLoopState(
            contexts=next_contexts,
            head_weights=next_head_weights)

        return context, next_loop_state
示例#10
0
    def encoder_inputs(self) -> tf.Tensor:
        inputs = self.input_sequence.temporal_states

        if self.target_space_id is not None:
            inputs += tf.reshape(self.target_modality_embedding, [1, 1, -1])

        length = tf.shape(inputs)[1]

        if self.use_positional_encoding:
            inputs += position_signal(self.model_dimension, length)

        return dropout(inputs, self.dropout_keep_prob, self.train_mode)
示例#11
0
    def func(train_mode: tf.Tensor,
             rnn_size: int,
             encoders: List[Stateful]) -> tf.Tensor:

        if rnn_size is None:
            raise ValueError(
                "You must supply rnn_size for this type of encoder projection")

        en_concat = concat_encoder_projection(train_mode, None, encoders)

        return dropout(
            tf.layers.dense(en_concat, rnn_size, name="encoders_projection"),
            dropout_keep_prob, train_mode)
示例#12
0
    def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor:
        """Create the feed-forward network sublayer."""

        # Layer normalization
        normalized_input = layer_norm(layer_input)

        # Feed-forward network hidden layer + ReLU
        ff_hidden = tf.layers.dense(
            normalized_input, self.ff_hidden_size, activation=tf.nn.relu,
            name="hidden_state")

        # Apply dropout on the activations
        ff_hidden = dropout(ff_hidden, self.dropout_keep_prob, self.train_mode)

        # Feed-forward output projection
        ff_output = tf.layers.dense(ff_hidden, self.dimension, name="output")

        # Apply dropout on the output projection
        ff_output = dropout(ff_output, self.dropout_keep_prob, self.train_mode)

        # Add residual connections
        return ff_output + layer_input
示例#13
0
    def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]:
        layer_input = self.rnn_input  # type: tf.Tensor
        # pylint: disable=unsubscriptable-object
        layer_final = self.rnn_input[:, -1]
        # pylint: enable=unsubscriptable-object

        for i, rnn_spec in enumerate(self.rnn_specs):
            with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction),
                                   reuse=tf.AUTO_REUSE):

                if self.add_layer_norm:
                    layer_input = layer_norm(layer_input)

                layer_output, layer_final_output = rnn_layer(
                    layer_input, self.input_sequence.lengths, rnn_spec)

                layer_output = dropout(
                    layer_output, self.dropout_keep_prob, self.train_mode)
                layer_final_output = dropout(
                    layer_final_output, self.dropout_keep_prob,
                    self.train_mode)

                in_dim = layer_input.get_shape()[-1]
                out_dim = layer_output.get_shape()[-1]

                if self.add_residual and in_dim == out_dim:
                    layer_input += layer_output
                    layer_final += layer_final_output
                else:
                    # pylint: disable=redefined-variable-type
                    layer_input = layer_output
                    layer_final = layer_final_output
                    # pylint: enable=redefined-variable-type

        assert layer_final is not None
        if self.include_final_layer_norm:
            return layer_norm(layer_input), layer_norm(layer_final)
        return layer_input, layer_final
示例#14
0
    def input_plus_attention(self, *args) -> tf.Tensor:
        """Merge input and previous attentions.

        Input and previous attentions are merged into a single vector
        of the size fo embedding.
        """
        loop_state = LoopState(*args)
        feedables = loop_state.feedables
        emb_with_ctx = tf.concat(
            [feedables.embedded_input] + feedables.prev_contexts, 1)

        return dropout(
            tf.layers.dense(emb_with_ctx, self.embedding_size),
            self.dropout_keep_prob, self.train_mode)
示例#15
0
    def test_keep_prob(self):
        """Counts dropped items and compare with the expectation"""

        var = tf.ones([10000])
        s = tf.Session()

        for kprob in [0.1, 0.7]:
            dropped_var = dropout(var, kprob, tf.constant(True))
            dropped_size = tf.reduce_sum(
                tf.to_int32(tf.equal(dropped_var, 0.0)))

            dsize = s.run(dropped_size)

            expected_dropped_size = 10000 * (1 - kprob)

            self.assertTrue(np.isclose(expected_dropped_size, dsize, atol=500))
示例#16
0
    def embed_input_symbol(self, inputs: tf.Tensor) -> tf.Tensor:
        embedded = tf.nn.embedding_lookup(self.embedding_matrix, inputs)

        if (self.embeddings_source is not None
                and self.embeddings_source.scale_embeddings_by_depth):

            # Pylint @property-related bug
            # pylint: disable=no-member
            embedding_size = self.embedding_matrix.shape.as_list()[-1]
            # pylint: enable=no-member

            embedded *= math.sqrt(embedding_size)

        length = tf.shape(inputs)[1]
        return dropout(embedded + position_signal(self.dimension, length),
                       self.dropout_keep_prob,
                       self.train_mode)
示例#17
0
    def _projection(prev_state, prev_output, ctx_tensors, train_mode):
        ctx_concat = tf.concat(ctx_tensors, 1)

        logit_rnn = tf.layers.dense(
            prev_state, output_size,
            kernel_initializer=get_initializer("rnn_state/kernel", None),
            name="rnn_state")

        logit_emb = tf.layers.dense(
            prev_output, output_size,
            kernel_initializer=get_initializer("prev_out/kernel", None),
            name="prev_out")

        logit_ctx = tf.layers.dense(
            ctx_concat, output_size,
            kernel_initializer=get_initializer("context/kernel", None),
            name="context")

        return dropout(activation_fn(logit_rnn + logit_emb + logit_ctx),
                       dropout_keep_prob, train_mode)
示例#18
0
    def logits(self) -> tf.Tensor:
        embeddings = self.embedded_sequence.embedding_matrix
        if not self.train_embeddings:
            embeddings = tf.stop_gradient(embeddings)

        states = self.states
        # pylint: disable=no-member
        states_dim = self.states.get_shape()[-1].value
        # pylint: enable=no-member
        embedding_dim = self.embedded_sequence.embedding_sizes[0]
        # pylint: disable=redefined-variable-type
        if states_dim != embedding_dim:
            states = tf.layers.dense(
                states, embedding_dim, name="project_for_embeddings")
            states = dropout(states, self.dropout_keep_prob, self.train_mode)
        # pylint: enable=redefined-variable-type

        reshaped_states = tf.reshape(states, [-1, embedding_dim])
        reshaped_logits = tf.matmul(
            reshaped_states, embeddings, transpose_b=True, name="logits")
        return tf.reshape(
            reshaped_logits, [self.batch_size, -1, len(self.vocabulary)])
示例#19
0
 def callback(x: tf.Tensor) -> tf.Tensor:
     return dropout(x, prob, self.train_mode)
示例#20
0
 def _attention_tensor(self) -> tf.Tensor:
     return dropout(self.states, self.dropout_keep_prob, self.train_mode)
示例#21
0
 def rnn_input(self) -> tf.Tensor:
     return dropout(self.input_sequence.temporal_states,
                    self.dropout_keep_prob, self.train_mode)
示例#22
0
 def _projection(prev_state, prev_output, ctx_tensors, train_mode):
     state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1)
     return dropout(
         tf.layers.dense(
             state_out_ctx, output_size, activation=activation_fn),
         dropout_keep_prob, train_mode)
示例#23
0
 def _projection(prev_state, prev_output, ctx_tensors, train_mode):
     state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1)
     return dropout(
         maxout(state_out_ctx, maxout_size),
         dropout_keep_prob, train_mode)
示例#24
0
    def next_state(self, loop_state: LoopState) -> Tuple[tf.Tensor, Any, Any]:
        rnn_feedables = loop_state.feedables.other
        rnn_histories = loop_state.histories.other

        with tf.variable_scope(self.step_scope):
            rnn_input = self.input_projection(*loop_state)

            cell = self._get_rnn_cell()
            if self._rnn_cell_str in ["GRU", "NematusGRU"]:
                cell_output, next_state = cell(
                    rnn_input, rnn_feedables.prev_rnn_output)

                attns = [
                    a.attention(
                        cell_output, rnn_feedables.prev_rnn_output,
                        rnn_input, att_loop_state)
                    for a, att_loop_state in zip(
                        self.attentions,
                        rnn_histories.attention_histories)]
                if self.attentions:
                    contexts, att_loop_states = zip(*attns)
                else:
                    contexts, att_loop_states = [], []

                if self._conditional_gru:
                    cell_cond = self._get_conditional_gru_cell()
                    cond_input = tf.concat(contexts, -1)
                    cell_output, next_state = cell_cond(
                        cond_input, next_state, scope="cond_gru_2_cell")

            elif self._rnn_cell_str == "LSTM":
                prev_state = tf.contrib.rnn.LSTMStateTuple(
                    rnn_feedables.prev_rnn_state,
                    rnn_feedables.prev_rnn_output)
                cell_output, state = cell(rnn_input, prev_state)
                next_state = state.c
                attns = [
                    a.attention(
                        cell_output, rnn_feedables.prev_rnn_output,
                        rnn_input, att_loop_state)
                    for a, att_loop_state in zip(
                        self.attentions,
                        rnn_histories.attention_histories)]
                if self.attentions:
                    contexts, att_loop_states = zip(*attns)
                else:
                    contexts, att_loop_states = [], []
            else:
                raise ValueError("Unknown RNN cell.")

            # TODO: attention functions should apply dropout on output
            #       themselves before returning the tensors
            contexts = [dropout(ctx, self.dropout_keep_prob, self.train_mode)
                        for ctx in list(contexts)]
            cell_output = dropout(
                cell_output, self.dropout_keep_prob, self.train_mode)

            with tf.name_scope("rnn_output_projection"):
                if self.embedding_size != self.output_dimension:
                    raise ValueError(
                        "The dimension ({}) of the output projection must be "
                        "same as the dimension of the input embedding "
                        "({})".format(self.output_dimension,
                                      self.embedding_size))
                # pylint: disable=not-callable
                output = self.output_projection(
                    cell_output, loop_state.feedables.embedded_input,
                    list(contexts), self.train_mode)
                # pylint: enable=not-callable

        new_feedables = RNNFeedables(
            prev_rnn_state=next_state,
            prev_rnn_output=cell_output,
            prev_contexts=list(contexts))

        new_histories = RNNHistories(
            rnn_outputs=append_tensor(rnn_histories.rnn_outputs, cell_output),
            attention_histories=list(att_loop_states))

        return (output, new_feedables, new_histories)
示例#25
0
 def embed_input_symbols(self, input_symbols: tf.Tensor) -> tf.Tensor:
     embedded_input = tf.nn.embedding_lookup(
         self.embedding_matrix, input_symbols)
     return dropout(embedded_input, self.dropout_keep_prob, self.train_mode)
示例#26
0
 def _attention_states_dropped(self) -> tf.Tensor:
     return dropout(get_attention_states(self.input_sequence),
                    self.dropout_keep_prob, self.train_mode)
示例#27
0
 def callback(x: tf.Tensor) -> tf.Tensor:
     return dropout(x, prob, self.train_mode)
示例#28
0
 def attention_states(self) -> tf.Tensor:
     return dropout(get_attention_states(self.encoder),
                    self.dropout_keep_prob,
                    self.train_mode)
示例#29
0
 def _logit_function(self, state: tf.Tensor) -> tf.Tensor:
     state = dropout(state, self.dropout_keep_prob, self.train_mode)
     return tf.matmul(state, self.decoding_w) + self.decoding_b
示例#30
0
    def embed_input_symbol(self, *args) -> tf.Tensor:
        loop_state = LoopState(*args)
        embedded_input = tf.nn.embedding_lookup(
            self.embedding_matrix, loop_state.feedables.input_symbol)

        return dropout(embedded_input, self.dropout_keep_prob, self.train_mode)
示例#31
0
 def rnn_input(self) -> tf.Tensor:
     return dropout(self.input_sequence.temporal_states,
                    self.dropout_keep_prob, self.train_mode)
    def __init__(self,
                 name: str,
                 vocabulary: Vocabulary,
                 data_id: str,
                 embedding_size: int,
                 filters: List[Tuple[int, int]],
                 max_input_len: Optional[int] = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None) -> None:
        """Creates a new instance of the CNN sequence encoder.

        Based on: Yoon Kim: Convolutional Neural Networks for Sentence
        Classification (http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf)

        Arguments:
            vocabulary: Input vocabulary
            data_id: Identifier of the data series fed to this encoder
            name: An unique identifier for this encoder
            max_input_len: Maximum length of an encoded sequence
            embedding_size: The size of the embedding vector assigned
                to each word
            filters: Specification of CNN filters. It is a list of tuples
                specifying the filter size and number of channels.
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)

        assert check_argument_types()

        self.vocabulary = vocabulary
        self.data_id = data_id
        self.max_input_len = max_input_len

        with tf.variable_scope(self.name):
            self.train_mode = tf.placeholder(tf.bool,
                                             shape=[],
                                             name="mode_placeholder")

            self.inputs = tf.placeholder(tf.int32,
                                         shape=[None, None],
                                         name="encoder_input")

            self._input_mask = tf.placeholder(tf.float32,
                                              shape=[None, None],
                                              name="encoder_padding")

            with tf.variable_scope("input_projection"):
                self.embedding_matrix = tf.get_variable(
                    "word_embeddings", [len(vocabulary), embedding_size],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                embedded_inputs = dropout(
                    tf.nn.embedding_lookup(self.embedding_matrix, self.inputs),
                    dropout_keep_prob, self.train_mode)

            pooled_outputs = []
            for filter_size, num_filters in filters:
                with tf.variable_scope("conv-maxpool-%s" % filter_size):
                    # Convolution Layer
                    filter_shape = [filter_size, embedding_size, num_filters]
                    w_filter = tf.get_variable(
                        "conv_W",
                        filter_shape,
                        initializer=tf.random_uniform_initializer(-0.5, 0.5))
                    b_filter = tf.get_variable(
                        "conv_bias", [num_filters],
                        initializer=tf.constant_initializer(0.0))
                    conv = tf.nn.conv1d(embedded_inputs,
                                        w_filter,
                                        stride=1,
                                        padding="VALID",
                                        name="conv")

                    # Apply nonlinearity
                    conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter))

                    # Max-pooling over the outputs
                    pooled = tf.reduce_max(conv_relu, 1)
                    pooled_outputs.append(pooled)

            # Combine all the pooled features
            self.encoded = tf.concat(pooled_outputs, axis=1)
示例#33
0
    def __init__(self,
                 name: str,
                 data_id: str,
                 rnn_size: int,
                 input_dimension: int,
                 max_input_len: Optional[int] = None,
                 dropout_keep_prob: float = 1.0,
                 attention_type: Optional[Any] = None,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None) -> None:
        """Creates a new instance of the encoder.

        Arguments:
            data_id: Identifier of the data series fed to this encoder
            name: An unique identifier for this encoder
            rnn_size: The size of the encoder's hidden state. Note
                that the actual encoder output state size will be
                twice as long because it is the result of
                concatenation of forward and backward hidden states.

        Keyword arguments:
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
            attention_type: The class that is used for creating
                attention mechanism (default None)
        """
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)
        Attentive.__init__(self, attention_type)

        assert check_argument_types()

        self.data_id = data_id

        self.rnn_size = rnn_size
        self.max_input_len = max_input_len
        self.input_dimension = input_dimension
        self.dropout_keep_p = dropout_keep_prob

        log("Initializing RNN encoder, name: '{}'".format(self.name))

        with tf.variable_scope(self.name):
            self._create_input_placeholders()

            self._input_mask = tf.sequence_mask(self._input_lengths,
                                                dtype=tf.float32)

            fw_cell, bw_cell = self.rnn_cells()  # type: RNNCellTuple
            outputs_bidi_tup, encoded_tup = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                self.inputs,
                self._input_lengths,
                dtype=tf.float32)

            self.hidden_states = tf.concat(outputs_bidi_tup, 2)

            with tf.variable_scope('attention_tensor'):
                self.__attention_tensor = dropout(self.hidden_states,
                                                  self.dropout_keep_p,
                                                  self.train_mode)

            self.encoded = tf.concat(encoded_tup, 1)

        log("RNN encoder initialized")
示例#34
0
 def _attention_states_dropped(self) -> tf.Tensor:
     return dropout(get_attention_states(self.input_sequence),
                    self.dropout_keep_prob, self.train_mode)
示例#35
0
 def attention_states(self) -> tf.Tensor:
     return dropout(get_attention_states(self.encoder),
                    self.dropout_keep_prob, self.train_mode)
示例#36
0
    def __init__(self,
                 name: str,
                 data_id: str,
                 input_size: int,
                 rnn_layers: List[RNNSpecTuple],
                 max_input_len: Optional[int] = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: Optional[str] = None,
                 load_checkpoint: Optional[str] = None) -> None:
        """Create a new instance of the encoder.

        Arguments:
            data_id: Identifier of the data series fed to this encoder
            name: An unique identifier for this encoder
            rnn_layers: A list of tuples specifying the size and, optionally,
                the direction ('forward', 'backward' or 'bidirectional')
                and cell type ('GRU' or 'LSTM') of each RNN layer.

        Keyword arguments:
            dropout_keep_prob: The dropout keep probability
                (default 1.0)
        """
        check_argument_types()
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)

        self.data_id = data_id

        self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers]
        self.max_input_len = max_input_len
        self.input_size = input_size
        self.dropout_keep_prob = dropout_keep_prob

        log("Initializing RNN encoder, name: '{}'".format(self.name))

        with self.use_scope():
            self._create_input_placeholders()

            self.states_mask = tf.sequence_mask(self._input_lengths,
                                                dtype=tf.float32)

            states = self.inputs
            states_reversed = False

            def reverse_states():
                nonlocal states, states_reversed
                states = tf.reverse_sequence(states,
                                             self._input_lengths,
                                             batch_dim=0,
                                             seq_dim=1)
                states_reversed = not states_reversed

            for i, layer in enumerate(self._rnn_layers):
                with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)):
                    cell = _make_rnn_cell(layer)
                    if layer.direction == "bidirectional":
                        outputs_tup, encoded_tup = (
                            tf.nn.bidirectional_dynamic_rnn(
                                cell(),
                                cell(),
                                states,
                                self._input_lengths,
                                dtype=tf.float32))

                        if states_reversed:
                            # treat forward as backward and vice versa
                            outputs_tup = tuple(reversed(outputs_tup))
                            encoded_tup = tuple(reversed(encoded_tup))
                            states_reversed = False

                        states = tf.concat(outputs_tup, 2)
                        encoded = tf.concat(encoded_tup, 1)
                    elif layer.direction in ["forward", "backward"]:
                        should_be_reversed = (layer.direction == "backward")
                        if states_reversed != should_be_reversed:
                            reverse_states()

                        states, encoded = tf.nn.dynamic_rnn(
                            cell(),
                            states,
                            sequence_length=self._input_lengths,
                            dtype=tf.float32)
                    else:
                        raise ValueError("Unknown RNN direction {}".format(
                            layer.direction))

                if i < len(self._rnn_layers) - 1:
                    states = dropout(states, self.dropout_keep_prob,
                                     self.train_mode)

            if states_reversed:
                reverse_states()

            self.hidden_states = states
            self.encoded = encoded

        log("RNN encoder initialized")