Exemplo n.º 1
0
 def _build_kernel(self, clp_weights_init):
   from TFUtil import get_initializer
   input_placeholder = self.input_data.get_placeholder_as_batch_major()
   kernel_width = input_placeholder.shape[2].value // 2
   kernel_height = self._nr_of_filters
   with self.var_creation_scope():
     clp_weights_initializer = get_initializer(
       clp_weights_init, seed=self.network.random.randint(2 ** 31), eval_local_ns={"layer": self})
     clp_kernel = self.add_param(tf.get_variable(
       name="clp_kernel", shape=(2, kernel_width, kernel_height), dtype=tf.float32, initializer=clp_weights_initializer))
   return clp_kernel
Exemplo n.º 2
0
 def _build_kernel(self, clp_weights_init):
     from TFUtil import get_initializer
     input_placeholder = self.input_data.get_placeholder_as_batch_major()
     kernel_width = input_placeholder.shape[2].value // 2
     kernel_height = self._nr_of_filters
     with self.var_creation_scope():
         clp_weights_initializer = get_initializer(
             clp_weights_init,
             seed=self.network.random.randint(2**31),
             eval_local_ns={"layer": self})
         clp_kernel = self.add_param(
             tf.get_variable(name="clp_kernel",
                             shape=(2, kernel_width, kernel_height),
                             dtype=tf.float32,
                             initializer=clp_weights_initializer))
     return clp_kernel
    def __init__(self,
                 transducer_hidden_units,
                 n_out,
                 transducer_max_width,
                 input_block_size,
                 embedding_size,
                 e_symbol_index,
                 use_prev_state_as_start=False,
                 **kwargs):
        """
        Initialize the Neural Transducer.
        :param int transducer_hidden_units: Amount of units the transducer should have.
        :param int n_out: The size of the output layer, i.e. the size of the vocabulary including <E> symbol.
        :param int transducer_max_width: The max amount of outputs in one NT block (including the final <E> symbol)
        :param int input_block_size: Amount of inputs to use for each NT block.
        :param int embedding_size: Embedding dimension size.
        :param int e_symbol_index: Index of e symbol that is used in the NT block. 0 <= e_symbol_index < num_outputs
        :param bool use_prev_state_as_start: Whether to use the last state of the previous recurrent layer as the ]
        initial state of the transducer. NOTE: For this to work, you have to watch out for:
        previous_layer.hidden_units = previous_layer.n_out = transducer.transducer_hidden_units
        """

        super(NeuralTransducerLayer, self).__init__(**kwargs)

        # TODO: Build optimized version

        # Get embedding
        from TFUtil import get_initializer
        initializer = get_initializer('glorot_uniform',
                                      seed=self.network.random.randint(2**31),
                                      eval_local_ns={"layer": self})
        embeddings = self.add_param(tf.get_variable(
            shape=[n_out, embedding_size],
            dtype=tf.float32,
            initializer=initializer,
            name='nt_embedding'),
                                    trainable=True,
                                    saveable=True)

        # Ensure encoder is time major
        encoder_outputs = self.input_data.get_placeholder_as_time_major()

        # Pad encoder outputs with zeros so that it its cleanly divisible by the input_block_size
        batch_size = tf.shape(encoder_outputs)[1]
        time_length_to_append = input_block_size - tf.mod(
            tf.shape(encoder_outputs)[0], input_block_size)
        padding_tensor = tf.zeros(
            [time_length_to_append, batch_size,
             tf.shape(encoder_outputs)[2]],
            dtype=tf.float32)
        encoder_outputs = tf.concat([encoder_outputs, padding_tensor], axis=0)
        # Do assertions
        assert 0 <= e_symbol_index < n_out, 'NT: E symbol outside possible outputs!'

        # Get prev state as start state
        last_hidden = None
        if use_prev_state_as_start is True and isinstance(
                self.sources[0], RecLayer) is True:
            # TODO: add better checking whether the settings are correct
            last_hidden_c = self.sources[0].get_last_hidden_state(
                '*')  # Get last c after all blocks
            last_hidden_h = encoder_outputs[
                input_block_size - 1]  # Get last hidden after the first block

            # Padding so that last hidden_c & _h are the same (this is needed for when using BiLSTM)
            c_shape = tf.shape(last_hidden_c)
            h_shape = tf.shape(last_hidden_h)
            padding = tf.zeros([c_shape[0], h_shape[1] - c_shape[1]])
            last_hidden_c = tf.concat([last_hidden_c, padding], axis=1)

            last_hidden = tf.stack([last_hidden_c, last_hidden_h], axis=0)

        # Note down data
        self.transducer_hidden_units = transducer_hidden_units
        self.num_outputs = n_out
        self.transducer_max_width = transducer_max_width
        self.input_block_size = input_block_size
        self.e_symbol_index = e_symbol_index

        # self.output.placeholder is of shape [transducer_max_width * amount_of_blocks, batch_size, n_out]
        self.output.placeholder = self.build_full_transducer(
            transducer_hidden_units=transducer_hidden_units,
            embeddings=embeddings,
            num_outputs=n_out,
            input_block_size=input_block_size,
            transducer_max_width=transducer_max_width,
            encoder_outputs=encoder_outputs,
            trans_hidden_init=last_hidden)

        # Set correct logit lengths
        output_size = self.round_vector_to_closest_input_block(
            vector=self.input_data.size_placeholder[0],
            input_block_size=input_block_size,
            transducer_max_width=transducer_max_width)

        # Set shaping info
        self.output.size_placeholder = {0: output_size}
        self.output.time_dim_axis = 0
        self.output.batch_dim_axis = 1

        # Add all trainable params
        with self.var_creation_scope() as scope:
            self._add_all_trainable_params(
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  scope=scope.name))
    def build_full_transducer(self, transducer_hidden_units, embeddings,
                              num_outputs, input_block_size,
                              transducer_max_width, encoder_outputs,
                              trans_hidden_init):
        """
        Builds the complete transducer.
        :param int transducer_hidden_units:  Amount of units the transducer should have.
        :param tf.Variable embeddings: Variable with the reference to the embeddings.
        :param int num_outputs: The size of the output layer, i.e. the size of the vocabulary including <E> symbol.
        :param int input_block_size: Amount of inputs to use for each NT block.
        :param int transducer_max_width: The max amount of outputs in one NT block (including the final <E> symbol)
        :param tf.tensor encoder_outputs: The outputs of the encode in shape of [max_time, batch_size, encoder_hidden]
        :param tf.tensor trans_hidden_init: The init state of the transducer. Needs to be of shape
        [2, batch_size, transducer_hidden_units]. The trans_hidden_init[0] is the c vector of the lstm,
        trans_hidden_init[1] the hidden vector.
        :return: Returns a reference to the tf.tensor containing the logits.
        :rtype: tf.tensor
        """

        with self.var_creation_scope():
            # Get meta variables
            batch_size = tf.shape(encoder_outputs)[1]
            if trans_hidden_init is None:
                trans_hidden_init = tf.zeros(
                    [2, batch_size, transducer_hidden_units], dtype=tf.float32)

            # Do some more post processing
            max_blocks = tf.to_int32(
                tf.shape(encoder_outputs)[0] / input_block_size)
            transducer_list_outputs = tf.ones(
                [max_blocks, batch_size],
                dtype=tf.int32) * transducer_max_width
            inference_mode = 1.0
            teacher_forcing_targets = tf.ones(
                [transducer_max_width * max_blocks, batch_size],
                dtype=tf.int32)

            # Process teacher forcing targets
            teacher_forcing_targets_emb = tf.nn.embedding_lookup(
                embeddings, teacher_forcing_targets)

            # Outputs
            outputs_ta = tf.TensorArray(dtype=tf.float32,
                                        size=max_blocks,
                                        infer_shape=False)
            init_state = (0, outputs_ta, trans_hidden_init, 0)

            # Init the transducer cell
            from TFUtil import get_initializer
            transducer_cell_initializer = get_initializer(
                'glorot_uniform',
                seed=self.network.random.randint(2**31),
                eval_local_ns={"layer": self})
            transducer_cell = tf.contrib.rnn.LSTMCell(
                transducer_hidden_units,
                initializer=transducer_cell_initializer)

            def cond(current_block, outputs_int, trans_hidden, total_output):
                return current_block < max_blocks

            def body(current_block, outputs_int, trans_hidden, total_output):

                # --------------------- TRANSDUCER --------------------------------------------------------------------
                # Each transducer block runs for the max transducer outputs in its respective block

                encoder_raw_outputs = encoder_outputs[
                    input_block_size * current_block:input_block_size *
                    (current_block + 1)]

                encoder_raw_outputs = tf.where(
                    tf.is_nan(encoder_raw_outputs),
                    tf.zeros_like(encoder_raw_outputs), encoder_raw_outputs)

                trans_hidden = tf.where(tf.is_nan(trans_hidden),
                                        tf.zeros_like(trans_hidden),
                                        trans_hidden)

                # Save/load the state as one tensor, use top encoder layer state as init if this is the first block
                trans_hidden_state = trans_hidden
                transducer_amount_outputs = transducer_list_outputs[
                    current_block]
                transducer_max_output = tf.reduce_max(
                    transducer_amount_outputs)

                # Model building
                helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs=teacher_forcing_targets_emb[
                        total_output:total_output +
                        transducer_max_output],  # Get the current target inputs
                    sequence_length=transducer_amount_outputs,
                    embedding=embeddings,
                    sampling_probability=inference_mode,
                    time_major=True)

                attention_states = tf.transpose(encoder_raw_outputs, [
                    1, 0, 2
                ])  # attention_states: [batch_size, max_time, num_enc_units]
                attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                    transducer_hidden_units, attention_states)

                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    transducer_cell,
                    attention_mechanism,
                    attention_layer_size=transducer_hidden_units)

                from tensorflow.python.layers import core as layers_core
                projection_layer = layers_core.Dense(num_outputs,
                                                     use_bias=False)

                # Build previous state
                trans_hidden_c, trans_hidden_h = tf.split(trans_hidden_state,
                                                          num_or_size_splits=2,
                                                          axis=0)
                trans_hidden_c = tf.reshape(
                    trans_hidden_c, shape=[-1, transducer_hidden_units])
                trans_hidden_h = tf.reshape(
                    trans_hidden_h, shape=[-1, transducer_hidden_units])
                from tensorflow.contrib.rnn import LSTMStateTuple
                trans_hidden_state_t = LSTMStateTuple(trans_hidden_c,
                                                      trans_hidden_h)

                decoder = tf.contrib.seq2seq.BasicDecoder(
                    decoder_cell,
                    helper,
                    decoder_cell.zero_state(
                        batch_size,
                        tf.float32).clone(cell_state=trans_hidden_state_t),
                    output_layer=projection_layer)
                outputs, transducer_hidden_state_new, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder,
                    output_time_major=True,
                    maximum_iterations=transducer_max_output)
                logits = outputs.rnn_output  # logits of shape [max_time,batch_size,vocab_size]

                # Modify output of transducer_hidden_state_new so that it can be fed back in again without problems.

                transducer_hidden_state_new = tf.concat([
                    transducer_hidden_state_new[0].c,
                    transducer_hidden_state_new[0].h
                ],
                                                        axis=0)
                transducer_hidden_state_new = tf.reshape(
                    transducer_hidden_state_new,
                    shape=[2, -1, transducer_hidden_units])

                # Note the outputs
                outputs_int = outputs_int.write(current_block, logits)

                return current_block + 1, outputs_int, \
                    transducer_hidden_state_new, total_output + transducer_max_output

            _, outputs_final, _, _ = tf.while_loop(cond,
                                                   body,
                                                   init_state,
                                                   parallel_iterations=1)

            # Process outputs
            with tf.device('/cpu:0'):
                logits = outputs_final.concat(
                )  # And now its [max_output_time, batch_size, num_outputs]

            # For loading the model later on
            logits = tf.identity(logits, name='logits')

        return logits