def _build_kernel(self, clp_weights_init): from TFUtil import get_initializer input_placeholder = self.input_data.get_placeholder_as_batch_major() kernel_width = input_placeholder.shape[2].value // 2 kernel_height = self._nr_of_filters with self.var_creation_scope(): clp_weights_initializer = get_initializer( clp_weights_init, seed=self.network.random.randint(2 ** 31), eval_local_ns={"layer": self}) clp_kernel = self.add_param(tf.get_variable( name="clp_kernel", shape=(2, kernel_width, kernel_height), dtype=tf.float32, initializer=clp_weights_initializer)) return clp_kernel
def _build_kernel(self, clp_weights_init): from TFUtil import get_initializer input_placeholder = self.input_data.get_placeholder_as_batch_major() kernel_width = input_placeholder.shape[2].value // 2 kernel_height = self._nr_of_filters with self.var_creation_scope(): clp_weights_initializer = get_initializer( clp_weights_init, seed=self.network.random.randint(2**31), eval_local_ns={"layer": self}) clp_kernel = self.add_param( tf.get_variable(name="clp_kernel", shape=(2, kernel_width, kernel_height), dtype=tf.float32, initializer=clp_weights_initializer)) return clp_kernel
def __init__(self, transducer_hidden_units, n_out, transducer_max_width, input_block_size, embedding_size, e_symbol_index, use_prev_state_as_start=False, **kwargs): """ Initialize the Neural Transducer. :param int transducer_hidden_units: Amount of units the transducer should have. :param int n_out: The size of the output layer, i.e. the size of the vocabulary including <E> symbol. :param int transducer_max_width: The max amount of outputs in one NT block (including the final <E> symbol) :param int input_block_size: Amount of inputs to use for each NT block. :param int embedding_size: Embedding dimension size. :param int e_symbol_index: Index of e symbol that is used in the NT block. 0 <= e_symbol_index < num_outputs :param bool use_prev_state_as_start: Whether to use the last state of the previous recurrent layer as the ] initial state of the transducer. NOTE: For this to work, you have to watch out for: previous_layer.hidden_units = previous_layer.n_out = transducer.transducer_hidden_units """ super(NeuralTransducerLayer, self).__init__(**kwargs) # TODO: Build optimized version # Get embedding from TFUtil import get_initializer initializer = get_initializer('glorot_uniform', seed=self.network.random.randint(2**31), eval_local_ns={"layer": self}) embeddings = self.add_param(tf.get_variable( shape=[n_out, embedding_size], dtype=tf.float32, initializer=initializer, name='nt_embedding'), trainable=True, saveable=True) # Ensure encoder is time major encoder_outputs = self.input_data.get_placeholder_as_time_major() # Pad encoder outputs with zeros so that it its cleanly divisible by the input_block_size batch_size = tf.shape(encoder_outputs)[1] time_length_to_append = input_block_size - tf.mod( tf.shape(encoder_outputs)[0], input_block_size) padding_tensor = tf.zeros( [time_length_to_append, batch_size, tf.shape(encoder_outputs)[2]], dtype=tf.float32) encoder_outputs = tf.concat([encoder_outputs, padding_tensor], axis=0) # Do assertions assert 0 <= e_symbol_index < n_out, 'NT: E symbol outside possible outputs!' # Get prev state as start state last_hidden = None if use_prev_state_as_start is True and isinstance( self.sources[0], RecLayer) is True: # TODO: add better checking whether the settings are correct last_hidden_c = self.sources[0].get_last_hidden_state( '*') # Get last c after all blocks last_hidden_h = encoder_outputs[ input_block_size - 1] # Get last hidden after the first block # Padding so that last hidden_c & _h are the same (this is needed for when using BiLSTM) c_shape = tf.shape(last_hidden_c) h_shape = tf.shape(last_hidden_h) padding = tf.zeros([c_shape[0], h_shape[1] - c_shape[1]]) last_hidden_c = tf.concat([last_hidden_c, padding], axis=1) last_hidden = tf.stack([last_hidden_c, last_hidden_h], axis=0) # Note down data self.transducer_hidden_units = transducer_hidden_units self.num_outputs = n_out self.transducer_max_width = transducer_max_width self.input_block_size = input_block_size self.e_symbol_index = e_symbol_index # self.output.placeholder is of shape [transducer_max_width * amount_of_blocks, batch_size, n_out] self.output.placeholder = self.build_full_transducer( transducer_hidden_units=transducer_hidden_units, embeddings=embeddings, num_outputs=n_out, input_block_size=input_block_size, transducer_max_width=transducer_max_width, encoder_outputs=encoder_outputs, trans_hidden_init=last_hidden) # Set correct logit lengths output_size = self.round_vector_to_closest_input_block( vector=self.input_data.size_placeholder[0], input_block_size=input_block_size, transducer_max_width=transducer_max_width) # Set shaping info self.output.size_placeholder = {0: output_size} self.output.time_dim_axis = 0 self.output.batch_dim_axis = 1 # Add all trainable params with self.var_creation_scope() as scope: self._add_all_trainable_params( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name))
def build_full_transducer(self, transducer_hidden_units, embeddings, num_outputs, input_block_size, transducer_max_width, encoder_outputs, trans_hidden_init): """ Builds the complete transducer. :param int transducer_hidden_units: Amount of units the transducer should have. :param tf.Variable embeddings: Variable with the reference to the embeddings. :param int num_outputs: The size of the output layer, i.e. the size of the vocabulary including <E> symbol. :param int input_block_size: Amount of inputs to use for each NT block. :param int transducer_max_width: The max amount of outputs in one NT block (including the final <E> symbol) :param tf.tensor encoder_outputs: The outputs of the encode in shape of [max_time, batch_size, encoder_hidden] :param tf.tensor trans_hidden_init: The init state of the transducer. Needs to be of shape [2, batch_size, transducer_hidden_units]. The trans_hidden_init[0] is the c vector of the lstm, trans_hidden_init[1] the hidden vector. :return: Returns a reference to the tf.tensor containing the logits. :rtype: tf.tensor """ with self.var_creation_scope(): # Get meta variables batch_size = tf.shape(encoder_outputs)[1] if trans_hidden_init is None: trans_hidden_init = tf.zeros( [2, batch_size, transducer_hidden_units], dtype=tf.float32) # Do some more post processing max_blocks = tf.to_int32( tf.shape(encoder_outputs)[0] / input_block_size) transducer_list_outputs = tf.ones( [max_blocks, batch_size], dtype=tf.int32) * transducer_max_width inference_mode = 1.0 teacher_forcing_targets = tf.ones( [transducer_max_width * max_blocks, batch_size], dtype=tf.int32) # Process teacher forcing targets teacher_forcing_targets_emb = tf.nn.embedding_lookup( embeddings, teacher_forcing_targets) # Outputs outputs_ta = tf.TensorArray(dtype=tf.float32, size=max_blocks, infer_shape=False) init_state = (0, outputs_ta, trans_hidden_init, 0) # Init the transducer cell from TFUtil import get_initializer transducer_cell_initializer = get_initializer( 'glorot_uniform', seed=self.network.random.randint(2**31), eval_local_ns={"layer": self}) transducer_cell = tf.contrib.rnn.LSTMCell( transducer_hidden_units, initializer=transducer_cell_initializer) def cond(current_block, outputs_int, trans_hidden, total_output): return current_block < max_blocks def body(current_block, outputs_int, trans_hidden, total_output): # --------------------- TRANSDUCER -------------------------------------------------------------------- # Each transducer block runs for the max transducer outputs in its respective block encoder_raw_outputs = encoder_outputs[ input_block_size * current_block:input_block_size * (current_block + 1)] encoder_raw_outputs = tf.where( tf.is_nan(encoder_raw_outputs), tf.zeros_like(encoder_raw_outputs), encoder_raw_outputs) trans_hidden = tf.where(tf.is_nan(trans_hidden), tf.zeros_like(trans_hidden), trans_hidden) # Save/load the state as one tensor, use top encoder layer state as init if this is the first block trans_hidden_state = trans_hidden transducer_amount_outputs = transducer_list_outputs[ current_block] transducer_max_output = tf.reduce_max( transducer_amount_outputs) # Model building helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( inputs=teacher_forcing_targets_emb[ total_output:total_output + transducer_max_output], # Get the current target inputs sequence_length=transducer_amount_outputs, embedding=embeddings, sampling_probability=inference_mode, time_major=True) attention_states = tf.transpose(encoder_raw_outputs, [ 1, 0, 2 ]) # attention_states: [batch_size, max_time, num_enc_units] attention_mechanism = tf.contrib.seq2seq.LuongAttention( transducer_hidden_units, attention_states) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( transducer_cell, attention_mechanism, attention_layer_size=transducer_hidden_units) from tensorflow.python.layers import core as layers_core projection_layer = layers_core.Dense(num_outputs, use_bias=False) # Build previous state trans_hidden_c, trans_hidden_h = tf.split(trans_hidden_state, num_or_size_splits=2, axis=0) trans_hidden_c = tf.reshape( trans_hidden_c, shape=[-1, transducer_hidden_units]) trans_hidden_h = tf.reshape( trans_hidden_h, shape=[-1, transducer_hidden_units]) from tensorflow.contrib.rnn import LSTMStateTuple trans_hidden_state_t = LSTMStateTuple(trans_hidden_c, trans_hidden_h) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, decoder_cell.zero_state( batch_size, tf.float32).clone(cell_state=trans_hidden_state_t), output_layer=projection_layer) outputs, transducer_hidden_state_new, _ = tf.contrib.seq2seq.dynamic_decode( decoder, output_time_major=True, maximum_iterations=transducer_max_output) logits = outputs.rnn_output # logits of shape [max_time,batch_size,vocab_size] # Modify output of transducer_hidden_state_new so that it can be fed back in again without problems. transducer_hidden_state_new = tf.concat([ transducer_hidden_state_new[0].c, transducer_hidden_state_new[0].h ], axis=0) transducer_hidden_state_new = tf.reshape( transducer_hidden_state_new, shape=[2, -1, transducer_hidden_units]) # Note the outputs outputs_int = outputs_int.write(current_block, logits) return current_block + 1, outputs_int, \ transducer_hidden_state_new, total_output + transducer_max_output _, outputs_final, _, _ = tf.while_loop(cond, body, init_state, parallel_iterations=1) # Process outputs with tf.device('/cpu:0'): logits = outputs_final.concat( ) # And now its [max_output_time, batch_size, num_outputs] # For loading the model later on logits = tf.identity(logits, name='logits') return logits