def _get_outputs(self, inputs, input_seq_length, is_training): ''' Create the variables and do the forward computation Args: inputs: the inputs to the neural network, this is a list of [batch_size x time x ...] tensors input_seq_length: The sequence lengths of the input utterances, this is a [batch_size] vector is_training: whether or not the network is in training mode Returns: - output, which is a [batch_size x time x ...] tensors ''' kernel_size = map(int, self.conf['filters'].split(' ')) num_filters_1st_layer = int(self.conf['num_filters_1st_layer']) f_pool_rate = int(self.conf['f_pool_rate']) t_pool_rate = int(self.conf['t_pool_rate']) num_encoder_layers = int(self.conf['num_encoder_layers']) num_decoder_layers = num_encoder_layers num_centre_layers = int(self.conf['num_centre_layers']) layer_norm = self.conf['layer_norm'] == 'True' if 'activation_fn' in self.conf: if self.conf['activation_fn'] == 'tanh': activation_fn = tf.nn.tanh elif self.conf['activation_fn'] == 'relu': activation_fn = tf.nn.relu elif self.conf['activation_fn'] == 'sigmoid': activation_fn = tf.nn.sigmoid else: raise Exception('Undefined activation function: %s' % self.conf['activation_fn']) else: activation_fn = tf.nn.relu # the encoder layers encoder_layers = [] for l in range(num_encoder_layers): num_filters_l = num_filters_1st_layer * 2**l max_pool_filter = [1, 1] if np.mod(l + 1, t_pool_rate) == 0: max_pool_filter[0] = 2 if np.mod(l + 1, f_pool_rate) == 0: max_pool_filter[1] = 2 encoder_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size, strides=(1, 1), padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=max_pool_filter)) # the centre layers centre_layers = [] for l in range(num_centre_layers): num_filters_l = num_filters_1st_layer * 2**num_encoder_layers centre_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size, strides=(1, 1), padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=(1, 1))) # the decoder layers decoder_layers = [] for l in range(num_encoder_layers): corresponding_encoder_l = num_encoder_layers - 1 - l num_filters_l = encoder_layers[corresponding_encoder_l].num_filters strides = encoder_layers[corresponding_encoder_l].max_pool_filter decoder_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size, strides=strides, padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=(1, 1), transpose=True)) #code not available for multiple inputs!! if len(inputs) > 1: raise 'The implementation of DCNN expects 1 input and not %d' % len( inputs) else: inputs = inputs[0] # Convolutional layers expect input channels, making 1 here. inputs = tf.expand_dims(inputs, -1) with tf.variable_scope(self.scope): if is_training and float(self.conf['input_noise']) > 0: inputs = inputs + tf.random_normal( tf.shape(inputs), stddev=float(self.conf['input_noise'])) logits = inputs with tf.variable_scope('encoder'): encoder_outputs = [] for l in range(num_encoder_layers): with tf.variable_scope('layer_%s' % l): logits = encoder_layers[l](logits) encoder_outputs.append(logits) if is_training and float(self.conf['dropout']) < 1: raise 'have to check wheter dropout is implemented correctly' logits = tf.nn.dropout(logits, float(self.conf['dropout'])) with tf.variable_scope('centre'): for l in range(num_centre_layers): with tf.variable_scope('layer_%s' % l): logits = centre_layers[l](logits) if is_training and float(self.conf['dropout']) < 1: raise 'have to check wheter dropout is implemented correctly' logits = tf.nn.dropout(logits, float(self.conf['dropout'])) with tf.variable_scope('decoder'): for l in range(num_decoder_layers): with tf.variable_scope('layer_%s' % l): corresponding_encoder_l = num_encoder_layers - 1 - l corresponding_encoder_output = encoder_outputs[ corresponding_encoder_l] decoder_input = tf.concat( [logits, corresponding_encoder_output], -1) logits = decoder_layers[l](decoder_input) if is_training and float(self.conf['dropout']) < 1: raise 'have to check wheter dropout is implemented correctly' logits = tf.nn.dropout(logits, float(self.conf['dropout'])) #get wanted output size if corresponding_encoder_l == 0: wanted_size = tf.shape(inputs) else: wanted_size = tf.shape( encoder_outputs[corresponding_encoder_l - 1]) #if corresponding_encoder_l==0: #wanted_size = inputs.get_shape() #else: #wanted_size = encoder_outputs[corresponding_encoder_l-1].get_shape() wanted_t_size = wanted_size[1] wanted_f_size = wanted_size[2] #get actual output size output_size = tf.shape(logits) #output_size = logits.get_shape() output_t_size = output_size[1] output_f_size = output_size[2] #compensate for potential mismatch, by adding duplicates missing_t_size = wanted_t_size - output_t_size missing_f_size = wanted_f_size - output_f_size last_t_slice = tf.expand_dims(logits[:, -1, :, :], 1) duplicate_logits = tf.tile(last_t_slice, [1, missing_t_size, 1, 1]) logits = tf.concat([logits, duplicate_logits], 1) last_f_slice = tf.expand_dims(logits[:, :, -1, :], 2) duplicate_logits = tf.tile(last_f_slice, [1, 1, missing_f_size, 1]) logits = tf.concat([logits, duplicate_logits], 2) output = logits return output
def _get_outputs(self, inputs, input_seq_length, is_training): """ Create the variables and do the forward computation Args: inputs: the inputs to the neural network, this is a list of [batch_size x time x ...] tensors input_seq_length: The sequence lengths of the input utterances, this is a [batch_size] vector is_training: whether or not the network is in training mode Returns: - output, which is a [batch_size x time x ...] tensors """ if 'filters' in self.conf: kernel_size = map(int, self.conf['filters'].split(' ')) elif 'filter_size_t' in self.conf and 'filter_size_f' in self.conf: kernel_size_t = int(self.conf['filter_size_t']) kernel_size_f = int(self.conf['filter_size_f']) kernel_size = (kernel_size_t, kernel_size_f) else: raise ValueError('Kernel convolution size not specified.') f_stride = int(self.conf['f_stride']) t_stride = int(self.conf['t_stride']) num_layers = int(self.conf['num_layers']) num_filters_1st_layer = int(self.conf['num_filters_1st_layer']) if 'fac_per_layer' in self.conf: fac_per_layer = float(self.conf['fac_per_layer']) else: fac_per_layer = 1.0 num_filters = [ int(math.ceil(num_filters_1st_layer * (fac_per_layer**l))) for l in range(num_layers) ] layer_norm = self.conf['layer_norm'] == 'True' flat_freq = self.conf['flat_freq'] == 'True' if 'activation_fn' in self.conf: if self.conf['activation_fn'] == 'tanh': activation_fn = tf.nn.tanh elif self.conf['activation_fn'] == 'relu': activation_fn = tf.nn.relu elif self.conf['activation_fn'] == 'sigmoid': activation_fn = tf.nn.sigmoid else: raise Exception('Undefined activation function: %s' % self.conf['activation_fn']) else: activation_fn = tf.nn.relu # the cnn layers cnn_layers = [] for l in range(num_layers): num_filters_l = num_filters[l] cnn_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size, strides=(t_stride, f_stride), padding='same', activation_fn=activation_fn, layer_norm=layer_norm)) # code not available for multiple inputs!! if len(inputs) > 1: raise 'The implementation of DCNN expects 1 input and not %d' % len( inputs) else: inputs = inputs[0] if num_layers == 0: output = inputs return output # Convolutional layers expect input channels, making 1 here. inputs = tf.expand_dims(inputs, -1) with tf.variable_scope(self.scope): if is_training and float(self.conf['input_noise']) > 0: inputs = inputs + tf.random_normal( tf.shape(inputs), stddev=float(self.conf['input_noise'])) logits = inputs with tf.variable_scope('cnn'): for l in range(num_layers): with tf.variable_scope('layer_%s' % l): logits, _ = cnn_layers[l](logits) if is_training and float(self.conf['dropout']) < 1: raise Exception( 'have to check whether dropout is implemented correctly' ) # logits = tf.nn.dropout(logits, float(self.conf['dropout'])) if flat_freq: shapes = logits.get_shape().as_list() logits = tf.reshape(logits, [shapes[0], -1, shapes[2] * shapes[3]]) output = logits return output
def _get_outputs(self, inputs, input_seq_length, is_training): ''' Create the variables and do the forward computation Args: inputs: the inputs to the neural network, this is a list of [batch_size x time x ...] tensors input_seq_length: The sequence lengths of the input utterances, this is a [batch_size] vector is_training: whether or not the network is in training mode Returns: - output, which is a [batch_size x time x ...] tensors ''' kernel_size = map(int, self.conf['filters'].split(' ')) num_filters = int(self.conf['num_filters']) num_layers = int(self.conf['num_layers']) layer_norm = self.conf['layer_norm'] == 'True' if 'activation_fn' in self.conf: if self.conf['activation_fn'] == 'tanh': activation_fn = tf.nn.tanh elif self.conf['activation_fn'] == 'relu': activation_fn = tf.nn.relu elif self.conf['activation_fn'] == 'sigmoid': activation_fn = tf.nn.sigmoid else: raise Exception('Undefined activation function: %s' % self.conf['activation_fn']) else: activation_fn = tf.nn.relu # the encoder layers cnn_layer = layer.Conv2D(num_filters=num_filters, kernel_size=kernel_size, strides=(1, 1), padding='same', activation_fn=activation_fn, layer_norm=layer_norm) #code not available for multiple inputs!! if len(inputs) > 1: raise 'The implementation of DCNN expects 1 input and not %d' % len( inputs) else: inputs = inputs[0] # Convolutional layers expect input channels, making 1 here. inputs = tf.expand_dims(inputs, -1) with tf.variable_scope(self.scope): if is_training and float(self.conf['input_noise']) > 0: inputs = inputs + tf.random_normal( tf.shape(inputs), stddev=float(self.conf['input_noise'])) logits = inputs for l in range(num_layers): with tf.variable_scope('layer_%s' % l): logits = cnn_layer(logits) if is_training and float(self.conf['dropout']) < 1: raise 'have to check wheter dropout is implemented correctly' logits = tf.nn.dropout(logits, float(self.conf['dropout'])) output = logits return output
def _get_outputs(self, inputs, input_seq_length, is_training): """ Create the variables and do the forward computation Args: inputs: the inputs to the neural network, this is a list of [batch_size x time x ...] tensors input_seq_length: The sequence lengths of the input utterances, this is a [batch_size] vector is_training: whether or not the network is in training mode Returns: - output, which is a [batch_size x time x ...] tensors """ if 'filters' in self.conf: kernel_size_lay1 = map(int, self.conf['filters'].split(' ')) elif 'filter_size_t' in self.conf and 'filter_size_f' in self.conf: kernel_size_t_lay1 = int(self.conf['filter_size_t']) kernel_size_f_lay1 = int(self.conf['filter_size_f']) kernel_size_lay1 = [kernel_size_t_lay1, kernel_size_f_lay1] else: raise ValueError('Kernel convolution size not specified.') if 'filter_size_t' in self.conf and 'filter_size_f' in self.conf: kernel_size_t_fac_after_pool = float( self.conf['filter_size_t_fac_after_pool']) kernel_size_f_fac_after_pool = float( self.conf['filter_size_f_fac_after_pool']) kernel_fac_after_pool = [ kernel_size_t_fac_after_pool, kernel_size_f_fac_after_pool ] else: kernel_fac_after_pool = [1, 1] f_pool_rate = int(self.conf['f_pool_rate']) t_pool_rate = int(self.conf['t_pool_rate']) num_encoder_layers = int(self.conf['num_encoder_layers']) num_decoder_layers = num_encoder_layers num_centre_layers = int(self.conf['num_centre_layers']) num_filters_1st_layer = int(self.conf['num_filters_1st_layer']) fac_per_layer = float(self.conf['fac_per_layer']) num_filters_enc = [ int(math.ceil(num_filters_1st_layer * (fac_per_layer**l))) for l in range(num_encoder_layers) ] num_filters_dec = num_filters_enc[::-1] num_filters_dec = num_filters_dec[1:] + [(int( self.conf['num_output_filters']))] kernel_size_enc = [] ideal_kernel_size_enc = [kernel_size_lay1] bypass = self.conf['bypass'] layer_norm = self.conf['layer_norm'] == 'True' if 'activation_fn' in self.conf: if self.conf['activation_fn'] == 'tanh': activation_fn = tf.nn.tanh elif self.conf['activation_fn'] == 'relu': activation_fn = tf.nn.relu elif self.conf['activation_fn'] == 'sigmoid': activation_fn = tf.nn.sigmoid else: raise Exception('Undefined activation function: %s' % self.conf['activation_fn']) else: activation_fn = tf.nn.relu # the encoder layers encoder_layers = [] for l in range(num_encoder_layers): kernel_size_l = copy.deepcopy(ideal_kernel_size_enc[l]) kernel_size_l_plus_1 = kernel_size_l kernel_size_l = [int(math.ceil(k)) for k in kernel_size_l] kernel_size_enc.append(kernel_size_l) num_filters_l = num_filters_enc[l] max_pool_filter = [1, 1] if np.mod(l + 1, t_pool_rate) == 0: max_pool_filter[0] = 2 kernel_size_l_plus_1[ 0] = kernel_size_l_plus_1[0] * kernel_fac_after_pool[0] if np.mod(l + 1, f_pool_rate) == 0: max_pool_filter[1] = 2 kernel_size_l_plus_1[ 1] = kernel_size_l_plus_1[1] * kernel_fac_after_pool[1] ideal_kernel_size_enc.append(kernel_size_l_plus_1) encoder_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size_l, strides=(1, 1), padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=max_pool_filter)) # the centre layers centre_layers = [] for l in range(num_centre_layers): num_filters_l = num_filters_enc[-1] kernel_size_l = ideal_kernel_size_enc[-1] kernel_size_l = map(int(math.ceil()), kernel_size_l) centre_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size_l, strides=(1, 1), padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=(1, 1))) # the decoder layers decoder_layers = [] for l in range(num_decoder_layers): corresponding_encoder_l = num_encoder_layers - 1 - l num_filters_l = num_filters_dec[l] kernel_size_l = kernel_size_enc[corresponding_encoder_l] if bypass == 'unpool': strides = [1, 1] else: strides = encoder_layers[ corresponding_encoder_l].max_pool_filter decoder_layers.append( layer.Conv2D(num_filters=num_filters_l, kernel_size=kernel_size_l, strides=strides, padding='same', activation_fn=activation_fn, layer_norm=layer_norm, max_pool_filter=(1, 1), transpose=True)) # code not available for multiple inputs!! if len(inputs) > 1: raise 'The implementation of DCNN expects 1 input and not %d' % len( inputs) else: inputs = inputs[0] if (num_encoder_layers + num_centre_layers + num_decoder_layers) == 0: output = inputs return output # Convolutional layers expect input channels, making 1 here. inputs = tf.expand_dims(inputs, -1) with tf.variable_scope(self.scope): if is_training and float(self.conf['input_noise']) > 0: inputs = inputs + tf.random_normal( tf.shape(inputs), stddev=float(self.conf['input_noise'])) logits = inputs with tf.variable_scope('encoder'): encoder_outputs = [] encoder_outputs_before_pool = [] for l in range(num_encoder_layers): with tf.variable_scope('layer_%s' % l): logits, outputs_before_pool = encoder_layers[l](logits) encoder_outputs.append(logits) encoder_outputs_before_pool.append(outputs_before_pool) if is_training and float(self.conf['dropout']) < 1: raise Exception( 'have to check whether dropout is implemented correctly' ) # logits = tf.nn.dropout(logits, float(self.conf['dropout'])) with tf.variable_scope('centre'): for l in range(num_centre_layers): with tf.variable_scope('layer_%s' % l): logits, _ = centre_layers[l](logits) if is_training and float(self.conf['dropout']) < 1: raise Exception( 'have to check whether dropout is implemented correctly' ) # logits = tf.nn.dropout(logits, float(self.conf['dropout'])) with tf.variable_scope('decoder'): for l in range(num_decoder_layers): with tf.variable_scope('layer_%s' % l): corresponding_encoder_l = num_encoder_layers - 1 - l corresponding_encoder_output = encoder_outputs[ corresponding_encoder_l] corresponding_encoder_output_before_pool = encoder_outputs_before_pool[ corresponding_encoder_l] corresponding_encoder_max_pool_filter = encoder_layers[ corresponding_encoder_l].max_pool_filter if bypass == 'True' and (num_centre_layers > 0 or l > 0): # don't use bypass for layer 0 if no centre layers decoder_input = tf.concat( [logits, corresponding_encoder_output], -1) else: decoder_input = logits if bypass == 'unpool' and corresponding_encoder_max_pool_filter != [ 1, 1 ]: decoder_input = layer.unpool( pool_input= corresponding_encoder_output_before_pool, pool_output=corresponding_encoder_output, unpool_input=decoder_input, pool_kernel_size= corresponding_encoder_max_pool_filter, pool_stride= corresponding_encoder_max_pool_filter, padding='VALID') logits, _ = decoder_layers[l](decoder_input) if is_training and float(self.conf['dropout']) < 1: raise Exception( 'have to check whether dropout is implemented correctly' ) # logits = tf.nn.dropout(logits, float(self.conf['dropout'])) # get wanted output size if corresponding_encoder_l == 0: wanted_size = tf.shape(inputs) else: wanted_size = tf.shape( encoder_outputs[corresponding_encoder_l - 1]) wanted_t_size = wanted_size[1] wanted_f_size = wanted_size[2] # get actual output size output_size = tf.shape(logits) output_t_size = output_size[1] output_f_size = output_size[2] # compensate for potential mismatch, by adding duplicates missing_t_size = wanted_t_size - output_t_size missing_f_size = wanted_f_size - output_f_size last_t_slice = tf.expand_dims(logits[:, -1, :, :], 1) duplicate_logits = tf.tile(last_t_slice, [1, missing_t_size, 1, 1]) logits = tf.concat([logits, duplicate_logits], 1) last_f_slice = tf.expand_dims(logits[:, :, -1, :], 2) duplicate_logits = tf.tile(last_f_slice, [1, 1, missing_f_size, 1]) logits = tf.concat([logits, duplicate_logits], 2) # set the shape of the logits as we know dyn_shape = logits.get_shape().as_list() dyn_shape[-2] = inputs.get_shape()[-2] logits.set_shape(dyn_shape) output = logits return output