def AddFCLayer(self, prev_layer, index): """Parse expression and add Fully Connected Layer. Args: prev_layer: Input tensor. index: Position in model_str to start parsing Returns: Output tensor, end index in model_str. """ pattern = re.compile(R'(F)(s|t|r|l|m)({\w+})?(\d+)') m = pattern.match(self.model_str, index) if m is None: return None, None fn = self._NonLinearity(m.group(2)) name = self._GetLayerName(m.group(0), index, m.group(3)) depth = int(m.group(4)) input_depth = shapes.tensor_dim(prev_layer, 1) * shapes.tensor_dim( prev_layer, 2) * shapes.tensor_dim(prev_layer, 3) # The slim fully connected is actually a 1x1 conv, so we have to crush the # dimensions on input. # Everything except batch goes to depth, and therefore has to be known. shaped = tf.reshape( prev_layer, [-1, input_depth], name=name + '_reshape_in') output = slim.fully_connected(shaped, depth, activation_fn=fn, scope=name) # Width and height are collapsed to 1. self.reduction_factors[1] = None self.reduction_factors[2] = None return tf.reshape( output, [shapes.tensor_dim(prev_layer, 0), 1, 1, depth], name=name + '_reshape_out'), m.end()
def AddFCLayer(self, prev_layer, index): """Parse expression and add Fully Connected Layer. Args: prev_layer: Input tensor. index: Position in model_str to start parsing Returns: Output tensor, end index in model_str. """ pattern = re.compile(R'(F)(s|t|r|l|m)({\w+})?(\d+)') m = pattern.match(self.model_str, index) if m is None: return None, None fn = self._NonLinearity(m.group(2)) name = self._GetLayerName(m.group(0), index, m.group(3)) depth = int(m.group(4)) input_depth = shapes.tensor_dim(prev_layer, 1) * shapes.tensor_dim( prev_layer, 2) * shapes.tensor_dim(prev_layer, 3) # The slim fully connected is actually a 1x1 conv, so we have to crush the # dimensions on input. # Everything except batch goes to depth, and therefore has to be known. shaped = tf.reshape(prev_layer, [-1, input_depth], name=name + '_reshape_in') output = slim.fully_connected(shaped, depth, activation_fn=fn, scope=name) # Width and height are collapsed to 1. self.reduction_factors[1] = None self.reduction_factors[2] = None return tf.reshape(output, [shapes.tensor_dim(prev_layer, 0), 1, 1, depth], name=name + '_reshape_out'), m.end()
def _AddOutputs(self, prev_layer, out_dims, out_func, num_classes): """Adds the output layer and loss function. Args: prev_layer: Output of last layer of main network. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. num_classes: Number of outputs/size of last output dimension. """ height_in = shapes.tensor_dim(prev_layer, dim=1) logits, outputs = self._AddOutputLayer(prev_layer, out_dims, out_func, num_classes) if self.mode == 'train': # Setup loss for training. self.loss = self._AddLossFunction(logits, height_in, out_dims, out_func) tf.scalar_summary('loss', self.loss, name='loss') elif out_dims == 0: # Be sure the labels match the output, even in eval mode. self.labels = tf.slice(self.labels, [0, 0], [-1, 1]) self.labels = tf.reshape(self.labels, [-1]) logging.info('Final output=%s', outputs) logging.info('Labels tensor=%s', self.labels) self.output = outputs
def _AddOutputLayer(self, prev_layer, out_dims, out_func, num_classes): """Add the fully-connected logits and SoftMax/Logistic output Layer. Args: prev_layer: Output of last layer of main network. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. num_classes: Number of outputs/size of last output dimension. Returns: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. outputs: Post-softmax/logistic shaped to out_dims. Raises: ValueError: if syntax is incorrect. """ # bilinear interpolation #prev_layer = tf.image.resize_images(prev_layer, [shapes.tensor_dim(prev_layer, dim=1), shapes.tensor_dim(self.labels, dim=1)]) # Reduce dimensionality appropriate to the output dimensions. batch_in = shapes.tensor_dim(prev_layer, dim=0) height_in = shapes.tensor_dim(prev_layer, dim=1) width_in = shapes.tensor_dim(prev_layer, dim=2) depth_in = shapes.tensor_dim(prev_layer, dim=3) if out_dims: # Combine any remaining height and width with batch and unpack after. shaped = tf.reshape(prev_layer, [-1, depth_in]) else: # Everything except batch goes to depth, and therefore has to be known. shaped = tf.reshape(prev_layer, [-1, height_in * width_in * depth_in]) logits = slim.fully_connected(shaped, num_classes, activation_fn=None) if out_func == 'l': raise ValueError('Logistic not yet supported!') else: output = tf.nn.softmax(logits) # Reshape to the dessired output. if out_dims == 2: output_shape = [batch_in, height_in, width_in, num_classes] elif out_dims == 1: output_shape = [batch_in, height_in * width_in, num_classes] else: output_shape = [batch_in, num_classes] output = tf.reshape(output, output_shape, name='Output') logits = tf.reshape(logits, output_shape) return logits, output
def _AddOutputLayer(self, prev_layer, out_dims, out_func, num_classes): """Add the fully-connected logits and SoftMax/Logistic output Layer. Args: prev_layer: Output of last layer of main network. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. num_classes: Number of outputs/size of last output dimension. Returns: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. outputs: Post-softmax/logistic shaped to out_dims. Raises: ValueError: if syntax is incorrect. """ # Reduce dimensionality appropriate to the output dimensions. batch_in = shapes.tensor_dim(prev_layer, dim=0) height_in = shapes.tensor_dim(prev_layer, dim=1) width_in = shapes.tensor_dim(prev_layer, dim=2) depth_in = shapes.tensor_dim(prev_layer, dim=3) if out_dims: # Combine any remaining height and width with batch and unpack after. shaped = tf.reshape(prev_layer, [-1, depth_in]) else: # Everything except batch goes to depth, and therefore has to be known. shaped = tf.reshape(prev_layer, [-1, height_in * width_in * depth_in]) logits = slim.fully_connected(shaped, num_classes, activation_fn=None) if out_func == 'l': raise ValueError('Logistic not yet supported!') else: output = tf.nn.softmax(logits) # Reshape to the dessired output. if out_dims == 2: output_shape = [batch_in, height_in, width_in, num_classes] elif out_dims == 1: output_shape = [batch_in, height_in * width_in, num_classes] else: output_shape = [batch_in, num_classes] output = tf.reshape(output, output_shape, name='Output') logits = tf.reshape(logits, output_shape) return logits, output
def AddBilinear(self, prev_layer, index): """Add a single standard deconvolutional layer. Args: prev_layer: Input tensor. index: Position in model_str to start parsing Returns: Output tensor, end index in model_str. """ pattern = re.compile(R'(Bl)') m = pattern.match(self.model_str, index) if m is None: return None, None layer = tf.image.resize_images(prev_layer, [ shapes.tensor_dim(prev_layer, dim=1), tf.cast(self.widths[0], tf.int32) ]) self.reduction_factors[2] = 1.0 return layer, m.end()
def _AddLossFunction(self, logits, height_in, out_dims, out_func): """Add the appropriate loss function. Args: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. height_in: Height of logits before going into the softmax layer. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. Returns: loss: That which is to be minimized. Raises: ValueError: if logistic is used. """ if out_func == 'c': # Transpose batch to the middle. ctc_input = tf.transpose(logits, [1, 0, 2]) # Compute the widths of each batch element from the input widths. widths = self.layers.GetLengths(dim=2, factor=height_in) #cross_entropy = tf.nn.ctc_loss(ctc_input, self.sparse_labels, widths) cross_entropy = tf.nn.ctc_loss(self.sparse_labels, ctc_input, widths) elif out_func == 's': if out_dims == 2: self.labels = _PadLabels3d(logits, self.labels) elif out_dims == 1: self.labels = _PadLabels2d(shapes.tensor_dim(logits, dim=1), self.labels) else: self.labels = tf.slice(self.labels, [0, 0], [-1, 1]) self.labels = tf.reshape(self.labels, [-1]) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.labels, name='xent') else: # TODO(rays) Labels need an extra dimension for logistic, so different # padding functions are needed, as well as a different loss function. raise ValueError('Logistic not yet supported!') return tf.reduce_sum(cross_entropy)
def _AddLossFunction(self, logits, height_in, out_dims, out_func): """Add the appropriate loss function. Args: logits: Pre-softmax/logistic fully-connected output shaped to out_dims. height_in: Height of logits before going into the softmax layer. out_dims: Number of output dimensions, 0, 1 or 2. out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic. Returns: loss: That which is to be minimized. Raises: ValueError: if logistic is used. """ if out_func == 'c': # Transpose batch to the middle. ctc_input = tf.transpose(logits, [1, 0, 2]) # Compute the widths of each batch element from the input widths. widths = self.layers.GetLengths(dim=2, factor=height_in) cross_entropy = tf.nn.ctc_loss(ctc_input, self.sparse_labels, widths) elif out_func == 's': if out_dims == 2: self.labels = _PadLabels3d(logits, self.labels) elif out_dims == 1: self.labels = _PadLabels2d( shapes.tensor_dim( logits, dim=1), self.labels) else: self.labels = tf.slice(self.labels, [0, 0], [-1, 1]) self.labels = tf.reshape(self.labels, [-1]) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, self.labels, name='xent') else: # TODO(rays) Labels need an extra dimension for logistic, so different # padding functions are needed, as well as a different loss function. raise ValueError('Logistic not yet supported!') return tf.reduce_sum(cross_entropy)
def _LSTMLayer(self, prev_layer, direction, dim, summarize, depth, name): """Adds an LSTM layer with the given pre-parsed attributes. Always maps 4-D to 4-D regardless of summarize. Args: prev_layer: Input tensor. direction: 'forward' 'backward' or 'bidirectional' dim: 'x' or 'y', dimension to consider as time. summarize: True if we are to return only the last timestep. depth: Output depth. name: Some string naming the op. Returns: Output tensor. """ # If the target dimension is y, we need to transpose. if dim == 'x': lengths = self.GetLengths(2, 1) inputs = prev_layer else: lengths = self.GetLengths(1, 1) inputs = tf.transpose(prev_layer, [0, 2, 1, 3], name=name + '_ytrans_in') input_batch = shapes.tensor_dim(inputs, 0) num_slices = shapes.tensor_dim(inputs, 1) num_steps = shapes.tensor_dim(inputs, 2) input_depth = shapes.tensor_dim(inputs, 3) # Reshape away the other dimension. inputs = tf.reshape( inputs, [-1, num_steps, input_depth], name=name + '_reshape_in') # We need to replicate the lengths by the size of the other dimension, and # any changes that have been made to the batch dimension. tile_factor = tf.to_float(input_batch * num_slices) / tf.to_float(tf.shape(lengths)[0]) lengths = tf.tile(lengths, [tf.cast(tile_factor, tf.int32)]) lengths = tf.cast(lengths, tf.int64) outputs = nn_ops.rnn_helper( inputs, lengths, cell_type='lstm', num_nodes=depth, direction=direction, name=name, stddev=0.1) # Output depth is doubled if bi-directional. if direction == 'bidirectional': output_depth = depth * 2 else: output_depth = depth # Restore the other dimension. if summarize: outputs = tf.slice( outputs, [0, num_steps - 1, 0], [-1, 1, -1], name=name + '_sum_slice') outputs = tf.reshape( outputs, [input_batch, num_slices, 1, output_depth], name=name + '_reshape_out') else: outputs = tf.reshape( outputs, [input_batch, num_slices, num_steps, output_depth], name=name + '_reshape_out') if dim == 'y': outputs = tf.transpose(outputs, [0, 2, 1, 3], name=name + '_ytrans_out') return outputs
def _LSTMLayer(self, prev_layer, direction, dim, summarize, depth, name): """Adds an LSTM layer with the given pre-parsed attributes. Always maps 4-D to 4-D regardless of summarize. Args: prev_layer: Input tensor. direction: 'forward' 'backward' or 'bidirectional' dim: 'x' or 'y', dimension to consider as time. summarize: True if we are to return only the last timestep. depth: Output depth. name: Some string naming the op. Returns: Output tensor. """ # If the target dimension is y, we need to transpose. if dim == 'x': lengths = self.GetLengths(2, 1) inputs = prev_layer else: lengths = self.GetLengths(1, 1) inputs = tf.transpose(prev_layer, [0, 2, 1, 3], name=name + '_ytrans_in') input_batch = shapes.tensor_dim(inputs, 0) num_slices = shapes.tensor_dim(inputs, 1) num_steps = shapes.tensor_dim(inputs, 2) input_depth = shapes.tensor_dim(inputs, 3) # Reshape away the other dimension. inputs = tf.reshape(inputs, [-1, num_steps, input_depth], name=name + '_reshape_in') # We need to replicate the lengths by the size of the other dimension, and # any changes that have been made to the batch dimension. tile_factor = tf.to_float(input_batch * num_slices) / tf.to_float( tf.shape(lengths)[0]) lengths = tf.tile(lengths, [tf.cast(tile_factor, tf.int32)]) lengths = tf.cast(lengths, tf.int64) outputs = nn_ops.rnn_helper(inputs, lengths, cell_type='lstm', num_nodes=depth, direction=direction, name=name, stddev=0.1) # Output depth is doubled if bi-directional. if direction == 'bidirectional': output_depth = depth * 2 else: output_depth = depth # Restore the other dimension. if summarize: outputs = tf.slice(outputs, [0, num_steps - 1, 0], [-1, 1, -1], name=name + '_sum_slice') outputs = tf.reshape(outputs, [input_batch, num_slices, 1, output_depth], name=name + '_reshape_out') else: outputs = tf.reshape( outputs, [input_batch, num_slices, num_steps, output_depth], name=name + '_reshape_out') if dim == 'y': outputs = tf.transpose(outputs, [0, 2, 1, 3], name=name + '_ytrans_out') return outputs
def lstm_layer(inp, length=None, state=None, memory=None, num_nodes=None, backward=False, clip=50.0, reg_func=tf.nn.l2_loss, weight_reg=False, weight_collection="LSTMWeights", bias_reg=False, stddev=None, seed=None, decode=False, use_native_weights=False, name=None): """Adds ops for an LSTM layer. This adds ops for the following operations: input => (forward-LSTM|backward-LSTM) => output The direction of the LSTM is determined by `backward`. If it is false, the forward LSTM is used, the backward one otherwise. Args: inp: A 3-D tensor of shape [`batch_size`, `max_length`, `feature_dim`]. length: A 1-D tensor of shape [`batch_size`] and type int64. Each element represents the length of the corresponding sequence in `inp`. state: If specified, uses it as the initial state. memory: If specified, uses it as the initial memory. num_nodes: The number of LSTM cells. backward: If true, reverses the `inp` before adding the ops. The output is also reversed so that the direction is the same as `inp`. clip: Value used to clip the cell values. reg_func: Function used for the weight regularization such as `tf.nn.l2_loss`. weight_reg: If true, regularize the filter weights with `reg_func`. weight_collection: Collection to add the weights to for regularization. bias_reg: If true, regularize the bias vector with `reg_func`. stddev: Standard deviation used to initialize the variables. seed: Seed used to initialize the variables. decode: If true, does not add ops which are not used for inference. use_native_weights: If true, uses weights in the same format as the native implementations. name: Name of the op. Returns: A 3-D tensor of shape [`batch_size`, `max_length`, `num_nodes`]. """ with tf.variable_scope(name): if backward: if length is None: inp = tf.reverse(inp, [1]) else: inp = tf.reverse_sequence(inp, length, 1, 0) num_prev = inp.get_shape()[2] if stddev: initializer = tf.truncated_normal_initializer(stddev=stddev, seed=seed) else: initializer = tf.uniform_unit_scaling_initializer(seed=seed) if use_native_weights: with tf.variable_scope("LSTMCell"): w = tf.get_variable( "W_0", shape=[num_prev + num_nodes, 4 * num_nodes], initializer=initializer, dtype=tf.float32) w_i_m = tf.slice(w, [0, 0], [num_prev, 4 * num_nodes], name="w_i_m") w_m_m = tf.reshape(tf.slice(w, [num_prev, 0], [num_nodes, 4 * num_nodes]), [num_nodes, 4, num_nodes], name="w_m_m") else: w_i_m = tf.get_variable("w_i_m", [num_prev, 4 * num_nodes], initializer=initializer) w_m_m = tf.get_variable("w_m_m", [num_nodes, 4, num_nodes], initializer=initializer) if not decode and weight_reg: tf.add_to_collection(weight_collection, reg_func(w_i_m, name="w_i_m_reg")) tf.add_to_collection(weight_collection, reg_func(w_m_m, name="w_m_m_reg")) batch_size = shapes.tensor_dim(inp, dim=0) num_frames = shapes.tensor_dim(inp, dim=1) prev = tf.reshape(inp, tf.stack([batch_size * num_frames, num_prev])) if use_native_weights: with tf.variable_scope("LSTMCell"): b = tf.get_variable("B", shape=[4 * num_nodes], initializer=tf.zeros_initializer(), dtype=tf.float32) biases = tf.identity(b, name="biases") else: biases = tf.get_variable("biases", [4 * num_nodes], initializer=tf.constant_initializer(0.0)) if not decode and bias_reg: tf.add_to_collection(weight_collection, reg_func(biases, name="biases_reg")) prev = tf.nn.xw_plus_b(prev, w_i_m, biases) prev = tf.reshape(prev, tf.stack([batch_size, num_frames, 4, num_nodes])) if state is None: state = tf.fill(tf.stack([batch_size, num_nodes]), 0.0) if memory is None: memory = tf.fill(tf.stack([batch_size, num_nodes]), 0.0) out, _, mem = rnn.variable_lstm(prev, state, memory, w_m_m, clip=clip) if backward: if length is None: out = tf.reverse(out, [1]) else: out = tf.reverse_sequence(out, length, 1, 0) return out, mem
def lstm_layer(inp, length=None, state=None, memory=None, num_nodes=None, backward=False, clip=50.0, reg_func=tf.nn.l2_loss, weight_reg=False, weight_collection="LSTMWeights", bias_reg=False, stddev=None, seed=None, decode=False, use_native_weights=False, name=None): """Adds ops for an LSTM layer. This adds ops for the following operations: input => (forward-LSTM|backward-LSTM) => output The direction of the LSTM is determined by `backward`. If it is false, the forward LSTM is used, the backward one otherwise. Args: inp: A 3-D tensor of shape [`batch_size`, `max_length`, `feature_dim`]. length: A 1-D tensor of shape [`batch_size`] and type int64. Each element represents the length of the corresponding sequence in `inp`. state: If specified, uses it as the initial state. memory: If specified, uses it as the initial memory. num_nodes: The number of LSTM cells. backward: If true, reverses the `inp` before adding the ops. The output is also reversed so that the direction is the same as `inp`. clip: Value used to clip the cell values. reg_func: Function used for the weight regularization such as `tf.nn.l2_loss`. weight_reg: If true, regularize the filter weights with `reg_func`. weight_collection: Collection to add the weights to for regularization. bias_reg: If true, regularize the bias vector with `reg_func`. stddev: Standard deviation used to initialize the variables. seed: Seed used to initialize the variables. decode: If true, does not add ops which are not used for inference. use_native_weights: If true, uses weights in the same format as the native implementations. name: Name of the op. Returns: A 3-D tensor of shape [`batch_size`, `max_length`, `num_nodes`]. """ with tf.variable_scope(name): if backward: if length is None: inp = tf.reverse(inp, [1]) else: inp = tf.reverse_sequence(inp, length, 1, 0) num_prev = inp.get_shape()[2] if stddev: initializer = tf.truncated_normal_initializer(stddev=stddev, seed=seed) else: initializer = tf.uniform_unit_scaling_initializer(seed=seed) if use_native_weights: with tf.variable_scope("LSTMCell"): w = tf.get_variable( "W_0", shape=[num_prev + num_nodes, 4 * num_nodes], initializer=initializer, dtype=tf.float32) w_i_m = tf.slice(w, [0, 0], [num_prev, 4 * num_nodes], name="w_i_m") w_m_m = tf.reshape( tf.slice(w, [num_prev, 0], [num_nodes, 4 * num_nodes]), [num_nodes, 4, num_nodes], name="w_m_m") else: w_i_m = tf.get_variable("w_i_m", [num_prev, 4 * num_nodes], initializer=initializer) w_m_m = tf.get_variable("w_m_m", [num_nodes, 4, num_nodes], initializer=initializer) if not decode and weight_reg: tf.add_to_collection(weight_collection, reg_func(w_i_m, name="w_i_m_reg")) tf.add_to_collection(weight_collection, reg_func(w_m_m, name="w_m_m_reg")) batch_size = shapes.tensor_dim(inp, dim=0) num_frames = shapes.tensor_dim(inp, dim=1) prev = tf.reshape(inp, tf.stack([batch_size * num_frames, num_prev])) if use_native_weights: with tf.variable_scope("LSTMCell"): b = tf.get_variable( "B", shape=[4 * num_nodes], initializer=tf.zeros_initializer(), dtype=tf.float32) biases = tf.identity(b, name="biases") else: biases = tf.get_variable( "biases", [4 * num_nodes], initializer=tf.constant_initializer(0.0)) if not decode and bias_reg: tf.add_to_collection( weight_collection, reg_func( biases, name="biases_reg")) prev = tf.nn.xw_plus_b(prev, w_i_m, biases) prev = tf.reshape(prev, tf.stack([batch_size, num_frames, 4, num_nodes])) if state is None: state = tf.fill(tf.stack([batch_size, num_nodes]), 0.0) if memory is None: memory = tf.fill(tf.stack([batch_size, num_nodes]), 0.0) out, _, mem = rnn.variable_lstm(prev, state, memory, w_m_m, clip=clip) if backward: if length is None: out = tf.reverse(out, [1]) else: out = tf.reverse_sequence(out, length, 1, 0) return out, mem