def batch_normalize_with_arguments(x, arguments): """Applies batch normalization to x as specified in arguments. Args: x: A Pretty Tensor. arguments: Either a boolean to batch_normalize or a BatchNormalizationArguments Returns: x with batch normalization applied. """ x = prettytensor.wrap(x) # Backwards compatibility. if isinstance(arguments, bool): if arguments: return x.batch_normalize() else: return x # pylint: disable=protected-access kwargs = arguments._asdict() defaults = prettytensor._defaults # pylint: enable=protected-access for arg in ('learned_moments_update_rate', 'variance_epsilon', 'scale_after_normalization'): if kwargs.get(arg, None) is None: if arg in defaults: kwargs[arg] = defaults[arg] else: del kwargs return x.batch_normalize(**kwargs)
def lstm_cell(input_layer, states, num_units, bias=True, peephole=True, stddev=None, init=None): """Long short-term memory cell (LSTM). Args: input_layer: The input layer. states: The current state of the network, as [[batch, num_units], [batch, num_units]] (c, h). num_units: How big is the hidden state. bias: Whether or not to use a bias. peephole: Whether to use peephole connections as described in http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf stddev: Standard deviation for Gaussian initialization of parameters. init: A tf.*Initializer that is used to initialize the variables. Returns: A RecurrentResult. """ # As a compound op, it needs to respect whether or not this is a sequential # builder. if input_layer.is_sequential_builder(): layer = input_layer.as_layer() else: layer = input_layer c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states] activation_input = layer.fully_connected(4 * num_units, bias=bias, activation_fn=None, stddev=stddev, init=init) activation_h = h.fully_connected(4 * num_units, bias=False, activation_fn=None, stddev=stddev, init=init) activation = activation_input + activation_h # i = input_gate, j = new_input, f = forget_gate, o = output_gate split = activation.split(1, 4) i = split[0] j = split[1] f = split[2] if bias: # Biases of the forget gate are initialized to 1 in order to reduce the # scale of forgetting in the beginning of the training. f += 1.0 o = split[3] if peephole: # TODO(eiderman): It would be worthwhile to determine the best initialization. i += c.diagonal_matrix_mul(stddev=stddev, init=init) f += c.diagonal_matrix_mul(stddev=stddev, init=init) new_c = c * f.apply(tf.sigmoid, name="f_gate") + i.apply(tf.sigmoid, name="i_gate") * j.apply(tf.tanh) if peephole: o += new_c.diagonal_matrix_mul(stddev=stddev, init=init) new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name="o_gate") if input_layer.is_sequential_builder(): new_h = input_layer.set_head(input_layer) return RecurrentResult(new_h, [new_c, new_h])
def gru_cell(input_layer, state, num_units, bias=tf.zeros_initializer(), weights=None, phase=prettytensor.Phase.train, parameter_modifier=parameters.identity): """Gated recurrent unit memory cell (GRU). Args: input_layer: The input layer. state: The current state of the network. For GRUs, this is a list with one element (tensor) of shape [batch, num_units]. num_units: How big is the hidden state. bias: An initializer for the bias or a Tensor. No bias if set to None. weights: An initializer for weights or a Tensor. phase: The phase of graph construction. See `pt.Phase`. parameter_modifier: A function to modify parameters that is applied after creation and before use. Returns: A RecurrentResult. """ # As a compound op, it needs to respect whether or not this is a sequential # builder. if input_layer.is_sequential_builder(): layer = input_layer.as_layer() else: layer = input_layer # We start with bias of 1.0 to not reset and not udpate. # NB We compute activation_input and activation_state in two different ops, # instead of concatenating them, followed by one matrix multiplication. The # reason is that input has size [batch_size x input_size], while state has # [ ? x state_size ], where the first dimension is 1 initially and will be # batch_size only after the first RNN computation. We thus cannot concatenate # input and state, and instead add the results of two fully connected ops, # which works thanks to broadcasting, independent of state's batch size. state = state[0] state_pt = prettytensor.wrap(state, layer.bookkeeper) activation_input = layer.fully_connected( 2 * num_units, bias=None if bias is None else tf.constant_initializer(1.0), activation_fn=None, weights=weights, phase=phase, parameter_modifier=parameter_modifier) activation_state = state_pt.fully_connected( 2 * num_units, bias=None, activation_fn=None, weights=weights, phase=phase, parameter_modifier=parameter_modifier) # adds batch_size x (2 * num_units) + ? x (2 * num_inputs) activation = activation_input + activation_state activation = activation.sigmoid() split = activation.split(1, 2) r = split[0] u = split[1] c = layer.concat(1, [r * state]).fully_connected( num_units, bias=bias, activation_fn=None, weights=weights, phase=phase, parameter_modifier=parameter_modifier).apply(tf.tanh) new_h = u * state + (1 - u) * c if input_layer.is_sequential_builder(): new_h = input_layer.set_head(input_layer) return RecurrentResult(new_h, [new_h])
def lstm_cell(input_layer, states, num_units, bias=tf.zeros_initializer(), peephole=True, weights=None, phase=prettytensor.Phase.train, parameter_modifier=parameters.identity): """Long short-term memory cell (LSTM). Args: input_layer: The input layer. states: The current state of the network, as [[batch, num_units], [batch, num_units]] (c, h). num_units: How big is the hidden state. bias: An initializer for the bias or a Tensor. No bias if set to None. peephole: Whether to use peephole connections as described in http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf weights: An initializer for weights or a Tensor. phase: The phase of graph construction. See `pt.Phase`. parameter_modifier: A function to modify parameters that is applied after creation and before use. Returns: A RecurrentResult. """ # As a compound op, it needs to respect whether or not this is a sequential # builder. if input_layer.is_sequential_builder(): layer = input_layer.as_layer() else: layer = input_layer c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states] activation_input = layer.fully_connected( 4 * num_units, bias=bias, activation_fn=None, weights=weights, parameter_modifier=parameter_modifier, phase=phase) activation_h = h.fully_connected(4 * num_units, bias=None, activation_fn=None, weights=weights, parameter_modifier=parameter_modifier, phase=phase) activation = activation_input + activation_h # i = input_gate, j = new_input, f = forget_gate, o = output_gate split = activation.split(1, 4) i = split[0] j = split[1] f = split[2] if bias is not None: # Biases of the forget gate are initialized to 1 in order to reduce the # scale of forgetting in the beginning of the training. f += 1. o = split[3] if peephole: # TODO(eiderman): It would be worthwhile to determine the best initialization. i += c.diagonal_matrix_mul(weights=weights, parameter_modifier=parameter_modifier, phase=phase) f += c.diagonal_matrix_mul(weights=weights, parameter_modifier=parameter_modifier, phase=phase) f_gate = f.apply(tf.sigmoid, name='f_gate') new_c = (c * f_gate + i.apply(tf.sigmoid, name='i_gate') * j.apply(tf.tanh)) if peephole: o += new_c.diagonal_matrix_mul(weights=weights, parameter_modifier=parameter_modifier, phase=phase) new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name='o_gate') if input_layer.is_sequential_builder(): new_h = input_layer.set_head(input_layer) return RecurrentResult(new_h, [new_c, new_h])
def lstm_cell(input_layer, states, num_units, bias=True, peephole=True, stddev=None, init=None): """Long short-term memory cell (LSTM). Args: input_layer: The input layer. states: The current state of the network, as [[batch, num_units], [batch, num_units]] (c, h). num_units: How big is the hidden state. bias: Whether or not to use a bias. peephole: Whether to use peephole connections as described in http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf stddev: Standard deviation for Gaussian initialization of parameters. init: A tf.*Initializer that is used to initialize the variables. Returns: A RecurrentResult. """ # As a compound op, it needs to respect whether or not this is a sequential # builder. if input_layer.is_sequential_builder(): layer = input_layer.as_layer() else: layer = input_layer c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states] activation_input = layer.fully_connected(4 * num_units, bias=bias, activation_fn=None, stddev=stddev, init=init) activation_h = h.fully_connected(4 * num_units, bias=False, activation_fn=None, stddev=stddev, init=init) activation = activation_input + activation_h # i = input_gate, j = new_input, f = forget_gate, o = output_gate split = activation.split(1, 4) i = split[0] j = split[1] f = split[2] if bias: # Biases of the forget gate are initialized to 1 in order to reduce the # scale of forgetting in the beginning of the training. f += 1. o = split[3] if peephole: # TODO(eiderman): It would be worthwhile to determine the best initialization. i += c.diagonal_matrix_mul(stddev=stddev, init=init) f += c.diagonal_matrix_mul(stddev=stddev, init=init) new_c = (c * f.apply(tf.sigmoid, name='f_gate') + i.apply(tf.sigmoid, name='i_gate') * j.apply(tf.tanh)) if peephole: o += new_c.diagonal_matrix_mul(stddev=stddev, init=init) new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name='o_gate') if input_layer.is_sequential_builder(): new_h = input_layer.set_head(input_layer) return RecurrentResult(new_h, [new_c, new_h])
def __init__(self, input_fn, replay_size, batch_size=None): """Creates a ReplayableQueue that takes data from `input_fn`. See also: `pt.train.ReplayableQueue.build_from_queue`. Note: the shapes of the inputs must be fully defined. Note: `input_fn` is a function instead of an input. This is because otherwise if the input came from a queue, dependencies wouldn't be set up properly and the data would always be dequeued. If you are providing data from a queue, then pass in `lambda: q.dequeue_many(batch_size)`. Args: input_fn: A function of no arguments that returns the input as a tuple of `Tensors`. replay_size: The size of the replay queue. batch_size: If provided, use this as the batch size otherwise infer it. Raises: ValueError: if `replay_size` is not divisible by `batch_size` or if the shapes on the input are wrong. """ inputs = _make_tuple(input_fn()) for x in inputs: x.get_shape().assert_is_fully_defined() if batch_size is not None: x.get_shape()[0].assert_is_compatible_with(batch_size) else: batch_size = x.get_shape()[0].value dtypes = [x.dtype for x in inputs] shapes = [x.get_shape()[1:] if x.get_shape() else () for x in inputs] if replay_size % batch_size != 0: raise ValueError('replay_size size (%d) must be a multiple of batch size ' '(%d)' % (replay_size, batch_size)) # Setup the flag that controls replay. self._replay_var = tf.get_variable( 'replay', dtype=tf.bool, shape=[], initializer=tf.constant_initializer(False), trainable=False) self._set_replay_ph = tf.placeholder(dtype=tf.bool) self._set_replay = self._replay_var.assign(self._set_replay_ph) self._replay_queue = tf.FIFOQueue(replay_size, dtypes, shapes) # _fill_queue adds data to the queue and then returns whether it is full. with tf.control_dependencies([self._replay_queue.enqueue_many(inputs)]): self._fill_queue = tf.less(self._replay_queue.size(), replay_size) # Dequeue all the things! self._clear_queue = self._replay_queue.dequeue_many( self._replay_queue.size()) def _pull_from_replay(): data_tuple = _make_tuple(self._replay_queue.dequeue_many(batch_size)) with tf.control_dependencies([self._replay_queue.enqueue_many(data_tuple) ]): return (tf.identity(data_tuple[0]),) + data_tuple[1:] def _pull_from_original(): return _make_tuple(input_fn()) self._output = prettytensor.wrap( tf.cond(self._replay_var, _pull_from_replay, _pull_from_original))
def __init__(self, input_fn, replay_size, batch_size=None): """Creates a ReplayableQueue that takes data from `input_fn`. See also: `pt.train.ReplayableQueue.build_from_queue`. Note: the shapes of the inputs must be fully defined. Note: `input_fn` is a function instead of an input. This is because otherwise if the input came from a queue, dependencies wouldn't be set up properly and the data would always be dequeued. If you are providing data from a queue, then pass in `lambda: q.dequeue_many(batch_size)`. Args: input_fn: A function of no arguments that returns the input as a tuple of `Tensors`. replay_size: The size of the replay queue. batch_size: If provided, use this as the batch size otherwise infer it. Raises: ValueError: if `replay_size` is not divisible by `batch_size` or if the shapes on the input are wrong. """ inputs = _make_tuple(input_fn()) for x in inputs: x.get_shape().assert_is_fully_defined() if batch_size is not None: x.get_shape()[0].assert_is_compatible_with(batch_size) else: batch_size = x.get_shape()[0].value dtypes = [x.dtype for x in inputs] shapes = [ x.get_shape()[1:] if x.get_shape() else () for x in inputs ] if replay_size % batch_size != 0: raise ValueError( 'replay_size size (%d) must be a multiple of batch size ' '(%d)' % (replay_size, batch_size)) # Setup the flag that controls replay. self._replay_var = tf.get_variable( 'replay', dtype=tf.bool, shape=[], initializer=tf.constant_initializer(False), trainable=False) self._set_replay_ph = tf.placeholder(dtype=tf.bool) self._set_replay = self._replay_var.assign(self._set_replay_ph) self._replay_queue = tf.FIFOQueue(replay_size, dtypes, shapes) # _fill_queue adds data to the queue and then returns whether it is full. with tf.control_dependencies([self._replay_queue.enqueue_many(inputs) ]): self._fill_queue = tf.less(self._replay_queue.size(), replay_size) # Dequeue all the things! self._clear_queue = self._replay_queue.dequeue_many( self._replay_queue.size()) def _pull_from_replay(): data_tuple = _make_tuple( self._replay_queue.dequeue_many(batch_size)) with tf.control_dependencies( [self._replay_queue.enqueue_many(data_tuple)]): return (tf.identity(data_tuple[0]), ) + data_tuple[1:] def _pull_from_original(): return _make_tuple(input_fn()) self._output = prettytensor.wrap( tf.cond(self._replay_var, _pull_from_replay, _pull_from_original))