def batch_normalize_with_arguments(x, arguments):
  """Applies batch normalization to x as specified in arguments.

  Args:
    x: A Pretty Tensor.
    arguments: Either a boolean to batch_normalize or a
      BatchNormalizationArguments

  Returns:
    x with batch normalization applied.
  """
  x = prettytensor.wrap(x)
  # Backwards compatibility.
  if isinstance(arguments, bool):
    if arguments:
      return x.batch_normalize()
    else:
      return x

  # pylint: disable=protected-access
  kwargs = arguments._asdict()
  defaults = prettytensor._defaults
  # pylint: enable=protected-access
  for arg in ('learned_moments_update_rate', 'variance_epsilon',
              'scale_after_normalization'):
    if kwargs.get(arg, None) is None:
      if arg in defaults:
        kwargs[arg] = defaults[arg]
      else:
        del kwargs
  return x.batch_normalize(**kwargs)
예제 #2
0
def batch_normalize_with_arguments(x, arguments):
    """Applies batch normalization to x as specified in arguments.

  Args:
    x: A Pretty Tensor.
    arguments: Either a boolean to batch_normalize or a
      BatchNormalizationArguments

  Returns:
    x with batch normalization applied.
  """
    x = prettytensor.wrap(x)
    # Backwards compatibility.
    if isinstance(arguments, bool):
        if arguments:
            return x.batch_normalize()
        else:
            return x

    # pylint: disable=protected-access
    kwargs = arguments._asdict()
    defaults = prettytensor._defaults
    # pylint: enable=protected-access
    for arg in ('learned_moments_update_rate', 'variance_epsilon',
                'scale_after_normalization'):
        if kwargs.get(arg, None) is None:
            if arg in defaults:
                kwargs[arg] = defaults[arg]
            else:
                del kwargs
    return x.batch_normalize(**kwargs)
예제 #3
0
def lstm_cell(input_layer, states, num_units, bias=True, peephole=True, stddev=None, init=None):
    """Long short-term memory cell (LSTM).

  Args:
    input_layer: The input layer.
    states: The current state of the network, as
      [[batch, num_units], [batch, num_units]] (c, h).
    num_units: How big is the hidden state.
    bias: Whether or not to use a bias.
    peephole: Whether to use peephole connections as described in
        http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf
    stddev: Standard deviation for Gaussian initialization of parameters.
    init: A tf.*Initializer that is used to initialize the variables.
  Returns:
    A RecurrentResult.
  """
    # As a compound op, it needs to respect whether or not this is a sequential
    # builder.
    if input_layer.is_sequential_builder():
        layer = input_layer.as_layer()
    else:
        layer = input_layer
    c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states]
    activation_input = layer.fully_connected(4 * num_units, bias=bias, activation_fn=None, stddev=stddev, init=init)
    activation_h = h.fully_connected(4 * num_units, bias=False, activation_fn=None, stddev=stddev, init=init)

    activation = activation_input + activation_h

    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    split = activation.split(1, 4)
    i = split[0]
    j = split[1]
    f = split[2]
    if bias:
        # Biases of the forget gate are initialized to 1 in order to reduce the
        # scale of forgetting in the beginning of the training.
        f += 1.0
    o = split[3]
    if peephole:
        # TODO(eiderman): It would be worthwhile to determine the best initialization.
        i += c.diagonal_matrix_mul(stddev=stddev, init=init)
        f += c.diagonal_matrix_mul(stddev=stddev, init=init)

    new_c = c * f.apply(tf.sigmoid, name="f_gate") + i.apply(tf.sigmoid, name="i_gate") * j.apply(tf.tanh)
    if peephole:
        o += new_c.diagonal_matrix_mul(stddev=stddev, init=init)

    new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name="o_gate")

    if input_layer.is_sequential_builder():
        new_h = input_layer.set_head(input_layer)
    return RecurrentResult(new_h, [new_c, new_h])
예제 #4
0
def gru_cell(input_layer,
             state,
             num_units,
             bias=tf.zeros_initializer(),
             weights=None,
             phase=prettytensor.Phase.train,
             parameter_modifier=parameters.identity):
    """Gated recurrent unit memory cell (GRU).

  Args:
    input_layer: The input layer.
    state: The current state of the network. For GRUs, this is a list with
      one element (tensor) of shape [batch, num_units].
    num_units: How big is the hidden state.
    bias: An initializer for the bias or a Tensor. No bias if set to None.
    weights: An initializer for weights or a Tensor.
    phase: The phase of graph construction.  See `pt.Phase`.
    parameter_modifier: A function to modify parameters that is applied after
      creation and before use.
  Returns:
    A RecurrentResult.
  """
    # As a compound op, it needs to respect whether or not this is a sequential
    # builder.
    if input_layer.is_sequential_builder():
        layer = input_layer.as_layer()
    else:
        layer = input_layer
    # We start with bias of 1.0 to not reset and not udpate.
    # NB We compute activation_input and activation_state in two different ops,
    # instead of concatenating them, followed by one matrix multiplication. The
    # reason is that input has size [batch_size x input_size], while state has
    # [ ? x state_size ], where the first dimension is 1 initially and will be
    # batch_size only after the first RNN computation. We thus cannot concatenate
    # input and state, and instead add the results of two fully connected ops,
    # which works thanks to broadcasting, independent of state's batch size.
    state = state[0]
    state_pt = prettytensor.wrap(state, layer.bookkeeper)

    activation_input = layer.fully_connected(
        2 * num_units,
        bias=None if bias is None else tf.constant_initializer(1.0),
        activation_fn=None,
        weights=weights,
        phase=phase,
        parameter_modifier=parameter_modifier)
    activation_state = state_pt.fully_connected(
        2 * num_units,
        bias=None,
        activation_fn=None,
        weights=weights,
        phase=phase,
        parameter_modifier=parameter_modifier)

    # adds batch_size x (2 * num_units) + ? x (2 * num_inputs)
    activation = activation_input + activation_state
    activation = activation.sigmoid()

    split = activation.split(1, 2)
    r = split[0]
    u = split[1]

    c = layer.concat(1, [r * state]).fully_connected(
        num_units,
        bias=bias,
        activation_fn=None,
        weights=weights,
        phase=phase,
        parameter_modifier=parameter_modifier).apply(tf.tanh)
    new_h = u * state + (1 - u) * c
    if input_layer.is_sequential_builder():
        new_h = input_layer.set_head(input_layer)
    return RecurrentResult(new_h, [new_h])
예제 #5
0
def lstm_cell(input_layer,
              states,
              num_units,
              bias=tf.zeros_initializer(),
              peephole=True,
              weights=None,
              phase=prettytensor.Phase.train,
              parameter_modifier=parameters.identity):
    """Long short-term memory cell (LSTM).

  Args:
    input_layer: The input layer.
    states: The current state of the network, as
      [[batch, num_units], [batch, num_units]] (c, h).
    num_units: How big is the hidden state.
    bias: An initializer for the bias or a Tensor. No bias if set to None.
    peephole: Whether to use peephole connections as described in
        http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf
    weights:  An initializer for weights or a Tensor.
    phase: The phase of graph construction.  See `pt.Phase`.
    parameter_modifier: A function to modify parameters that is applied after
      creation and before use.
  Returns:
    A RecurrentResult.
  """
    # As a compound op, it needs to respect whether or not this is a sequential
    # builder.
    if input_layer.is_sequential_builder():
        layer = input_layer.as_layer()
    else:
        layer = input_layer
    c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states]
    activation_input = layer.fully_connected(
        4 * num_units,
        bias=bias,
        activation_fn=None,
        weights=weights,
        parameter_modifier=parameter_modifier,
        phase=phase)
    activation_h = h.fully_connected(4 * num_units,
                                     bias=None,
                                     activation_fn=None,
                                     weights=weights,
                                     parameter_modifier=parameter_modifier,
                                     phase=phase)

    activation = activation_input + activation_h

    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    split = activation.split(1, 4)
    i = split[0]
    j = split[1]
    f = split[2]
    if bias is not None:
        # Biases of the forget gate are initialized to 1 in order to reduce the
        # scale of forgetting in the beginning of the training.
        f += 1.
    o = split[3]
    if peephole:
        # TODO(eiderman): It would be worthwhile to determine the best initialization.
        i += c.diagonal_matrix_mul(weights=weights,
                                   parameter_modifier=parameter_modifier,
                                   phase=phase)
        f += c.diagonal_matrix_mul(weights=weights,
                                   parameter_modifier=parameter_modifier,
                                   phase=phase)

    f_gate = f.apply(tf.sigmoid, name='f_gate')
    new_c = (c * f_gate +
             i.apply(tf.sigmoid, name='i_gate') * j.apply(tf.tanh))
    if peephole:
        o += new_c.diagonal_matrix_mul(weights=weights,
                                       parameter_modifier=parameter_modifier,
                                       phase=phase)

    new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name='o_gate')

    if input_layer.is_sequential_builder():
        new_h = input_layer.set_head(input_layer)
    return RecurrentResult(new_h, [new_c, new_h])
예제 #6
0
def lstm_cell(input_layer,
              states,
              num_units,
              bias=True,
              peephole=True,
              stddev=None,
              init=None):
    """Long short-term memory cell (LSTM).

  Args:
    input_layer: The input layer.
    states: The current state of the network, as
      [[batch, num_units], [batch, num_units]] (c, h).
    num_units: How big is the hidden state.
    bias: Whether or not to use a bias.
    peephole: Whether to use peephole connections as described in
        http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf
    stddev: Standard deviation for Gaussian initialization of parameters.
    init: A tf.*Initializer that is used to initialize the variables.
  Returns:
    A RecurrentResult.
  """
    # As a compound op, it needs to respect whether or not this is a sequential
    # builder.
    if input_layer.is_sequential_builder():
        layer = input_layer.as_layer()
    else:
        layer = input_layer
    c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states]
    activation_input = layer.fully_connected(4 * num_units,
                                             bias=bias,
                                             activation_fn=None,
                                             stddev=stddev,
                                             init=init)
    activation_h = h.fully_connected(4 * num_units,
                                     bias=False,
                                     activation_fn=None,
                                     stddev=stddev,
                                     init=init)

    activation = activation_input + activation_h

    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    split = activation.split(1, 4)
    i = split[0]
    j = split[1]
    f = split[2]
    if bias:
        # Biases of the forget gate are initialized to 1 in order to reduce the
        # scale of forgetting in the beginning of the training.
        f += 1.
    o = split[3]
    if peephole:
        # TODO(eiderman): It would be worthwhile to determine the best initialization.
        i += c.diagonal_matrix_mul(stddev=stddev, init=init)
        f += c.diagonal_matrix_mul(stddev=stddev, init=init)

    new_c = (c * f.apply(tf.sigmoid, name='f_gate') +
             i.apply(tf.sigmoid, name='i_gate') * j.apply(tf.tanh))
    if peephole:
        o += new_c.diagonal_matrix_mul(stddev=stddev, init=init)

    new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name='o_gate')

    if input_layer.is_sequential_builder():
        new_h = input_layer.set_head(input_layer)
    return RecurrentResult(new_h, [new_c, new_h])
예제 #7
0
  def __init__(self, input_fn, replay_size, batch_size=None):
    """Creates a ReplayableQueue that takes data from `input_fn`.

    See also: `pt.train.ReplayableQueue.build_from_queue`.

    Note: the shapes of the inputs must be fully defined.

    Note: `input_fn` is a function instead of an input. This is because
      otherwise if the input came from a queue, dependencies wouldn't be set up
      properly and the data would always be dequeued. If you are providing data
      from a queue, then pass in `lambda: q.dequeue_many(batch_size)`.

    Args:
      input_fn: A function of no arguments that returns the input as a tuple of
        `Tensors`.
      replay_size: The size of the replay queue.
      batch_size: If provided, use this as the batch size otherwise infer it.

    Raises:
      ValueError: if `replay_size` is not divisible by `batch_size` or if the
        shapes on the input are wrong.
    """
    inputs = _make_tuple(input_fn())

    for x in inputs:
      x.get_shape().assert_is_fully_defined()
      if batch_size is not None:
        x.get_shape()[0].assert_is_compatible_with(batch_size)
      else:
        batch_size = x.get_shape()[0].value

      dtypes = [x.dtype for x in inputs]
      shapes = [x.get_shape()[1:] if x.get_shape() else () for x in inputs]

    if replay_size % batch_size != 0:
      raise ValueError('replay_size size (%d) must be a multiple of batch size '
                       '(%d)' % (replay_size, batch_size))

    # Setup the flag that controls replay.
    self._replay_var = tf.get_variable(
        'replay',
        dtype=tf.bool,
        shape=[],
        initializer=tf.constant_initializer(False),
        trainable=False)
    self._set_replay_ph = tf.placeholder(dtype=tf.bool)
    self._set_replay = self._replay_var.assign(self._set_replay_ph)

    self._replay_queue = tf.FIFOQueue(replay_size, dtypes, shapes)

    # _fill_queue adds data to the queue and then returns whether it is full.
    with tf.control_dependencies([self._replay_queue.enqueue_many(inputs)]):
      self._fill_queue = tf.less(self._replay_queue.size(), replay_size)

    # Dequeue all the things!
    self._clear_queue = self._replay_queue.dequeue_many(
        self._replay_queue.size())

    def _pull_from_replay():
      data_tuple = _make_tuple(self._replay_queue.dequeue_many(batch_size))
      with tf.control_dependencies([self._replay_queue.enqueue_many(data_tuple)
                                   ]):
        return (tf.identity(data_tuple[0]),) + data_tuple[1:]

    def _pull_from_original():
      return _make_tuple(input_fn())

    self._output = prettytensor.wrap(
        tf.cond(self._replay_var, _pull_from_replay, _pull_from_original))
예제 #8
0
    def __init__(self, input_fn, replay_size, batch_size=None):
        """Creates a ReplayableQueue that takes data from `input_fn`.

    See also: `pt.train.ReplayableQueue.build_from_queue`.

    Note: the shapes of the inputs must be fully defined.

    Note: `input_fn` is a function instead of an input. This is because
      otherwise if the input came from a queue, dependencies wouldn't be set up
      properly and the data would always be dequeued. If you are providing data
      from a queue, then pass in `lambda: q.dequeue_many(batch_size)`.

    Args:
      input_fn: A function of no arguments that returns the input as a tuple of
        `Tensors`.
      replay_size: The size of the replay queue.
      batch_size: If provided, use this as the batch size otherwise infer it.

    Raises:
      ValueError: if `replay_size` is not divisible by `batch_size` or if the
        shapes on the input are wrong.
    """
        inputs = _make_tuple(input_fn())

        for x in inputs:
            x.get_shape().assert_is_fully_defined()
            if batch_size is not None:
                x.get_shape()[0].assert_is_compatible_with(batch_size)
            else:
                batch_size = x.get_shape()[0].value

            dtypes = [x.dtype for x in inputs]
            shapes = [
                x.get_shape()[1:] if x.get_shape() else () for x in inputs
            ]

        if replay_size % batch_size != 0:
            raise ValueError(
                'replay_size size (%d) must be a multiple of batch size '
                '(%d)' % (replay_size, batch_size))

        # Setup the flag that controls replay.
        self._replay_var = tf.get_variable(
            'replay',
            dtype=tf.bool,
            shape=[],
            initializer=tf.constant_initializer(False),
            trainable=False)
        self._set_replay_ph = tf.placeholder(dtype=tf.bool)
        self._set_replay = self._replay_var.assign(self._set_replay_ph)

        self._replay_queue = tf.FIFOQueue(replay_size, dtypes, shapes)

        # _fill_queue adds data to the queue and then returns whether it is full.
        with tf.control_dependencies([self._replay_queue.enqueue_many(inputs)
                                      ]):
            self._fill_queue = tf.less(self._replay_queue.size(), replay_size)

        # Dequeue all the things!
        self._clear_queue = self._replay_queue.dequeue_many(
            self._replay_queue.size())

        def _pull_from_replay():
            data_tuple = _make_tuple(
                self._replay_queue.dequeue_many(batch_size))
            with tf.control_dependencies(
                [self._replay_queue.enqueue_many(data_tuple)]):
                return (tf.identity(data_tuple[0]), ) + data_tuple[1:]

        def _pull_from_original():
            return _make_tuple(input_fn())

        self._output = prettytensor.wrap(
            tf.cond(self._replay_var, _pull_from_replay, _pull_from_original))
예제 #9
0
def gru_cell(input_layer,
             state,
             num_units,
             bias=tf.zeros_initializer(),
             weights=None,
             phase=prettytensor.Phase.train,
             parameter_modifier=parameters.identity):
  """Gated recurrent unit memory cell (GRU).

  Args:
    input_layer: The input layer.
    state: The current state of the network. For GRUs, this is a list with
      one element (tensor) of shape [batch, num_units].
    num_units: How big is the hidden state.
    bias: An initializer for the bias or a Tensor. No bias if set to None.
    weights: An initializer for weights or a Tensor.
    phase: The phase of graph construction.  See `pt.Phase`.
    parameter_modifier: A function to modify parameters that is applied after
      creation and before use.
  Returns:
    A RecurrentResult.
  """
  # As a compound op, it needs to respect whether or not this is a sequential
  # builder.
  if input_layer.is_sequential_builder():
    layer = input_layer.as_layer()
  else:
    layer = input_layer
  # We start with bias of 1.0 to not reset and not udpate.
  # NB We compute activation_input and activation_state in two different ops,
  # instead of concatenating them, followed by one matrix multiplication. The
  # reason is that input has size [batch_size x input_size], while state has
  # [ ? x state_size ], where the first dimension is 1 initially and will be
  # batch_size only after the first RNN computation. We thus cannot concatenate
  # input and state, and instead add the results of two fully connected ops,
  # which works thanks to broadcasting, independent of state's batch size.
  state = state[0]
  state_pt = prettytensor.wrap(state, layer.bookkeeper)

  activation_input = layer.fully_connected(
      2 * num_units,
      bias=None if bias is None else tf.constant_initializer(1.0),
      activation_fn=None,
      weights=weights,
      phase=phase,
      parameter_modifier=parameter_modifier)
  activation_state = state_pt.fully_connected(
      2 * num_units,
      bias=None,
      activation_fn=None,
      weights=weights,
      phase=phase,
      parameter_modifier=parameter_modifier)

  # adds batch_size x (2 * num_units) + ? x (2 * num_inputs)
  activation = activation_input + activation_state
  activation = activation.sigmoid()

  split = activation.split(1, 2)
  r = split[0]
  u = split[1]

  c = layer.concat(1, [r * state]).fully_connected(
      num_units,
      bias=bias,
      activation_fn=None,
      weights=weights,
      phase=phase,
      parameter_modifier=parameter_modifier).apply(tf.tanh)
  new_h = u * state + (1 - u) * c
  if input_layer.is_sequential_builder():
    new_h = input_layer.set_head(input_layer)
  return RecurrentResult(new_h, [new_h])
예제 #10
0
def lstm_cell(input_layer,
              states,
              num_units,
              bias=tf.zeros_initializer(),
              peephole=True,
              weights=None,
              phase=prettytensor.Phase.train,
              parameter_modifier=parameters.identity):
  """Long short-term memory cell (LSTM).

  Args:
    input_layer: The input layer.
    states: The current state of the network, as
      [[batch, num_units], [batch, num_units]] (c, h).
    num_units: How big is the hidden state.
    bias: An initializer for the bias or a Tensor. No bias if set to None.
    peephole: Whether to use peephole connections as described in
        http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf
    weights:  An initializer for weights or a Tensor.
    phase: The phase of graph construction.  See `pt.Phase`.
    parameter_modifier: A function to modify parameters that is applied after
      creation and before use.
  Returns:
    A RecurrentResult.
  """
  # As a compound op, it needs to respect whether or not this is a sequential
  # builder.
  if input_layer.is_sequential_builder():
    layer = input_layer.as_layer()
  else:
    layer = input_layer
  c, h = [prettytensor.wrap(state, layer.bookkeeper) for state in states]
  activation_input = layer.fully_connected(
      4 * num_units,
      bias=bias,
      activation_fn=None,
      weights=weights,
      parameter_modifier=parameter_modifier,
      phase=phase)
  activation_h = h.fully_connected(4 * num_units,
                                   bias=None,
                                   activation_fn=None,
                                   weights=weights,
                                   parameter_modifier=parameter_modifier,
                                   phase=phase)

  activation = activation_input + activation_h

  # i = input_gate, j = new_input, f = forget_gate, o = output_gate
  split = activation.split(1, 4)
  i = split[0]
  j = split[1]
  f = split[2]
  if bias is not None:
    # Biases of the forget gate are initialized to 1 in order to reduce the
    # scale of forgetting in the beginning of the training.
    f += 1.
  o = split[3]
  if peephole:
    # TODO(eiderman): It would be worthwhile to determine the best initialization.
    i += c.diagonal_matrix_mul(weights=weights,
                               parameter_modifier=parameter_modifier,
                               phase=phase)
    f += c.diagonal_matrix_mul(weights=weights,
                               parameter_modifier=parameter_modifier,
                               phase=phase)

  f_gate = f.apply(tf.sigmoid, name='f_gate')
  new_c = (c * f_gate + i.apply(tf.sigmoid, name='i_gate') * j.apply(tf.tanh))
  if peephole:
    o += new_c.diagonal_matrix_mul(weights=weights,
                                   parameter_modifier=parameter_modifier,
                                   phase=phase)

  new_h = new_c.apply(tf.tanh) * o.apply(tf.sigmoid, name='o_gate')

  if input_layer.is_sequential_builder():
    new_h = input_layer.set_head(input_layer)
  return RecurrentResult(new_h, [new_c, new_h])