示例#1
0
    def update(self, step, grads, weights, slots, opt_params):
        updates = []
        learning_rate = opt_params['learning_rate']
        beta1 = opt_params['beta1']
        decay_rate = opt_params['decay_rate']
        clipping_threshold = opt_params['clipping_threshold']
        weight_decay_rate = opt_params['weight_decay_rate']
        epsilon1 = opt_params['epsilon1']
        epsilon2 = opt_params['epsilon2']
        decay_rate = self._decay_rate_pow(step, exponent=decay_rate)
        update_scale = learning_rate
        if self._multiply_by_parameter_scale:
            update_scale *= np.maximum(np.sqrt(np.mean(weights * weights)),
                                       epsilon2)
        mixing_rate = 1.0 - decay_rate

        grads_sqr = grads * grads + epsilon1
        if self._factored and len(weights.shape) >= 2:
            v_row = slots.pop(0)
            v_col = slots.pop(0)
            new_v_row = decay_rate * v_row + mixing_rate * np.mean(grads_sqr,
                                                                   axis=-1)
            new_v_col = decay_rate * v_col + mixing_rate * np.mean(grads_sqr,
                                                                   axis=-2)
            updates.extend([new_v_row, new_v_col])
            row_col_mean = np.mean(new_v_row, axis=-1, keepdims=True)
            row_factor = (new_v_row / row_col_mean)**-0.5
            col_factor = (new_v_col)**-0.5
            y = (grads * np.expand_dims(row_factor, axis=-1) *
                 np.expand_dims(col_factor, axis=-2))
        else:
            v = slots.pop(0)
            new_v = decay_rate * v + mixing_rate * grads_sqr
            updates.append(new_v)
            y = grads * (new_v)**-0.5

        if self._do_clipping:
            clipping_denom = (np.maximum(
                1.0,
                np.sqrt(np.mean(y * y)) / clipping_threshold))
            y /= clipping_denom

        subtrahend = update_scale * y
        if self._do_momentum:
            m = slots.pop(0)
            new_m = beta1 * m + (1.0 - beta1) * subtrahend
            subtrahend = new_m
            updates.append(new_m)

        new_weights = (1 - weight_decay_rate) * weights - subtrahend
        # TODO(lukaszkaiser): why is the astype needed here? Check and correct.
        return new_weights.astype(weights.dtype), updates
示例#2
0
def Softmax5Branches(x_list, **unused_kwargs):
    """Softmax qs.

  The input xs is a list of weights and embedded queries of the form
  w_1 ... w_n q_1 ... q_n. The q_1 ... q_n will be kept, result appended.

  Args:
    x_list: the input weights and embeddings.

  Returns:
    the weighted average of q_1 ... q_n according to softmax(w).
  """
    n_branches = 5
    softmax_activations = x_list[:n_branches]
    max_sa = softmax_activations[0]
    for x in softmax_activations:
        max_sa = np.maximum(max_sa, x)
    softmax_activations = [x - max_sa for x in softmax_activations]
    softmax_activations = [np.exp(x) for x in softmax_activations]
    sum_sa = sum(softmax_activations)
    softmax_activations = [x / sum_sa for x in softmax_activations]
    res = sum([
        x_list[i + n_branches] * softmax_activations[i]
        for i in range(n_branches)
    ])
    return res
示例#3
0
    def forward_with_state(self, xs, weights, state, rng):
        self._validate_forward_inputs(xs)
        (step, layers_state) = state
        # Get N+1 rngs, N for running layers and one extra.
        rngs = _split_rngs(rng, self._n_layers + 1)
        rng0, rngs = rngs[0], rngs[1:]
        if not self.sublayers:  # No-op: leave args unchanged.
            return (xs, (step + 1, layers_state))

        # Prepare the stack and do some safety checks as in the parent class.
        stack = xs
        new_state = []
        n_layers = self._n_layers
        if n_layers != 1 and len(weights) != n_layers:
            raise ValueError(
                'number of weights ({}) not equal to number of layers '
                '({})'.format(len(weights), n_layers))
        if n_layers != 1 and len(layers_state) != n_layers:
            raise ValueError(
                'length of state ({}) not equal to number of layers '
                '({})'.format(len(layers_state), n_layers))

        # TODO(chowdhery): try different strategies, also try running not all
        # layers backwards by using math.stop_gradient where needed.

        # Calculate how many layers to run forward.
        if self._mode == 'train':
            # warmup goes from 1.0 at start to 0.0 at skipping_warmup_steps and after
            w_steps = float(self._skipping_warmup_steps)
            warmup = np.maximum(0.0,
                                (w_steps - step.astype(np.float32)) / w_steps)
            # low is the minimum number of layers to *not* skip, from n_layers to 0
            low = warmup * float(n_layers)
            # high should be so that (high - n_layers) / high = 1.0 - skip_fraction
            # because (high - n_layers) / high is the probability we're not skipping
            # (after warmup); so high - n_layers = high - high * skip_fraction
            high = float(n_layers) / self._skip_fraction
            # We want the same rng0 on all cores.
            if math.device_count() > 1:
                rng0 = math.psum(rng0, 'batch')
            n_forward_layers = random.uniform(rng0, (), np.float32, low, high)
        else:
            n_forward_layers = float(n_layers)
        # Run layers skipping after a certain number.
        cur_layer_idx = 0.0
        for layer, p, s, rng in zip(self.sublayers, weights, layers_state,
                                    rngs):
            inputs = _inputs_from_stack(layer, stack)
            outputs, s = math.cond(  # Skip (do identity) if > n_forward_layers.
                pred=(math.lt(cur_layer_idx, n_forward_layers)),
                true_operand=(inputs, p, s, rng),  # This tuple is t below.
                true_fun=(lambda t: layer.pure_fn(t[0], t[1], t[2], t[3])),  # pylint: disable=cell-var-from-loop
                false_operand=(inputs, p, s, rng),
                false_fun=(lambda t: (t[0], t[2])),  # return (inputs, state)
            )
            stack = _outputs_onto_stack(layer, outputs, stack)
            new_state.append(s)
            cur_layer_idx += 1.0
        return stack, (step + 1, new_state)
示例#4
0
def Relu():
    r"""Returns a layer that computes the Rectified Linear Unit (ReLU) function.

  .. math::
      f(x) = \left\{ \begin{array}{cl}
          0 & \text{if}\ x \leq 0, \\
          x & \text{otherwise}.
      \end{array} \right.
  """
    return Fn('Relu', lambda x: np.maximum(x, np.zeros_like(x)))
示例#5
0
def HardTanh():
  r"""Returns a layer that computes a linear approximation to `Tanh`.

  .. math::
      f(x) = \left\{ \begin{array}{cl}
          -1 & \text{if}\ x \leq 0, \\
          x  & \text{if}\ -1 < x < 1, \\
          1  & \text{otherwise}.
      \end{array} \right.
  """
  return Fn('HardTanh', lambda x: np.maximum(-1, np.minimum(1, x)))
示例#6
0
def HardSigmoid():
  r"""Returns a layer that computes a linear approximation to `Sigmoid`.

  .. math::
      f(x) = \left\{ \begin{array}{cl}
          0 & \text{if}\ x \leq 0, \\
          x & \text{if}\ 0 < x < 1, \\
          1 & \text{otherwise}.
      \end{array} \right.
  """
  return Fn('HardSigmoid', lambda x: np.maximum(0, np.minimum(1, (1 + x))))
示例#7
0
  def forward(self, inputs):
    """Executes this layer as part of a forward pass through the model.

    Args:
      inputs: Tensor.

    Returns:
      Tensor of same shape and dtype as the input.
    """
    threshold = self.weights
    return np.maximum(inputs, threshold)
示例#8
0
def ParametricRelu(a=1.):
  r"""Returns a layer that computes a ReLU function with the given slope.

  .. math::
      f(x) = \left\{ \begin{array}{cl}
          0  & \text{if}\ x \leq 0, \\
          ax & \text{otherwise}.
      \end{array} \right.

  Args:
    a: Slope of line for positive inputs.
  """
  return Fn('ParametricRelu', lambda x: np.maximum(a * x, np.zeros_like(x)))
示例#9
0
 def learning_rate(step):
     """Step to learning rate function."""
     ret = 1.0
     for name in factors:
         if name == 'constant':
             ret *= constant
         elif name == 'linear_warmup':
             ret *= np.minimum(1.0, step / warmup_steps)
         elif name == 'rsqrt_decay':
             ret /= np.sqrt(np.maximum(step, warmup_steps))
         elif name == 'rsqrt_normalized_decay':
             ret *= np.sqrt(warmup_steps)
             ret /= np.sqrt(np.maximum(step, warmup_steps))
         elif name == 'decay_every':
             ret *= (decay_factor**(step // steps_per_decay))
         elif name == 'cosine_decay':
             progress = np.maximum(0.0, (step - warmup_steps) /
                                   float(steps_per_cycle))
             ret *= (0.5 * (1.0 + np.cos(np.pi * (progress % 1.0))))
         else:
             raise ValueError('Unknown factor %s.' % name)
     ret = np.asarray(ret, dtype=np.float32)
     return {'learning_rate': ret}
示例#10
0
  def test_forward(self):
    layer = base.Fn('SumAndMax',
                    lambda x0, x1: (x0 + x1, jnp.maximum(x0, x1)),
                    n_out=2)

    x0 = np.array([1, 2, 3, 4, 5])
    x1 = np.array([10, 20, 30, 40, 50])

    y0, y1 = layer((x0, x1))
    self.assertEqual(y0.tolist(), [11, 22, 33, 44, 55])
    self.assertEqual(y1.tolist(), [10, 20, 30, 40, 50])

    y2, y3 = layer.forward((x0, x1), base.EMPTY_WEIGHTS)
    self.assertEqual(y2.tolist(), [11, 22, 33, 44, 55])
    self.assertEqual(y3.tolist(), [10, 20, 30, 40, 50])

    (y4, y5), state = layer.forward_with_state(
        (x0, x1), base.EMPTY_WEIGHTS, base.EMPTY_STATE, None)
    self.assertEqual(y4.tolist(), [11, 22, 33, 44, 55])
    self.assertEqual(y5.tolist(), [10, 20, 30, 40, 50])
    self.assertEqual(state, base.EMPTY_STATE)
示例#11
0
def ParametricRelu(a=1.):
    return Fn('ParametricRelu', lambda x: np.maximum(a * x, np.zeros_like(x)))
示例#12
0
 def forward(self, inputs, weights):
     threshold = weights[0]
     return np.maximum(inputs, threshold)
示例#13
0
def HardTanh(x, **unused_kwargs):
    """Linear approximation to tanh."""
    return np.maximum(-1, np.minimum(1, x))
示例#14
0
def HardSigmoid(x, **unused_kwargs):
    """Linear approximation to sigmoid."""
    return np.maximum(0, np.minimum(1, (1 + x)))
示例#15
0
def ParametricRelu(x, a=1., **unused_kwargs):
    return np.maximum(a * x, np.zeros_like(x))
示例#16
0
def Relu(x, **unused_kwargs):
    return np.maximum(x, np.zeros_like(x))
示例#17
0
def Relu(x):
    return np.maximum(x, np.zeros_like(x))
示例#18
0
def HardTanh():
    """Computes a linear approximation to tanh."""
    return Fn('HardTanh', lambda x: np.maximum(-1, np.minimum(1, x)))
示例#19
0
def HardSigmoid():
    """Computes a linear approximation to sigmoid."""
    return Fn('HardSigmoid', lambda x: np.maximum(0, np.minimum(1, (1 + x))))
示例#20
0
def Relu():
    return Fn('Relu', lambda x: np.maximum(x, np.zeros_like(x)))
示例#21
0
def ParametricRelu(x, a=1.):
    return np.maximum(a * x, np.zeros_like(x))
示例#22
0
def HardTanh(x):
    """Computes a linear approximation to tanh."""
    return np.maximum(-1, np.minimum(1, x))
示例#23
0
def HardSigmoid(x):
    """Computes a linear approximation to sigmoid."""
    return np.maximum(0, np.minimum(1, (1 + x)))