def update(self, step, grads, weights, slots, opt_params): updates = [] learning_rate = opt_params['learning_rate'] beta1 = opt_params['beta1'] decay_rate = opt_params['decay_rate'] clipping_threshold = opt_params['clipping_threshold'] weight_decay_rate = opt_params['weight_decay_rate'] epsilon1 = opt_params['epsilon1'] epsilon2 = opt_params['epsilon2'] decay_rate = self._decay_rate_pow(step, exponent=decay_rate) update_scale = learning_rate if self._multiply_by_parameter_scale: update_scale *= np.maximum(np.sqrt(np.mean(weights * weights)), epsilon2) mixing_rate = 1.0 - decay_rate grads_sqr = grads * grads + epsilon1 if self._factored and len(weights.shape) >= 2: v_row = slots.pop(0) v_col = slots.pop(0) new_v_row = decay_rate * v_row + mixing_rate * np.mean(grads_sqr, axis=-1) new_v_col = decay_rate * v_col + mixing_rate * np.mean(grads_sqr, axis=-2) updates.extend([new_v_row, new_v_col]) row_col_mean = np.mean(new_v_row, axis=-1, keepdims=True) row_factor = (new_v_row / row_col_mean)**-0.5 col_factor = (new_v_col)**-0.5 y = (grads * np.expand_dims(row_factor, axis=-1) * np.expand_dims(col_factor, axis=-2)) else: v = slots.pop(0) new_v = decay_rate * v + mixing_rate * grads_sqr updates.append(new_v) y = grads * (new_v)**-0.5 if self._do_clipping: clipping_denom = (np.maximum( 1.0, np.sqrt(np.mean(y * y)) / clipping_threshold)) y /= clipping_denom subtrahend = update_scale * y if self._do_momentum: m = slots.pop(0) new_m = beta1 * m + (1.0 - beta1) * subtrahend subtrahend = new_m updates.append(new_m) new_weights = (1 - weight_decay_rate) * weights - subtrahend # TODO(lukaszkaiser): why is the astype needed here? Check and correct. return new_weights.astype(weights.dtype), updates
def Softmax5Branches(x_list, **unused_kwargs): """Softmax qs. The input xs is a list of weights and embedded queries of the form w_1 ... w_n q_1 ... q_n. The q_1 ... q_n will be kept, result appended. Args: x_list: the input weights and embeddings. Returns: the weighted average of q_1 ... q_n according to softmax(w). """ n_branches = 5 softmax_activations = x_list[:n_branches] max_sa = softmax_activations[0] for x in softmax_activations: max_sa = np.maximum(max_sa, x) softmax_activations = [x - max_sa for x in softmax_activations] softmax_activations = [np.exp(x) for x in softmax_activations] sum_sa = sum(softmax_activations) softmax_activations = [x / sum_sa for x in softmax_activations] res = sum([ x_list[i + n_branches] * softmax_activations[i] for i in range(n_branches) ]) return res
def forward_with_state(self, xs, weights, state, rng): self._validate_forward_inputs(xs) (step, layers_state) = state # Get N+1 rngs, N for running layers and one extra. rngs = _split_rngs(rng, self._n_layers + 1) rng0, rngs = rngs[0], rngs[1:] if not self.sublayers: # No-op: leave args unchanged. return (xs, (step + 1, layers_state)) # Prepare the stack and do some safety checks as in the parent class. stack = xs new_state = [] n_layers = self._n_layers if n_layers != 1 and len(weights) != n_layers: raise ValueError( 'number of weights ({}) not equal to number of layers ' '({})'.format(len(weights), n_layers)) if n_layers != 1 and len(layers_state) != n_layers: raise ValueError( 'length of state ({}) not equal to number of layers ' '({})'.format(len(layers_state), n_layers)) # TODO(chowdhery): try different strategies, also try running not all # layers backwards by using math.stop_gradient where needed. # Calculate how many layers to run forward. if self._mode == 'train': # warmup goes from 1.0 at start to 0.0 at skipping_warmup_steps and after w_steps = float(self._skipping_warmup_steps) warmup = np.maximum(0.0, (w_steps - step.astype(np.float32)) / w_steps) # low is the minimum number of layers to *not* skip, from n_layers to 0 low = warmup * float(n_layers) # high should be so that (high - n_layers) / high = 1.0 - skip_fraction # because (high - n_layers) / high is the probability we're not skipping # (after warmup); so high - n_layers = high - high * skip_fraction high = float(n_layers) / self._skip_fraction # We want the same rng0 on all cores. if math.device_count() > 1: rng0 = math.psum(rng0, 'batch') n_forward_layers = random.uniform(rng0, (), np.float32, low, high) else: n_forward_layers = float(n_layers) # Run layers skipping after a certain number. cur_layer_idx = 0.0 for layer, p, s, rng in zip(self.sublayers, weights, layers_state, rngs): inputs = _inputs_from_stack(layer, stack) outputs, s = math.cond( # Skip (do identity) if > n_forward_layers. pred=(math.lt(cur_layer_idx, n_forward_layers)), true_operand=(inputs, p, s, rng), # This tuple is t below. true_fun=(lambda t: layer.pure_fn(t[0], t[1], t[2], t[3])), # pylint: disable=cell-var-from-loop false_operand=(inputs, p, s, rng), false_fun=(lambda t: (t[0], t[2])), # return (inputs, state) ) stack = _outputs_onto_stack(layer, outputs, stack) new_state.append(s) cur_layer_idx += 1.0 return stack, (step + 1, new_state)
def Relu(): r"""Returns a layer that computes the Rectified Linear Unit (ReLU) function. .. math:: f(x) = \left\{ \begin{array}{cl} 0 & \text{if}\ x \leq 0, \\ x & \text{otherwise}. \end{array} \right. """ return Fn('Relu', lambda x: np.maximum(x, np.zeros_like(x)))
def HardTanh(): r"""Returns a layer that computes a linear approximation to `Tanh`. .. math:: f(x) = \left\{ \begin{array}{cl} -1 & \text{if}\ x \leq 0, \\ x & \text{if}\ -1 < x < 1, \\ 1 & \text{otherwise}. \end{array} \right. """ return Fn('HardTanh', lambda x: np.maximum(-1, np.minimum(1, x)))
def HardSigmoid(): r"""Returns a layer that computes a linear approximation to `Sigmoid`. .. math:: f(x) = \left\{ \begin{array}{cl} 0 & \text{if}\ x \leq 0, \\ x & \text{if}\ 0 < x < 1, \\ 1 & \text{otherwise}. \end{array} \right. """ return Fn('HardSigmoid', lambda x: np.maximum(0, np.minimum(1, (1 + x))))
def forward(self, inputs): """Executes this layer as part of a forward pass through the model. Args: inputs: Tensor. Returns: Tensor of same shape and dtype as the input. """ threshold = self.weights return np.maximum(inputs, threshold)
def ParametricRelu(a=1.): r"""Returns a layer that computes a ReLU function with the given slope. .. math:: f(x) = \left\{ \begin{array}{cl} 0 & \text{if}\ x \leq 0, \\ ax & \text{otherwise}. \end{array} \right. Args: a: Slope of line for positive inputs. """ return Fn('ParametricRelu', lambda x: np.maximum(a * x, np.zeros_like(x)))
def learning_rate(step): """Step to learning rate function.""" ret = 1.0 for name in factors: if name == 'constant': ret *= constant elif name == 'linear_warmup': ret *= np.minimum(1.0, step / warmup_steps) elif name == 'rsqrt_decay': ret /= np.sqrt(np.maximum(step, warmup_steps)) elif name == 'rsqrt_normalized_decay': ret *= np.sqrt(warmup_steps) ret /= np.sqrt(np.maximum(step, warmup_steps)) elif name == 'decay_every': ret *= (decay_factor**(step // steps_per_decay)) elif name == 'cosine_decay': progress = np.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle)) ret *= (0.5 * (1.0 + np.cos(np.pi * (progress % 1.0)))) else: raise ValueError('Unknown factor %s.' % name) ret = np.asarray(ret, dtype=np.float32) return {'learning_rate': ret}
def test_forward(self): layer = base.Fn('SumAndMax', lambda x0, x1: (x0 + x1, jnp.maximum(x0, x1)), n_out=2) x0 = np.array([1, 2, 3, 4, 5]) x1 = np.array([10, 20, 30, 40, 50]) y0, y1 = layer((x0, x1)) self.assertEqual(y0.tolist(), [11, 22, 33, 44, 55]) self.assertEqual(y1.tolist(), [10, 20, 30, 40, 50]) y2, y3 = layer.forward((x0, x1), base.EMPTY_WEIGHTS) self.assertEqual(y2.tolist(), [11, 22, 33, 44, 55]) self.assertEqual(y3.tolist(), [10, 20, 30, 40, 50]) (y4, y5), state = layer.forward_with_state( (x0, x1), base.EMPTY_WEIGHTS, base.EMPTY_STATE, None) self.assertEqual(y4.tolist(), [11, 22, 33, 44, 55]) self.assertEqual(y5.tolist(), [10, 20, 30, 40, 50]) self.assertEqual(state, base.EMPTY_STATE)
def ParametricRelu(a=1.): return Fn('ParametricRelu', lambda x: np.maximum(a * x, np.zeros_like(x)))
def forward(self, inputs, weights): threshold = weights[0] return np.maximum(inputs, threshold)
def HardTanh(x, **unused_kwargs): """Linear approximation to tanh.""" return np.maximum(-1, np.minimum(1, x))
def HardSigmoid(x, **unused_kwargs): """Linear approximation to sigmoid.""" return np.maximum(0, np.minimum(1, (1 + x)))
def ParametricRelu(x, a=1., **unused_kwargs): return np.maximum(a * x, np.zeros_like(x))
def Relu(x, **unused_kwargs): return np.maximum(x, np.zeros_like(x))
def Relu(x): return np.maximum(x, np.zeros_like(x))
def HardTanh(): """Computes a linear approximation to tanh.""" return Fn('HardTanh', lambda x: np.maximum(-1, np.minimum(1, x)))
def HardSigmoid(): """Computes a linear approximation to sigmoid.""" return Fn('HardSigmoid', lambda x: np.maximum(0, np.minimum(1, (1 + x))))
def Relu(): return Fn('Relu', lambda x: np.maximum(x, np.zeros_like(x)))
def ParametricRelu(x, a=1.): return np.maximum(a * x, np.zeros_like(x))
def HardTanh(x): """Computes a linear approximation to tanh.""" return np.maximum(-1, np.minimum(1, x))
def HardSigmoid(x): """Computes a linear approximation to sigmoid.""" return np.maximum(0, np.minimum(1, (1 + x)))