def create_fns(input, in_signs, Ds): cumulative_units = np.concatenate([[0], np.cumsum(Ds[:-1])]) Ws = [sj.initializers.he((j, i)) for j, i in zip(Ds[1:], Ds[:-1])] bs = [sj.initializers.he((j,)) for j in Ds[1:]] A_w = [T.eye(Ds[0])] B_w = [T.zeros(Ds[0])] A_q = [T.eye(Ds[0])] B_q = [T.zeros(Ds[0])] maps = [input] signs = [] masks = [T.ones(Ds[0])] in_masks = T.where(T.concatenate([T.ones(Ds[0]), in_signs]) > 0, 1., 0.1) for w, b in zip(Ws[:-1], bs[:-1]): pre_activation = T.matmul(w, maps[-1]) + b signs.append(T.sign(pre_activation)) masks.append(T.where(pre_activation > 0, 1., 0.1)) maps.append(pre_activation * masks[-1]) maps.append(T.matmul(Ws[-1], maps[-1]) + bs[-1]) # compute per region A and B for start, end, w, b, m in zip(cumulative_units[:-1], cumulative_units[1:], Ws, bs, masks): A_w.append(T.matmul(w * m, A_w[-1])) B_w.append(T.matmul(w * m, B_w[-1]) + b) A_q.append(T.matmul(w * in_masks[start:end], A_q[-1])) B_q.append(T.matmul(w * in_masks[start:end], B_q[-1]) + b) signs = T.concatenate(signs) ineq_b = T.concatenate(B_w[1:-1]) ineq_A = T.vstack(A_w[1:-1]) inequalities = T.hstack([ineq_b[:, None], ineq_A]) inequalities = inequalities * signs[:, None] / T.linalg.norm(ineq_A, 2, 1, keepdims=True) inequalities_code = T.hstack([T.concatenate(B_q[1:-1])[:, None], T.vstack(A_q[1:-1])]) inequalities_code = inequalities_code * in_signs[:, None] f = sj.function(input, outputs=[maps[-1], A_w[-1], B_w[-1], inequalities, signs]) g = sj.function(in_signs, outputs=[A_q[-1], B_q[-1]]) all_g = sj.function(in_signs, outputs=inequalities_code) h = sj.function(input, outputs=maps[-1]) return f, g, h, all_g
def elu(x, alpha=1.0): r"""Exponential linear unit activation function. Computes the element-wise function: .. math:: \mathrm{elu}(x) = \begin{cases} x, & x > 0\\ \alpha \left(\exp(x) - 1\right), & x \le 0 \end{cases} """ safe_x = T.where(x > 0, 0.0, x) return T.where(x > 0, x, alpha * T.expm1(safe_x))
def hard_tanh(x): r"""Hard :math:`\mathrm{tanh}` activation function. Computes the element-wise function: .. math:: \mathrm{hard\_tanh}(x) = \begin{cases} -1, & x < -1\\ x, & 0 \le x \le 1\\ 1, & 1 < x \end{cases} """ return T.where(x > 1, 1, T.where(x < -1, -1, x))
def forward( self, input, axis, deterministic, const=1e-4, beta1=0.9, beta2=0.9, W=T.ones, b=T.zeros, trainable_W=True, trainable_b=True, ): self.beta1 = beta1 self.beta2 = beta2 self.const = const self.axis = axis self.deterministic = deterministic parameter_shape = [ input.shape[i] if i in axis else 1 for i in range(input.ndim) ] reduce_axes = [i for i in range(input.ndim) if i not in axis] self.create_variable("W", W, parameter_shape, trainable=trainable_W) self.create_variable("b", b, parameter_shape, trainable=trainable_b) input_mean = T.mean(input, reduce_axes, keepdims=True) input_inv_std = 1 / (T.std(input, reduce_axes, keepdims=True) + const) self.avg_mean = schedules.ExponentialMovingAverage(input_mean, beta1)[1] self.avg_inv_std = schedules.ExponentialMovingAverage( input_inv_std, beta2)[1] use_mean = T.where(deterministic, self.avg_mean, input_mean) use_inv_std = T.where(deterministic, self.avg_inv_std, input_inv_std) W = self.W or 1.0 b = self.b if self.b is not None else 0.0 return W * (input - use_mean) * use_inv_std + b
def leaky_relu(x, negative_slope=1e-2): r"""Leaky rectified linear unit activation function. Computes the element-wise function: .. math:: \mathrm{leaky\_relu}(x) = \begin{cases} x, & x \ge 0\\ \alpha x, & x < 0 \end{cases} where :math:`\alpha` = :code:`negative_slope`. """ return T.where(x >= 0, x, negative_slope * x)
def __init__(self, input, p, axis, deterministic, seed=None): extra_dims = input.ndim - 1 flip = T.random.bernoulli( shape=(input.shape[0], ) + (1, ) * extra_dims, p=p, seed=seed, ) dirac = T.cast(deterministic, "float32") flipped_input = T.where(flip, T.flip(input, axis), input) return input * dirac + flipped_input * (1 - dirac)
def celu(x, alpha=1.0): r"""Continuously-differentiable exponential linear unit activation. Computes the element-wise function: .. math:: \mathrm{celu}(x) = \begin{cases} x, & x > 0\\ \alpha \left(\exp(\frac{x}{\alpha}) - 1\right), & x \le 0 \end{cases} For more information, see `Continuously Differentiable Exponential Linear Units <https://arxiv.org/pdf/1704.07483.pdf>`_.""" return T.where(x > 0, x, alpha * T.expm1(x / alpha))
def create_updates( self, grads_or_loss, learning_rate, amsgrad=False, beta_1=0.9, beta_2=0.999, epsilon=1e-7, params=None, ): if isinstance(grads_or_loss, list): assert params if params is None: params = self._get_variables(grads_or_loss) elif type(params) != list: raise RuntimeError("given params should be a list") if len(params) == 0: raise RuntimeError( "no parameters are given for the gradients, this can be due to passing explicitly an empty list or to passing a lost connected to no trainable weights" ) grads = self._get_grads(grads_or_loss, params) local_step = tensor.Variable(1, dtype="int32", trainable=False) updates = {local_step: local_step + 1} beta_1_t = tensor.power(beta_1, local_step) beta_2_t = tensor.power(beta_2, local_step) lr = learning_rate * (tensor.sqrt(1 - beta_2_t) / (1 - beta_1_t)) for param, grad in zip(params, grads): m = ExponentialMovingAverage(grad, beta_1, debias=False)[0] v = ExponentialMovingAverage(grad**2, beta_2, debias=False)[0] if amsgrad: v_hat = tensor.Variable(tensor.zeros_like(param), name="v_hat", trainable=False) updates[v_hat] = tensor.maximum(v_hat, v) update = m / (tensor.sqrt(updates[v_hat]) + epsilon) else: update = m / (tensor.sqrt(v) + epsilon) update = tensor.where(local_step == 1, grad, update) updates[param] = param - lr * update self.add_updates(updates)
def ExponentialMovingAverage(value, alpha): with Scope("ExponentialMovingAverage"): first_step = T.Variable(True, trainable=False, name="first_step", dtype="bool") var = T.Variable(T.zeros(value.shape), trainable=False, dtype="float32", name="EMA") new_value = T.where(first_step, value, var * alpha + (1 - alpha) * value) current_graph().add({var: new_value, first_step: False}) return new_value, var
def create_fns(input, in_signs, Ds, x, m0, m1, m2, batch_in_signs, alpha=0.1, sigma=1, sigma_x=1, lr=0.0002): cumulative_units = np.concatenate([[0], np.cumsum(Ds[:-1])]) BS = batch_in_signs.shape[0] Ws = [ T.Variable(sj.initializers.glorot((j, i)) * sigma) for j, i in zip(Ds[1:], Ds[:-1]) ] bs = [T.Variable(sj.initializers.he((j,)) * sigma) for j in Ds[1:-1]]\ + [T.Variable(T.zeros((Ds[-1],)))] A_w = [T.eye(Ds[0])] B_w = [T.zeros(Ds[0])] A_q = [T.eye(Ds[0])] B_q = [T.zeros(Ds[0])] batch_A_q = [T.eye(Ds[0]) * T.ones((BS, 1, 1))] batch_B_q = [T.zeros((BS, Ds[0]))] maps = [input] signs = [] masks = [T.ones(Ds[0])] in_masks = T.where(T.concatenate([T.ones(Ds[0]), in_signs]) > 0, 1., alpha) batch_in_masks = T.where( T.concatenate([T.ones((BS, Ds[0])), batch_in_signs], 1) > 0, 1., alpha) for w, b in zip(Ws[:-1], bs[:-1]): pre_activation = T.matmul(w, maps[-1]) + b signs.append(T.sign(pre_activation)) masks.append(T.where(pre_activation > 0, 1., alpha)) maps.append(pre_activation * masks[-1]) maps.append(T.matmul(Ws[-1], maps[-1]) + bs[-1]) # compute per region A and B for start, end, w, b, m in zip(cumulative_units[:-1], cumulative_units[1:], Ws, bs, masks): A_w.append(T.matmul(w * m, A_w[-1])) B_w.append(T.matmul(w * m, B_w[-1]) + b) A_q.append(T.matmul(w * in_masks[start:end], A_q[-1])) B_q.append(T.matmul(w * in_masks[start:end], B_q[-1]) + b) batch_A_q.append( T.matmul(w * batch_in_masks[:, None, start:end], batch_A_q[-1])) batch_B_q.append((w * batch_in_masks[:, None, start:end]\ * batch_B_q[-1][:, None, :]).sum(2) + b) batch_B_q = batch_B_q[-1] batch_A_q = batch_A_q[-1] signs = T.concatenate(signs) inequalities = T.hstack( [T.concatenate(B_w[1:-1])[:, None], T.vstack(A_w[1:-1])]) * signs[:, None] inequalities_code = T.hstack( [T.concatenate(B_q[1:-1])[:, None], T.vstack(A_q[1:-1])]) * in_signs[:, None] #### loss log_sigma2 = T.Variable(sigma_x) sigma2 = T.exp(log_sigma2) Am1 = T.einsum('qds,nqs->nqd', batch_A_q, m1) Bm0 = T.einsum('qd,nq->nd', batch_B_q, m0) B2m0 = T.einsum('nq,qd->n', m0, batch_B_q**2) AAm2 = T.einsum('qds,qdu,nqup->nsp', batch_A_q, batch_A_q, m2) inner = -(x * (Am1.sum(1) + Bm0)).sum(1) + (Am1 * batch_B_q).sum((1, 2)) loss_2 = (x**2).sum(1) + B2m0 + T.trace(AAm2, axis1=1, axis2=2).squeeze() loss_z = T.trace(m2.sum(1), axis1=1, axis2=2).squeeze() cst = 0.5 * (Ds[0] + Ds[-1]) * T.log(2 * np.pi) loss = cst + 0.5 * Ds[-1] * log_sigma2 + inner / sigma2\ + 0.5 * loss_2 / sigma2 + 0.5 * loss_z mean_loss = loss.mean() adam = sj.optimizers.NesterovMomentum(mean_loss, Ws + bs, lr, 0.9) train_f = sj.function(batch_in_signs, x, m0, m1, m2, outputs=mean_loss, updates=adam.updates) f = sj.function(input, outputs=[maps[-1], A_w[-1], B_w[-1], inequalities, signs]) g = sj.function(in_signs, outputs=[A_q[-1], B_q[-1]]) all_g = sj.function(in_signs, outputs=inequalities_code) h = sj.function(input, outputs=maps[-1]) return f, g, h, all_g, train_f, sigma2
def relu_mask(x, leakiness): if type(x) == list: return [relu_mask(xx, leakiness) for xx in x] return T.where(x >= 0, 1., leakiness)
def forward(self, input, p, deterministic, seed=None): self.p = p self.mask = T.random.bernoulli(shape=input.shape, p=p, seed=seed) return T.where(deterministic, input, self.mask * input)
def ExponentialMovingAverage( value, alpha, init=None, decay_min=False, debias=True, name="ExponentialMovingAverage", ): """exponential moving average of a given value This method allows to obtain an EMA of a given variable (or any Tensor) with internal state automatically upating its values as new samples are observed and the internal updates are applied as part of a fuction At each iteration the new value is given by .. math:: v(0) = value(0) or init v(t) = v(t-1) * alpha + value(t) * (1 - alpha) Args ---- value: Tensor-like the value to use for the EMA alpha: scalar the decay of the EMA init: Tensor-like (same shape as value) optional the initialization of the EMA, if not given uses the value allowing for unbiased estimate decay_min: bool at early stages, clip the decay to avoid erratir behaviors Returns ------- ema: Tensor-like the current (latest) value of the EMA incorporating information of the latest observation of value fixed_ema: Tensor-like the value of the EMA of the previous pass. This is usefull if one wants to keep the estimate of the EMA fixed for new observations, then simply do not apply anymore updates (using a new function) and using this fixed variable during testing (while ema will keep use the latest observed value) Example ------- .. doctest :: >>> import symjax >>> import numpy as np >>> np.random.seed(0) >>> symjax.current_graph().reset() >>> # suppose we want to do an EMA of a vector user-input >>> input = symjax.tensor.Placeholder((2,), 'float32') >>> ema, var = symjax.nn.schedules.ExponentialMovingAverage(input, 0.9) >>> # in the background, symjax automatically records the needed updates >>> print(symjax.get_updates()) {Variable(name=EMA, shape=(2,), dtype=float32, trainable=False, scope=/ExponentialMovingAverage/): Op(name=where, fn=where, shape=(2,), dtype=float32, scope=/ExponentialMovingAverage/), Variable(name=first_step, shape=(), dtype=bool, trainable=False, scope=/ExponentialMovingAverage/): False} >>> # example of use: >>> f = symjax.function(input, outputs=ema, updates=symjax.get_updates()) >>> for i in range(25): ... print(f(np.ones(2) + np.random.randn(2) * 0.3)) [1.5292157 1.1200472] [1.5056562 1.1752692] [1.5111173 1.1284239] [1.4885082 1.1110408] [1.4365609 1.1122546] [1.3972261 1.1446574] [1.3803346 1.1338419] [1.355617 1.1304679] [1.3648777 1.1112664] [1.3377819 1.0745169] [1.227414 1.0866737] [1.2306056 1.0557414] [1.2756376 1.0065362] [1.2494465 1.000267 ] [1.2704852 1.0443211] [1.2480851 1.0512339] [1.196643 0.9866866] [1.1665413 0.9927084] [1.186796 1.029509] [1.1564965 1.017489 ] [1.1093903 0.97313946] [1.0472631 1.0343488] [1.0272473 1.0177717] [0.9869387 1.0393193] [0.93982786 1.029005 ] """ with Scope(name): init = init if init is not None else T.zeros_like(value, detach=True) num_steps = T.Variable(0, trainable=False, name="num_steps", dtype="int32") var = T.Variable(init, trainable=False, dtype="float32", name="EMA") if decay_min: decay = T.minimum(alpha, (1.0 + num_steps) / (10.0 + num_steps)) else: decay = alpha ema = decay * var + (1 - decay) * value var_update = T.where(T.equal(num_steps, 0), init, ema) current_graph().add_updates({var: ema, num_steps: num_steps + 1}) if debias: debiased_ema = ema_debias(ema, init, decay, num_steps + 1) debiased_var = T.Variable( init, trainable=False, dtype="float32", name="debiased_EMA" ) current_graph().add_updates({debiased_var: debiased_ema}) if debias: return debiased_ema, debiased_var else: return ema, var