def compute_retrospective_loss(self, observed_arr, encoded_arr, decoded_arr, re_encoded_arr): ''' Compute retrospective loss. Returns: The tuple data. - `np.ndarray` of delta. - `np.ndarray` of losses of each batch. - float of loss of all batch. ''' if self.__output_neuron_count == self.__hidden_neuron_count: target_arr = nd.broadcast_sub( encoded_arr, nd.expand_dims(observed_arr.mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt(nd.power(decoded_arr - target_arr, 2)) else: # For each batch, draw a samples from the Uniform distribution. if self.__output_neuron_count > self.__hidden_neuron_count: all_dim_arr = np.arange(self.__output_neuron_count) np.random.shuffle(all_dim_arr) choiced_dim_arr = all_dim_arr[:self.__hidden_neuron_count] target_arr = nd.broadcast_sub( encoded_arr, nd.expand_dims(observed_arr[:, :, choiced_dim_arr].mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt( nd.power(decoded_arr[:, :, choiced_dim_arr] - target_arr, 2)) else: all_dim_arr = np.arange(self.__hidden_neuron_count) np.random.shuffle(all_dim_arr) choiced_dim_arr = all_dim_arr[:self.__output_neuron_count] target_arr = nd.broadcast_sub( encoded_arr[:, :, choiced_dim_arr], nd.expand_dims(observed_arr.mean(axis=2), axis=2)) summary_delta_arr = nd.sqrt( nd.power(decoded_arr - target_arr, 2)) match_delta_arr = None for i in range(self.__batch_size): arr = nd.sqrt( nd.power(encoded_arr[i, -1] - re_encoded_arr[i, -1], 2)) if match_delta_arr is None: match_delta_arr = nd.expand_dims(arr, axis=0) else: match_delta_arr = nd.concat(match_delta_arr, nd.expand_dims(arr, axis=0), dim=0) delta_arr = summary_delta_arr + nd.expand_dims( self.__retrospective_lambda * match_delta_arr, axis=1) v = nd.norm(delta_arr) if v > self.__grad_clip_threshold: delta_arr = delta_arr * self.__grad_clip_threshold / v loss = nd.mean(delta_arr, axis=0, exclude=True) return loss
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): assert len(X.shape) in (2, 4) if len(X.shape) == 2: mean = X.mean(axis=0) variance = ((X - mean)**2).mean(axis=0) else: mean = X.mean(axis=(0, 2, 3), keepdims=True) variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 变形使得可以正确广播 moving_mean = moving_mean.reshape(mean.reshape) moving_variance = moving_variance.reshape(mean.shape) if is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) # 更新全局的均值方差 moving_mean[:] = moving_momentum * moving_mean + ( 1.0 - moving_momentum) * variance else: X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) # 拉升和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance, eps = 1e-5, moving_momentum = 0.9): assert len(X.shape) in (2, 4) # 全连接: batch_size x feature if len(X.shape) == 2: # 每个输入维度在样本上的平均和方差 mean = X.mean(axis=0) variance = ((X - mean)**2).mean(axis=0) # 2D卷积: batch_size x channel x height x width else: # 对每个通道算均值和方差,需要保持4D形状使得可以正确的广播 mean = X.mean(axis=(0,2,3), keepdims=True) variance = ((X - mean)**2).mean(axis=(0,2,3), keepdims=True) # 变形使得可以正确的广播 moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) # 均一化 if not is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) #!!! 更新全局的均值和方差 moving_mean[:] = moving_momentum * moving_mean + ( 1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + ( 1.0 - moving_momentum) * variance else: #!!! 测试阶段使用全局的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) # 拉升和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def adadelta(params, sqrs, deltas, roh, batch_size): eps_stable = 1e-5 for param, sqr, delta in zip(params, sqrs, deltas): g = param.grad / batch_size sqr[:] = roh * sqr + (1. - roh) * nd.square(g) g_next = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g delta[:] = roh * delta + (1. - roh) * g_next * g_next param[:] -= g_next
def adadelta(params, sqrs, deltas, batch_size, rho): eps_stable = 1e-5 for param, sqr, delta in zip(params, sqrs, deltas): g = param.grad / batch_size sqr[:] = rho * sqr + (1. - rho) * nd.square(g) cur_delta = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g delta[:] = rho * delta + (1. - rho) * cur_delta * cur_delta param[:] -= cur_delta
def forward(self, X1, X2): X1 = self.bimp(X1) X2 = self.bimp(X2) X1_norm = nd.sqrt(nd.sum(X1 * X1, axis=-1) + 1e-12) X2_norm = nd.sqrt(nd.sum(X2 * X2, axis=-1) + 1e-12) distance_cos = 1 - nd.sum(X1 * X2, axis=-1) / (X1_norm * X2_norm + 1e-12) return distance_cos
def update(self, index, weight, grad, state): assert (isinstance(weight, NDArray)) assert (isinstance(grad, NDArray)) self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) t = self._index_update_count[index] with bulk(self._bulk): # preprocess grad grad *= self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) mean, var = state mean *= self.beta1 mean += (1. - self.beta1) * grad var *= self.beta2 var += (1. - self.beta2) * square(grad) r1 = weight.norm() if not self.bias_correction: r1 = minimum(maximum(r1, self.lower_bound), self.upper_bound) sqrt_var = sqrt(var) sqrt_var += self.epsilon g = mean / sqrt_var g += wd * weight else: # apply bias correction mean_hat = mean / (1. - power(self.beta1, t)) var_hat = var / (1. - power(self.beta2, t)) if self._eps_after_sqrt: sqrt(var_hat, out=var_hat) var_hat += self.epsilon else: var_hat += self.epsilon sqrt(var_hat, out=var_hat) mean_hat /= var_hat mean_hat += wd * weight g = mean_hat r2 = g.norm() # calculate lamb_trust_ratio ratio = r1 / r2 # becomes NaN if ratio == NaN or 0, otherwise 0 nan_or_zero = 1 - ratio / ratio r = where(nan_or_zero, ones_like(ratio), ratio) lr *= r # update weight g *= lr weight[:] -= g
def verify_instance_norm_rewrite(shp, eps): # assert len(shp) == 4 # NCHW assert len(shp) >= 3 vshp = (shp[1], ) data_np = np.random.uniform(size=shp) gamma_np = np.random.uniform(size=vshp) beta_np = np.random.uniform(size=vshp) x = nd.array(data_np) gamma = nd.array(gamma_np) beta = nd.array(beta_np) # org op y = nd.InstanceNorm(x, gamma=gamma, beta=beta, eps=eps) # rewrite op axis = [i for i in range(len(shp)) if i != 1] for i in axis: gamma = nd.expand_dims(gamma, axis=i) beta = nd.expand_dims(beta, axis=i) n = np.product(shp[2:]) mean = nd.sum(x, axis=axis, keepdims=True) / n dev = x - mean var = nd.sum(dev * dev, axis=axis, keepdims=True) / n std = nd.sqrt(var) + eps frac = dev / std z = frac * gamma + beta # compare assert z.shape == y.shape zn, zp = get_norm(z) yn, yp = get_norm(y) rn = np.linalg.norm(zp - yp) print(zn, yn, rn)
def goodness_of_function_optimizer_function(self): for param, sqr in zip(self.__params, self.__sqrs): g = param.grad / self.__batch_size # 注意 这里不是 += sqr[:] = self.__gamma * sqr + (1. - self.__gamma) * nd.square(g) div = self.__learning_rate * g / nd.sqrt(sqr + self.__eps_stable) param[:] -= div
def TimeseriesFromPSD_nd(param_noise): """ GPU only """ (asd_pos, asd_neg, low_f, high_f, high_f_, size, fs, fmin, fmax) = param_noise (*D_, N) = size D = reduce(lambda x, y: x * y, D_) # Gauss noise and its one-sided PSD without window gauss_noise = 1* nd.random_normal(loc=0,scale=64,shape=(D, N), ctx=ctx) _, xf_noise, psd_gauss = oneSidedPeriodogram_nd(gauss_noise, fs=8192) psd_gauss = nd.array(psd_gauss, ctx = ctx, dtype='float64') psd_twosided = nd.concat( # low positive nd.zeros((D, low_f), ctx = ctx, dtype='float64'), # high positive psd_gauss[:, low_f:high_f] * asd_pos, nd.zeros((D, high_f_), ctx = ctx, dtype='float64'), nd.zeros((D, high_f_), ctx = ctx, dtype='float64'), # high negative psd_gauss[:, low_f:high_f][::-1] * asd_neg, # low negative nd.zeros((D, low_f), ctx = ctx, dtype='float64'), dim=1) amplitude = nd.sqrt(psd_twosided *2 *fs*N ) epsilon_imag = nd.random_uniform(low=0, high=1, shape=(D,N),ctx=ctx,dtype='float64')*2*np.pi re = nd.cos(epsilon_imag)*amplitude im = nd.sin(epsilon_imag)*amplitude temp = nd.zeros((D, N*2) , ctx=ctx) temp[:,::2] = re temp[:,1::2] = im timeseries = mx.contrib.ndarray.ifft(temp)/N return timeseries.reshape(size), psd_twosided
def merge(conv_w, gamma, beta, running_mean, running_var): gamma_over_var = gamma / nd.sqrt(running_var + 1e-5) gamma_over_var_expanded = nd.reshape(gamma_over_var, (gamma_over_var.shape[0], 1, 1, 1)) new_w = gamma_over_var_expanded * nd.cast(conv_w, 'float32') new_b = beta - running_mean * gamma_over_var return new_w, new_b
def adagrad(params, sqrs, lr, batch_size): eps_stable = 1e-7 for param, sqr in zip(params, sqrs): g = param.grad / batch_size sqr[:] += nd.square(g) div = lr * g / nd.sqrt(sqr + eps_stable) param[:] -= div
def verify_l2normalization_rewrite(shape, eps, mode): assert len(shape) == 4 # NCHW data_np = np.random.uniform(size=shape) x = nd.array(data_np) # org op y = nd.L2Normalization(x, eps=eps, mode=mode) # rewrite op z = nd.broadcast_mul(x, x) if mode == "channel": axis = [1] elif mode == "instance": axis = [1, 2, 3] elif mode == "spatial": axis = [2, 3] else: assert "not valid `mode` type: %s" % mode z = nd.sum(z, axis=axis) eps_tensor = nd.array([eps]) z = nd.broadcast_add(z, eps_tensor) z = nd.sqrt(z) for i in axis: z = nd.expand_dims(z, axis=i) z = nd.repeat(z, repeats=shape[i], axis=i) z = nd.broadcast_div(x, z) print(z.shape) return # compare assert z.shape == y.shape zn, zp = get_norm(z) yn, yp = get_norm(y) rn = np.linalg.norm(zp - yp) print(zn, yn, rn)
def rmsprop(params, sqrs, lr, gamma, batch_size): eps_stable = 1e-8 for param, sqr in zip(params, sqrs): g = param.grad / batch_size sqr[:] = gamma * sqr + (1. - gamma) * nd.square(g) div = lr * g / nd.sqrt(sqr + eps_stable) param[:] -= div
def adagrad(params, sqrs, lr, batch_size): eps_stable = 1e-7 for param, sqr in zip(params, sqrs): g = param.grad / batch_size sqr[:] = sqr + nd.square(g) div = lr * g / (nd.sqrt(eps_stable + sqr)) param[:] -= div
def update(self): self.state_step += 1 for idx, data in self.trace: grad = data.grad clr = self.args.lr # clr = self.args.lr / (1 + (self.state_step - 1) * group['lr_decay']) # the update is non-linear so indices must be unique grad_indices = idx grad_values = grad grad_sum = (grad_values * grad_values).mean(1) ctx = self.state_sum.context if ctx != grad_indices.context: grad_indices = grad_indices.as_in_context(ctx) if ctx != grad_sum.context: grad_sum = grad_sum.as_in_context(ctx) self.state_sum[grad_indices] += grad_sum std = self.state_sum[grad_indices] # _sparse_mask std_values = nd.expand_dims(nd.sqrt(std) + 1e-10, 1) if self.gpu >= 0: std_values = std_values.as_in_context(mx.gpu(self.args.gpu)) tmp = -clr * grad_values / std_values if tmp.context != ctx: tmp = tmp.as_in_context(ctx) # TODO(zhengda) the overhead is here. self.emb[grad_indices] = mx.nd.take(self.emb, grad_indices) + tmp self.trace = []
def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) assert(isinstance(grad, NDArray)) self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) t = self._index_update_count[index] # preprocess grad #grad = grad * self.rescale_grad + wd * weight grad *= self.rescale_grad + wd * weight if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) # warming momentum schedule momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay))) momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay))) self.m_schedule = self.m_schedule * momentum_t m_schedule_next = self.m_schedule * momentum_t_1 # update m_t and v_t m_t, v_t = state m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad grad_prime = grad / (1. - self.m_schedule) m_t_prime = m_t / (1. - m_schedule_next) v_t_prime = v_t / (1. - pow(self.beta2, t)) m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime # update weight weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
def update(self, index, weight, grad, state): assert (isinstance(weight, NDArray)) assert (isinstance(grad, NDArray)) self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) is_sparse = grad.stype == 'row_sparse' history = state if is_sparse: kwargs = { 'epsilon': self.float_stable_eps, 'rescale_grad': self.rescale_grad } if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient sparse.adaalter_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs) # raise NotImplementedError('AdaAlter has not been implemented for sparse nd') else: grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) div = grad / sqrt(history + self.float_stable_eps) weight[:] += (div + weight * wd) * -lr history[:] += square(grad)
def verify_l2normalization_rewrite_tile(shape, eps, mode): assert len(shape) == 4 # NCHW data_np = np.random.uniform(size=shape) x = nd.array(data_np) # org op y = nd.L2Normalization(x, eps, mode=mode) # rewrite op z = nd.broadcast_mul(x, x) if mode == "channel": axis = [1] elif mode == "instance": axis = [1, 2, 3] elif mode == "spatial": axis = [2, 3] else: assert "not valid `mode` type: %s" % mode reps = tuple( [shp if i in axis else 1 for i, shp in enumerate(list(shape))]) z = nd.sum(z, axis=axis, keepdims=True) eps_tensor = nd.array([eps]) z = nd.sqrt(z) z = nd.tile(z, reps=reps) z = nd.broadcast_div(x, z) # compare assert z.shape == y.shape zn, zp = get_norm(z) yn, yp = get_norm(y) rn = np.linalg.norm(zp - yp) print(zn, yn, rn)
def grad_clipping(params, theta): norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad**2) norm = nd.sqrt(norm).asscalar() if norm > theta: for p in params: p.grad[:] *= theta / norm
def get_global_norm(arrays): ctx = arrays[0].context total_norm = nd.add_n(*[ nd.dot(x, x).as_in_context(ctx) for x in (arr.reshape((-1, )) for arr in arrays) ]) total_norm = nd.sqrt(total_norm).asscalar() return total_norm
def hybrid_forward(self, F, pred, label, sample_weight=None): #label = _reshape_like(F, label, pred) #loss = F.square(pred-label) #loss = _apply_weighting(F, loss, self._weight/2, sample_weight) loss = F.sqrt(F.square(pred - label)) #return F.mean(loss, axis=self._batch_axis, exclude=True) return loss
def adam(params, vs, sqrs, lr, batch_size, t): for param, v, sqr in zip(params, vs, sqrs): current_v = beta1 * v + (1 - beta1) * param.grad current_sqr = beta2 * sqr + (1 - beta2) * param.grad * param.grad v[:] = current_v / (1 - beta1**t) sqr[:] = current_sqr / (1 - beta2**t) grad = v / nd.sqrt(sqr + eps_stable) param[:] = param - (lr / batch_size) * grad
def grad_clipping(params, theta, ctx): if theta is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad * p.grad) norm = nd.sqrt(norm).asscalar() if norm > theta: for p in params: p.grad[:] *= theta / norm
def batched_l2_dist(a, b): a_squared = nd.power(nd.norm(a, axis=-1), 2) b_squared = nd.power(nd.norm(b, axis=-1), 2) squared_res = nd.add(nd.linalg_gemm( a, nd.transpose(b, axes=(0, 2, 1)), nd.broadcast_axes(nd.expand_dims(b_squared, axis=-2), axis=1, size=a.shape[1]), alpha=-2 ), nd.expand_dims(a_squared, axis=-1)) res = nd.sqrt(nd.clip(squared_res, 1e-30, np.finfo(np.float32).max)) return res
def goodness_of_function_optimizer_function(self): for param, v, sqr in zip(self.__params, self.__vs, self.__sqrs): g = param.grad / self.__batch_size v[:] = self.__beta1 * v + (1 - self.__beta1) * g sqr[:] = self.__beta2 * sqr + (1 - self.__beta2) * nd.square(g) v_hat = v / (1 - self.__beta1**self.__t) sqr_hat = sqr / (1 - self.__beta2**self.__t) div = self.__learning_rate * v_hat / nd.sqrt(sqr_hat + self.__eps_stable) param[:] -= div
def grad_clipping(params, clipping_norm, ctx): """Gradient clipping.""" if clipping_norm is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > clipping_norm: for p in params: p.grad[:] *= clipping_norm / norm
def gradient_clipping(parameters, threshold, ctx): if threshold is not None: norm = nd.array([0.0], ctx) for parameter in parameters: norm += nd.sum(parameter.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > threshold: for parameter in parameters: parameter.grad[:] *= (threshold / norm)
def adam(params, lr, vals, sqrs, iter, batch_size, beta1=0.9, beta2=0.999): eps_stable = 1e-8 for param, val, sqr in zip(params, vals, sqrs): g = param.grad / batch_size val[:] = beta1 * val + (1 - beta1) * g sqr[:] = beta2 * sqr + (1 - beta2) * nd.square(g) #val_next = val / (1 - nd.power(beta1, iter)) val_next = val / (1. - beta1**iter) #sqr_next = sqr / (1. - nd.power(beta2, iter)) sqr_next = sqr / (1. - beta2**iter) g_next = lr * val_next / (nd.sqrt(sqr_next) + eps_stable) param[:] -= g_next
def _ratio_enum(anchor, ratios): """ Enumerate a set of anchors for each aspect ratio wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = nd.round(nd.sqrt(size_ratios)) hs = nd.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors
def grad_clipping(params,theta,ctx): if theta is not None: norm = nd.array([0.0],ctx) for p in params: # print('grad_clipping:grad=',p.grad) norm += nd.sum(p.grad ** 2) # print('norm:',norm) norm = nd.sqrt(norm).asscalar() # print('grad_clipoing:norm=%f,theta=%f' %(norm,theta)) if norm > theta: for p in params: p.grad[:] *= theta / norm