def _add(self, x, y): if self._tf1: result = K.update_add(x, y) else: result = state_ops.assign_add(x, y, use_locking=self._use_locking) self._updates.append(result) return result
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: if self.verbose>0: print("CLpping variable",p.name," to ", self.clips[p.name] ) c = K.eval(self.clips[clptrkey]) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): assert len(params) == len(self.multipliers) grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m, mult in zip(params, grads, moments, self.multipliers): v = self.momentum * m - (lr * mult) * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): g2 = K.square(g) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr adam_lr = self.adam_lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) adam_lr = adam_lr * (1. / (1. + self.decay * K.cast( self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 adam_lr_t = adam_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.ms = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.vs = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.weights = [self.iterations] + moments + vhats + [self.ms ] + [self.vs] for i, (p, g, m, vhat) in enumerate(zip(params, grads, moments, vhats)): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v if i == 0 and self.e2efs_layer is not None: nnz = K.sum(K.cast(K.greater(p, 0.), K.floatx())) m_t = (self.beta_1 * self.ms) + (1. - self.beta_1) * g v_t = (self.beta_2 * self.vs) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - adam_lr_t * m_t / (K.sqrt(vhat_t) + K.epsilon()) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - adam_lr_t * m_t / (K.sqrt(v_t) + K.epsilon()) self.updates.append(K.update(self.ms, m_t)) self.updates.append(K.update(self.vs, v_t)) new_p = K.switch(K.less_equal(nnz, self.e2efs_layer.units), new_p, p_t) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, inputs): inputs, weights = inputs weights = weights / tf.reduce_sum(weights) # Normalize sample weights weights_expand = tf.expand_dims(weights, axis=1) mean, variance = tf.nn.weighted_moments( inputs, [0], weights_expand) # Compute weighed mean and variance counter = K.update_add( self.counter, K.ones_like(self.counter) ) # Count number of times the data passes through the model init = K.sign( counter - K.ones_like(counter) ) # Indicator is 1 if model is being initalized, 0 otherwise mean = K.update( self.mean, init * self.mean + (1.0 - init) * mean) # Store the mean when the indicator is 1 variance = K.update( self.variance, init * self.variance + (1.0 - init) * variance) # Store the variance when the indicator is 1 mean_expand = tf.expand_dims(mean, axis=0) variance_expand = tf.expand_dims(variance, axis=0) outputs = (inputs - mean_expand) / tf.sqrt( variance_expand + self.epsilon) # Normalize the inputs return outputs
def north_bad(loss, pred, velocities): # check south loss for (w, v) in zip(trainable_vars, velocities): # x0 + v (N) -> x0 - v (S) K.update_add(w, -2 * v) # the base algorithm would check the gradient at x0 or # at x0 + v, but we are checking it at x0 - v here with tf.GradientTape() as tape: predS = self(x, training=True) lossS = K.mean(loss_fun(y, predS)) south_good_f = functools.partial(south_good, loss, pred, lossS, tape, velocities) south_bad_f = functools.partial(south_bad, loss, pred, velocities) return tf.cond(lossS <= loss0, south_good_f, south_bad_f)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] self.weights = accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for i, (p, g, a) in enumerate(zip(params, grads, accumulators)): # update accumulator rho = 0.5 if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else self.rho i_lr = self.e2efs_lr if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else lr new_a = rho * a + (1. - rho) * K.square(g) self.updates.append(K.update(a, new_a)) new_p = p - i_lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): sync_cond = K.equal((self.iterations + 1) // self.sync_period * self.sync_period, (self.iterations + 1)) if TF_KERAS: slow_params = [K.variable(K.get_value(p), name='sp_{}'.format(i)) for i, p in enumerate(params)] self.updates = self.optimizer.get_updates(loss, params) slow_updates = [] for p, sp in zip(params, slow_params): sp_t = sp + self.slow_step * (p - sp) slow_updates.append(K.update(sp, K.switch( sync_cond, sp_t, sp, ))) slow_updates.append(K.update_add(p, K.switch( sync_cond, sp_t - p, K.zeros_like(p), ))) else: slow_params = {p.name: K.variable(K.get_value( p), name='sp_{}'.format(i)) for i, p in enumerate(params)} update_names = ['update', 'update_add', 'update_sub'] original_updates = [getattr(K, name) for name in update_names] setattr(K, 'update', lambda x, new_x: ('update', x, new_x)) setattr(K, 'update_add', lambda x, new_x: ('update_add', x, new_x)) setattr(K, 'update_sub', lambda x, new_x: ('update_sub', x, new_x)) self.updates = self.optimizer.get_updates(loss, params) for name, original_update in zip(update_names, original_updates): setattr(K, name, original_update) slow_updates = [] for i, update in enumerate(self.updates): if isinstance(update, tuple): name, x, new_x, adjusted = update + (update[-1],) update_func = getattr(K, name) if name == 'update_add': adjusted = x + new_x if name == 'update_sub': adjusted = x - new_x if x.name not in slow_params: self.updates[i] = update_func(x, new_x) else: slow_param = slow_params[x.name] slow_param_t = slow_param + \ self.slow_step * (adjusted - slow_param) slow_updates.append(K.update(slow_param, K.switch( sync_cond, slow_param_t, slow_param, ))) self.updates[i] = K.update(x, K.switch( sync_cond, slow_param_t, adjusted, )) slow_params = list(slow_params.values()) self.updates += slow_updates self.weights = self.optimizer.weights + slow_params return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum( K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [ mult for mult in self.multipliers if mult in p.name ] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format( multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def loop_body(i, loss, pred, velocities, gradient_steps): # set x <- x0 + v for (w, v) in zip(trainable_vars, velocities): K.update_add(w, v) predN = self(x, training=True) lossN = K.mean(loss_fun(y, predN)) north_good_f = functools.partial(north_good, lossN, predN, velocities) north_bad_f = functools.partial(north_bad, loss, pred, velocities) (loss, pred, velocities, delta_gradients) = tf.cond(lossN <= loss0, north_good_f, north_bad_f) return (i + 1, loss, pred, velocities, gradient_steps + delta_gradients)
def get_updates(self, loss, params): self.updates = [ K.update_add(self.iterations, 1), K.update_add(self.optimizer.iterations, K.cast(self.cond, 'int64')), ] # gradient accumulation self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] grads = self.get_gradients(loss, params) for g, ag in zip(grads, self.accum_grads): self.updates.append(K.update(ag, K.switch(self.cond, g, ag + g))) # inheriting updates of original optimizer self.updates.extend(self.optimizer.get_updates(loss, params)[1:]) self.weights.extend(self.optimizer.weights) return self.updates
def __call__(self, gradients): """Accumulates :obj:`gradients` on the current replica.""" if len(self._gradients) == 0: self._gradients.extend([ tf.Variable( tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA) if gradient is not None else gradient for gradient in gradients ]) for accum_gradient, gradient in zip(self._gradients, gradients): if accum_gradient is not None and gradient is not None: K.update_add( accum_gradient, _multiply_gradient(gradient, self._accum_grad_scale))
def get_updates(self, loss, params): """ Obtain the optimizer loss updates. Parameters ---------- loss: list List of tensors params: list List of tensors Returns ------- list List of tensors """ grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # Pass off to CPU if requested if self.cpu_mode: with K.tf.device("/cpu:0"): ms, vs, vhats = self._update_1(params) else: ms, vs, vhats = self._update_1(params) self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 '''Bias corrections according to the Adam paper ''' lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): #################################################### # Add a lr multiplier for vars outside excluded_vars if p.name in self.excluded_vars: multiplied_lr_t = lr_t else: multiplied_lr_t = lr_t * self.lr_mult ################################################### m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) '''Schedule multiplier eta_t = 1 for simple AdamW According to the AdamW paper, eta_t can be fixed, decay, or also be used for warm restarts (AdamWR to come). ''' eta_t = 1. p_t = p - eta_t * (multiplied_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)) if self.weight_decay != 0: '''Normalized weight decay according to the AdamW paper ''' w_d = self.weight_decay * K.sqrt( self.batch_size / (self.samples_per_epoch * self.epochs)) p_t = p_t - eta_t * (w_d * p) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, dtype = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=dtype, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 g2r = K.mean(g2, axis=axis1, keepdims=True) g2c = K.mean(g2, axis=axis2, keepdims=True) vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = self.rms(u) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=dtype, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(self.rms(p), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
def get_updates(self, loss, params): """ Build the graph nodes that accumulate gradients. """ self.updates = [] grads = self.get_gradients(loss, params) for param, grad in zip(params, grads): shape = K.int_shape(param) var = K.zeros(shape) self._vars.append(var) self.updates.append(K.update_add(var, grad)) return self.updates
def __call__(self, y_true, y_pred): '''Update precision computation. # Arguments y_true: Tensor, batch_wise labels y_pred: Tensor, batch_wise predictions # Returns Overall precision for the epoch at the completion of the batch. ''' # Batch y_true, y_pred = _slice_by_class(y_true, y_pred, self.class_ind) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) pred_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) # Current current_true_positives = self.true_positives * 1 current_pred_positives = self.pred_positives * 1 # Updates updates = [K.update_add(self.true_positives, true_positives), K.update_add(self.pred_positives, pred_positives)] self.add_update(updates, inputs=[y_true, y_pred]) # Compute recall return (current_true_positives + true_positives) / \ (current_pred_positives + pred_positives + K.epsilon())
def updated_get_updates(self, loss, params): self.accumulate_gradient_accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] updates_accumulated_iterations = K.update_add(self.accumulated_iterations, 1) new_grads = orig_get_gradients(loss, params) if not accumulate_sum_or_mean: new_grads = [g / K.cast(self.update_params_frequency, K.dtype(g)) for g in new_grads] self.updated_grads = [K.update_add(p, g) for p, g in zip(self.accumulate_gradient_accumulators, new_grads)] def update_function(): with tensorflow.control_dependencies(orig_get_updates(loss, params)): reset_grads = [K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulate_gradient_accumulators] return tensorflow.group(*(reset_grads + [updates_accumulated_iterations])) def just_store_function(): return tensorflow.group(*[updates_accumulated_iterations]) update_switch = K.equal((updates_accumulated_iterations) % self.update_params_frequency, 0) with tensorflow.control_dependencies(self.updated_grads): self.updates = [K.switch(update_switch, update_function, just_store_function)] return self.updates
def get_updates(self, loss, params): # Mostly the same code as Adam class, with added multiplier variables. # Keras code from: # https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/optimizers.py#L456 grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1.0 / (1.0 + self.decay * K.cast(self.iterations, K.dtype(self.decay))) ) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * ( K.sqrt(1.0 - K.pow(self.beta_2, t)) / (1.0 - K.pow(self.beta_1, t)) ) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): layername = p.name.split("/", 1)[0] mult = self.multipliers.get(layername, 1.0) m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - mult * lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - mult * lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, "constraint", None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def update_function(): with tensorflow.control_dependencies(orig_get_updates( loss, params)): reset_grads = [ K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in self.accumulate_gradient_accumulators ] if ema_decay > 0: reset_grads += [K.update_add(self.total_iterations, 1)] reset_grads += [ K.update(e_p, (e_p * ema_decay) + (1 - ema_decay) * p) for e_p, p in zip(self.params_ema, params) ] return tensorflow.group(*(reset_grads + [updates_accumulated_iterations]))
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.inital_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] f = K.variable(0) d = K.variable(1) self.weights = [self.iterations] + ms + vs + [f, d] cond = K.greater(t, K.variable(1)) small_delta_t = K.switch(K.greater(loss, f), self.small_k + 1, 1. / (self.big_K + 1)) big_delta_t = K.switch(K.greater(loss, f), self.big_K + 1, 1. / (self.small_k + 1)) c_t = K.minimum(K.maximum(small_delta_t, loss / (f + self.epsilon)), big_delta_t) f_t = c_t * f r_t = K.abs(f_t - f) / (K.minimum(f_t, f)) d_t = self.beta_3 * d + (1 - self.beta_3) * r_t f_t = K.switch(cond, f_t, loss) d_t = K.switch(cond, d_t, K.variable(1.)) self.updates.append(K.update(f, f_t)) self.updates.append(K.update(d, d_t)) for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (d_t * K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) clptrkey = set_pattern_find(p.name,self.clips.keys()) if self.clips_val and clptrkey: c = K.eval(self.clips[clptrkey]) if self.verbose>0: print("Clipping variable",p.name," to ", c ) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_t_hat = m_t / (1. - K.pow(self.beta_1, t)) v_t_hat = v_t / (1. - K.pow(self.beta_2, t)) p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon)) if self.weight_decay > 0.: wd = self.weight_decay * p p_dash = p_dash + wd r1 = K.sqrt(K.sum(K.square(p))) r2 = K.sqrt(K.sum(K.square(p_dash))) r = tf.where(tf.greater(r1, 0.), tf.where(tf.greater(r2, 0.), r1 / r2, 1.0), 1.0) # r = r1 / r2 eta = r * lr p_t = p - eta * p_dash self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_t_hat = m_t / (1. - K.pow(self.beta_1, t)) v_t_hat = v_t / (1. - K.pow(self.beta_2, t)) p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon)) if self._do_use_weight_decay(p.name): wd = self.weight_decay * p p_dash = p_dash + wd r1 = linalg_ops.norm(p, ord=2) r2 = linalg_ops.norm(p_dash, ord=2) r = array_ops.where( math_ops.greater(r1, 0), array_ops.where(math_ops.greater(r2, 0), (r1 / r2), 1.0), 1.0) # r = r1 / r2 eta = r * lr p_t = p - eta * p_dash self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, inputs, training=None): x = inputs assert not isinstance(x, list) # Compute the minibatch statistics mean, var = self._moments(x) sigma = K.sqrt(var + self.epsilon) # If in training phase set rmax, dmax large so that we use the moving # averages to do the normalization rmax = K.in_train_phase(self.rmax, K.constant(1e5), training) dmax = K.in_train_phase(self.dmax, K.constant(1e5), training) # Compute the corrections based on rmax, dmax r = K.stop_gradient( self._clip(sigma / self.moving_sigma, 1. / rmax, rmax)) d = K.stop_gradient( self._clip((mean - self.moving_mean) / self.moving_sigma, -dmax, dmax)) # Actually do the normalization and the rescaling xnorm = ((x - mean) / sigma) * r + d y = self.gamma * xnorm + self.beta # Add the moving average updates self.add_update([ K.moving_average_update(self.moving_mean, mean, self.momentum), K.moving_average_update(self.moving_sigma, sigma, self.momentum) ], x) # Add the r, d updates rmax_prog = K.minimum(1., self.steps / self.rmax_dur) dmax_prog = K.minimum(1., self.steps / self.dmax_dur) self.add_update([ K.update_add(self.steps, 1), K.update(self.rmax, self.rmax_0 + rmax_prog * (self.rmax_inf - self.rmax_0)), K.update(self.dmax, self.dmax_0 + dmax_prog * (self.dmax_inf - self.dmax_0)) ]) # Fix the output's uses learning phase y._uses_learning_phase = rmax._uses_learning_phase return y
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.get_variable_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.get_variable_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs # Multiplier for weights [0,2,4,6,...] and bias [1,3,5,7,...] if len(params) != len(self.lr_multipliers) : raise Exception("Check Multipliers !") count_multipliers = 0 for p, g, m, v in zip(params, grads, ms, vs): # Multiplier for weights [0,2,4,6,...] and bias [1,3,5,7,...] if self.lr_multipliers is None: new_lr = lr_t else: new_lr = lr_t * self.lr_multipliers[count_multipliers] count_multipliers += 1 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - new_lr * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for i, (p, g, m, v, vhat) in enumerate(zip(params, grads, ms, vs, vhats)): beta_1 = 0.5 if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else self.beta_1 beta_2 = 0. if i == -1 and self.e2efs_layer is not None and not self.lr_momentum else self.beta_2 m_t = (beta_1 * m) + (1. - beta_1) * g v_t = (beta_2 * v) + (1. - beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0.0: lr = lr * ( 1.0 / ( 1.0 + self.decay * K.cast(self.iterations, K.dtype(self.decay)) ) ) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * K.sqrt(K.pow(1.0 + self.beta_2, t) - 1.0) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = self.beta_1 * m + (1.0 - self.beta_1) * g v_t = (1.0 + self.beta_2) * v + self.beta_2 * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates