def call(self, inputs, training=None): if not self.trainable: training = False else: # The learning phase flag is a bool tensor (0 = test, 1 = train) training = K.learning_phase() if training is not False: K.update_add(self.iterations, 1) # compute current mean&var mini_mean, mini_variance = tf.nn.moments(inputs, axes=[0,1,2]) # affine the inputs x = (inputs - self.steps_mean) / K.sqrt(self.steps_variance + self.epsilon) x = self.gamma * x + self.beta # update the moving params K.moving_average_update(self.moving_mean, mini_mean, self.momentum) K.moving_average_update(self.moving_variance, mini_variance, self.momentum) # update the short-term params under specific condition cond = K.equal(self.iterations % self.steps_per_update, 0) K.switch(cond, lambda: self.steps_mean*0, K.update_add(self.steps_mean, mini_mean)) K.switch(cond, lambda: self.steps_variance*0, K.update_add(self.steps_variance, mini_mean)) else: # affine scale = self.gamma / K.sqrt(self.moving_variance + self.epsilon) x = inputs * scale + (self.beta - self.moving_mean * scale) return x
def __call__(self, y_true, y_pred): """Computes the number of true positives in a batch. # Arguments y_true: Tensor, batch_wise labels y_pred: Tensor, batch_wise predictions # Returns The total number of true positives seen this epoch at the completion of the batch. """ y_true = K.cast(y_true, 'int32') y_pred = K.cast(K.round(y_pred), 'int32') # False positives calculations false_pos = K.cast(K.sum(K.cast(K.greater(y_true, y_pred) ,'int32')), 'int32') current_false_pos = self.false_positives * 1 self.add_update(K.update_add(self.false_positives, false_pos), inputs=[y_true, y_pred]) # True positive calculations correct_preds = K.cast(K.equal(y_pred, y_true), 'int32') true_pos = K.cast(K.sum(correct_preds * y_true), 'int32') current_true_pos = self.true_positives * 1 self.add_update(K.update_add(self.true_positives, true_pos), inputs=[y_true, y_pred]) # Combine precision = (K.cast(self.true_positives, 'float32') / (K.cast(self.true_positives, 'float32') + K.cast(self.false_positives, 'float32') + K.cast(K.epsilon(), 'float32'))) self.add_update(K.update(self.precision, precision), inputs=[y_true, y_pred]) return precision
def __call__(self, y_true, y_pred): """Computes the kappa in a batch. # Arguments y_true: Tensor, batch_wise labels y_pred: Tensor, batch_wise predictions # Returns The kappa seen this epoch at the completion of the batch. """ y_true = K.cast(y_true, 'int32') y_pred = K.cast(K.round(y_pred), 'int32') true_pos = K.cast(K.sum(y_pred * y_true), 'int32') true_neg = K.cast(K.sum((1 - y_pred) * (1 - y_true)), 'int32') false_pos = K.cast(K.sum(y_pred * (1 - y_true)), 'int32') false_neg = K.cast(K.sum((1 - y_pred) * y_true), 'int32') self.add_update(K.update_add(self.true_positives, true_pos), inputs=[y_true, y_pred]) self.add_update(K.update_add(self.true_negative, true_neg), inputs=[y_true, y_pred]) self.add_update(K.update_add(self.false_positives, false_pos), inputs=[y_true, y_pred]) self.add_update(K.update_add(self.false_negative, false_neg), inputs=[y_true, y_pred]) true_pos = K.cast(self.true_positives * 1, "float32") true_neg = K.cast(self.true_negative * 1, "float32") false_pos = K.cast(self.false_positives * 1, "float32") false_neg = K.cast(self.false_negative * 1, "float32") sm = true_pos + true_neg + false_pos + false_neg obs_agree = (true_pos + true_neg) / sm poss_pos = (true_pos + false_neg) * (true_pos + false_pos) / (sm**2) poss_neg = (true_neg + false_neg) * (true_neg + false_pos) / (sm**2) poss_agree = poss_pos + poss_neg return (obs_agree - poss_agree) / (1 - poss_agree + K.epsilon())
def get_updates(self, loss, params): with K.name_scope(self.__class__.__name__): self.slow_weights = [K.variable(p) for p in params] opt_updates = self.opt.get_updates(loss, params) update_dict = { t.op.inputs[0].name: t.op.inputs[1] for t in opt_updates } p_names = [p.name for p in params] other_ops = [ t for t in opt_updates if t.op.inputs[0].name not in p_names ] self.updates = [K.update_add(self.iterations, 1)] self.updates += other_ops with tf.control_dependencies([self.updates[0]]): condition = K.equal(self.iterations % self.k, 0) for fast_w, slow_w in zip(params, self.slow_weights): self.updates.append( K.switch( condition, lambda: K.update( fast_w, K.update_add( slow_w, (K.update(fast_w, update_dict[fast_w.name]) - slow_w) * self.alpha, ), ), lambda: K.update(fast_w, update_dict[fast_w.name]), )) self.weights = self.opt.weights + self.slow_weights return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.updates.append(K.update_add(self.t_cur, 1)) lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats total_iterations = self.total_iterations # Cosine annealing if self.use_cosine_annealing and total_iterations != 0: self.eta_t = _compute_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking lr_t_premult = lr_t for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t_premult, p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # Weight decays if p.name in self.weight_decays.keys() and total_iterations != 0: p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self._init_notified = True return self.updates
def adjust(self): dtype = self.embedding.dtype unchanged_mask = bk.cast(0 == self.update_cnt, dtype) paved_embedding = self._pave_embedding(self.embedding) * unchanged_mask unpaved_embedding = self.embedding * unchanged_mask bk.update_add(self.embedding, (1 - self.pave_momentum) * (paved_embedding - unpaved_embedding)) inv_seg_scale = (self.cur_max - self.cur_min) / self.seg_num x = bk.expand_dims(bk.arange(0, self.seg_num, dtype=dtype), axis=-1) * inv_seg_scale + ( self.cur_min + self.val_epsilon * inv_seg_scale) x = bk.concatenate( [x, x + (1. - 2 * self.val_epsilon) * inv_seg_scale], axis=0) y = bk.transpose( bk.reshape( self.embedding, (self.input_dim * self._target_dim, -1)))[int(self.mask_zero):] y = bk.concatenate([y, y], axis=0) seg_indices = self._calc_seg_indices(x, self.moving_min, self.moving_max) tmp_embedding, tmp_cnt = self._sum_seg_embeddings(seg_indices, y) bk.update( self.embedding, tmp_embedding / (tmp_cnt + bk.cast(0 == tmp_cnt, dtype=dtype))) unmatched_mask = bk.cast(0 == self.embedding, dtype) bk.update_add(self.embedding, self._pave_embedding(self.embedding) * unmatched_mask) bk.update(self.update_cnt, bk.zeros_like(self.update_cnt)) bk.update(self.cur_min, self.moving_min) bk.update(self.cur_max, self.moving_max)
def call(self, inputs, training=None): if training is None: training = bk.learning_phase() training = bk.get_value(training) indices, outputs = [[] for i in range(self.phase)], [] for i in range(self.phase): trainable = self.embed_trainable_list[i] cur_outputs = [] for j, feats_idx in enumerate(self.feat_idx_list[i]): inds = self._calc_indices( bk.transpose(bk.gather(bk.transpose(inputs), feats_idx)), self.min_val_list[i][j], self.max_val_list[i][j], self.seg_num_list[i][j], self.seg_num_mul_list[i][j]) if trainable: indices[i].append(inds) if self.index_learnable: cur_outputs.append(make_gather_in_flow()( self.embedding_list[i][j], inds)) else: cur_outputs.append( bk.gather(self.embedding_list[i][j], bk.cast(inds, 'int32'))) outputs.append( bk.concatenate(cur_outputs ) if len(cur_outputs) > 1 else cur_outputs[0]) if training: bk.update(self.call_cnt, self.call_cnt + 1) for i in range(self.phase): for j, inds in enumerate(indices[i]): update_cnt = self.update_cnt_list[i][j] inds = bk.cast(inds, 'int64') if self.unique_supported: ind, _, cnts = tf.unique_with_counts(bk.flatten(inds)) tmp_cnt = tf.sparse.to_dense( tf.sparse.reorder( tf.SparseTensor( bk.expand_dims(ind, -1), bk.cast(cnts, update_cnt.dtype), update_cnt.shape))) else: tmp_cnt = bk.map_fn(lambda ele: bk.sum( bk.cast(ele == inds, inputs.dtype)), bk.arange(0, update_cnt.shape[0], dtype=inds.dtype), dtype=inputs.dtype) bk.update_add(update_cnt, tmp_cnt) if self.period is not None and self.call_cnt % self.period == 0: self.adjust() return outputs
def __call__(self, y_true, y_pred): y_true, y_pred = _sanitize(y_true, y_pred, self.threshold) true_pos = _tp(y_true, y_pred) false_neg = _fn(y_true, y_pred) self.add_update(K.update_add(self.tp, true_pos), inputs=[y_true, y_pred]) self.add_update(K.update_add(self.fn, false_neg), inputs=[y_true, y_pred]) return self.fn / (self.fn + self.tp + self.eps)
def __call__(self, y_true, y_pred): total = self.total + K.shape(y_true)[0] correct = self.correct + K.sum(K.equal(y_true, K.round(y_pred))) self.add_update(K.update_add(self.total, K.shape(y_true)[0]), inputs=[y_true, y_pred]) self.add_update(K.update_add( self.correct, K.sum(K.equal(y_true, K.round(y_pred)))), inputs=[y_true, y_pred]) return correct / total
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.updates.append(K.update_add(self.t_cur, 1)) lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [ K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + moments total_iterations = self.total_iterations # Cosine annealing if self.use_cosine_annealing and total_iterations != 0: self.eta_t = _compute_eta_t(self) self.lr_t = lr * self.eta_t # for external tracking for p, g, m in zip(params, grads, moments): # Learning rate multipliers lr_t = self.learning_rate if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) v = self.momentum * m - self.eta_t * lr_t * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: p_t = p + self.momentum * v - self.eta_t * lr_t * g else: p_t = p + v # Weight decays if p.name in self.weight_decays.keys() and total_iterations != 0: p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self._init_notified = True return self.updates
def get_updates(self, loss, params): self.updates = [ K.update_add(self.iterations, 1), K.update_add(self.optimizer.iterations, K.cast(self.cond, 'int64')), ] # (gradient accumulation) self.accum_grads = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] grads = self.get_gradients(loss, params) for g, ag in zip(grads, self.accum_grads): self.updates.append(K.update(ag, K.switch(self.cond, g, ag + g))) # optimizer (inheriting updates of original optimizer) self.updates.extend(self.optimizer.get_updates(loss, params)[1:]) self.weights.extend(self.optimizer.weights) return self.updates
def adjust(self): for i in range(self.phase): if self.embed_trainable_list[i]: for j, embedding in enumerate(self.embedding_list[i]): update_cnt = self.update_cnt_list[i][j] unchanged_mask = bk.expand_dims( bk.cast(0 == update_cnt, embedding.dtype)) paved_embedding = self._pave_embedding( embedding, self.seg_num_list[i][j][-1]) * unchanged_mask unpaved_embedding = embedding * unchanged_mask bk.update_add(embedding, (1 - self.pave_momentum) * (paved_embedding - unpaved_embedding)) bk.update(update_cnt, bk.zeros_like(update_cnt))
def call(self, inputs): S = K.dot(inputs, self.kernel) if self.use_bias: S = K.bias_add(S, self.bias) if self.activation is not None: output = self.activation(S) delta = (output - self.gamma * S) K.update_add(self.kernel, 2 * self.lr * K.dot(K.transpose(inputs), delta)) if self.use_bias: K.update_add(self.bias, 2 * self.lr * K.mean(delta, axis=0)) return output
def get_updates(self, loss, params): tower_gradvars = [] gdev_list = self._gdev_list global_scope = tf.get_variable_scope() for idev, device in enumerate(gdev_list): with tf.device(device), \ tf.variable_scope(global_scope, reuse=idev > 0), \ tf.name_scope('tower_%i' % idev): grads = self.optimizer.compute_gradients(loss, params) gradvars = zip(grads, params) tower_gradvars.append(gradvars) tower_gradvars = all_avg_gradients(tower_gradvars, gdev_list, usenccl=False) self.updates = [K.update_add(self.iterations, 1)] for device_num, device in enumerate(gdev_list): with tf.device(device): gradvars = tower_gradvars[device_num] opt_update = self.optimizer.apply_gradients( grads, global_step=self.iterations) self.updates.append(opt_update) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) noise = K.random_normal(shape=K.shape(p), stddev=self.gradient_noise_coefficient) # temperature_update = if self.nesterov: new_p = p + self.momentum * v - lr * g + lr * noise else: new_p = p + v + lr * noise # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms for p, g, m in zip(params, grads, ms): m_t = (self.beta * m) + (1. - self.beta) * g p_t = p - lr * m_t self.updates.append(K.update(m, m_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr / (1. - K.pow(self.beta_1, t)) shapes = [K.int_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = K.maximum(self.beta_2 * u, K.abs(g)) p_t = p - lr_t * m_t / (u_t + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(u, u_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.optimizer.compute_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] opt_update = self.optimizer.apply_gradients( grads, global_step=self.iterations) self.updates.append(opt_update) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] self.weights = accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): if p.name in self.lr_mult: multiplied_lr = lr * self.lr_mult[p.name] else: multiplied_lr = lr v = self.momentum * m - multiplied_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - multiplied_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] mems = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs + mems for p, g, m, v, mem in zip(params, grads, ms, vs, mems): r = 1. / (1. + mem) m_t = (1. - r) * m + r * g v_t = (1. - r) * v + r * K.square(g) denoise = K.square(m_t) / (v_t + self.epsilon) p_t = p - g * K.minimum(lr, denoise) / (K.sqrt(v_t) + self.epsilon) mem_t = 1. + mem * (1. - denoise) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) self.updates.append(K.update(mem, mem_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) lr = self.lr * (1. / (1. + self.decay * self.iterations)) self.updates = [K.update_add(self.iterations, 1)] # momentum shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - self.get_param_learning_rate(p, lr) * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = K.maximum(self.beta_2 * u, K.abs(g)) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (u_t + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(u, u_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum(K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] delta_accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - lr * update # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) self.updates.append(K.update(d_a, new_d_a)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] wd = self.wd # decoupled weight decay (3/4) lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g - lr * wd * p # decoupled weight decay (4/4) else: new_p = p + v - lr * wd * p # decoupled weight decay # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] wd = self.wd # decoupled weight decay (3/4) lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) # decoupled weight decay (4/4) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [ K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params ] self.weights = accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) for p, g, a in zip(params, grads, accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) if p.name in self.clips.keys(): if self.verbose > 0: print("CLpping variable", p.name, " to ", self.clips[p.name]) c = K.eval(self.clips[p.name]) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.inital_decay > 0: lr *= (1. / (1. + self.decay * self.iterations))**0.75 self.updates.append(K.update_add(self.iterations, 1)) # momentum shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr t = K.cast(self.iterations, K.floatx()) + 1 epoch = (t // 98) + 1 # CIFAR10, CIFAR100 # epoch = (t//118) + 1 # MNIST beta2 = 1 - (self.gamma / epoch) alpha_t = lr / ((epoch + 2) * (epoch**(1 / 2))) momentum = epoch / (epoch + 2) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): v_t = (beta2 * v) + (1 - beta2) * K.square(g) v_hat = (K.sqrt(v_t)) + (self.delta / (epoch**(1 / 2))) m_t = momentum * m + alpha_t * g / v_hat p_t = p - m_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr adam_lr = self.adam_lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) adam_lr = adam_lr * (1. / (1. + self.decay * K.cast( self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 adam_lr_t = adam_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.ms = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.vs = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0])) self.weights = [self.iterations] + moments + vhats + [self.ms ] + [self.vs] for i, (p, g, m, vhat) in enumerate(zip(params, grads, moments, vhats)): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v if i == 0 and self.e2efs_layer is not None: nnz = K.sum(K.cast(K.greater(p, 0.), K.floatx())) m_t = (self.beta_1 * self.ms) + (1. - self.beta_1) * g v_t = (self.beta_2 * self.vs) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - adam_lr_t * m_t / (K.sqrt(vhat_t) + K.epsilon()) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - adam_lr_t * m_t / (K.sqrt(v_t) + K.epsilon()) self.updates.append(K.update(self.ms, m_t)) self.updates.append(K.update(self.vs, v_t)) new_p = K.switch(K.less_equal(nnz, self.e2efs_layer.units), new_p, p_t) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [ K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def __call__(self, y_true, y_pred): """Computes the number of true positives in a batch. # Arguments y_true: Tensor, batch_wise labels y_pred: Tensor, batch_wise predictions # Returns The total number of true positives seen this epoch at the completion of the batch. """ y_true = K.cast(y_true, 'int32') y_pred = K.cast(K.round(y_pred), 'int32') correct_preds = K.cast(K.equal(y_pred, y_true), 'int32') true_pos = K.cast(K.sum(correct_preds * y_true), 'int32') current_true_pos = self.true_positives * 1 self.add_update(K.update_add(self.true_positives, true_pos), inputs=[y_true, y_pred]) return current_true_pos + true_pos
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) p_t = p - get_learing_rate(p, self.lr) * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 loss_prev = K.variable(0) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] ch_fact_lbound = K.switch(K.greater(loss, loss_prev), 1+self.thl, 1/(1+self.thu)) ch_fact_ubound = K.switch(K.greater(loss, loss_prev), 1+self.thu, 1/(1+self.thl)) loss_ch_fact = loss / loss_prev loss_ch_fact = K.switch(K.lesser(loss_ch_fact, ch_fact_lbound), ch_fact_lbound, loss_ch_fact) loss_ch_fact = K.switch(K.greater(loss_ch_fact, ch_fact_ubound), ch_fact_ubound, loss_ch_fact) loss_hat = K.switch(K.greater(t, 1), loss_prev * loss_ch_fact, loss) d_den = K.switch(K.greater(loss_hat, loss_prev), loss_prev, loss_hat) d_t = (self.beta_3 * self.d) + (1. - self.beta_3) * K.abs((loss_hat - loss_prev) / d_den) d_t = K.switch(K.greater(t, 1), d_t, 1.) self.updates.append(K.update(self.d, d_t)) for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g mhat_t = m_t / (1. - K.pow(self.beta_1, t)) self.updates.append(K.update(m, m_t)) v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) vhat_t = v_t / (1. - K.pow(self.beta_2, t)) self.updates.append(K.update(v, v_t)) p_t = p - (self.lr / (1. + (self.iterations * self.decay))) * mhat_t / ((K.sqrt(vhat_t) * d_t) + self.epsilon) self.updates.append(K.update(p, p_t)) self.updates.append(K.update(loss_prev, loss_hat)) return self.updates