def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 lr_t = self.lr / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = K.maximum(self.beta_2 * u, K.abs(g)) p_t = p - self.get_param_learning_rate_t(p,t,lr_t) * m_t / (u_t + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(u, u_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): if p.name in self.lr_mult: multiplied_lr = lr * self.lr_mult[p.name] else: multiplied_lr = lr v = self.momentum * m - multiplied_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - multiplied_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) lr = self.lr * (1. / (1. + self.decay * self.iterations)) self.updates = [K.update_add(self.iterations, 1)] # momentum shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m - self.get_param_learning_rate(p, lr) * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) shapes = [K.get_variable_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] delta_accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators + delta_accumulators self.updates = [] for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - get_learing_rate(p,self.lr) * update # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) self.updates.append(K.update(d_a, new_d_a)) return self.updates
def call(self, inputs): kernel_shape = K.int_shape(self.kernel) if not self.renormalize: w = K.reshape(self.kernel, (kernel_shape[0], kernel_shape[1] * kernel_shape[2] * kernel_shape[3], kernel_shape[-1])) sigma, u_bar = max_singular_val( w, self.u, transpose=lambda x: ktf.transpose(x, [0, 2, 1]), fully_differentiable=self.fully_diff_spectral, ip=self.spectral_iterations) sigma = K.reshape(sigma, (self.number_of_classes, 1, 1, 1, 1)) else: w = K.reshape(self.kernel, (-1, kernel_shape[-1])) sigma, u_bar = max_singular_val( w, self.u, fully_differentiable=self.fully_diff_spectral, ip=self.spectral_iterations) self.add_update(K.update(self.u, u_bar)) kernel = self.kernel self.kernel = self.kernel / sigma outputs = super(SNConditionalConv2D, self).call(inputs) self.kernel = kernel return outputs
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) if p.name in self.clips.keys(): c = K.eval(self.clips[p.name]) if self.verbose > 0: print("Clipping variable", p.name, " to ", c) new_p = K.clip(new_p, c[0], c[1]) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): if self.lr_multipliers != None: if p.name in self.lr_multipliers: new_lr = lr * self.lr_multipliers[p.name] else: new_lr = lr else: new_lr = lr if self.momentum_multipliers != None: if p.name in self.momentum_multipliers: new_momentum = self.momentum * \ self.momentum_multipliers[p.name] else: new_momentum = self.momentum else: new_momentum = self.momentum v = new_momentum * m - new_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + new_momentum * v - new_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, inputs, training=None): if training is None: training = bk.learning_phase() training = bk.get_value(training) if training: dtype = self.embedding.dtype bk.update(self.call_cnt, self.call_cnt + 1) if self.period is not None and self.call_cnt % self.period == 0: self.adjust() @tf.custom_gradient def __delegate(_x, _y): x = bk.cast(_x, dtype) if 1 == self._target_dim: y = bk.cast(_y, dtype) * bk.ones_like(x) else: y = bk.reshape( bk.expand_dims(bk.cast(_y, dtype), 1) * bk.ones_like(bk.expand_dims(x, -1)), (-1, self.input_dim * self._target_dim)) def _grad(dy): seg_indices = self._calc_seg_indices( x, self.cur_min, self.cur_max) seg_embeddings = bk.gather(self.embedding, seg_indices) self._update_embedding(x, y, seg_indices, seg_embeddings) dys = diff_by_col_num(self.embedding, col_num=self.seg_num, direction='both') cur_dy = bk.gather((dys[0] + dys[1]) / 2, seg_indices) if 1 == self._target_dim: cur_dy *= dy else: cur_dy = bk.reshape( cur_dy, (-1, self.input_dim, self._target_dim)) * bk.expand_dims(dy, 1) cur_dy = bk.sum(cur_dy, axis=-1) / self.input_dim return cur_dy * self.grad_ease, dy return _y, _grad return __delegate(inputs[0], inputs[1]) return inputs[-1]
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) print(lr) t = K.cast(self.iterations, K.floatx()) + 1 '''Bias corrections according to the Adam paper ''' lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) '''Schedule multiplier eta_t = 1 for simple AdamW According to the AdamW paper, eta_t can be fixed, decay, or also be used for warm restarts (AdamWR to come). ''' eta_t = 1. p_t = p - eta_t * (lr_t * m_t / (K.sqrt(v_t) + self.epsilon)) if self.weight_decay != 0: '''Normalized weight decay according to the AdamW paper ''' w_d = self.weight_decay * K.sqrt( self.batch_size / (self.samples_per_epoch * self.epochs)) p_t = p_t - eta_t * (w_d * p) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] accumulators = [ K.zeros(shape, name='accumulator_' + str(i)) for (i, shape) in enumerate(shapes) ] delta_accumulators = [ K.zeros(shape, name='delta_accumulator_' + str(i)) for (i, shape) in enumerate(shapes) ] self.weights = [self.iterations] + accumulators + delta_accumulators self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) shapes = [K.int_shape(p) for p in params] moments = [ K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes) ] for p, g, a, d_a, m in zip(params, grads, accumulators, delta_accumulators, moments): v = self.momentum * m - lr * g # update accumulator new_a = self.rho * a + (1. - self.rho) * K.square(g) self.updates.append(K.update(a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) new_p = p - lr * update + v # Add Stochastic Gradient Step Here? # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) self.updates.append(K.update(d_a, new_d_a)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.updates.append(K.update_add(self.state_counter, 1)) lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) shapes = [K.int_shape(p) for p in params] grad_mean = [K.zeros(shape) for shape in shapes] prev_weights = [p for p in params] self.weights = [self.iterations] + grad_mean + prev_weights old_grads = self.get_gradients(loss, prev_weights) for p, g, g_mean, prev, old_g in zip(params, grads, grad_mean, prev_weights, old_grads): #update part grad = g + g_mean - old_g v = -lr * grad new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) new_p = K.switch(self.state_counter > self.mean_calculation_step, new_p, p) self.updates.append(K.update(p, new_p)) #statistics part grad_stat = K.switch( self.state_counter <= self.mean_calculation_step, g * (1.0 / self.mean_calculation_step), K.zeros_like(g)) self.updates.append(K.update_add(g_mean, grad_stat)) #switch statistics --> update temp_params = K.switch( self.state_counter <= self.mean_calculation_step, p, prev) self.updates.append(K.update(prev, temp_params)) #switch update --> statistics temp_g_mean = K.switch( K.equal(self.state_counter, self.mean_calculation_step + self.update_step), K.zeros_like(g_mean), g_mean) self.updates.append(K.update(g_mean, temp_g_mean)) counter = K.switch( self.state_counter > self.mean_calculation_step + self.update_step, K.constant(0, dtype='int64'), self.state_counter) self.updates.append(K.update(self.state_counter, counter)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1.0 - 1.0 / (self.gamma * t + 1.0)) upper_bound = final_lr * (1.0 + 1.0 / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) step = lr_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: step = lr_t / (K.sqrt(v_t) + self.epsilon) p_t = p - K.minimum(K.maximum(step, lower_bound), upper_bound) * m_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, loss, contraints=None): self.updates = [K.update_add(self.iterations, 1)] grads = self.get_gradients(loss, params) shapes = [K.int_shape(p) for p in params] old_grads = [K.zeros(shape) for shape in shapes] weights = [K.zeros(shape) for shape in shapes] # Learning Rate learning_rate = self.learning_rate if self.initial_decay > 0: learning_rate *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 # Line 2 - initialise current weights zeta = [K.ones(shape) for shape in shapes] Z = [K.zeros(shape) for shape in shapes] theta = [K.zeros(shape) for shape in shapes] for p, g, w, expMA, prevZ, prevTheta, old_g in zip(params, grads, weights, zeta, Z, theta, old_grads): change = g * old_g pos_change = K.greater(change,0.) neg_change = K.less(change,0.) # Line 3-8: For all t in [1..t] do the following zeta_t = K.switch(pos_change, K.minimum(expMA * self.eta_plus, self.zeta_max), K.switch(neg_change, K.maximum(expmA * self.eta_minus, self.zeta_min), expMA)) zeta_clip = K.clip(zeta_t, self.zeta_min, self.zeta_max) # Lines 9-12: Update weights for t with amendments as proposed for line 11 Z_t = (self.alpha * prevZ) + ((1 - self.alpha) * zeta_t) theta_t = (self.alpha * prevTheta) + ((1 - self.alpha) * K.square(g)) wChange = - (learning_rate * (zeta_clip /zeta_t) * g) / K.sqrt(theta_t + self.epsilon) new_weight = w + wChange p_update = p - w + new_weight self.updates.append(K.update(p,p_update)) self.updates.append(K.update(w,new_weight)) self.updates.append(K.update(expMA,zeta_t)) self.updates.append(K.update(prevZ,Z_t)) self.updates.append(K.update(prevTheta,theta_t)) return self.updates
def set_weights_for_training(model, fine_tune, layer_num=[81, 174]): """ Takes a model and a training state i.e. fine_tune = True and sets weights accordingly. Fine-tuning unlocks from layer 81 - res4a_branch2a Input: model - ResNet_UNet model by default, can be any model fine_tune - bool to signify training state layer_num - layer to lock/unlock from. default is 173 add_16, where 174 is up_sampling2d_1 Output: None """ if not fine_tune: print("[INFO] base model...") # ResNet layers for layer in model.layers[0:layer_num[1]]: # Opens up mean and variance for training if hasattr(layer, 'moving_mean') and hasattr(layer, 'moving_variance'): layer.trainable = True K.eval(K.update(layer.moving_mean, K.zeros_like(layer.moving_mean))) K.eval(K.update(layer.moving_variance, K.zeros_like(layer.moving_variance))) else: layer.trainable = False # UNet layers for layer in model.layers[layer_num[1]::]: layer.trainable = True else: print("[INFO] fine tuning model...") # ResNet layers for layer in model.layers[layer_num[0]:layer_num[1]]: layer.trainable = True # Opens up mean and variance for training if hasattr(layer, 'moving_mean') and hasattr(layer, 'moving_variance'): K.eval(K.update(layer.moving_mean, K.zeros_like(layer.moving_mean))) K.eval(K.update(layer.moving_variance, K.zeros_like(layer.moving_variance))) # UNet layers for layer in model.layers[layer_num[1]::]: layer.trainable = True
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.updates.append(K.update_add(self.t_cur, 1)) lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments total_iterations = self.total_iterations # Cosine annealing if self.use_cosine_annealing and total_iterations != 0: self.eta_t = _compute_eta_t(self) self.lr_t = lr * self.eta_t # for external tracking for p, g, m in zip(params, grads, moments): # Learning rate multipliers lr_t = self.lr if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) v = self.momentum * m - self.eta_t * lr_t * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: p_t = p + self.momentum * v - self.eta_t * lr_t * g else: p_t = p + v # Weight decays if p.name in self.weight_decays.keys() and total_iterations != 0: p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self._init_notified = True return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] vhats = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g + ( (self.prior_prec * p) / self.train_set_size) v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) #m_t = m_t / (1. - self.beta_1) # bias correction #v_t = v_t / (1. - self.beta_2) # bias correction p_t = (p + self.epsilon) - lr_t * m_t / ( K.sqrt(v_t) + self.prior_prec / self.train_set_size) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = K.gradients(loss, params) flattenedgrads = [K.flatten(x) for x in grads] G = K.concatenate(flattenedgrads) self.updates = [] dP = self.dP xi = self.xi if self.initial_decay > 0: dP *= (1. / (1. + self.decay * self.iterations)) self.updates.append(K.update_add(self.iterations, 1)) shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] flattenedmoments = [K.flatten(x) for x in moments] F = K.concatenate(flattenedmoments) self.weights = [self.iterations] + moments IGG=K.sum(G * G) IFF=K.sum(F * F) IGF=K.sum(G * F) dQ=-xi*dP*K.sqrt(IGG) lamda2= 0.5*K.sqrt((IFF*IGG-IGF*IGF)/(IGG*dP*dP-dQ*dQ)) lamda1=(-2*lamda2*dQ+IGF)/IGG for p, g, m in zip(params, grads, moments): cond=K.greater(IFF,0.0) v = K.switch(cond, -((lamda1/(2*lamda2))*g)+((1/(2*lamda2))*m), -dP * g) self.updates.append(K.update(m, v)) new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates_by_grads(self, grads, params): updates = [] for g, p in zip(grads, params): new_p = p - self.lr * g updates.append(K.update(p, new_p)) return updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) shapes = [K.get_variable_shape(p) for p in params] accumulators = [K.zeros(shape) for shape in shapes] self.weights = accumulators self.updates = [] for p, g, a in zip(params, grads, accumulators): new_a = a + K.square(g) # update accumulator self.updates.append(K.update(a, new_a)) new_p = p - get_learing_rate(p, self.lr) * g / (K.sqrt(new_a) + self.epsilon) # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_t_hat = m_t / (1. - K.pow(self.beta_1, t)) v_t_hat = v_t / (1. - K.pow(self.beta_2, t)) p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon)) if self.weight_decay > 0.: wd = self.weight_decay * p p_dash = p_dash + wd r1 = K.sqrt(K.sum(K.square(p))) r2 = K.sqrt(K.sum(K.square(p_dash))) r = r1 / r2 eta = r * lr p_t = p - eta * p_dash self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.minThres: pred = v_t > self.beta_3 v_t = array_ops.where(pred, array_ops.zeros_like(v_t), v_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def inject(self, model): """Inject the Lookahead algorithm for the given model. The following code is modified from keras's _make_train_function method. See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 """ if not hasattr(model, 'train_function'): raise RuntimeError('You must compile your model before using it.') model._check_trainable_weights_consistency() if model.train_function is None: inputs = (model._feed_inputs + model._feed_targets + model._feed_sample_weights) if model._uses_dynamic_learning_phase(): inputs += [K.learning_phase()] fast_params = model._collected_trainable_weights with K.name_scope('training'): with K.name_scope(model.optimizer.__class__.__name__): training_updates = model.optimizer.get_updates( params=fast_params, loss=model.total_loss) slow_params = [K.variable(p) for p in fast_params] fast_updates = (model.updates + training_updates + model.metrics_updates) slow_updates, copy_updates = [], [] for p, q in zip(fast_params, slow_params): slow_updates.append(K.update(q, q + self.alpha * (p - q))) copy_updates.append(K.update(p, q)) # Gets loss and metrics. Updates weights at each call. fast_train_function = K.function(inputs, [model.total_loss] + model.metrics_tensors, updates=fast_updates, name='fast_train_function', **model._function_kwargs) def F(inputs): self.count += 1 R = fast_train_function(inputs) if self.count % self.k == 0: K.batch_get_value(slow_updates) K.batch_get_value(copy_updates) return R model.train_function = F
def get_updates(self, params, gparams): self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1. lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) for p, g, m, v in zip(params, gparams, self.ms, self.vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) # change # var_list = tf.trainable_variables() # grads = self.compute_sanitized_gradients(loss, var_list) # sanitized_grads = [] # for px_grad, v in zip(px_grads, var_list): # # tensor_name = GetTensorOpName(v) # #tensorname # #tensor_name=tensor_name, # sanitized_grad = self._sanitizer.sanitize( # px_grad, self._eps_delta, sigma=self._sigma, # add_noise=add_noise, # num_examples=self._batches_per_lot * tf.slice( # tf.shape(px_grad), [0], [1])) # sanitized_grads.append(sanitized_grad) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): # g = sanitize(g,self._eps_delta, sigma=self._sigma, # add_noise=add_noise, # num_examples=self._batches_per_lot * tf.slice( # tf.shape(px_grad), [0], [1])) v = self.momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = K.cast(self.iterations, K.floatx()) + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * ( 1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) shapes = [K.int_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + (momentum_cache_t_1 * m_t_prime) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, inputs, training=None): x = inputs assert not isinstance(x, list) # Compute the minibatch statistics mean, var = self._moments(x) sigma = K.sqrt(var + self.epsilon) # If in training phase set rmax, dmax large so that we use the moving # averages to do the normalization rmax = K.in_train_phase(self.rmax, K.constant(1e5), training) dmax = K.in_train_phase(self.dmax, K.constant(1e5), training) # Compute the corrections based on rmax, dmax r = K.stop_gradient( self._clip(sigma / self.moving_sigma, 1. / rmax, rmax)) d = K.stop_gradient( self._clip((mean - self.moving_mean) / self.moving_sigma, -dmax, dmax)) # Actually do the normalization and the rescaling xnorm = ((x - mean) / sigma) * r + d y = self.gamma * xnorm + self.beta # Add the moving average updates self.add_update([ K.moving_average_update(self.moving_mean, mean, self.momentum), K.moving_average_update(self.moving_sigma, sigma, self.momentum) ], x) # Add the r, d updates rmax_prog = K.minimum(1., self.steps / self.rmax_dur) dmax_prog = K.minimum(1., self.steps / self.dmax_dur) self.add_update([ K.update_add(self.steps, 1), K.update(self.rmax, self.rmax_0 + rmax_prog * (self.rmax_inf - self.rmax_0)), K.update(self.dmax, self.dmax_0 + dmax_prog * (self.dmax_inf - self.dmax_0)) ]) # Fix the output's uses learning phase y._uses_learning_phase = rmax._uses_learning_phase return y
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) self.updates = [K.update_add(self.iterations, 1)] completed_updates = K.cast( tf.floordiv(self.iterations, self.accum_iters), K.floatx()) t = completed_updates + 1 update_switch = K.equal((self.iterations + 1) % self.accum_iters, 0) update_switch = K.cast(update_switch, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v, tg in zip(params, grads, ms, vs, gs): sum_grad = tg + g avg_grad = sum_grad / self.accum_iters_float m_t = m * self.beta_1 + (1. - self.beta_1) * avg_grad v_t = v * self.beta_2 + (1. - self.beta_2) * K.square(avg_grad) m_hat = m_t / (1. - K.pow(self.beta_1, t)) v_hat = v_t / (1. - K.pow(self.beta_2, t)) u = m_hat / (K.sqrt(v_hat) + self.epsilon) + self.weight_decay * p r = K.sqrt(K.sum(K.square(p))) / K.sqrt(K.sum(K.square(u))) p_t = p - self.lr * r * u self.updates.append( K.update(m, (1 - update_switch) * m + update_switch * m_t)) self.updates.append( K.update(v, (1 - update_switch) * v + update_switch * v_t)) self.updates.append(K.update(tg, (1 - update_switch) * sum_grad)) self.updates.append( K.update(p, (1 - update_switch) * p + update_switch * p_t)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # Partial momentum adaption new_p = p - (lr_t * (m_t / (denom**(self.partial * 2)))) # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr / (1. - K.pow(self.beta_1, t)) shapes = [K.int_shape(p) for p in params] # zero init of 1st moment ms = [K.zeros(shape) for shape in shapes] # zero init of exponentially weighted infinity norm us = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + us for p, g, m, u in zip(params, grads, ms, us): # Update lr new_lr_t = lr_t if self.lr_multipliers is not None: matched_layer = [ x for x in self.lr_multipliers.keys() if x in p.name] if matched_layer: new_lr_t = lr_t * self.lr_multipliers[matched_layer[0]] m_t = (self.beta_1 * m) + (1. - self.beta_1) * g u_t = K.maximum(self.beta_2 * u, K.abs(g)) p_t = p - new_lr_t * m_t / (u_t + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(u, u_t)) new_p = p_t # Weight_decay new_p -= new_lr_t * self.wd * p # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) new_iter_op = tf.assign_add(self.iterations, 1) self.updates = [] lr = self.lr with tf.control_dependencies([new_iter_op]): if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) accum_switch = K.cast(K.equal(self.iterations % self.accum_iters, 0), dtype=K.floatx()) t = K.cast(self.iterations // self.accum_iters, K.floatx()) + 1 accum_iters = K.cast(self.accum_iters, dtype=K.floatx()) lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, gp, m, v, ga in zip(params, grads, ms, vs, gs): g = (ga + gp) / accum_iters m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append( K.update(m, (1 - accum_switch) * m + accum_switch * m_t)) self.updates.append( K.update(v, (1 - accum_switch) * v + accum_switch * v_t)) self.updates.append(K.update(ga, (1 - accum_switch) * (ga + gp))) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append( K.update(p, (1 - accum_switch) * p + accum_switch * new_p)) return self.updates
def call(self, inputs, **kwargs): # inputs.shape=[None, input_num_capsule, input_dim_vector] # Expand dims to [None, input_num_capsule, 1, 1, input_dim_vector] inputs_expand = K.expand_dims(K.expand_dims(inputs, 2), 2) # Replicate num_capsule dimension to prepare being multiplied by W # Now it has shape = [None, input_num_capsule, num_capsule, 1, input_dim_vector] inputs_tiled = K.tile(inputs_expand, [1, 1, self.num_capsule, 1, 1]) """ # Compute inputs * W by expanding the first dim of W. More time-consuming and need batch_size. # Prepare the dimension of W # Now W has shape = [batch_size, input_num_capsule, num_capsule, input_dim_vector, dim_vector] w_tiled = K.tile(K.expand_dims(self.W, 0), [self.batch_size, 1, 1, 1, 1]) # Transformed vectors, shape = [batch_size, input_num_capsule, num_capsule, 1, dim_vector] inputs_hat = K.batch_dot(inputs_tiled, w_tiled, [4, 3]) """ # Compute `inputs * W` by scanning inputs_tiled on dimension 0. # This is faster but requires Tensorflow. # shape = [None, input_num_capsule, num_capsule, 1, dim_vector] inputs_hat = tf.scan(lambda ac, x: K.batch_dot(x, self.W, [3, 2]), elems=inputs_tiled, initializer=K.zeros([ self.input_num_capsule, self.num_capsule, 1, self.dim_vector ])) """ # Routing algorithm V1. Use tf.while_loop in a dynamic way. def body(i, b, outputs): c = K.softmax(b) c_expand = K.expand_dims(K.expand_dims(K.expand_dims(c, 2), 2), 0) outputs = K.sum(c_expand * inputs_hat, 1, keepdims=True) outputs = squash(outputs) b = b + K.sum(inputs_hat * outputs, [0, -2, -1]) return [i-1, b, outputs] cond = lambda i, b, inputs_hat: i > 0 loop_vars = [K.constant(self.num_routing), self.bias, K.sum(inputs_hat, 1, keepdims=True)] _, self.bias, outputs = tf.while_loop(cond, body, loop_vars)""" # Routing algorithm V2. Use for iteration. V2 and V1 both work without much difference on performance for _ in range(self.num_routing): c = K.softmax(self.bias) c_expand = K.expand_dims(K.expand_dims(K.expand_dims(c, 2), 2), 0) outputs = K.sum(c_expand * inputs_hat, 1, keepdims=True) outputs = squash(outputs) self.bias = K.update( self.bias, self.bias + K.sum(inputs_hat * outputs, [0, -2, -1])) # Handling with no routing scenario. Prior bias will always be zero. if self.num_routing == 0: c = K.softmax(self.bias) c_expand = K.expand_dims(K.expand_dims(K.expand_dims(c, 2), 2), 0) outputs = squash(K.sum(c_expand * inputs_hat, 1, keepdims=True)) return K.reshape(outputs, [-1, self.num_capsule, self.dim_vector])
def call(self, x, mask=None): batch_count = K.cast(K.prod(K.shape(x)[:2]), K.floatx()) batch_mean = K.mean(x, axis=(0, 1)) batch_var = K.var(x, axis=(0, 1)) total_count = self._count + batch_count # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm delta = batch_mean - self._mean m_a = self._var * self._count m_b = batch_var * batch_count M2 = m_a + m_b + K.square(delta) * self._count * batch_count / total_count # add updates to the graph self.add_update([ K.update(self._mean, self._mean + delta * batch_count / total_count), K.update(self._var, M2 / total_count), K.update(self._count, total_count) ]) # dummy addition to suppress Keras warning return x + 0
def conditional_update(cond, variable, new_value): '''Helper function to create updates that only happen when cond is True. Writes to self.updates and returns the new variable. Note: K.update(x, x) is cheap, but K.update_add(x, K.zeros_like(x)) can be expensive. ''' maybe_new_value = K.switch(cond, new_value, variable) self.updates.append(K.update(variable, maybe_new_value)) return maybe_new_value
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) accumulators = [ K.variable(value=K.get_value(p), dtype='float32') for p in params ] self.updates = [K.update_add(self.iterations, 1)] lr = self.lr mu = self.mu c = self.c l1 = c * K.pow(lr, 0.5 + mu) * K.pow( K.cast(self.iterations, K.floatx()) + 1, mu) for p, g, a in zip(params, grads, accumulators): new_a = a - lr * g self.updates.append(K.update(a, new_a)) new_p = K.softthreshold(new_a, l1) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr print("lr", K.get_value(lr)) # momentum shapes = [K.get_variable_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): #v = self.momentum * m - lr * g # velocity v = -1. * lr * g #print (K.get_value(g)) self.updates.append(K.update(m, v)) new_p = p + v self.updates.append(K.update(p, new_p)) return self.updates
def merge_updates(updates): """Average repeated updates of the same variable""" merged_updates = {} for update in updates: variable, value = unpack_assignment(update) key = variable_key(variable) if key not in merged_updates: merged_updates[key] = [variable, []] merged_updates[key][1].append(value) ret = [] for k, v in iteritems(merged_updates): variable = v[0] values = v[1] n = len(values) if n == 1: ret.append(K.update(variable, value[0])) else: ret.append(K.update(variable, sum(values) / n)) return ret
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) p_t = p - get_learing_rate(p, self.lr) * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = self.iterations + 1 loss_prev = K.variable(0) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] ch_fact_lbound = K.switch(K.greater(loss, loss_prev), 1+self.thl, 1/(1+self.thu)) ch_fact_ubound = K.switch(K.greater(loss, loss_prev), 1+self.thu, 1/(1+self.thl)) loss_ch_fact = loss / loss_prev loss_ch_fact = K.switch(K.lesser(loss_ch_fact, ch_fact_lbound), ch_fact_lbound, loss_ch_fact) loss_ch_fact = K.switch(K.greater(loss_ch_fact, ch_fact_ubound), ch_fact_ubound, loss_ch_fact) loss_hat = K.switch(K.greater(t, 1), loss_prev * loss_ch_fact, loss) d_den = K.switch(K.greater(loss_hat, loss_prev), loss_prev, loss_hat) d_t = (self.beta_3 * self.d) + (1. - self.beta_3) * K.abs((loss_hat - loss_prev) / d_den) d_t = K.switch(K.greater(t, 1), d_t, 1.) self.updates.append(K.update(self.d, d_t)) for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g mhat_t = m_t / (1. - K.pow(self.beta_1, t)) self.updates.append(K.update(m, m_t)) v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) vhat_t = v_t / (1. - K.pow(self.beta_2, t)) self.updates.append(K.update(v, v_t)) p_t = p - (self.lr / (1. + (self.iterations * self.decay))) * mhat_t / ((K.sqrt(vhat_t) * d_t) + self.epsilon) self.updates.append(K.update(p, p_t)) self.updates.append(K.update(loss_prev, loss_hat)) return self.updates
def WGAN_train(self, loss_function, D_lr, G_lr, clamp, lamda): assert loss_function=='gradient_penalty' or loss_function=='clip' x_real = Input(shape=self.image_size) fake_vectors = Input(shape=(self.random_vector_size,)) x_fake = self.generator(fake_vectors) loss_real = K.mean(self.discriminator(x_real)) loss_fake = K.mean(self.discriminator(x_fake)) # loss for generator loss = -loss_fake training_updates = RMSprop(lr=G_lr).get_updates(loss, self.generator.trainable_weights) G_train = K.function([fake_vectors], [loss], training_updates) # clip step if loss_function == 'clip': # loss for discriminator loss = loss_fake - loss_real training_updates = RMSprop(lr=D_lr).get_updates(loss, self.discriminator.trainable_weights) D_train = K.function([x_real, fake_vectors], [loss_real, loss_fake], training_updates) # clip clamp_lower, clamp_upper = clamp * -1., clamp weights_clip = [K.update(x, K.clip(x, clamp_lower, clamp_upper)) for x in self.discriminator.trainable_weights] D_clamp = K.function([], [], weights_clip) return D_train, G_train, D_clamp # gradient penalty step else: # loss for discriminator e = K.placeholder(shape=(None, 1, 1, 1)) x_mixed = Input(shape=self.image_size, tensor=e * x_real + (1 - e) * x_fake) x_mixed_gradient = K.gradients(self.discriminator(x_mixed), [x_mixed])[0] x_mixed_gradient_norm = K.sqrt(K.sum(K.square(x_mixed_gradient), axis=[1, 2, 3])) # not norm in batch_size gradient_penalty = K.mean(K.square(x_mixed_gradient_norm - 1)) loss = loss_fake - loss_real + lamda * gradient_penalty training_updates = RMSprop(lr=D_lr).get_updates(loss, self.discriminator.trainable_weights) D_train = K.function([x_real, fake_vectors, e], [loss_real, loss_fake], training_updates) return D_train, G_train, None
def unroll(updates, uupdates, depth): replace = {k: v for k, v in unpack_assignments(uupdates)} updates_t = unpack_assignments(updates) for i in range(depth): updates_t = [(k, clone_replace(v, replace)) for k, v in updates_t] return [K.update(a, b) for a, b in updates_t]