def train_optimizer(self): with tf.variable_scope('train_step'): self.global_step_ = tf.Variable(0, name='global_step_', trainable=False) if self.optimizer_ == 'Adam': opt = AdamOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Adagrad': opt = AdagradOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Adadelta': opt = AdadeltaOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'RMSProp': opt = RMSPropOptimizer(learning_rate=self.learning_rate_ph_) elif self.optimizer_ == 'Momentum': opt = MomentumOptimizer(learning_rate=self.learning_rate_ph_, momentum=0.9) else: opt = GradientDescentOptimizer( learning_rate=self.learning_rate_ph_) """ #修正梯度参数的另一种写法 #获取全部可以训练的参数tf_variables tf_variables = tf.trainable_variables() #提前计算梯度 tf_grads = tf.gradients(self.loss_, tf_variables) #由它们的范数之和之比求多个张量的值 tf_grads,_ = tf.clip_by_global_norm(tf_grads, self.clip_grad_) #将前面clip过的梯度应用到可训练的参数上 self.train_optimizer_ = opt.apply_gradients(zip(tf_grads, tf_variables)) """ # 获取参数,提前计算梯度 grads_and_vars = opt.compute_gradients(self.loss_) # 修正梯度值 grads_and_vars_clip = [[ tf.clip_by_value(g, -self.clip_grad_, self.clip_grad_), v ] for g, v in grads_and_vars] # 应用修正后的梯度值 self.train_optimizer_ = opt.apply_gradients( grads_and_vars_clip, global_step=self.global_step_)
def neural_transfer(content_image, style_image, output_dirpath, epochs=1000, epoch_length=100, alpha=1, beta=10): """ Main function to execute neural transfer algorithm using tensorflow eager execution """ tf.enable_eager_execution() optimizer = AdamOptimizer(learning_rate=0.003) # Layers for loss calculations content_layers = ['block4_conv2'] style_layers = [ 'block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1' ] model = init_model(content_layers, style_layers) # Get target featuremaps tensors from the content and style images content_featuremaps = model(np.expand_dims(content_image, axis=0))[:len(content_layers)] style_featuremaps = model(np.expand_dims(style_image, axis=0))[len(content_layers):] # Starting point of combination image image_zero = np.expand_dims(np.random.random(np.shape(content_image)), axis=0) combined_image_tensor = tf.Variable(image_zero, name='combined_image_tensor', dtype=tf.float32) for epoch in range(epochs): print('\nEpoch: ', epoch) # Convert tensor to array then save image to output directory for viewing combined_image = np.squeeze(combined_image_tensor.numpy(), axis=0) output_filepath = os.path.join(output_dirpath, 'epoch_{}.png'.format(epoch)) cv2.imwrite(output_filepath, combined_image * 255) content_losses_array_avg = np.zeros(len(content_layers), dtype=np.float32) style_losses_array_avg = np.zeros(len(style_layers), dtype=np.float32) for _ in tqdm(range(epoch_length)): # Operations here are recorded to "GradientTape" for backpropagation with tf.GradientTape() as tape: combination_featuremaps = model(combined_image_tensor) total_loss, content_losses, style_losses = calc_total_loss( content_featuremaps, style_featuremaps, combination_featuremaps, alpha, beta) gradients = tape.gradient(total_loss, combined_image_tensor) optimizer.apply_gradients([[gradients, combined_image_tensor]]) # Ensure output image/tensor is bounded between 0 and 1 clipped = tf.clip_by_value(combined_image_tensor, clip_value_min=0, clip_value_max=1) combined_image_tensor.assign(clipped) # Record the average losses for the epoch content_losses_array_avg += content_losses / epoch_length style_losses_array_avg += style_losses / epoch_length # Display individual losses for analysis print('Content loss: ', content_losses_array_avg) print('Style loss: ', style_losses_array_avg) print( 'Total loss: ', np.sum(style_losses_array_avg) + np.sum(content_losses_array_avg))
class ExponentialMappingOptimizer(optimizer.Optimizer): def __init__(self, lr=0.1, use_locking=False, name="ExponentialMappingOptimizer"): super(ExponentialMappingOptimizer, self).__init__(use_locking, name) self.lr = lr self.euclidean_optimizer = AdamOptimizer() def _apply_dense(self, grad, var): assert False spacial_grad = grad[..., :-1] t_grad = -1 * grad[..., -1:] ambient_grad = tf.concat([spacial_grad, t_grad], axis=-1) tangent_grad = project_onto_tangent_space(var, ambient_grad) exp_map = exponential_mapping(var, -self.lr * tangent_grad) return tf.assign(var, exp_map) def _apply_sparse(self, grad, var): if "hyperbolic" in var.name: indices = grad.indices values = grad.values p = tf.gather(var, indices, name="gather_apply_sparse") spacial_grad = values[..., :-1] t_grad = -1 * values[..., -1:] ambient_grad = K.concatenate(\ [spacial_grad, t_grad], axis=-1, ) tangent_grad = project_onto_tangent_space(p, ambient_grad) exp_map = exponential_mapping(p, -self.lr * tangent_grad) return tf.scatter_update(ref=var, indices=indices, updates=exp_map, name="scatter_update") else: # euclidean update using Adam optimizer return self.euclidean_optimizer.apply_gradients([ (grad, var), ]) # class MyAdamOptimizer(optimizer.Optimizer): # """Optimizer that implements the Adam algorithm. # See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) # ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). # """ # def __init__(self, # learning_rate=1e-3, # beta1=0.9, # beta2=0.999, # epsilon=1e-8, # use_locking=False, # name="Adam"): # r"""Construct a new Adam optimizer. # Initialization: # $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ # $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ # $$t := 0 \text{(Initialize timestep)}$$ # The update rule for `variable` with gradient `g` uses an optimization # described at the end of section 2 of the paper: # $$t := t + 1$$ # $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ # $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ # $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ # $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ # The default value of 1e-8 for epsilon might not be a good default in # general. For example, when training an Inception network on ImageNet a # current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the # formulation just before Section 2.1 of the Kingma and Ba paper rather than # the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon # hat" in the paper. # The sparse implementation of this algorithm (used when the gradient is an # IndexedSlices object, typically because of `tf.gather` or an embedding # lookup in the forward pass) does apply momentum to variable slices even if # they were not used in the forward pass (meaning they have a gradient equal # to zero). Momentum decay (beta1) is also applied to the entire momentum # accumulator. This means that the sparse behavior is equivalent to the dense # behavior (in contrast to some momentum implementations which ignore momentum # unless a variable slice was actually used). # Args: # learning_rate: A Tensor or a floating point value. The learning rate. # beta1: A float value or a constant float tensor. The exponential decay # rate for the 1st moment estimates. # beta2: A float value or a constant float tensor. The exponential decay # rate for the 2nd moment estimates. # epsilon: A small constant for numerical stability. This epsilon is # "epsilon hat" in the Kingma and Ba paper (in the formula just before # Section 2.1), not the epsilon in Algorithm 1 of the paper. # use_locking: If True use locks for update operations. # name: Optional name for the operations created when applying gradients. # Defaults to "Adam". @compatibility(eager) When eager execution is # enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a # callable that takes no arguments and returns the actual value to use. # This can be useful for changing these values across different # invocations of optimizer functions. @end_compatibility # """ # super(MyAdamOptimizer, self).__init__(use_locking, name) # self._lr = learning_rate # self._beta1 = beta1 # self._beta2 = beta2 # self._epsilon = epsilon # # Tensor versions of the constructor arguments, created in _prepare(). # self._lr_t = None # self._beta1_t = None # self._beta2_t = None # self._epsilon_t = None # def _get_beta_accumulators(self): # with ops.init_scope(): # if context.executing_eagerly(): # graph = None # else: # graph = ops.get_default_graph() # return (self._get_non_slot_variable("beta1_power", graph=graph), # self._get_non_slot_variable("beta2_power", graph=graph)) # def _create_slots(self, var_list): # # Create the beta1 and beta2 accumulators on the same device as the first # # variable. Sort the var_list to make sure this device is consistent across # # workers (these need to go on the same PS, otherwise some updates are # # silently ignored). # first_var = min(var_list, key=lambda x: x.name) # self._create_non_slot_variable( # initial_value=self._beta1, name="beta1_power", colocate_with=first_var) # self._create_non_slot_variable( # initial_value=self._beta2, name="beta2_power", colocate_with=first_var) # # Create slots for the first and second moments. # for v in var_list: # self._zeros_slot(v, "m", self._name) # self._zeros_slot(v, "v", self._name) # def _prepare(self): # lr = self._call_if_callable(self._lr) # beta1 = self._call_if_callable(self._beta1) # beta2 = self._call_if_callable(self._beta2) # epsilon = self._call_if_callable(self._epsilon) # self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") # self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") # self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") # self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") # def _apply_dense(self, grad, var): # assert False # m = self.get_slot(var, "m") # v = self.get_slot(var, "v") # beta1_power, beta2_power = self._get_beta_accumulators() # return training_ops.apply_adam( # var, # m, # v, # math_ops.cast(beta1_power, var.dtype.base_dtype), # math_ops.cast(beta2_power, var.dtype.base_dtype), # math_ops.cast(self._lr_t, var.dtype.base_dtype), # math_ops.cast(self._beta1_t, var.dtype.base_dtype), # math_ops.cast(self._beta2_t, var.dtype.base_dtype), # math_ops.cast(self._epsilon_t, var.dtype.base_dtype), # grad, # use_locking=self._use_locking).op # def _resource_apply_dense(self, grad, var): # assert False # m = self.get_slot(var, "m") # v = self.get_slot(var, "v") # beta1_power, beta2_power = self._get_beta_accumulators() # return training_ops.resource_apply_adam( # var.handle, # m.handle, # v.handle, # math_ops.cast(beta1_power, grad.dtype.base_dtype), # math_ops.cast(beta2_power, grad.dtype.base_dtype), # math_ops.cast(self._lr_t, grad.dtype.base_dtype), # math_ops.cast(self._beta1_t, grad.dtype.base_dtype), # math_ops.cast(self._beta2_t, grad.dtype.base_dtype), # math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), # grad, # use_locking=self._use_locking) # def _apply_sparse_shared(self, grad, var, indices, # scatter_add): # beta1_power, beta2_power = self._get_beta_accumulators() # beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) # beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) # lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) # beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) # beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) # epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # grad = tf.verify_tensor_all_finite(grad, "fail in grad") # # if "hyperbolic" in var.name: # # grad = K.concatenate([grad[:,:-1], -grad[:,-1:]], # # axis=-1) # # m_t = beta1 * m + (1 - beta1) * g_t # m = self.get_slot(var, "m") # m_scaled_g_values = grad * (1 - beta1_t) # m_t = state_ops.assign(m, m * beta1_t, # use_locking=self._use_locking) # with ops.control_dependencies([m_t]): # m_t = scatter_add(m, indices, m_scaled_g_values) # # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) # v = self.get_slot(var, "v") # v_scaled_g_values = (grad * grad) * (1 - beta2_t) # v_t = state_ops.assign(v, v * beta2_t, # use_locking=self._use_locking) # with ops.control_dependencies([v_t]): # v_t = scatter_add(v, indices, v_scaled_g_values) # v_sqrt = math_ops.sqrt(K.maximum(v_t, 0.)) # if "hyperbolic" in var.name: # m_t = tf.verify_tensor_all_finite(m_t, "fail in m_t") # v_sqrt = tf.verify_tensor_all_finite(v_sqrt, # "fail in v_sqrt") # gr = m_t / (v_sqrt + epsilon_t) # gr = tf.verify_tensor_all_finite(gr, "fail in gr") # gr = K.concatenate( # [gr[...,:-1], -gr[...,-1:]], # axis=-1) # gr_tangent = project_onto_tangent_space(var, gr) # gr_tangent = tf.verify_tensor_all_finite(gr_tangent, # "fail in tangent") # exp_map = exponential_mapping(var, -lr * gr_tangent) # exp_map = tf.verify_tensor_all_finite(exp_map, # "fail in exp_map") # var_update = state_ops.assign( # var, # exp_map, # use_locking=self._use_locking) # else: # var_update = state_ops.assign_sub( # var, # lr * m_t / (v_sqrt + epsilon_t), # use_locking=self._use_locking) # return control_flow_ops.group(*[var_update, m_t, v_t]) # def _apply_sparse(self, grad, var): # return self._apply_sparse_shared( # grad.values, # var, # grad.indices, # lambda x, i, v: state_ops.scatter_add( # x, # i, # v, # use_locking=self._use_locking)) # def _resource_scatter_add(self, x, i, v): # with ops.control_dependencies( # [resource_variable_ops.resource_scatter_add(x.handle, i, v)]): # return x.value() # def _resource_apply_sparse(self, grad, var, indices): # return self._apply_sparse_shared(grad, var, indices, # self._resource_scatter_add) # def _finish(self, update_ops, name_scope): # # Update the power accumulators. # with ops.control_dependencies(update_ops): # beta1_power, beta2_power = self._get_beta_accumulators() # with ops.colocate_with(beta1_power): # update_beta1 = beta1_power.assign( # beta1_power * self._beta1_t, use_locking=self._use_locking) # update_beta2 = beta2_power.assign( # beta2_power * self._beta2_t, use_locking=self._use_locking) # return control_flow_ops.group( # *update_ops + [update_beta1, update_beta2], name=name_scope)