def _network(self, observ, length=None, state=None, reuse=True): """Compute the network output for a batched sequence of observations. Optionally, the initial state can be specified. The weights should be reused for all calls, except for the first one. Output is a named tuple containing the policy as a TensorFlow distribution, the policy mean and log standard deviation, the approximated state value, and the new recurrent state. Args: observ: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. reuse: Python boolean whether to reuse previous variables. Returns: NetworkOutput tuple. """ with tf.variable_scope('network', reuse=reuse): observ = tf.convert_to_tensor(observ) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): observ = tf.check_numerics(observ, 'observ') cell = self._config.network(self._batch_env.action.shape[1].value) (mean, logstd, value), state = tf.nn.dynamic_rnn(cell, observ, length, state, tf.float32, swap_memory=True) mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd)) return _NetworkOutput(policy, mean, logstd, value, state)
def simulate(self, action): """Step the environment. The result of the step can be accessed from the variables defined below. Args: action: Tensor holding the action to apply. Returns: Operation. """ with tf.name_scope('environment/simulate'): if action.dtype in (tf.float16, tf.float32, tf.float64): action = tf.check_numerics(action, 'action') observ_dtype = self._parse_dtype(self._env.observation_space) observ, reward, done = tf.py_func( lambda a: self._env.step(a)[:3], [action], [observ_dtype, tf.float32, tf.bool], name='step') observ = tf.check_numerics(observ, 'observ') reward = tf.check_numerics(reward, 'reward') return tf.group( self._observ.assign(observ), self._action.assign(action), self._reward.assign(reward), self._done.assign(done), self._step.assign_add(1))
def testGradient(self): s = [2, 3, 4, 2] # NOTE(kearnes): divide by 20 so product is a reasonable size x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20. with self.test_session(): t = tf.convert_to_tensor(x) su = tf.reduce_prod(t, []) jacob_t, jacob_n = gradient_checker.ComputeGradient( t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3) su = tf.reduce_prod(t, [1, 2]) jacob_t, jacob_n = gradient_checker.ComputeGradient( t, s, su, [2, 2], x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3) su = tf.reduce_prod(t, [0, 1, 2, 3]) jacob_t, jacob_n = gradient_checker.ComputeGradient( t, s, su, [1], x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3) # NOTE(kearnes): the current gradient calculation gives NaNs for 0 inputs x = np.arange(0.0, 48.0).reshape(s).astype(np.float32) / 20. with self.test_session(): t = tf.convert_to_tensor(x) su = tf.reduce_prod(t, []) jacob_t, _ = gradient_checker.ComputeGradient( t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1) with self.assertRaisesOpError("Tensor had NaN values"): tf.check_numerics(jacob_t, message="_ProdGrad NaN test").op.run()
def batch_norm(input_, dim, name, scale=True, train=True, epsilon=1e-8, decay=.1, axes=[0], bn_lag=DEFAULT_BN_LAG): """Batch normalization.""" # create variables with tf.variable_scope(name): var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) if scale: gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.)) beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.)) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_mean /= (1. - bn_lag**(step + 1)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # normalize res = (input_ - used_mean) / tf.sqrt(used_var + epsilon) # de-normalize if scale: res *= gamma res += beta # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) res += 0. * new_mean * new_var * new_step return res
def recurrent_gaussian( config, action_space, observations, length, state=None): """Independent recurrent policy and feed forward value networks. The policy network outputs the mean action and the standard deviation is learned as independent parameter vector. The last policy layer is recurrent and uses a GRU cell. Args: config: Configuration object. action_space: Action space of the environment. observations: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. Raises: ValueError: Unexpected action space. Returns: Attribute dictionary containing the policy, value, and state. """ if not isinstance(action_space, gym.spaces.Box): raise ValueError('Network expects continuous actions.') if not len(action_space.shape) == 1: raise ValueError('Network only supports 1D action vectors.') action_size = action_space.shape[0] init_output_weights = tf.contrib.layers.variance_scaling_initializer( factor=config.init_output_factor) before_softplus_std_initializer = tf.constant_initializer( np.log(np.exp(config.init_std) - 1)) cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1]) flat_observations = tf.reshape(observations, [ tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) with tf.variable_scope('policy'): x = flat_observations for size in config.policy_layers[:-1]: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32) mean = tf.contrib.layers.fully_connected( x, action_size, tf.tanh, weights_initializer=init_output_weights) std = tf.nn.softplus(tf.get_variable( 'before_softplus_std', mean.shape[2:], tf.float32, before_softplus_std_initializer)) std = tf.tile( std[None, None], [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) with tf.variable_scope('value'): x = flat_observations for size in config.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] mean = tf.check_numerics(mean, 'mean') std = tf.check_numerics(std, 'std') value = tf.check_numerics(value, 'value') policy = CustomKLDiagNormal(mean, std) return agents.tools.AttrDict(policy=policy, value=value, state=state)
def batch_norm_log_diff(input_, dim, name, train=True, epsilon=1e-8, decay=.1, axes=[0], reuse=None, bn_lag=DEFAULT_BN_LAG): """Batch normalization with corresponding log determinant Jacobian.""" if reuse is None: reuse = not train # create variables with tf.variable_scope(name) as scope: if reuse: scope.reuse_variables() var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_var = stable_var(input_=input_, mean=used_mean, axes=axes) cur_var = used_var used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_mean /= (1. - bn_lag**(step + 1)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics( decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) used_var += 0. * new_mean * new_var * new_step used_var += epsilon return used_mean, used_var
def antenna_jones(lm, stokes, alpha, ref_freq): """ Compute the jones terms for each antenna. lm, stokes and alpha are the source variables. """ # Compute the complex phase cplx_phase = rime.phase(lm, D.uvw, D.frequency, CT=CT) # Check for nans/infs in the complex phase phase_msg = ("Check that '1 - l**2 - m**2 >= 0' holds " "for all your lm coordinates. This is required " "for 'n = sqrt(1 - l**2 - m**2) - 1' " "to be finite.") phase_real = tf.check_numerics(tf.real(cplx_phase), phase_msg) phase_imag = tf.check_numerics(tf.imag(cplx_phase), phase_msg) # Compute the square root of the brightness matrix # (as well as the sign) bsqrt, sgn_brightness = rime.b_sqrt(stokes, alpha, D.frequency, ref_freq, CT=CT, polarisation_type=polarisation_type) # Check for nans/infs in the bsqrt bsqrt_msg = ("Check that your stokes parameters " "satisfy I**2 >= Q**2 + U**2 + V**2. " "Montblanc performs a cholesky decomposition " "of the brightness matrix and the above must " "hold for this to produce valid values.") bsqrt_real = tf.check_numerics(tf.real(bsqrt), bsqrt_msg) bsqrt_imag = tf.check_numerics(tf.imag(bsqrt), bsqrt_msg) # Compute the direction dependent effects from the beam ejones = rime.e_beam(lm, D.frequency, D.pointing_errors, D.antenna_scaling, beam_sin, beam_cos, D.beam_extents, D.beam_freq_map, D.ebeam) deps = [phase_real, phase_imag, bsqrt_real, bsqrt_imag] deps = [] # Do nothing for now # Combine the brightness square root, complex phase, # feed rotation and beam dde's with tf.control_dependencies(deps): antenna_jones = rime.create_antenna_jones(bsqrt, cplx_phase, feed_rotation, ejones, FT=FT) return antenna_jones, sgn_brightness
def recurrent_gaussian( config, action_size, observations, length, state=None): """Independent recurrent policy and feed forward value networks. The policy network outputs the mean action and the log standard deviation is learned as independent parameter vector. The last policy layer is recurrent and uses a GRU cell. Args: config: Configuration object. action_size: Length of the action vector. observations: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. Returns: NetworkOutput tuple. """ mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( factor=config.init_mean_factor) logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) cell = tf.contrib.rnn.GRUBlockCell(config.policy_layers[-1]) flat_observations = tf.reshape(observations, [ tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) with tf.variable_scope('policy'): x = flat_observations for size in config.policy_layers[:-1]: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) x, state = tf.nn.dynamic_rnn(cell, x, length, state, tf.float32) mean = tf.contrib.layers.fully_connected( x, action_size, tf.tanh, weights_initializer=mean_weights_initializer) logstd = tf.get_variable( 'logstd', mean.shape[2:], tf.float32, logstd_initializer) logstd = tf.tile( logstd[None, None], [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) with tf.variable_scope('value'): x = flat_observations for size in config.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag( mean, tf.exp(logstd)) # assert state.shape.as_list()[0] is not None return NetworkOutput(policy, mean, logstd, value, state)
def transform(self, value): """Normalize a single or batch tensor. Applies the activated transformations in the constructor using current estimates of mean and variance. Args: value: Batch or single value tensor. Returns: Normalized batch or single value tensor. """ with tf.name_scope(self._name + '/transform'): no_batch_dim = value.shape.ndims == self._mean.shape.ndims if no_batch_dim: # Add a batch dimension if necessary. value = value[None, ...] if self._center: value -= self._mean[None, ...] if self._scale: # We cannot scale before seeing at least two samples. value /= tf.cond( self._count > 1, lambda: self._std() + 1e-8, lambda: tf.ones_like(self._var_sum))[None] if self._clip: value = tf.clip_by_value(value, -self._clip, self._clip) # Remove batch dimension if necessary. if no_batch_dim: value = value[0] return tf.check_numerics(value, 'value')
def _value_loss(self, observ, reward, length): """Compute the loss function for the value baseline. The value loss is the difference between empirical and approximated returns over the collected episodes. Returns the loss tensor and a summary strin. Args: observ: Sequences of observations. reward: Sequences of reward. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('value_loss'): value = self._network(observ, length).value return_ = utility.discounted_return(reward, length, self._config.discount) advantage = return_ - value value_loss = 0.5 * self._mask(advantage**2, length) summary = tf.summary.merge([ tf.summary.histogram('value_loss', value_loss), tf.summary.scalar('avg_value_loss', tf.reduce_mean(value_loss)) ]) value_loss = tf.reduce_mean(value_loss) return tf.check_numerics(value_loss, 'value_loss'), summary
def perform(self, observ): """Compute batch of actions and a summary for a batch of observation. Args: observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) network = self._network(observ[:, None], tf.ones(observ.shape[0]), self._last_state) action = tf.cond(self._is_training, network.policy.sample, lambda: network.mean) logprob = network.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', network.mean[:, 0]), tf.summary.histogram('std', tf.exp(network.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. with tf.control_dependencies([ utility.assign_nested_vars(self._last_state, network.state), self._last_action.assign(action[:, 0]), self._last_mean.assign(network.mean[:, 0]), self._last_logstd.assign(network.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)
def testPassThrough(self): with self.test_session(graph=tf.Graph()): t1 = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3]) checked = tf.check_numerics(t1, message="pass through test") value = checked.eval() self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value) self.assertEqual([2, 3], checked.get_shape())
def _BuildLoss(self): # 1. reconstr_loss seems doesn't do better than l2 loss. # 2. Only works when using reduce_mean. reduce_sum doesn't work. # 3. It seems kl loss doesn't play an important role. self.loss = 0 with tf.variable_scope('loss'): if self.params['l2_loss']: l2_loss = tf.reduce_mean(tf.square(self.diff_output - self.diffs[1])) tf.summary.scalar('l2_loss', l2_loss) self.loss += l2_loss if self.params['reconstr_loss']: reconstr_loss = (-tf.reduce_mean( self.diffs[1] * (1e-10 + self.diff_output) + (1-self.diffs[1]) * tf.log(1e-10 + 1 - self.diff_output))) reconstr_loss = tf.check_numerics(reconstr_loss, 'reconstr_loss') tf.summary.scalar('reconstr_loss', reconstr_loss) self.loss += reconstr_loss if self.params['kl_loss']: kl_loss = (0.5 * tf.reduce_mean( tf.square(self.z_mean) + tf.square(self.z_stddev) - 2 * self.z_stddev_log - 1)) tf.summary.scalar('kl_loss', kl_loss) self.loss += kl_loss tf.summary.scalar('loss', self.loss)
def logdet_grad(op, grad): a = op.inputs[0] a_adj_inv = tf.check_numerics( tf.matrix_inverse(a, adjoint=True), 'zero determinant') out_shape = tf.concat([tf.shape(a)[:-2], [1, 1]], axis=0) return tf.reshape(grad, out_shape) * a_adj_inv
def add_check_numerics_ops(): """Connect a `check_numerics` to every floating point tensor. `check_numerics` operations themselves are added for each `half`, `float`, or `double` tensor in the graph. For all ops in the graph, the `check_numerics` op for all of its (`half`, `float`, or `double`) inputs is guaranteed to run before the `check_numerics` op on any of its outputs. Returns: A `group` op depending on all `check_numerics` ops added. Based on `tf.add_check_numerics_ops`; modified to work around problem with variables in different "frames" (triggered by attempt to merge nodes from inside and outside the while loop of an RNN). """ check_op = [] # This code relies on the ordering of ops in get_operations(). # The producer of a tensor always comes before that tensor's consumer in # this list. This is true because get_operations() returns ops in the order # added, and an op can only be added after its inputs are added. for op in tf.get_default_graph().get_operations(): if op.name and any(re.search(pattern, op.name) for pattern in NO_MONITOR): continue for output in op.outputs: if output.dtype in [tf.float16, tf.float32, tf.float64] and \ output.op._get_control_flow_context() == \ tf.get_default_graph()._get_control_flow_context(): message = op.name + ":" + str(output.value_index) with tf.control_dependencies(check_op): check_op = [tf.check_numerics(output, message=message)] return tf.group(*check_op)
def fit(self, X_train, y_train, X_min, X_max, ridge): # pylint: disable=arguments-differ super(GPRGD, self).fit(X_train, y_train, ridge) self.X_min = X_min self.X_max = X_max with tf.Session(graph=self.graph, config=tf.ConfigProto( intra_op_parallelism_threads=self.num_threads_)) as sess: xt_ = tf.Variable(self.X_train[0], tf.float32) xt_ph = tf.placeholder(tf.float32) xt_assign_op = xt_.assign(xt_ph) init = tf.global_variables_initializer() sess.run(init) K2_mat = tf.transpose(tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow( tf.subtract(xt_, self.X_train), 2), 1)), 0)) if self.check_numerics is True: K2_mat = tf.check_numerics(K2_mat, "K2_mat: ") K2__ = tf.cast(self.magnitude * tf.exp(-K2_mat / self.length_scale), tf.float32) if self.check_numerics is True: K2__ = tf.check_numerics(K2__, "K2__: ") yhat_gd = tf.cast(tf.matmul(tf.transpose(K2__), self.xy_), tf.float32) if self.check_numerics is True: yhat_gd = tf.check_numerics(yhat_gd, message="yhat: ") sig_val = tf.cast((tf.sqrt(self.magnitude - tf.matmul( tf.transpose(K2__), tf.matmul(self.K_inv, K2__)))), tf.float32) if self.check_numerics is True: sig_val = tf.check_numerics(sig_val, message="sigma: ") LOG.debug("\nyhat_gd : %s", str(sess.run(yhat_gd))) LOG.debug("\nsig_val : %s", str(sess.run(sig_val))) loss = tf.squeeze(tf.subtract(self.mu_multiplier * yhat_gd, self.sigma_multiplier * sig_val)) if self.check_numerics is True: loss = tf.check_numerics(loss, "loss: ") optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) # optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) train = optimizer.minimize(loss) self.vars['xt_'] = xt_ self.vars['xt_ph'] = xt_ph self.ops['xt_assign_op'] = xt_assign_op self.ops['yhat_gd'] = yhat_gd self.ops['sig_val2'] = sig_val self.ops['loss_op'] = loss self.ops['train_op'] = train return self
def AddTraining(self, task_context, batch_size, learning_rate=0.1, decay_steps=4000, momentum=0.9, corpus_name='documents'): """Builds a trainer to minimize the cross entropy cost function. Args: task_context: file path from which to read the task context batch_size: batch size to request from reader op learning_rate: initial value of the learning rate decay_steps: decay learning rate by 0.96 every this many steps momentum: momentum parameter used when training with momentum corpus_name: name of the task input to read parses from Returns: Dictionary of named training nodes. """ with tf.name_scope('training'): nodes = self.training nodes.update(self._AddGoldReader(task_context, batch_size, corpus_name)) nodes.update(self._BuildNetwork(nodes['feature_endpoints'], return_average=False)) nodes.update(self._AddCostFunction(batch_size, nodes['gold_actions'], nodes['logits'])) # Add the optimizer if self._only_train: trainable_params = [v for k, v in self.params.iteritems() if k in self._only_train] else: trainable_params = self.params.values() lr = self._AddLearningRate(learning_rate, decay_steps) optimizer = tf.train.MomentumOptimizer(lr, momentum, use_locking=self._use_locking) train_op = optimizer.minimize(nodes['cost'], var_list=trainable_params) for param in trainable_params: slot = optimizer.get_slot(param, 'momentum') self.inits[slot.name] = state_ops.init_variable(slot, tf.zeros_initializer) self.variables[slot.name] = slot numerical_checks = [ tf.check_numerics(param, message='Parameter is not finite.') for param in trainable_params if param.dtype.base_dtype in [tf.float32, tf.float64] ] check_op = tf.group(*numerical_checks) avg_update_op = tf.group(*self._averaging.values()) train_ops = [train_op] if self._check_parameters: train_ops.append(check_op) if self._use_averaging: train_ops.append(avg_update_op) nodes['train_op'] = tf.group(*train_ops, name='train_op') return nodes
def _conv(self, name, x, filter_size, in_filters, out_filters, strides): r = super()._conv(name, x, filter_size, in_filters, out_filters, strides) r = tf.check_numerics(r, "okay") p = tf.abs(r)/tf.reduce_sum(tf.abs(r), axis=(1,2,3), keep_dims=True) w,h,c = p.get_shape().as_list()[1:] N = w*h*c*2 if self.fix_randomness: p_keep = 1-tf.exp(-N*p) rand = tf.constant(np.random.uniform(size=(p_keep.shape[0],w,h,c)), dtype=tf.float32) else: p_keep = 1-tf.exp(-N*p) rand = tf.random_uniform(tf.shape(p_keep)) keep = rand<p_keep r = tf.cast(keep, tf.float32)*r/(p_keep+1e-8) r = tf.check_numerics(r, "OH NO") return r
def discounted_return(reward, length, discount): """Discounted Monte-Carlo returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.reverse(tf.transpose(tf.scan( lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]), tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def _policy_loss( self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean(self._mask(utility.diag_normal_kl( old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean(self._mask( policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([tf.cond( cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int)]): kl_cutoff = ( self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold) ** 2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss))]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def logdet(a, name = None): with tf.name_scope(name, 'LogDet', [a]) as name: res = tf.check_numerics( py_func(lambda x: np.linalg.slogdet(x)[1], [a], tf.float64, name=name, grad=logdet_grad), # set the gradient 'zero determinant') return res
def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.check_numerics(w, 'w') should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.log1p(-x * w) - c < tf.log(tf.random_uniform(sample_batch_shape, seed=seed(), dtype=self.dtype))) return w, should_continue
def lambda_advantage(reward, value, length, discount): """Generalized Advantage Estimation.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1) delta = reward + discount * next_value - value advantage = tf.reverse(tf.transpose(tf.scan( lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def fixed_step_return(reward, value, length, discount, window): """N-step discounted return.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.zeros_like(reward) for _ in range(window): return_ += reward reward = discount * tf.concat([reward[:, 1:], tf.zeros_like(reward[:, -1:])], 1) return_ += discount**window * tf.concat( [value[:, window:], tf.zeros_like(value[:, -window:]), 1]) return tf.check_numerics(tf.stop_gradient(mask * return_), 'return')
def _style_parameters(name): """Gets style normalization parameters.""" var = style_params[('{}/{}'.format(sc.name, name))] if check_numerics: var = tf.check_numerics(var, 'NaN/Inf in {}'.format(var.name)) if var.get_shape().ndims < 2: var = tf.expand_dims(var, 0) var = tf.expand_dims(tf.expand_dims(var, 1), 1) return var
def lambda_return(reward, value, length, discount, lambda_): """TD-lambda returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) sequence = mask * reward + discount * value * (1 - lambda_) discount = mask * discount * lambda_ sequence = tf.stack([sequence, discount], 2) return_ = tf.reverse(tf.transpose(tf.scan( lambda agg, cur: cur[0] + cur[1] * agg, tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def feed_forward_gaussian_fun(action_space, config, observations): """Feed-forward Gaussian.""" if not isinstance(action_space, gym.spaces.box.Box): raise ValueError("Expecting continuous action space.") mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( factor=config.init_mean_factor) logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) flat_observations = tf.reshape(observations, [ tf.shape(observations)[0], tf.shape(observations)[1], functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) with tf.variable_scope("network_parameters"): with tf.variable_scope("policy"): x = flat_observations for size in config.policy_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) mean = tf.contrib.layers.fully_connected( x, action_space.shape[0], tf.tanh, weights_initializer=mean_weights_initializer) logstd = tf.get_variable( "logstd", mean.shape[2:], tf.float32, logstd_initializer) logstd = tf.tile( logstd[None, None], [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) with tf.variable_scope("value"): x = flat_observations for size in config.value_layers: x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] mean = tf.check_numerics(mean, "mean") logstd = tf.check_numerics(logstd, "logstd") value = tf.check_numerics(value, "value") policy = tf.contrib.distributions.MultivariateNormalDiag(mean, tf.exp(logstd)) return NetworkOutput(policy, value, lambda a: tf.clip_by_value(a, -2., 2))
def simulate(self, action): """Step the batch of environments. The results of the step can be accessed from the variables defined below. Args: action: Tensor holding the batch of actions to apply. Returns: Operation. """ with tf.name_scope('environment/simulate'): if action.dtype in (tf.float16, tf.float32, tf.float64): action = tf.check_numerics(action, 'action') observ_dtype = self._parse_dtype(self._batch_env.observation_space) observ, reward, done = tf.py_func( lambda a: self._batch_env.step(a)[:3], [action], [observ_dtype, tf.float32, tf.bool], name='step') observ = tf.check_numerics(observ, 'observ') reward = tf.check_numerics(reward, 'reward') with tf.control_dependencies([self._observ.assign(observ)]): return tf.identity(reward), tf.identity(done)
def grad_potential(self, position, check_numerics=True): """Get gradient of potential function at current location.""" if not tf.executing_eagerly(): # TODO(lxuechen): Change this to tfe.gradients_function when it works grad = tf.gradients(self.potential(position), position)[0] else: grad = tfe.gradients_function(self.potential)(position)[0] if check_numerics: return tf.check_numerics(grad, message="gradient of potential") return grad
def check_numerics(input_dataset, message): if str(input_dataset).find('Tensor') == 0: input_dataset = tf.check_numerics(input_dataset, message) else: dataset = np.array(input_dataset) nan_count = np.count_nonzero(dataset != dataset) inf_count = len(dataset[dataset == float("inf")]) n_inf_count = len(dataset[dataset == float("-inf")]) if nan_count > 0 or inf_count > 0 or n_inf_count > 0: data_error = '【' + message + '】出现数据错误!【nan:' + str(nan_count) + '|inf:' + str( inf_count) + '|-inf:' + str(n_inf_count) + '】' raise Exception(data_error) return input_dataset
def inference(inputs, num_classes, routing_ites=3, remake=False, training=False, name='capsnet_1d'): """ :param inputs: :param num_classes: :param routing_ites: :param remake: :param name: :return: """ with tf.variable_scope(name) as scope: inputs_shape = inputs.get_shape() batch_size = inputs_shape[0].value image_height = inputs_shape[2].value image_width = inputs_shape[3].value # ReLU Conv1 # Images shape (b, 1, 24, 56) -> conv 5x5 filters, 32 output channels, strides 2 with padding, ReLU # nets -> (b, 256, 16, 48) print('inputs shape: %s' % inputs.get_shape()) inputs = tf.check_numerics(inputs, message="nan or inf from: inputs") conv1 = conv2d(inputs, kernel=3, out_channels=32, stride=1, padding='SAME', activation_fn=tf.nn.relu, normalizer_fn=tf.contrib.layers.batch_norm, name='relu_conv1') print('conv1 shape: %s' % conv1.get_shape()) pool1 = tf.nn.max_pool(conv1, ksize=[1, 1, 2, 2], strides=[1, 1, 2, 2], padding='VALID', data_format='NCHW', name='pool1') print('pool1 shape: %s' % pool1.get_shape()) # pool1_dropout = tf.layers.dropout(pool1, 0.5, training=training, name='pool1_dropout') conv2 = conv2d(pool1, kernel=3, out_channels=64, stride=1, padding='VALID', activation_fn=tf.nn.relu, normalizer_fn=tf.contrib.layers.batch_norm, name='relu_conv2') print('conv2 shape: %s' % conv2.get_shape()) conv2_dropout = tf.layers.dropout(conv2, 0.5, training=training, name='conv2_dropout') # pool2 = tf.nn.max_pool( # conv2, # ksize=[1, 1, 2, 2], strides=[1, 1, 2, 2], # padding='VALID', data_format='NCHW', name='pool2' # ) # print('pool2 shape: %s' % pool2.get_shape()) # pool2_dropout = tf.layers.dropout(pool2, 0.5, training=training, name='pool2_dropout') # conv3 = conv2d( # pool2, # kernel=3, out_channels=128, stride=1, padding='SAME', # activation_fn=tf.nn.relu, normalizer_fn=tf.contrib.layers.batch_norm, # name='relu_conv3' # ) # print('conv3 shape: %s' % conv3.get_shape()) # pool3 = tf.nn.max_pool( # conv3, # ksize=[1, 1, 2, 2], strides=[1, 1, 2, 2], # padding='VALID', data_format='NCHW', name='pool3' # ) # print('pool3 shape: %s' % pool3.get_shape()) print("\nprimary layer:") primary_out_capsules = 64 primary_caps_activations, _ = primary_caps1d( conv2_dropout, kernel_size=4, out_capsules=primary_out_capsules, stride=2, padding='VALID', activation_length=3, name='primary_caps') # (b, 32, 4, 20, 8) # primary_caps_activations = tf.check_numerics(primary_caps_activations, # message="nan or inf from: primary_caps_activations") print("\nconvolutional capsule layer:") conv_out_capsules = 24 conv_kernel_size, conv_stride = 3, 1 conv_caps_activations, conv_coupling_coeffs = conv_capsule1d( primary_caps_activations, kernel_size=conv_kernel_size, stride=conv_stride, routing_ites=3, in_capsules=primary_out_capsules, out_capsules=conv_out_capsules, activation_length=8, training=training, name="conv_caps") # (b, 32, 6, 6, 8), (b*6*6, 32*9, 32) conv_caps_activations = tf.check_numerics( conv_caps_activations, message="nan or inf from: conv_caps_activations") print("\nclass capsule layer:") class_caps_activations, class_coupling_coeffs = class_caps1d( conv_caps_activations, num_classes=num_classes, activation_length=16, routing_ites=routing_ites, batch_size=batch_size, training=training, name='class_capsules') # class_coupling_coeffs = tf.Print(class_coupling_coeffs, [class_coupling_coeffs], summarize=50) class_caps_activations = tf.check_numerics( class_caps_activations, message="nan or inf from: class_caps_activations") print('class_coupling_coeffs shape: %s' % class_coupling_coeffs.get_shape()) print('class_caps_activations shape: %s' % class_caps_activations.get_shape()) if remake: remakes_flatten = _remake(class_caps_activations, image_height * image_width) else: remakes_flatten = None label_logits = _decode(conv_caps_activations, primary_caps_activations, class_coupling_coeffs, conv_coupling_coeffs, conv_kernel_size, conv_stride, primary_out_capsules, num_classes=num_classes, batch_size=batch_size, conv1=conv1, conv2=conv2, pool1=pool1, training=training) # label_logits = tf.Print(label_logits, [tf.constant("label_logits"), label_logits[0]], summarize=100) # label_logits = tf.check_numerics(label_logits, message="nan or inf from: label_logits") labels2d = tf.argmax(label_logits, axis=3) labels2d_expanded = tf.expand_dims(labels2d, -1) tf.summary.image('labels', tf.cast(labels2d_expanded, tf.uint8)) return class_caps_activations, remakes_flatten, label_logits
def net(observations, config): # observation space = shape=(batch_size, episode_length, 10, 14, 2) # action space = shape=(batch, episode_length, 23) batch_size = tf.shape(observations)[0] episode_len = tf.shape(observations)[1] input_ = tf.reshape(observations, shape=[batch_size, episode_len, observations.shape.as_list()[ 2], functools.reduce(operator.mul, observations.shape.as_list()[3:], 1)]) init_xavier_weights = tf.variance_scaling_initializer( scale=1.0, mode='fan_avg', distribution='uniform') init_output_weights = tf.variance_scaling_initializer( scale=config.init_output_factor, mode='fan_in', distribution='normal') # seperate value and policy with tf.variable_scope('o_trunk_policy'): conv1 = tf.layers.conv2d( inputs=input_, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights ) conv2 = tf.layers.conv2d( inputs=conv1, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=256, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('o_crown'): # offensive off_fc = tf.layers.dense( inputs=trunk_fc, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('actions'): off_action_mean = tf.layers.dense( inputs=off_fc, units=12, activation=tf.tanh, # NOTE tanh is not good? kernel_initializer=init_output_weights, ) with tf.variable_scope('decision'): logits = tf.layers.dense( inputs=off_fc, units=3, activation=None, kernel_initializer=init_output_weights, ) with tf.variable_scope('o_trunk_value'): conv1 = tf.layers.conv2d( inputs=input_, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights ) conv2 = tf.layers.conv2d( inputs=conv1, filters=128, kernel_size=[1, 3], padding='valid', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=256, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('o_crown'): # offensive off_fc = tf.layers.dense( inputs=trunk_fc, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) off_value = tf.layers.dense( inputs=off_fc, units=1, activation=None, kernel_initializer=init_output_weights, ) off_value = tf.reshape( off_value, shape=[batch_size, episode_len]) off_value = tf.check_numerics(off_value, 'off_value') # with tf.variable_scope('o_trunk'): # conv1 = tf.layers.conv2d( # inputs=input_, # filters=64, # kernel_size=[1, 3], # padding='same', # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights # ) # conv2 = tf.layers.conv2d( # inputs=conv1, # filters=64, # kernel_size=[1, 3], # padding='same', # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( # operator.mul, conv2.shape.as_list()[2:], 1)]) # trunk_fc = tf.layers.dense( # inputs=flatten, # units=128, # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # with tf.variable_scope('o_crown'): # # offensive # off_fc = tf.layers.dense( # inputs=trunk_fc, # units=64, # activation=tf.nn.relu, # kernel_initializer=init_xavier_weights, # ) # with tf.variable_scope('policy'): # with tf.variable_scope('actions'): # off_action_mean = tf.layers.dense( # inputs=off_fc, # units=12, # activation=tf.tanh, # NOTE tanh is not good? # kernel_initializer=init_output_weights, # ) # with tf.variable_scope('decision'): # logits = tf.layers.dense( # inputs=off_fc, # units=3, # activation=None, # kernel_initializer=init_output_weights, # ) # with tf.variable_scope('value'): # off_value = tf.layers.dense( # inputs=off_fc, # units=1, # activation=None, # kernel_initializer=init_output_weights, # ) # off_value = tf.reshape( # off_value, shape=[batch_size, episode_len]) # off_value = tf.check_numerics(off_value, 'off_value') with tf.variable_scope('d_trunk'): conv1 = tf.layers.conv2d( inputs=input_, filters=64, kernel_size=[1, 3], padding='same', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) conv2 = tf.layers.conv2d( inputs=conv1, filters=64, kernel_size=[1, 3], padding='same', activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) flatten = tf.reshape(conv2, shape=[batch_size, episode_len, functools.reduce( operator.mul, conv2.shape.as_list()[2:], 1)]) trunk_fc = tf.layers.dense( inputs=flatten, units=128, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('d_crown'): # defensive def_fc = tf.layers.dense( inputs=trunk_fc, units=64, activation=tf.nn.relu, kernel_initializer=init_xavier_weights, ) with tf.variable_scope('policy'): with tf.variable_scope('actions'): def_action_mean = tf.layers.dense( inputs=def_fc, units=10, activation=tf.tanh, # NOTE tanh is not good? kernel_initializer=init_output_weights, ) with tf.variable_scope('value'): def_value = tf.layers.dense( inputs=def_fc, units=1, activation=None, kernel_initializer=init_output_weights, ) def_value = tf.reshape( def_value, shape=[batch_size, episode_len]) def_value = tf.check_numerics(def_value, 'def_value') return logits, off_action_mean, off_value, def_action_mean, def_value
def make_dag(latent, sequence_lengths, params): """ :param latent: (N, L, D) :param sequence_lengths: (N,) :return: """ n = tf.shape(latent)[0] L = tf.shape(latent)[1] with tf.variable_scope('dag'): proj_a = slim.fully_connected(inputs=latent, activation_fn=tf.nn.leaky_relu, num_outputs=params.attention_dim, scope='proj_a') proj_b = slim.fully_connected(inputs=latent, activation_fn=tf.nn.leaky_relu, num_outputs=params.attention_dim, scope='proj_b') proj_a_ta = tf.TensorArray(size=n, dtype=proj_a.dtype).unstack(proj_a) proj_b_ta = tf.TensorArray(size=n, dtype=proj_b.dtype).unstack(proj_b) sequence_lengths_ta = tf.TensorArray( size=n, dtype=sequence_lengths.dtype).unstack(sequence_lengths) def body(t, output_ta_t: tf.TensorArray, penalty_ta_t: tf.TensorArray): proj_a_t = proj_a_ta.read(t) proj_b_t = proj_b_ta.read(t) sequence_lengths_t = sequence_lengths_ta.read(t) proj_a_t_slice = proj_a_t[:sequence_lengths_t, :] proj_b_t_slice = proj_b_t[:sequence_lengths_t, :] energy = tf.tensordot(proj_a_t_slice, proj_b_t_slice, axes=[(1, ), (1, )]) # energy = energy*(1-tf.eye(sequence_lengths_t)) # mask diagonal # edges = tf.nn.sigmoid(energy) # edges = edges * (1 - tf.eye(sequence_lengths_t)) # mask diagonal edges = tf.nn.softmax(energy, axis=-1) # exp_edges = edges # for _ in range(params.series_depth): # exp_edges = tf.matmul(exp_edges, edges) exp_edges = tf.linalg.expm(input=edges) # penalty_t = tf.maximum(tf.trace(exp_edges) - tf.cast(sequence_lengths_t, tf.float32), 0) penalty_t = tf.trace(exp_edges) - tf.cast(sequence_lengths_t, tf.float32) # penalty_t = tf.reduce_sum(tf.maximum(tf.trace(exp_edges) - tf.cast(sequence_lengths_t, tf.float32), 0)) length_diff = L - sequence_lengths_t edges_padded = tf.pad(tensor=edges, paddings=[(0, length_diff), (0, length_diff)]) output_ta_t1 = output_ta_t.write(value=edges_padded, index=t) penalty_ta_t1 = penalty_ta_t.write(value=penalty_t, index=t) return t + 1, output_ta_t1, penalty_ta_t1 def cond(t, output_ta_t: tf.TensorArray, penalty_ta_t: tf.TensorArray): return t < n output_ta = tf.TensorArray(size=n, dtype=tf.float32) penalty_ta = tf.TensorArray(size=n, dtype=tf.float32) _, output_ta, penalty_ta = tf.while_loop( cond=cond, body=body, loop_vars=(tf.constant(0, dtype=tf.int32), output_ta, penalty_ta)) dag = output_ta.stack() # (N, L, L) penalty = penalty_ta.stack() # (N,) penalty = tf.check_numerics(penalty, message='penalty numerics') return dag, penalty
def two_trunk_gaussian(config, action_space, observations, unused_length, state=None): """ ### Structure ### O_Trunk : offensive crown, shape=(15) - policy : [Categorical(3), CustomKLDiagNormal(12)] 3 for discrete decisions, 2 for ball's direction, 10 for five ofensive players' dash(x,y). - value : shape=(1) ### D_Trunk : defensive crown, shape=(11) - policy : [CustomKLDiagNormal(10)] 10 for five defensive players' dash(x, y) - value : shape=(1) Args ---- config : Configuration object. action_space : Action space of the environment. observations : shape=[batch_size, episode_length, 5, 14, 2] Sequences of observations. unused_length : Batch of sequence lengths. state : Unused batch of initial states. (for rnn net) Raises: ValueError: Unexpected action space. Returns ------- Attribute dictionary containing the policy, value, and unused state. - policy : [Categorical(3), CustomKLDiagNormal(12), CustomKLDiagNormal(10)] - value : [off_value, def_value] NOTE maybe softmax will limit the exploration ability tf.contrib.distributions.TransformedDistribution (lognormal)?! because the action space might? like lognormal? than gaussian """ logits, off_action_mean, off_value, def_action_mean, def_value = net( observations, config) with tf.variable_scope('two_trunk_gaussian'): # config before_softplus_std_initializer = tf.constant_initializer( np.log(np.exp(config.init_std) - 1)) off_actions_std = tf.nn.softplus(tf.get_variable( # TODO 'off_before_softplus_std', off_action_mean.shape[2:], tf.float32, before_softplus_std_initializer)) off_actions_std = tf.tile( off_actions_std[None, None], [tf.shape(observations)[0], tf.shape(observations)[1], 1]) off_action_mean = tf.check_numerics( off_action_mean, 'off_action_mean') off_actions_std = tf.check_numerics( off_actions_std, 'off_actions_std') off_actions = CustomKLDiagNormal( off_action_mean, off_actions_std) off_decision = tfd.Categorical(logits) off_policy = [off_decision, off_actions] def_actions_std = tf.nn.softplus(tf.get_variable( # TODO 'def_before_softplus_std', def_action_mean.shape[2:], tf.float32, before_softplus_std_initializer)) def_actions_std = tf.tile( def_actions_std[None, None], [tf.shape(observations)[0], tf.shape(observations)[1], 1]) def_action_mean = tf.check_numerics( def_action_mean, 'def_action_mean') def_actions_std = tf.check_numerics( def_actions_std, 'def_actions_std') def_actions = CustomKLDiagNormal( def_action_mean, def_actions_std) def_policy = def_actions policy = off_policy + [def_policy] value = [off_value, def_value] return agents.tools.AttrDict(state=state, policy=policy, value=value)
tf.assert_greater() tf.assert_greater_equal() tf.assert_integer() tf.assert_less() tf.assert_less_equal() tf.assert_non_negative() tf.assign() tf.assign_add() tf.assign_sub() tf.argmax() tf.argmin() tf.clip_by_average_norm() tf.cast() tf.case() tf.ceil() tf.check_numerics() tf.check_ops tf.cholesky() tf.cholesky_grad() tf.cholesky_solve() tf.clip_by_global_norm() tf.clip_by_average_norm() tf.clip_by_norm() tf.clip_by_value() tf.colocate_with() tf.complex() tf.cond tf.confusion_matrix() tf.conj() tf.cross() tf.cumprod()
def inference(inputs, num_classes, routing_ites=3, remake=False, training=False, name='capsnet_1d'): """ :param inputs: :param num_classes: :param routing_ites: :param remake: :param name: :return: """ with tf.variable_scope(name) as scope: inputs_shape = inputs.get_shape() batch_size = inputs_shape[0].value image_height = inputs_shape[2].value image_width = inputs_shape[3].value # ReLU Conv1 # Images shape (b, 1, 24, 56) -> conv 5x5 filters, 32 output channels, strides 2 with padding, ReLU # nets -> (b, 256, 16, 48) print('inputs shape: %s' % inputs.get_shape()) inputs = tf.check_numerics(inputs, message="nan or inf from: inputs") print("\nconv1 layer:") conv1 = conv2d(inputs, kernel=5, out_channels=256, stride=1, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv1') # conv1 = tf.check_numerics(conv1, message="nan or inf from: conv1") print('conv1 shape: %s' % conv1.get_shape()) # print("\nconv2 layer:") # conv2 = conv2d( # conv1, # kernel=5, out_channels=256, stride=1, padding='VALID', # activation_fn=tf.nn.relu, name='relu_conv2' # ) # # conv2 = tf.check_numerics(conv2, message="nan or inf from: conv2") # print('conv2 shape: %s' % conv2.get_shape()) # PrimaryCaps # (b, 256, 16, 48) -> capsule 1x1 filter, 32 output capsule, strides 1 without padding # nets -> activations (?, 14, 14, 32)) print("\nprimary layer:") primary_out_capsules = 24 primary_caps_activations, conv2 = primary_caps1d( conv1, kernel_size=3, out_capsules=primary_out_capsules, stride=2, padding='VALID', activation_length=8, name='primary_caps') # (b, 32, 4, 20, 8) # (b, 32, 4, 20, 8) -> # (b, 32*4*20, 2*64) print("\nclass capsule layer:") class_caps_activations, class_coupling_coeffs = class_caps1d( primary_caps_activations, num_classes=num_classes, activation_length=16, routing_ites=routing_ites, batch_size=batch_size, name='class_capsules') # class_coupling_coeffs = tf.Print(class_coupling_coeffs, [class_coupling_coeffs], summarize=50) # class_caps_activations = tf.check_numerics(class_caps_activations, message="nan or inf from: class_caps_activations") print('class_coupling_coeffs shape: %s' % class_coupling_coeffs.get_shape()) print('class_caps_activations shape: %s' % class_caps_activations.get_shape()) if remake: remakes_flatten = _remake(class_caps_activations, image_height * image_width) else: remakes_flatten = None print("\ndecode layers:") label_logits = _decode(primary_caps_activations, primary_out_capsules, coupling_coeffs=class_coupling_coeffs, num_classes=num_classes, batch_size=batch_size, conv1=conv1, conv2=conv2) # label_logits = tf.Print(label_logits, [tf.constant("label_logits"), label_logits[0]], summarize=100) # label_logits = tf.check_numerics(label_logits, message="nan or inf from: label_logits") labels2d = tf.argmax(label_logits, axis=3) labels2d_expanded = tf.expand_dims(labels2d, -1) tf.summary.image('labels', tf.cast(labels2d_expanded, tf.uint8)) return class_caps_activations, remakes_flatten, label_logits
def safe_exp(x, name=None): return tf.exp(x) return tf.check_numerics(tf.exp(x), message='%s is NaN' % name)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=None): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. graph_hook_fn: Optional function that is called after the inference graph is built (before optimization). This is helpful to perform additional changes to the training graph such as adding FakeQuant ops. The function should modify the default graph. Raises: ValueError: If both num_clones > 1 and train_config.sync_replicas is true. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() if num_clones != 1 and train_config.sync_replicas: raise ValueError('In Synchronous SGD mode num_clones must ', 'be 1. Found num_clones: {}'.format(num_clones)) batch_size = train_config.batch_size // num_clones if train_config.sync_replicas: batch_size //= train_config.replicas_to_aggregate with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( batch_size, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope if graph_hook_fn: with tf.device(deploy_config.variables_device()): graph_hook_fn() # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.train.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=worker_replicas) sync_optimizer = training_optimizer with tf.device(deploy_config.optimizer_device()): regularization_losses = ( None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=regularization_losses) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram('ModelVars/' + model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar('Losses/' + loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.6)) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, max_to_keep=2) # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, fine_tune_checkpoint_type is set based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint, include_global_step=False)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, # save_interval_secs=300, sync_optimizer=sync_optimizer, saver=saver)
def train(model, data, batch_size=128, learning_rate=FLAGS.learning_rate, log_dir='./log', checkpoint_dir='./checkpoint', num_epochs=-1): # tf Graph input with tf.device('/cpu:0'): with tf.name_scope('data'): x, yt = data.generate_batches(batch_size) global_step = tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False) if FLAGS.gpu: device_str = '/gpu:' + str(FLAGS.device) else: device_str = '/cpu:0' with tf.device(device_str): y = model(x, is_training=True) # Define loss and optimizer with tf.name_scope('objective'): # loss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt, logits=y)) # accuracy = tf.reduce_mean( # tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32)) diff = tf.subtract(y, yt) printop = tf.Print(diff, [y, yt, diff]) with tf.control_dependencies([printop]): loss = tf.reduce_mean(tf.multiply(diff, diff)) #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=yt, logits=y)) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.sign(y), tf.sign(yt)), tf.float32)) opt = tf.contrib.layers.optimize_loss( loss, global_step, learning_rate, 'Adam', gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, #moving_average_decay=0.9, learning_rate_decay_fn=learning_rate_decay_fn, update_ops=None, variables=None, name=None) #grads = opt.compute_gradients(loss) #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # loss_avg ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step, name='average') ema_op = ema.apply([loss, accuracy] + tf.trainable_variables()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) loss_avg = ema.average(loss) tf.summary.scalar('loss/training', loss_avg) accuracy_avg = ema.average(accuracy) tf.summary.scalar('accuracy/training', accuracy_avg) check_loss = tf.check_numerics(loss, 'model diverged: loss->nan') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss) updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([opt]): train_op = tf.group(*updates_collection) if FLAGS.summary: add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg], activation_list=tf.get_collection( tf.GraphKeys.ACTIVATIONS), var_list=tf.trainable_variables()) # grad_list=grads) summary_op = tf.summary.merge_all() # Configure options for session gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options, )) saver = tf.train.Saver(max_to_keep=5) sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_batches = data.size[0] / batch_size summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph) epoch = 0 print('num of trainable paramaters: %d' % count_params(tf.trainable_variables())) while epoch != num_epochs: epoch += 1 curr_step = 0 # Initializing the variables #with tf.Session() as session: # print(session.run(ww)) #print('Started epoch %d' % epoch) #bar = Bar('Training', max=num_batches, # suffix='%(percent)d%% eta: %(eta)ds') while curr_step < data.size[0]: _, loss_val = sess.run([train_op, loss]) curr_step += FLAGS.batch_size #bar.next() step, acc_value, loss_value, summary = sess.run( [global_step, accuracy_avg, loss_avg, summary_op]) saver.save(sess, save_path=checkpoint_dir + '/model.ckpt', global_step=global_step) #bar.finish() test_acc, test_loss = evaluate(model, FLAGS.dataset, batch_size=batch_size, checkpoint_dir=checkpoint_dir) # , # log_dir=log_dir) print('Finished epoch %d' % epoch, ', Training Accuracy: %.3f' % acc_value, ', Training Loss: %.3f' % loss_value, ', Test Accuracy: %.3f' % test_acc, ', Test Loss: %.3f' % test_loss) summary_out = tf.Summary() summary_out.ParseFromString(summary) summary_out.value.add(tag='accuracy/test', simple_value=test_acc) summary_out.value.add(tag='loss/test', simple_value=test_loss) summary_writer.add_summary(summary_out, step) summary_writer.flush() # When done, ask the threads to stop. coord.request_stop() coord.join(threads) coord.clear_stop() summary_writer.close()
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # Set up deployment (i.e., multi-GPUs and/or multi-replicas). config = model_deploy.DeploymentConfig(num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.num_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Split the batch across GPUs. assert FLAGS.train_batch_size % config.num_clones == 0, ( 'Training batch size not divisble by number of clones (GPUs).') clone_batch_size = FLAGS.train_batch_size // config.num_clones tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Training on %s set', FLAGS.train_split) with tf.Graph().as_default() as graph: with tf.device(config.inputs_device()): dataset = data_generator.Dataset( dataset_name=FLAGS.dataset, split_name=FLAGS.train_split, dataset_dir=FLAGS.dataset_dir, batch_size=clone_batch_size, crop_size=[int(sz) for sz in FLAGS.train_crop_size], min_resize_value=FLAGS.min_resize_value, max_resize_value=FLAGS.max_resize_value, resize_factor=FLAGS.resize_factor, min_scale_factor=FLAGS.min_scale_factor, max_scale_factor=FLAGS.max_scale_factor, scale_factor_step_size=FLAGS.scale_factor_step_size, model_variant=FLAGS.model_variant, num_readers=4, is_training=True, should_shuffle=True, should_repeat=True, # my code is here is_salient=FLAGS.is_salient) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = tf.train.get_or_create_global_step() # Define the model and create clones. model_fn = _build_deeplab model_args = (dataset.get_one_shot_iterator(), { common.OUTPUT_TYPE: dataset.num_of_classes }, dataset.ignore_label) clones = model_deploy.create_clones(config, model_fn, args=model_args) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. first_clone_scope = config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for model variables. for model_var in tf.model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for images, labels, semantic predictions if FLAGS.save_summaries_images: summary_image = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/')) summaries.add( tf.summary.image('samples/%s' % common.IMAGE, summary_image)) first_clone_label = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/')) # Scale up summary image pixel values for better visualization. pixel_scaling = max(1, 255 // dataset.num_of_classes) summary_label = tf.cast(first_clone_label * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.LABEL, summary_label)) first_clone_output = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/')) predictions = tf.expand_dims(tf.argmax(first_clone_output, 3), -1) summary_predictions = tf.cast(predictions * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.OUTPUT_TYPE, summary_predictions)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Build the optimizer based on the device specification. with tf.device(config.optimizer_device()): learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate, decay_steps=FLAGS.decay_steps, end_learning_rate=FLAGS.end_learning_rate, # my code is here boundaries=[int(bd) for bd in FLAGS.boundaries], boundary_learning_rates=[ float(rates) for rates in FLAGS.boundary_learning_rates ]) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=FLAGS.adam_learning_rate, epsilon=FLAGS.adam_epsilon) else: raise ValueError('Unknown optimizer') if FLAGS.quantize_delay_step >= 0: if FLAGS.num_clones > 1: raise ValueError( 'Quantization doesn\'t support multi-clone yet.') contrib_quantize.create_training_graph( quant_delay=FLAGS.quantize_delay_step) startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps with tf.device(config.variables_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.use_rrm, FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # my code is here # allocate the fraction of GPU memory # session_config.gpu_options.per_process_gpu_memory_fraction = 0.4 # Start the training. profile_dir = FLAGS.profile_logdir if profile_dir is not None: tf.gfile.MakeDirs(profile_dir) with contrib_tfprof.ProfileContext(enabled=profile_dir is not None, profile_dir=profile_dir): init_fn = None if FLAGS.tf_initial_checkpoint: init_fn = train_utils.get_model_init_fn( FLAGS.train_logdir, FLAGS.tf_initial_checkpoint, FLAGS.initialize_last_layer, last_layers, ignore_missing_vars=True) slim.learning.train(train_tensor, logdir=FLAGS.train_logdir, log_every_n_steps=FLAGS.log_steps, master=FLAGS.master, number_of_steps=FLAGS.training_number_of_steps, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=startup_delay_steps, init_fn=init_fn, summary_op=summary_op, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def main(_): with tf.Graph().as_default() as graph: summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) num_batches_epoch = num_samples // (FLAGS.batch_size * FLAGS.num_clones) print(num_batches_epoch) ####################### # Config model_deploy # ####################### config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.ps_tasks) # Create global_step with tf.device(config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### with tf.device(config.inputs_device()): # Train Process dataset = get_split('train', FLAGS.dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=FLAGS.batch_size * 20, common_queue_min=FLAGS.batch_size * 10) [image_a, image_b, label] = provider.get(['image_a', 'image_b', 'label']) image_a = process_image(image_a) image_b = process_image(image_b) image_a.set_shape([FLAGS.target_height, FLAGS.target_width, 3]) image_b.set_shape([FLAGS.target_height, FLAGS.target_width, 3]) images_a, images_b, labels = tf.train.batch( [image_a, image_b, label], batch_size=FLAGS.batch_size, num_threads=8, capacity=FLAGS.batch_size * 10) inputs_queue = prefetch_queue([images_a, images_b, labels]) ###################### # Select the network # ###################### def model_fn(inputs_queue): images_a, images_b, labels = inputs_queue.dequeue() model = find_class_by_name(FLAGS.model, [models])() if 'ContrastiveModel' in FLAGS.model: vec_a, vec_b = model.create_model(images_a, images_b, reuse=False, is_training=True) contrastive_loss = tf.contrib.losses.metric_learning.contrastive_loss( labels, vec_a, vec_b) tf.losses.add_loss(contrastive_loss) else: logits = model.create_model(images_a, images_b, reuse=False, is_training=True) label_onehot = tf.one_hot(labels, 2) crossentropy_loss = tf.losses.softmax_cross_entropy( onehot_labels=label_onehot, logits=logits) clones = model_deploy.create_clones(config, model_fn, [inputs_queue]) first_clone_scope = clones[0].scope ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(config.optimizer_device()): learning_rate_step_boundaries = [ int(num_batches_epoch * num_epoches * 0.50), int(num_batches_epoch * num_epoches * 0.75), int(num_batches_epoch * num_epoches * 0.90) ] learning_rate_sequence = [FLAGS.learning_rate] learning_rate_sequence += [ FLAGS.learning_rate * 0.1, FLAGS.learning_rate * 0.01, FLAGS.learning_rate * 0.001 ] learning_rate = learning_schedules.manual_stepping( global_step, learning_rate_step_boundaries, learning_rate_sequence) # learning_rate = learning_schedules.exponential_decay_with_burnin(global_step, # FLAGS.learning_rate,num_batches_epoch*num_epoches,0.001/FLAGS.learning_rate, # burnin_learning_rate=0.01, # burnin_steps=5000) if FLAGS.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) if FLAGS.optimizer == 'momentum': opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(config.optimizer_device()): training_optimizer = opt # Create ops required to initialize the model from a given checkpoint. TODO!! init_fn = None if FLAGS.model == 'DCSL': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionResnetV2') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_resnet_v2.ckpt'), slim.get_model_variables('InceptionResnetV2')) if FLAGS.model == 'DCSL_inception_v1': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'DCSL_NAS': # if FLAGS.weights is None: # # if not FLAGS.moving_average_decay: # variables = slim.get_model_variables('NAS') # init_fn = slim.assign_from_checkpoint_fn( # os.path.join(FLAGS.checkpoints_dir, 'nasnet-a_large_04_10_2017/model.ckpt'), # slim.get_model_variables('NAS')) def restore_map(): variables_to_restore = {} for variable in tf.global_variables(): for scope_name in ['NAS']: if variable.op.name.startswith(scope_name): var_name = variable.op.name.replace( scope_name + '/', '') # var_name = variable.op.name variables_to_restore[ var_name + '/ExponentialMovingAverage'] = variable # variables_to_restore[var_name] = variable return variables_to_restore var_map = restore_map() # restore_var = [v for v in tf.global_variables() if 'global_step' not in v.name] available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, FLAGS.weights)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, FLAGS.weights) init_fn = initializer_fn if FLAGS.model == 'CoAttention': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'AttentionBaseModel': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'CoAttentionBaseModel': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadCoAttention': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadAttentionBaseModel': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadAttentionBaseModel_fixed': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadAttentionBaseModel_res': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadAttentionBaseModel_set_share_softmax': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'CoAttentionBaseModel_v2': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if 'ParallelAttentionBaseModel' in FLAGS.model: if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if 'ContrastiveModel' in FLAGS.model: if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'MultiHeadCoAttention_inv4': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV4') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v4.ckpt'), slim.get_model_variables('InceptionV4')) if FLAGS.model == 'MultiLayerMultiHeadCoAttention': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV1') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v1.ckpt'), slim.get_model_variables('InceptionV1')) if FLAGS.model == 'DCSL_inception_v4': if FLAGS.weights is None: # if not FLAGS.moving_average_decay: variables = slim.get_model_variables('InceptionV4') init_fn = slim.assign_from_checkpoint_fn( os.path.join(FLAGS.checkpoints_dir, 'inception_v4.ckpt'), slim.get_model_variables('InceptionV4')) # compute and update gradients with tf.device(config.optimizer_device()): if FLAGS.moving_average_decay: update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. all_trainable = tf.trainable_variables() # and returns a train_tensor and summary_op total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None, var_list=all_trainable) # Optionally clip gradients # with tf.name_scope('clip_grads'): # grads_and_vars = slim.learning.clip_gradient_norms(grads_and_vars, 10) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # GPU settings session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = False # Save checkpoints regularly. keep_checkpoint_every_n_hours = 2.0 saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) ########################### # Kicks off the training. # ########################### slim.learning.train(train_tensor, logdir=logdir, master=FLAGS.master, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=10, summary_op=summary_op, init_fn=init_fn, number_of_steps=num_batches_epoch * FLAGS.num_epoches, save_summaries_secs=240, sync_optimizer=None, saver=saver)
def _mapper(self, grad, var): # this is very slow.... see #3649 #op = tf.Assert(tf.reduce_all(tf.is_finite(var)), [var], summarize=100) grad = tf.check_numerics(grad, 'CheckGradient-' + var.op.name) return grad
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # Set up deployment (i.e., multi-GPUs and/or multi-replicas). config = model_deploy.DeploymentConfig(num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.num_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Split the batch across GPUs. assert FLAGS.train_batch_size % config.num_clones == 0, ( 'Training batch size not divisble by number of clones (GPUs).') clone_batch_size = FLAGS.train_batch_size // config.num_clones # Get dataset-dependent information. dataset = segmentation_dataset.get_dataset(FLAGS.dataset, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir) tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Training on %s set', FLAGS.train_split) with tf.Graph().as_default() as graph: with tf.device(config.inputs_device()): samples = input_generator.get( dataset, FLAGS.train_crop_size, clone_batch_size, min_resize_value=FLAGS.min_resize_value, max_resize_value=FLAGS.max_resize_value, resize_factor=FLAGS.resize_factor, min_scale_factor=FLAGS.min_scale_factor, max_scale_factor=FLAGS.max_scale_factor, scale_factor_step_size=FLAGS.scale_factor_step_size, dataset_split=FLAGS.train_split, is_training=True, model_variant=FLAGS.model_variant) inputs_queue = prefetch_queue.prefetch_queue(samples, capacity=128 * config.num_clones) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = tf.train.get_or_create_global_step() # Define the model and create clones. model_fn = _build_deeplab model_args = (inputs_queue, { common.OUTPUT_TYPE: dataset.num_classes }, dataset.ignore_label) clones = model_deploy.create_clones(config, model_fn, args=model_args) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. first_clone_scope = config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for model variables. for model_var in slim.get_model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for images, labels, semantic predictions if FLAGS.save_summaries_images: summary_image = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/')) summaries.add( tf.summary.image('samples/%s' % common.IMAGE, summary_image)) first_clone_label = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/')) # Scale up summary image pixel values for better visualization. pixel_scaling = max(1, 255 // dataset.num_classes) summary_label = tf.cast(first_clone_label * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.LABEL, summary_label)) first_clone_output = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/')) predictions = tf.expand_dims(tf.argmax(first_clone_output, 3), -1) summary_predictions = tf.cast(predictions * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.OUTPUT_TYPE, summary_predictions)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Build the optimizer based on the device specification. with tf.device(config.optimizer_device()): learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) #optimizer = tf.train.GradientDescentOptimizer(learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) with tf.device(config.variables_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Start the training. slim.learning.train(train_tensor, logdir=FLAGS.train_logdir, log_every_n_steps=FLAGS.log_steps, master=FLAGS.master, number_of_steps=FLAGS.training_number_of_steps, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=startup_delay_steps, init_fn=train_utils.get_model_init_fn( FLAGS.train_logdir, FLAGS.tf_initial_checkpoint, FLAGS.initialize_last_layer, last_layers, ignore_missing_vars=True), summary_op=summary_op, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def _policy_loss(self, mean, logstd, old_mean, old_logstd, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: mean: Sequences of action means of the current policy. logstd: Sequences of action log stddevs of the current policy. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): entropy = utility.diag_normal_entropy(mean, logstd) kl = tf.reduce_mean( self._mask( utility.diag_normal_kl(old_mean, old_logstd, mean, logstd), length), 1) policy_gradient = tf.exp( utility.diag_normal_logpdf(mean, logstd, action) - utility.diag_normal_logpdf(old_mean, old_logstd, action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff summary = tf.summary.merge([ tf.summary.histogram('entropy', entropy), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def create_accum_train_op(total_loss, optimizer, optimize_every, global_step=_USE_GLOBAL_STEP, variables_to_train=None, transform_grads_fn=None, summarize_gradients=False, gate_gradients=_GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, check_numerics=True): if global_step is _USE_GLOBAL_STEP: global_step = get_or_create_global_step() tf.summary.scalar("optimizer/global_step", global_step) # Creates a variable to keep track of the number of # batches seen by the network accum_steps = tf.get_variable("accum_steps", initializer=tf.constant(0), trainable=False) tf.summary.scalar("optimizer/accum_steps", accum_steps) # Defines an operation to update accum_steps update_accum_steps = tf.assign_add(accum_steps, 1) # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None. global_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if len(global_update_ops) > 0: raise RuntimeError("Additional update operations (eg, BatchNorm statistics updates) " "are not supported while accumulating gradients!") if variables_to_train is None: # Default to tf.trainable_variables() variables_to_train = tf.trainable_variables() else: # Make sure that variables_to_train are in tf.trainable_variables() for v in variables_to_train: assert v in tf.trainable_variables() assert variables_to_train with tf.name_scope('train_op'): # Make sure total_loss is valid. if check_numerics: total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan') # Create the gradients. Note that apply_gradients adds the gradient # computation to the current graph and automatically updates global_step grads = optimizer.compute_gradients( total_loss, variables_to_train, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if transform_grads_fn: grads = transform_grads_fn(grads) # Create variables to hold accumulated gradients # NOTE: we need to initialize variables since some of them (CudnnLSTM) # don't have a static shape yet with tf.Session() as sess: sess.run(tf.global_variables_initializer()) accum_grads = [tf.Variable(tf.zeros(shape=sess.run(tf.shape(var)), dtype=tf.float32), trainable=False) for var in variables_to_train] # Create the operation to accumulate gradients and make sure it is # executed after having updated the count of accumulation operations with tf.control_dependencies([update_accum_steps, total_loss]): accum_ops = [accum_grads[i].assign_add(grad / optimize_every) for i, (grad, var) in enumerate(grads) if grad is not None] accum_grads_and_vars = zip(accum_grads, variables_to_train) # Summarize gradients. if summarize_gradients: with tf.name_scope('summarize_grads'): add_gradients_summaries(grads) def optimize_branch(): with tf.control_dependencies(accum_ops): # Create gradient update operation on accumulated gradients apply_grads_op = optimizer.apply_gradients(accum_grads_and_vars, global_step=global_step) with tf.control_dependencies([apply_grads_op]): # Create an operation to zero the parameter gradients # this must always be executed after updating the parameters apply_and_zero_grads = [var.assign(tf.zeros_like(var)) for var in accum_grads] with tf.control_dependencies(apply_and_zero_grads): loss = tf.identity(total_loss) return loss def accum_branch(): with tf.control_dependencies(accum_ops): loss = tf.identity(total_loss) return loss # The overall training step accum_or_train_op = tf.cond(tf.equal(accum_steps % optimize_every, 0), lambda: optimize_branch(), lambda: accum_branch()) # Add the operation used for training to the 'train_op' collection train_ops = tf.get_collection_ref(tf.GraphKeys.TRAIN_OP) if len(train_ops) != 0: tf.logging.warning('Emptying the tf.GraphKeys.TRAIN_OP variable') del train_ops[:] train_ops.append(accum_or_train_op) return accum_or_train_op
def inference(inputs, num_classes, training=False, name='unet'): with tf.variable_scope(name) as scope: conv1 = conv2d( inputs, kernel=5, out_channels=128, stride=1, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv1' ) print('conv1 shape: %s' % conv1.get_shape()) conv11 = conv2d( conv1, kernel=3, out_channels=128, stride=1, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv11' ) print('conv11 shape: %s' % conv11.get_shape()) conv12 = conv2d( conv11, kernel=3, out_channels=128, stride=1, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv12' ) print('conv12 shape: %s' % conv12.get_shape()) # max_pool1 = tf.nn.max_pool( # conv1, # ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], # padding='VALID', data_format='NCHW', name='max_pool1' # ) conv2 = conv2d( conv12, kernel=3, out_channels=128, stride=1, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv2' ) print('conv2 shape: %s' % conv2.get_shape()) # max_pool2 = tf.nn.max_pool( # conv2, # ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], # padding='VALID', data_format='NCHW', name='max_pool2' # ) conv3 = conv2d( conv2, kernel=3, out_channels=128, stride=2, padding='VALID', activation_fn=tf.nn.relu, name='relu_conv3' ) print('conv3 shape: %s' % conv3.get_shape()) # max_pool3 = tf.nn.max_pool( # conv3, # ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], # padding='VALID', data_format='NCHW', name='max_pool1' # ) conv3 = tf.check_numerics(conv3, message="nan or inf from: conv3") deconv1 = deconv( conv3, kernel=4, out_channels=128, stride=2, data_format='NCHW', activation_fn=tf.nn.relu, name='deconv1' ) print('deconv1 shape: %s' % deconv1.get_shape()) concat1 = tf.concat([conv2, deconv1], axis=1, name='concat1') # print('concat1 shape: %s' % concat1.get_shape()) deconv1_conv = conv2d( concat1, kernel=3, out_channels=128, stride=1, padding='SAME', activation_fn=tf.nn.relu, name='deconv1_conv' ) deconv2 = deconv( deconv1_conv, kernel=3, out_channels=128, stride=1, data_format='NCHW', activation_fn=tf.nn.relu, name='deconv2' ) print('deconv2 shape: %s' % deconv2.get_shape()) concat2 = tf.concat([conv12, deconv2], axis=1, name='concat2') # print('concat2 shape: %s' % concat2.get_shape()) deconv2_conv = conv2d( concat2, kernel=3, out_channels=128, stride=1, padding='SAME', activation_fn=tf.nn.relu, name='deconv2_conv' ) deconv31 = deconv( deconv2_conv, kernel=3, out_channels=128, stride=1, data_format='NCHW', activation_fn=tf.nn.relu, name='deconv31' ) print('deconv31 shape: %s' % deconv31.get_shape()) concat31 = tf.concat([conv11, deconv31], axis=1, name='concat31') # print('concat2 shape: %s' % concat2.get_shape()) deconv31_conv = conv2d( concat31, kernel=3, out_channels=128, stride=1, padding='SAME', activation_fn=tf.nn.relu, name='deconv31_conv' ) deconv32 = deconv( deconv31_conv, kernel=3, out_channels=128, stride=1, data_format='NCHW', activation_fn=tf.nn.relu, name='deconv32' ) print('deconv32 shape: %s' % deconv32.get_shape()) concat32 = tf.concat([conv1, deconv32], axis=1, name='concat31') # print('concat2 shape: %s' % concat2.get_shape()) deconv32_conv = conv2d( concat32, kernel=3, out_channels=128, stride=1, padding='SAME', activation_fn=tf.nn.relu, name='deconv32_conv' ) deconv3 = deconv( deconv32_conv, kernel=5, out_channels=num_classes, stride=1, data_format='NCHW', activation_fn=tf.nn.relu, name='deconv3' ) # print('deconv3 shape: %s' % deconv3.get_shape()) deconv3_conv = conv2d( deconv3, kernel=3, out_channels=num_classes, stride=1, padding='SAME', activation_fn=tf.nn.relu, name='deconv3_conv' ) label_logits = tf.transpose(deconv3_conv, perm=[0, 2, 3, 1]) label_logits = tf.check_numerics(label_logits, message="nan or inf from: label_logits") print('label_logits shape: %s' % label_logits.get_shape()) return label_logits
def __init__(self, train_batch_size=4096, test_chain_batch_size=4096, bijector="iaf", log_dir="/tmp/neutra", base_learning_rate=1e-3, q_base_scale=1., learning_rate_schedule=[[6000, 1e-1]]): target, target_spec = GetTargetSpec() self.target = target self.target_spec = target_spec with gin.config_scope("train"): train_target, train_target_spec = GetTargetSpec() self.train_target = train_target self.train_target_spec = train_target_spec if bijector == "rnvp": bijector_fn = tf.make_template("bijector", MakeRNVPBijectorFn, num_dims=self.target_spec.num_dims) elif bijector == "iaf": bijector_fn = tf.make_template("bijector", MakeIAFBijectorFn, num_dims=self.target_spec.num_dims) elif bijector == "affine": bijector_fn = tf.make_template("bijector", MakeAffineBijectorFn, num_dims=self.target_spec.num_dims) else: bijector_fn = lambda *args, **kwargs: tfb.Identity() self.train_bijector = bijector_fn(train=True) self.bijector = bijector_fn(train=False) if train_target_spec.bijector is not None: print("Using train target bijector") self.train_bijector = tfb.Chain( [train_target_spec.bijector, self.train_bijector]) if target_spec.bijector is not None: print("Using target bijector") self.bijector = tfb.Chain([target_spec.bijector, self.bijector]) q_base = tfd.Independent( tfd.Normal(loc=tf.zeros(self.target_spec.num_dims), scale=q_base_scale * tf.ones(self.target_spec.num_dims)), 1) self.q_x_train = tfd.TransformedDistribution(q_base, self.train_bijector) self.q_x = tfd.TransformedDistribution(q_base, self.bijector) # Params self.train_batch_size = int(train_batch_size) self.test_chain_batch_size = tf.placeholder_with_default( test_chain_batch_size, [], "test_chain_batch_size") self.test_batch_size = tf.placeholder_with_default( 16384 * 8, [], "test_batch_size") self.test_num_steps = tf.placeholder_with_default( 1000, [], "test_num_steps") self.test_num_leapfrog_steps = tf.placeholder_with_default( tf.to_int32(2), [], "test_num_leapfrog_steps") self.test_step_size = tf.placeholder_with_default( 0.1, [], "test_step_size") # Test self.neutra_outputs = MakeNeuTra( target=self.target, q=self.q_x, batch_size=self.test_chain_batch_size, num_steps=self.test_num_steps, num_leapfrog_steps=self.test_num_leapfrog_steps, step_size=self.test_step_size, ) self.z_chain = tf.reshape( self.bijector.inverse( tf.reshape(self.neutra_outputs.x_chain, [-1, self.target_spec.num_dims])), tf.shape(self.neutra_outputs.x_chain)) self.target_samples = self.target.sample(self.test_batch_size) self.target_z = self.bijector.inverse(self.target_samples) self.q_samples = self.q_x.sample(self.test_batch_size) self.target_cov = utils.Covariance(self.target_samples) self.target_eigvals, self.target_eigvecs = tf.linalg.eigh( self.target_cov) self.cached_target_eigvals = tf.get_local_variable( "cached_target_eigvals", self.target_eigvals.shape, initializer=tf.zeros_initializer()) self.cached_target_eigvecs = tf.get_local_variable( "cached_target_eigvecs", self.target_eigvecs.shape, initializer=tf.zeros_initializer()) self.cached_target_stats_update_op = [ self.cached_target_eigvals.assign(self.target_eigvals), self.cached_target_eigvecs.assign(self.target_eigvecs), tf.print("Assigning target stats") ] def variance(x): x -= tf.reduce_mean(x, 0, keep_dims=True) x = tf.square(x) return x def rotated_variance(x): x2 = tf.reshape(x, [-1, self.target_spec.num_dims]) x2 -= tf.reduce_mean(x2, 0, keep_dims=True) x2 = tf.matmul(x2, self.cached_target_eigvecs) x2 = tf.square(x2) return tf.reshape(x2, tf.shape(x)) functions = [ ("mean", tf.identity), # ("var", variance), ("square", tf.square), # ("rot_square", rot_square), # ("rot_var", rotated_variance), ] self.cached_target_mean = {} self.cached_target_mean_update_op = [ tf.print("Assigning target means.") ] self.neutra_stats = {} self.q_stats = {} for name, f in functions: target_mean = tf.reduce_mean(f(self.target_samples), 0) cached_target_mean = tf.get_local_variable(name + "_cached_mean", target_mean.shape) if self.target_spec.stats is not None: self.cached_target_mean_update_op.append( cached_target_mean.assign(self.target_spec.stats[name])) else: self.cached_target_mean_update_op.append( cached_target_mean.assign(target_mean)) self.cached_target_mean[name] = cached_target_mean self.q_stats[name] = ComputeQStats(f(self.q_samples), cached_target_mean) self.neutra_stats[name] = ComputeChainStats( f(self.neutra_outputs.x_chain), cached_target_mean, self.test_num_leapfrog_steps) # Training self.train_q_samples = self.q_x_train.sample(self.train_batch_size) self.train_log_q_x = self.q_x_train.log_prob(self.train_q_samples) self.kl_q_p = tf.reduce_mean( self.train_log_q_x - self.target.log_prob(self.train_q_samples)) loss = self.kl_q_p reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) if reg_losses: tf.logging.info("Regularizing.") loss += tf.add_n(reg_losses) self.loss = tf.check_numerics(loss, "Loss has NaNs") self.global_step = tf.train.get_or_create_global_step() steps, factors = list(zip(*learning_rate_schedule)) learning_rate = base_learning_rate * tf.train.piecewise_constant( self.global_step, steps, [1.0] + list(factors)) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = opt.minimize(self.loss, global_step=self.global_step) tf.summary.scalar("kl_q_p", self.kl_q_p) tf.summary.scalar("loss", self.loss) self.init = [ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.print("Initializing variables") ] self.saver = tf.train.Saver() self.log_dir = log_dir
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) start_epoch = 0 """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/gpu:0'): """Get global_step.""" global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" #################### 改这里 ##################################### lrn_rate = tf.maximum(tf.train.exponential_decay( 1e-1, global_step, num_batches_per_epoch, 0.9), 1e-5) #################### end ##################################### tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/gpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # loss = net.cross_ent_loss(output, batch_labels) tf.logging.debug(pose_out.get_shape()) loss, spread_loss, mse, _ = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0'}, allow_soft_placement=True) sess = tf.Session(config=session_config) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [v for v in tf.global_variables( ) if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=5) """Display parameters""" total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) #################### 改这里 ##################################### latest= os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name)) model_file = tf.train.latest_checkpoint(latest) saver.restore(sess, model_file) start_epoch = 151 #################### end ######################################## """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/caps/{}/train_log/'.format(dataset_name), graph=sess.graph) # graph = sess.graph, huge! """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min for step in range(start_epoch * num_batches_per_epoch ,cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" epoch_th = step / num_batches_per_epoch if (epoch_th % 5) == 0: print('epoch_th: %d ' % epoch_th) if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max """Save model periodically""" ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) labels = FLAGS.labels.split(',') num_classes = len(labels) tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Creating train logdir: %s', FLAGS.train_logdir) with tf.Graph().as_default() as graph: global_step = tf.train.get_or_create_global_step() X = tf.placeholder(tf.float32, [None, FLAGS.height, FLAGS.width, 3], name='X') ground_truth = tf.placeholder(tf.int64, [None], name='ground_truth') is_training = tf.placeholder(tf.bool, name='is_training') keep_prob = tf.placeholder(tf.float32, [], name='keep_prob') # learning_rate = tf.placeholder(tf.float32, []) # apply SENet logits, end_points = model.hcd_model(X, num_classes=num_classes, is_training=is_training, keep_prob=keep_prob, attention_module='se_block') # Print name and shape of each tensor. tf.logging.info("++++++++++++++++++++++++++++++++++") tf.logging.info("Layers") tf.logging.info("++++++++++++++++++++++++++++++++++") for k, v in end_points.items(): tf.logging.info('name = %s, shape = %s' % (v.name, v.get_shape())) # # Print name and shape of parameter nodes (values not yet initialized) # tf.logging.info("++++++++++++++++++++++++++++++++++") # tf.logging.info("Parameters") # tf.logging.info("++++++++++++++++++++++++++++++++++") # for v in slim.get_model_variables(): # tf.logging.info('name = %s, shape = %s' % (v.name, v.get_shape())) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) prediction = tf.argmax(logits, axis=1, name='prediction') correct_prediction = tf.equal(prediction, ground_truth) confusion_matrix = tf.confusion_matrix(ground_truth, prediction, num_classes=num_classes) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') summaries.add(tf.summary.scalar('accuracy', accuracy)) # Define loss tf.losses.sparse_softmax_cross_entropy(labels=ground_truth, logits=logits) # Gather update_ops. These contain, for example, # the updates for the batch_norm variables created by model. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # # Add summaries for model variables. # for model_var in slim.get_model_variables(): # summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) # optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) optimizer = tf.train.AdamOptimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) total_loss, grads_and_vars = train_utils.optimize(optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # # Modify the gradients for biases and last layer variables. # last_layers = train_utils.get_extra_layer_scopes( # FLAGS.last_layers_contain_logits_only) # grad_mult = train_utils.get_model_gradient_multipliers( # last_layers, FLAGS.last_layer_gradient_multiplier) # if grad_mult: # grads_and_vars = slim.learning.multiply_gradients( # grads_and_vars, grad_mult) # Gradient clipping # clipped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars] # Otherwise -> # gradients, variables = zip(*optimizer.compute_gradients(loss)) # gradients, _ = tf.clip_by_global_norm(grads_and_vars[0], 5.0) # optimize = optimizer.apply_gradients(zip(gradients, grads_and_vars[1])) # TensorBoard: How to plot histogram for gradients grad_summ_op = tf.summary.merge([ tf.summary.histogram("%s-grad" % g[1].name, g[0]) for g in grads_and_vars ]) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_op = tf.identity(total_loss, name='train_op') # Add the summaries. These contain the summaries # created by model and either optimize() or _gather_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) train_writer = tf.summary.FileWriter(FLAGS.summaries_dir, graph) validation_writer = tf.summary.FileWriter( FLAGS.summaries_dir + '/validation', graph) ############### # Prepare data ############### # training dateset tfrecord_filenames = tf.placeholder(tf.string, shape=[]) dataset = data.Dataset(tfrecord_filenames, FLAGS.batch_size, FLAGS.how_many_training_epochs, FLAGS.height, FLAGS.width) iterator = dataset.dataset.make_initializable_iterator() next_batch = iterator.get_next() # validation dateset val_dataset = val_data.Dataset(tfrecord_filenames, FLAGS.val_batch_size, FLAGS.height, FLAGS.width) val_iterator = val_dataset.dataset.make_initializable_iterator() val_next_batch = val_iterator.get_next() sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) # Create a saver object which will save all the variables saver = tf.train.Saver() if FLAGS.saved_checkpoint_dir: if tf.gfile.IsDirectory(FLAGS.train_logdir): checkpoint_path = tf.train.latest_checkpoint( FLAGS.train_logdir) else: checkpoint_path = FLAGS.train_logdir saver.restore(sess, checkpoint_path) if FLAGS.pre_trained_checkpoint: train_utils.restore_fn(FLAGS) start_epoch = 0 # Get the number of training/validation steps per epoch tr_batches = int(PCAM_TRAIN_DATA_SIZE / FLAGS.batch_size) if PCAM_TRAIN_DATA_SIZE % FLAGS.batch_size > 0: tr_batches += 1 val_batches = int(PCAM_VALIDATE_DATA_SIZE / FLAGS.val_batch_size) if PCAM_VALIDATE_DATA_SIZE % FLAGS.val_batch_size > 0: val_batches += 1 # The filenames argument to the TFRecordDataset initializer can either be a string, # a list of strings, or a tf.Tensor of strings. train_record_filenames = os.path.join(FLAGS.dataset_dir, 'train.record') validate_record_filenames = os.path.join(FLAGS.dataset_dir, 'validate.record') ############################ # Training loop. ############################ for num_epoch in range(start_epoch, FLAGS.how_many_training_epochs): print("------------------------------------") print(" Epoch {} ".format(num_epoch)) print("------------------------------------") sess.run( iterator.initializer, feed_dict={tfrecord_filenames: train_record_filenames}) for step in range(tr_batches): train_batch_xs, train_batch_ys = sess.run(next_batch) # # Verify image # # assert not np.any(np.isnan(train_batch_xs)) # n_batch = train_batch_xs.shape[0] # # n_view = train_batch_xs.shape[1] # for i in range(n_batch): # img = train_batch_xs[i] # # scipy.misc.toimage(img).show() Or # img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) # cv2.imwrite('/home/ace19/Pictures/' + str(i) + '.png', img) # # cv2.imshow(str(train_batch_ys[idx]), img) # cv2.waitKey(100) # cv2.destroyAllWindows() augmented_batch_xs = aug_utils.aug(train_batch_xs) # # Verify image # # assert not np.any(np.isnan(train_batch_xs)) # n_batch = augmented_batch_xs.shape[0] # # n_view = train_batch_xs.shape[1] # for i in range(n_batch): # img = augmented_batch_xs[i] # # scipy.misc.toimage(img).show() Or # img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) # cv2.imwrite('/home/ace19/Pictures/' + str(i) + '.png', img) # # cv2.imshow(str(train_batch_ys[idx]), img) # cv2.waitKey(100) # cv2.destroyAllWindows() # Run the graph with this batch of training data and learning rate policy. lr, train_summary, train_accuracy, train_loss, grad_vals, _ = \ sess.run([learning_rate, summary_op, accuracy, total_loss, grad_summ_op, train_op], feed_dict={ X: augmented_batch_xs, ground_truth: train_batch_ys, is_training: True, keep_prob: 0.8 }) train_writer.add_summary(train_summary, num_epoch) train_writer.add_summary(grad_vals, num_epoch) tf.logging.info( 'Epoch #%d, Step #%d, rate %.10f, accuracy %.1f%%, loss %f' % (num_epoch, step, lr, train_accuracy * 100, train_loss)) # validate per every verification_cycle flag if num_epoch % FLAGS.verification_cycle != 0: continue ################################################### # Validate the model on the validation set ################################################### tf.logging.info('--------------------------') tf.logging.info(' Start validation ') tf.logging.info('--------------------------') # Test Time Augmentation (TTA) predictions = [] for i in range(FLAGS.num_tta): # Reinitialize iterator with the validation dataset sess.run(val_iterator.initializer, feed_dict={ tfrecord_filenames: validate_record_filenames }) total_val_accuracy = 0 validation_count = 0 total_conf_matrix = None batch_pred = [] batch_y = [] for step in range(val_batches): validation_batch_xs, validation_batch_ys = sess.run( val_next_batch) # random augmentation for TTA augmented_val_batch_xs = aug_utils.aug( validation_batch_xs) val_summary, val_accuracy, val_logit, conf_matrix = sess.run( [summary_op, accuracy, logits, confusion_matrix], feed_dict={ X: augmented_val_batch_xs, ground_truth: validation_batch_ys, is_training: False, keep_prob: 1.0 }) validation_writer.add_summary(val_summary, num_epoch) total_val_accuracy += val_accuracy validation_count += 1 if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix batch_pred.extend(val_logit) batch_y.extend(validation_batch_ys) total_val_accuracy /= validation_count tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info( 'Validation accuracy = %.1f%% (N=%d)' % (total_val_accuracy * 100, PCAM_VALIDATE_DATA_SIZE)) predictions.append(batch_pred) pred = np.mean(predictions, axis=0) tta_accuracy = np.mean( np.equal(batch_y, np.argmax(pred, axis=-1))) # summaries.add(tf.summary.scalar('tta_accuracy', tta_accuracy)) tf.logging.info('Test Time Accuracy: %.5f' % tta_accuracy) # validation_writer.add_summary(tta_accuracy, num_epoch) # Save the model checkpoint periodically. if (num_epoch <= FLAGS.how_many_training_epochs - 1): checkpoint_path = os.path.join(FLAGS.train_logdir, FLAGS.ckpt_name_to_save) tf.logging.info('Saving to "%s-%d"', checkpoint_path, num_epoch) saver.save(sess, checkpoint_path, global_step=num_epoch)
def build_loss(self, preds, targets, genome=None, target_subset=None): """Convert per-location real-valued predictions to a loss.""" ################################################## # slice positions tstart = self.hp.batch_buffer // self.hp.target_pool tend = (self.hp.seq_length - self.hp.batch_buffer) // self.hp.target_pool targets = tf.identity(targets[:, tstart:tend, :], name='targets_op') ################################################## # slice targets if target_subset is not None: # manually specify targets targets = tf.gather(targets, target_subset, axis=2) else: # take genome index from first example of batch try: genome_i = genome[0, 0] except ValueError: genome_i = tf.constant(0) # find genome target start and end genome_starts = [] genome_ends = [] gti = 0 for gi in range(self.hp.num_genomes): genome_starts.append(gti) gti += self.hp.num_targets[gi] genome_ends.append(gti) genome_starts = tf.constant(genome_starts) genome_ends = tf.constant(genome_ends) targets_start = tf.gather(genome_starts, genome_i) targets_end = tf.gather(genome_ends, genome_i) # slice to genome targets target_indexes_genome = tf.range(targets_start, targets_end) preds = tf.gather(preds, target_indexes_genome, axis=2) targets = tf.gather(targets, target_indexes_genome, axis=2) # clip if self.hp.target_clip is not None: targets = tf.clip_by_value(targets, 0, self.hp.target_clip) # sqrt if self.hp.target_sqrt: targets = tf.sqrt(targets) ################################################## # loss loss_op = None # choose loss if self.hp.loss == 'gaussian': loss_op = tf.squared_difference(preds, targets) elif self.hp.loss == 'poisson': loss_op = tf.nn.log_poisson_loss(targets, tf.log(preds), compute_full_loss=True) elif self.hp.loss == 'cross_entropy': loss_op = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=(targets - 1), logits=preds) else: raise ValueError('Cannot identify loss function %s' % self.hp.loss) # reduce lossses by batch and position loss_op = tf.reduce_mean(loss_op, axis=[0, 1], name='target_loss') loss_op = tf.check_numerics(loss_op, 'Invalid loss', name='loss_check') target_losses = loss_op if target_subset is None: tf.summary.histogram('target_loss', loss_op) # for ti in np.linspace(0, self.hp.sum_targets - 1, 10).astype('int'): # tf.summary.scalar('loss_t%d' % ti, loss_op[ti]) # fully reduce loss_op = tf.reduce_mean(loss_op, name='loss') # add regularization terms reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) reg_sum = tf.reduce_sum(reg_losses) tf.summary.scalar('regularizers', reg_sum) loss_op += reg_sum # track tf.summary.scalar('loss', loss_op) return loss_op, target_losses, targets, preds
def train_segmentation_model(create_model_fn, create_input_fn, train_config, master, task, is_chief, startup_delay_steps, train_dir, num_clones, num_worker_replicas, num_ps_tasks, clone_on_cpu, replica_id, num_replicas, max_checkpoints_to_keep, save_interval_secs, image_summaries, log_memory=False, gradient_checkpoints=None, sync_bn_accross_gpu=False): """Create an instance of the FastSegmentationModel""" _, segmentation_model = create_model_fn() deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=num_worker_replicas, num_ps_tasks=num_ps_tasks) startup_delay_steps = task * startup_delay_steps per_clone_batch_size = train_config.batch_size // num_clones preprocess_fn = None if train_config.preprocessor_step: preprocess_fn = functools.partial( preprocessor_builder.build, preprocessor_config_list=train_config.preprocessor_step) with tf.Graph().as_default(): # CPU of common ps server with tf.device(deploy_config.variables_device()): global_step = tf.train.get_or_create_global_step() with tf.device(deploy_config.inputs_device()): # CPU of each worker input_queue = create_training_input( create_input_fn, preprocess_fn, per_clone_batch_size, batch_queue_capacity=train_config.batch_queue_capacity, batch_queue_threads=train_config.num_batch_queue_threads, prefetch_queue_capacity=train_config.prefetch_queue_capacity) # Create the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): # Note: it is assumed that any loss created by `model_fn` # is collected at the tf.GraphKeys.LOSSES collection. model_fn = functools.partial( create_training_model_losses, create_model_fn=create_model_fn, train_config=train_config, train_dir=train_dir, gradient_checkpoints=gradient_checkpoints) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = deploy_config.clone_scope(0) if sync_bn_accross_gpu: # Attempt to sync BN updates across all GPU's in a tower. # Caution since this is very slow. Might not be needed update_ops = [] for clone_idx in range(num_clones): nth_clone_sope = deploy_config.clone_scope(clone_idx) update_ops.extend( tf.get_collection(tf.GraphKeys.UPDATE_OPS, nth_clone_sope)) else: # Gather updates from first GPU only update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Init variable to collect summeries summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('Losses/%s' % loss.op.name, loss)) with tf.device(deploy_config.optimizer_device()): # CPU of each worker (training_optimizer, optimizer_summary_vars) = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: summaries.add( tf.summary.scalar(var.op.name, var, family='LearningRate')) # Add summaries for model variables. for model_var in slim.get_model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Fine tune from classification or segmentation checkpoints trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: raise ValueError('Must specify `fine_tune_checkpoint_type`.') tf.logging.info('Initializing %s model from checkpoint %s', train_config.fine_tune_checkpoint_type, train_config.fine_tune_checkpoint) variables_to_restore = segmentation_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type) init_fn = slim.assign_from_checkpoint_fn( train_config.fine_tune_checkpoint, variables_to_restore, ignore_missing_vars=True) if train_config.freeze_fine_tune_backbone: tf.logging.info('Freezing %s scope from checkpoint.') non_frozen_vars = [] for var in trainable_vars: if not var.op.name.startswith( segmentation_model.shared_feature_extractor_scope): non_frozen_vars.append(var) tf.logging.info('Training variable: %s', var.op.name) trainable_vars = non_frozen_vars else: tf.logging.info('Not initializing the model from a checkpoint. ' 'Initializing from scratch!') # TODO(@oandrien): we might want to add gradient multiplier here # for the last layer if we have trouble with training # CPU of common ps server with tf.device(deploy_config.optimizer_device()): reg_losses = (None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=reg_losses, var_list=trainable_vars) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') summaries.add(tf.summary.scalar('Losses/TotalLoss', total_loss)) grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_op = tf.identity(total_loss, name='train_op') # TODO: this ideally should not be hardcoded like this. # should have a way to access the prediction and GT tensor if image_summaries: graph = tf.get_default_graph() pixel_scaling = max(1, 255 // 19) summ_first_clone_scope = (first_clone_scope + '/' if first_clone_scope else '') main_labels = graph.get_tensor_by_name( '%sSegmentationLoss/Reshape:0' % summ_first_clone_scope) main_preds = graph.get_tensor_by_name( '%sSegmentationLoss/Reshape_1:0' % summ_first_clone_scope) main_preds = tf.cast(main_preds * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('VerifyTrainImages/Predictions', main_preds)) main_labels = tf.cast(main_labels * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('VerifyTrainImages/Groundtruths', main_labels)) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() # or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) # Save checkpoints regularly. saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep) # HACK to see memory usage. # TODO: Clean up, pretty messy. def train_step_mem(sess, train_op, global_step, train_step_kwargs): start_time = time.time() run_metadata = tf.RunMetadata() options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) total_loss, np_global_step = sess.run([train_op, global_step], options=options, run_metadata=run_metadata) time_elapsed = time.time() - start_time if 'should_log' in train_step_kwargs: if sess.run(train_step_kwargs['should_log']): tf.logging.info( 'global step %d: loss = %.4f (%.3f sec/step)', np_global_step, total_loss, time_elapsed) if log_memory: mem_use = mem_util.peak_memory(run_metadata)['/gpu:0'] / 1e6 tf.logging.info('Memory used: %.2f MB', (mem_use)) if 'should_stop' in train_step_kwargs: should_stop = sess.run(train_step_kwargs['should_stop']) else: should_stop = False return total_loss, should_stop # Main training loop slim.learning.train(train_op, train_step_fn=train_step_mem, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, number_of_steps=train_config.num_steps, startup_delay_steps=startup_delay_steps, init_fn=init_fn, summary_op=summary_op, save_summaries_secs=120, save_interval_secs=save_interval_secs, saver=saver)
def _build_layer_update_ops( self, layers: Sequence ) -> Tuple[tf.Operation, tf.Operation, tf.Operation]: """ Builds the following layer Operations: `weight_update_op, scale_update_op, basis_update_op`. """ qws = [ self.learn.qws['w' + str(i)].tensor for i in range(len(self.learn.qws)) ] # w_grads = tf.gradients(tf.reduce_sum( # tf.reduce_mean(self._log_py_xw, 1), 0), qws) # NOTE: _log_py_xw is a scalar. n_batches = self.n_batch_size or tf.cast( tf.shape(self.inputs)[0], tf.float32) w_grads = tf.gradients(self._log_py_xw / n_batches, qws) n_layers = len(layers) activations = [ append_homog(a) for a in get_layer_input_activations(n_layers) ] s = list(get_layer_outputs(n_layers)) assert len(layers) == len(activations) == len(s) if self.ird_tag == "regression": if self.config.true_fisher: # True fisher: sample model and y from the var. distribution. sampled_log_prob = self.learn.sampled_log_prob s_grads = tf.gradients(tf.reduce_sum(sampled_log_prob), s) else: # Empirical fisher: sample model from var distribution, setting # y = target. s_grads = tf.gradients(self._log_py_xw, s) elif self.ird_tag == "ird": assert self.config.true_fisher, "Only true fisher supported" # Sample model and y from the var. distribution. # (Yes, _log_py_xw holds true fisher even though in regression case # this is the empiral fisher). s_grads = [ tf.check_numerics(x, "ird s_grads") for x in tf.gradients(self._log_py_xw, s) ] else: raise ValueError("Invalid ird tag: {}".format(self.ird_tag)) weight_updates = [] scale_updates = [] basis_updates = [] for l, w, w_grad, a, s_grad in zip(layers, qws, w_grads, activations, s_grads): # Adds the regular KFAC update. weight_updates.extend(l.update(w, w_grad, a, s_grad)) if self.layer_type == "emvg": scale_updates.extend(l.update_scale(w, w_grad, a, s_grad)) basis_updates.extend(l.update_basis(w, w_grad, a, s_grad)) return (tf.group(*weight_updates, name="weight_updates"), tf.group(*scale_updates, name="scale_updates"), tf.group(*basis_updates, name="basis_updates"))
def build(self, x, y, b, m): y = tf.cast(y, tf.int64) # log p(x_u | x_o, y) self.logpu = self.classify(x, b, m) # log p(x_o | y) self.logpo = self.classify(x, b * (1 - b), b) # log p(x_u, x_o | y) self.logpuo = self.classify(x, m * (1 - m), m) # logits: log p (x_u, x_u | y) if self.hps.version == 'v1': self.logits = self.logpu + self.logpo elif self.hps.version == 'v2': self.logits = self.logpuo else: raise Exception() # p(y | x_u, x_o) self.prob = tf.nn.softmax(self.logits) self.pred = tf.argmax(self.logits, axis=1) self.acc = tf.cast(tf.equal(self.pred, y), tf.float32) # log p(x_u | x_o) self.log_likel = ( tf.reduce_logsumexp(self.logpu + self.logpo, axis=1) - tf.reduce_logsumexp(self.logpo, axis=1)) # sample p(x_u | x_o) self.sam = self.sample(x, b, m) # loss xent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=y) xent = tf.reduce_mean(xent) tf.summary.scalar('xent', xent) nll = tf.reduce_mean(-self.log_likel) tf.summary.scalar('nll', nll) if self.hps.loss == 'xent': loss = xent elif self.hps.loss == 'sum': loss = xent + self.hps.lambda_nll * nll elif self.hps.loss == 'logsumexp': loss = -tf.reduce_logsumexp(tf.stack([-xent, -nll]), axis=0) else: raise Exception() tf.summary.scalar('loss', loss) # train self.global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.inverse_time_decay(self.hps.lr, self.global_step, self.hps.decay_steps, self.hps.decay_rate, staircase=True) tf.summary.scalar('lr', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name="Adam") grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables()) grads, vars_ = zip(*grads_and_vars) if self.hps.clip_gradient > 0: grads, gradient_norm = tf.clip_by_global_norm( grads, clip_norm=self.hps.clip_gradient) gradient_norm = tf.check_numerics(gradient_norm, "Gradient norm is NaN or Inf.") tf.summary.scalar('gradient_norm', gradient_norm) capped_grads_and_vars = zip(grads, vars_) self.train_op = optimizer.apply_gradients(capped_grads_and_vars, global_step=self.global_step) # summary self.summ_op = tf.summary.merge_all() # metric self.metric = self.acc
def apply_loss_function(self, global_step): ### loss function if self.loss_type == 'cos_margin_multi_label': assert self.output_dim == 300 word_dict = tf.constant(np.loadtxt(self.wordvec_dict), dtype=tf.float32) margin_param = tf.constant(self.margin_param, dtype=tf.float32) # N: batchsize, L: label_dim, D: 300 # img_label: N * L # word_dic: L * D # v_label: N * L * D v_label = tf.mul(tf.expand_dims(self.img_label, 2), tf.expand_dims(word_dict, 0)) # img_last: N * D # ip_1: N * L ip_1 = tf.reduce_sum( tf.mul(tf.expand_dims(self.img_last_layer, 1), v_label), 2) # mod_1: N * L v_label_mod = tf.mul( tf.expand_dims(tf.ones([self.batch_size, self.n_class]), 2), tf.expand_dims(word_dict, 0)) mod_1 = tf.sqrt( tf.mul( tf.expand_dims( tf.reduce_sum(tf.square(self.img_last_layer), 1), 1), tf.reduce_sum(tf.square(v_label_mod), 2))) #mod_1 = tf.select(tf.less(mod_1_1, tf.constant(0.0000001)), tf.ones([self.batch_size, self.n_class]), mod_1_1) # cos_1: N * L cos_1 = tf.div(ip_1, mod_1) ip_2 = tf.matmul(self.img_last_layer, word_dict, transpose_b=True) # multiply ids to inner product #ip_2 = tf.mul(ip_2_1, ids_dict) def reduce_shaper(t): return tf.reshape(tf.reduce_sum(t, 1), [tf.shape(t)[0], 1]) mod_2_2 = tf.sqrt( tf.matmul(reduce_shaper(tf.square(self.img_last_layer)), reduce_shaper(tf.square(word_dict)), transpose_b=True)) mod_2 = tf.select(tf.less(mod_2_2, tf.constant(0.0000001)), tf.ones([self.batch_size, self.n_class]), mod_2_2) # cos_2: N * L cos_2 = tf.div(ip_2, mod_2) # cos - cos: N * L * L cos_cos_1 = tf.sub( margin_param, tf.sub(tf.expand_dims(cos_1, 2), tf.expand_dims(cos_2, 1))) # we need to let the wrong place be 0 cos_cos = tf.mul(cos_cos_1, tf.expand_dims(self.img_label, 2)) cos_loss = tf.reduce_sum( tf.maximum(tf.constant(0, dtype=tf.float32), cos_cos)) self.cos_loss = tf.div( cos_loss, tf.mul(tf.constant(self.n_class, dtype=tf.float32), tf.reduce_sum(self.img_label))) self.test1 = cos_cos self.test2 = cos_1 self.test3 = cos_2 self.test00 = tf.reduce_sum( tf.cast( tf.not_equal( cos_cos, tf.zeros([self.batch_size, self.n_class, self.n_class])), tf.int32)) self.test0 = tf.mul(tf.constant(self.n_class, dtype=tf.float32), tf.reduce_sum(self.img_label)) self.check0 = tf.check_numerics(cos_cos, "cos_cos") self.check1 = tf.check_numerics(cos_1, "cos_1") self.check2 = tf.check_numerics(cos_2, "cos_2") elif self.loss_type == 'cos_softmargin_multi_label': assert self.output_dim == 300 word_dict = tf.constant(np.loadtxt(self.wordvec_dict), dtype=tf.float32) #margin_param = tf.constant(self.margin_param, dtype=tf.float32) # N: batchsize, L: label_dim, D: 300 # img_label: N * L # word_dic: L * D # v_label: N * L * D v_label = tf.mul(tf.expand_dims(self.img_label, 2), tf.expand_dims(word_dict, 0)) # img_last: N * D # ip_1: N * L ip_1 = tf.reduce_sum( tf.mul(tf.expand_dims(self.img_last_layer, 1), v_label), 2) # mod_1: N * L v_label_mod = tf.mul( tf.expand_dims(tf.ones([self.batch_size, self.n_class]), 2), tf.expand_dims(word_dict, 0)) mod_1 = tf.sqrt( tf.mul( tf.expand_dims( tf.reduce_sum(tf.square(self.img_last_layer), 1), 1), tf.reduce_sum(tf.square(v_label_mod), 2))) #mod_1 = tf.select(tf.less(mod_1_1, tf.constant(0.0000001)), tf.ones([self.batch_size, self.n_class]), mod_1_1) # cos_1: N * L cos_1 = tf.div(ip_1, mod_1) ip_2 = tf.matmul(self.img_last_layer, word_dict, transpose_b=True) # multiply ids to inner product #ip_2 = tf.mul(ip_2_1, ids_dict) def reduce_shaper(t): return tf.reshape(tf.reduce_sum(t, 1), [tf.shape(t)[0], 1]) mod_2_2 = tf.sqrt( tf.matmul(reduce_shaper(tf.square(self.img_last_layer)), reduce_shaper(tf.square(word_dict)), transpose_b=True)) mod_2 = tf.select(tf.less(mod_2_2, tf.constant(0.0000001)), tf.ones([self.batch_size, self.n_class]), mod_2_2) # cos_2: N * L cos_2 = tf.div(ip_2, mod_2) # word_dic: L * D # ip_3: L * L # compute soft margin ip_3 = tf.matmul(word_dict, word_dict, transpose_b=True) # use word_dic to avoid 0 in / mod_3 = tf.sqrt( tf.matmul(reduce_shaper(tf.square(word_dict)), reduce_shaper(tf.square(word_dict)), transpose_b=True)) margin_param = tf.sub(tf.constant(1.0, dtype=tf.float32), tf.div(ip_3, mod_3)) # cos - cos: N * L * L cos_cos_1 = tf.sub( tf.expand_dims(margin_param, 0), tf.sub(tf.expand_dims(cos_1, 2), tf.expand_dims(cos_2, 1))) # we need to let the wrong place be 0 cos_cos = tf.mul(cos_cos_1, tf.expand_dims(self.img_label, 2)) cos_loss = tf.reduce_sum( tf.maximum(tf.constant(0, dtype=tf.float32), cos_cos)) self.cos_loss = tf.div( cos_loss, tf.mul(tf.constant(self.n_class, dtype=tf.float32), tf.reduce_sum(self.img_label))) self.test1 = cos_cos self.test2 = cos_1 self.test3 = cos_2 self.precq_loss_img = tf.reduce_mean( tf.reduce_sum( tf.square( tf.sub(self.img_last_layer, tf.matmul(self.b_img, self.C))), 1)) word_dict = tf.constant(np.loadtxt(self.wordvec_dict), dtype=tf.float32) self.cq_loss_img = tf.reduce_mean( tf.reduce_sum( tf.square( tf.matmul( tf.sub(self.img_last_layer, tf.matmul(self.b_img, self.C)), tf.transpose(word_dict))), 1)) self.q_lambda = tf.Variable(self.cq_lambda, name='cq_lambda') self.cq_loss = tf.mul(self.q_lambda, self.cq_loss_img) self.loss = tf.add(self.cos_loss, self.cq_loss) ### Last layer has a 10 times learning rate self.lr = tf.train.exponential_decay(self.learning_rate, global_step, self.decay_step, self.learning_rate_decay_factor, staircase=True) opt = tf.train.MomentumOptimizer(learning_rate=self.lr, momentum=0.9) grads_and_vars = opt.compute_gradients( self.loss, self.train_layers + self.train_last_layer) fcgrad, _ = grads_and_vars[-2] fbgrad, _ = grads_and_vars[-1] if self.finetune_all: return opt.apply_gradients( [(grads_and_vars[0][0], self.train_layers[0]), (grads_and_vars[1][0] * 2, self.train_layers[1]), (grads_and_vars[2][0], self.train_layers[2]), (grads_and_vars[3][0] * 2, self.train_layers[3]), (grads_and_vars[4][0], self.train_layers[4]), (grads_and_vars[5][0] * 2, self.train_layers[5]), (grads_and_vars[6][0], self.train_layers[6]), (grads_and_vars[7][0] * 2, self.train_layers[7]), (grads_and_vars[8][0], self.train_layers[8]), (grads_and_vars[9][0] * 2, self.train_layers[9]), (grads_and_vars[10][0], self.train_layers[10]), (grads_and_vars[11][0] * 2, self.train_layers[11]), (grads_and_vars[12][0], self.train_layers[12]), (grads_and_vars[13][0] * 2, self.train_layers[13]), (fcgrad * 10, self.train_last_layer[0]), (fbgrad * 20, self.train_last_layer[1])], global_step=global_step) else: return opt.apply_gradients( [(fcgrad * 10, self.train_last_layer[0]), (fbgrad * 20, self.train_last_layer[1])], global_step=global_step)
def train(model, train_generator, validation_generator, count1, count2, batch_size=128, learning_rate=FLAGS.learning_rate, log_dir='./log', checkpoint_dir='./checkpoint', num_epochs=-1): # tf Graph input with tf.device('/cpu:0'): with tf.name_scope('data'): # x, yt = data.generate_batches(batch_size,num_threads=FLAGS.num_threads) x, y_one_hot = yt = np.array([[ np.where(r==1)[0][0] for r in y_one_hot ]]) global_step = tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False) if FLAGS.gpu: device_str='/gpu:' + str(FLAGS.device) else: device_str='/cpu:0' with tf.device(device_str): y = model(x, is_training=True) # Define loss and optimizer with tf.name_scope('objective'): loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt, logits=y)) accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32)) opt = tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, 'Adam', gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, #moving_average_decay=0.9, learning_rate_decay_fn=learning_rate_decay_fn if FLAGS.using_learning_rate_decay_fn else None, update_ops=None, variables=None, name=None) #grads = opt.compute_gradients(loss) #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # loss_avg ema = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step, name='average') ema_op = ema.apply([loss, accuracy] + tf.trainable_variables()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) loss_avg = ema.average(loss) tf.summary.scalar('loss/training', loss_avg) accuracy_avg = ema.average(accuracy) tf.summary.scalar('accuracy/training', accuracy_avg) check_loss = tf.check_numerics(loss, 'model diverged: loss->nan') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss) updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([opt]): train_op = tf.group(*updates_collection) if FLAGS.summary: add_summaries( scalar_list=[accuracy, accuracy_avg, loss, loss_avg], activation_list=tf.get_collection(tf.GraphKeys.ACTIVATIONS), var_list=tf.trainable_variables()) # grad_list=grads) summary_op = tf.summary.merge_all() # Configure options for session gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.InteractiveSession( config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options, ) ) if FLAGS.resume: logging.info('resuming from '+checkpoint_dir) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(checkpoint_dir+'/') if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found') return #print sess.run('global_step:0') #print global_step.eval() else: saver = tf.train.Saver(max_to_keep=5) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_batches = data.size[0] / batch_size summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph) epoch = global_step.eval()/num_batches if FLAGS.resume else 0 display_interval=FLAGS.display_interval or num_batches/10 test_interval=FLAGS.test_interval or num_batches/2 logging.info('num of trainable paramaters: %d' %count_params(tf.trainable_variables())) while epoch != num_epochs: curr_step = 0 # Initializing the variables #with tf.Session() as session: # print(session.run(ww)) logging.info('Started epoch %d' % epoch) bar = Bar('Training', max=num_batches, suffix='%(percent)d%% eta: %(eta)ds') while curr_step < data.size[0]: _, loss_val,step= sess.run( [train_op,loss,global_step]) # if step%display_interval==0: # step, acc_value, loss_value, summary = sess.run( # [global_step, accuracy_avg, loss_avg, summary_op]) # logging.info("step %d loss %.3f accuracy %.3f" %(step,loss_value,acc_value)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() # if step%test_interval==0: # saver.save(sess, save_path=checkpoint_dir + # '/model.ckpt', global_step=global_step) # test_top1,test_top5,test_loss = evaluate(model, FLAGS.dataset, # batch_size=batch_size, # checkpoint_dir=checkpoint_dir) # logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss,test_top1,test_top5)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) # summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) # summary_out.value.add(tag='loss/test', simple_value=test_loss) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() curr_step += FLAGS.batch_size bar.next() bar.finish() step, acc_value, loss_value, summary = sess.run([global_step, accuracy_avg, loss_avg, summary_op]) saver.save(sess, save_path=checkpoint_dir + '/model.ckpt', global_step=global_step) test_top1,test_top5,test_loss = evaluate(model, FLAGS.dataset, batch_size=batch_size, checkpoint_dir=checkpoint_dir) logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss,test_top1,test_top5)) summary_out = tf.Summary() summary_out.ParseFromString(summary) summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) summary_out.value.add(tag='loss/test', simple_value=test_loss) summary_writer.add_summary(summary_out, step) summary_writer.flush() logging.info("Finished epoch %d " %epoch) epoch += 1 # When done, ask the threads to stop. coord.request_stop() coord.join(threads) coord.clear_stop() summary_writer.close()
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, rank): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() with tf.device(deploy_config.inputs_device()): input_queue = _create_input_queue( train_config.batch_size // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) if detection_model.is_rbbox: model_fn = functools.partial(_create_losses_rbbox, create_model_fn=create_model_fn) else: model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( max_to_keep=None, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=240, save_interval_secs=3600, sync_optimizer=sync_optimizer, saver=saver)
def _policy_loss(self, old_policy, policy, action, advantage, length): """Compute the policy loss composed of multiple components. 1. The policy gradient loss is importance sampled from the data-collecting policy at the beginning of training. 2. The second term is a KL penalty between the policy at the beginning of training and the current policy. 3. Additionally, if this KL already changed more than twice the target amount, we activate a strong penalty discouraging further divergence. Args: old_policy: Action distribution of the behavioral policy. policy: Sequences of distribution params of the current policy. action: Sequences of actions. advantage: Sequences of advantages. length: Batch of sequence lengths. Returns: Tuple of loss tensor and summary tensor. """ with tf.name_scope('policy_loss'): kl = tf.contrib.distributions.kl_divergence(old_policy, policy) # Infinite values in the KL, even for padding frames that we mask out, # cause NaN gradients since TensorFlow computes gradients with respect to # the whole input tensor. kl = tf.check_numerics(kl, 'kl') kl = tf.reduce_mean(self._mask(kl, length), 1) policy_gradient = tf.exp( policy.log_prob(action) - old_policy.log_prob(action)) surrogate_loss = -tf.reduce_mean( self._mask(policy_gradient * tf.stop_gradient(advantage), length), 1) surrogate_loss = tf.check_numerics(surrogate_loss, 'surrogate_loss') kl_penalty = self._penalty * kl cutoff_threshold = self._config.kl_target * self._config.kl_cutoff_factor cutoff_count = tf.reduce_sum( tf.cast(kl > cutoff_threshold, tf.int32)) with tf.control_dependencies([ tf.cond(cutoff_count > 0, lambda: tf.Print(0, [cutoff_count], 'kl cutoff! '), int) ]): kl_cutoff = (self._config.kl_cutoff_coef * tf.cast(kl > cutoff_threshold, tf.float32) * (kl - cutoff_threshold)**2) policy_loss = surrogate_loss + kl_penalty + kl_cutoff entropy = policy.entropy() if self._config.entropy_regularization: policy_loss -= self._config.entropy_regularization * entropy summary = tf.summary.merge([ tf.summary.histogram('entropy', policy.entropy()), tf.summary.histogram('kl', kl), tf.summary.histogram('surrogate_loss', surrogate_loss), tf.summary.histogram('kl_penalty', kl_penalty), tf.summary.histogram('kl_cutoff', kl_cutoff), tf.summary.histogram('kl_penalty_combined', kl_penalty + kl_cutoff), tf.summary.histogram('policy_loss', policy_loss), tf.summary.scalar('avg_surr_loss', tf.reduce_mean(surrogate_loss)), tf.summary.scalar('avg_kl_penalty', tf.reduce_mean(kl_penalty)), tf.summary.scalar('avg_policy_loss', tf.reduce_mean(policy_loss)) ]) policy_loss = tf.reduce_mean(policy_loss, 0) return tf.check_numerics(policy_loss, 'policy_loss'), summary
def NumericalChecks(): return tf.group(*[ tf.check_numerics(param, message='Parameter is not finite.') for param in trainable_params.values() if param.dtype.base_dtype in [tf.float32, tf.float64]])
def _build_model(self): outsample_cls = NormalOutSample if self.ird_tag == "regression" else None if self.layer_type == "emvg": layer_cls = EMVGLayer elif self.layer_type == "mvg": layer_cls = MVGLayer if self.config.model_name == "ffn50": default_hidden_size = 50 elif self.config.model_name == "ffn100": default_hidden_size = 100 else: raise ValueError(self.config.model_name) hidden_sizes = self.config.get("hidden_sizes", None) or \ [default_hidden_size] layer_sizes = [int(self.inputs.shape[-1])] + list(hidden_sizes) end_with_sum = self.config.get("end_with_sum", False) if not end_with_sum: layer_sizes.append(1) self.n_layers = len(layer_sizes) - 1 layer_types = [layer_cls] * self.n_layers layer_params = [{}] * self.n_layers self.logger.info("layer_sizes: {}".format(layer_sizes)) layers, init_ops = ffn.ffn(layer_type=self.layer_type, input_size=int(self.inputs.shape[-1]), num_data=self.n_data, kl_factor=self.config.kl, ita=self.config.eta, alpha=self.alpha, beta=self.beta, damp=self.config.damping, omega=self.omega, layer_sizes=layer_sizes) self.learn = BayesianLearning(layer_sizes=layer_sizes, layer_types=layer_types, layer_params=layer_params, out_params={}, activation_fn=tf.nn.relu, outsample_cls=outsample_cls, x=self.inputs, y=self.targets, n_particles=self.n_particles, std_y_train=self.config.std_train, ird_tag=self.ird_tag, end_with_sum=end_with_sum) self.h_pred = tf.squeeze(self.learn.h_pred, 2) if self.ird_tag == "ird": main_raw, _, self._bnn = \ self.problem.gather_standard_rewards(self.h_pred) self._main = tf.reshape(main_raw, [self.n_particles, -1]) else: self._main = self._aux = self._bnn = None self._log_py_xw = self._build_log_py_xw() self.kl = tf.check_numerics(self.learn.build_kl(), "kl") self.loss_prec = self._build_loss_prec() n_outputs = tf.cast( tf.shape(self.inputs)[0] * self.n_particles, tf.float32) self.mean_log_py_xw = self._log_py_xw / n_outputs self.lower_bound = self._build_lower_bound() self.rmse = self.learn.rmse self.ll = self.learn.log_likelihood self.init_ops = tf.group(init_ops) if init_ops != [] else None weight_update_op, self.basis_update_op, self.scale_update_op = \ self._build_layer_update_ops(layers) prec_op = self._build_prec_op() self.train_op = tf.group([weight_update_op, prec_op], name="train_op")
def create_train_step(loss, optimizer, global_step=_USE_GLOBAL_STEP, total_loss_fn=None, update_ops=None, variables_to_train=None, transform_grads_fn=None, summarize_gradients=False, gate_gradients=tf.train.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, check_numerics=True): """Creates a train_step that evaluates the gradients and returns the loss. Args: loss: A (possibly nested tuple of) `Tensor` or function representing the loss. optimizer: A tf.Optimizer to use for computing the gradients. global_step: A `Tensor` representing the global step variable. If left as `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used. total_loss_fn: Function to call on loss value to access the final item to minimize. update_ops: An optional list of updates to execute. If `update_ops` is `None`, then the update ops are set to the contents of the `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but it doesn't contain all of the update ops in `tf.GraphKeys.UPDATE_OPS`, a warning will be displayed. variables_to_train: an optional list of variables to train. If None, it will default to all tf.trainable_variables(). transform_grads_fn: A function which takes a single argument, a list of gradient to variable pairs (tuples), performs any requested gradient updates, such as gradient clipping or multipliers, and returns the updated list. summarize_gradients: Whether or not add summaries for each gradient. gate_gradients: How to gate the computation of gradients. See tf.Optimizer. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: Whether or not to try colocating the gradients with the ops that generated them. check_numerics: Whether or not we apply check_numerics. Returns: In graph mode: A (possibly nested tuple of) `Tensor` that when evaluated, calculates the current loss, computes the gradients, applies the optimizer, and returns the current loss. In eager mode: A lambda function that when is called, calculates the loss, then computes and applies the gradients and returns the original loss values. Raises: ValueError: if loss is not callable. """ if total_loss_fn is None: total_loss_fn = lambda x: x if not callable(total_loss_fn): raise ValueError('`total_loss_fn` should be a function.') if not tf.executing_eagerly(): if callable(loss): loss = loss() if callable(variables_to_train): variables_to_train = variables_to_train() # Calculate loss first, then calculate train op, then return the original # loss conditioned on executing the train op. with tf.control_dependencies(nest.flatten(loss)): train_op = tf.contrib.training.create_train_op( total_loss_fn(loss), optimizer, global_step=global_step, update_ops=update_ops, variables_to_train=variables_to_train, transform_grads_fn=transform_grads_fn, summarize_gradients=summarize_gradients, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, check_numerics=check_numerics) with tf.control_dependencies([train_op]): return nest.map_structure(lambda t: tf.identity(t, 'loss'), loss) if global_step is _USE_GLOBAL_STEP: global_step = tf.train.get_or_create_global_step() if not callable(loss): raise ValueError('`loss` should be a function in eager mode.') if not isinstance(loss, Future): tf.logging.warning('loss should be an instance of eager_utils.Future') with tf.GradientTape() as tape: loss_value = loss() total_loss_value = total_loss_fn(loss_value) if variables_to_train is None: variables_to_train = tape.watched_variables() elif callable(variables_to_train): variables_to_train = variables_to_train() variables_to_train = nest.flatten(variables_to_train) grads = tape.gradient(total_loss_value, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if transform_grads_fn: grads_and_vars = transform_grads_fn(grads_and_vars) if summarize_gradients: with tf.name_scope('summarize_grads'): add_gradients_summaries(grads_and_vars) if check_numerics: with tf.name_scope('train_op'): tf.check_numerics(total_loss_value, 'Loss is inf or nan') optimizer.apply_gradients(grads_and_vars, global_step=global_step) return loss_value