def _train(self, experience, weights=None): # TODO(b/126593927): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.compat.v2.summary.histogram(name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def _train(self, experience, weights=None, train_step_counter=None): # TODO(sfishman): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.contrib.summary.histogram('rewards', experience.reward) tf.contrib.summary.histogram('discounts', experience.discount) tf.contrib.summary.histogram('returns', returns) # TODO(kbnaoop): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.contrib.summary.histogram('normalized_returns', returns) # TODO(kbanoop): remove after changing network interface to accept # observations and step_types, instead of time_steps. time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) # TODO(kbanoop): Filter boundary steps. loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) clip_gradients = (tf.contrib.training.clip_gradient_norms_fn( self._gradient_clipping) if self._gradient_clipping else None) # TODO(sguada): create_train_step should not return a Future. loss_info = eager_utils.create_train_step( loss_info, self._optimizer, total_loss_fn=lambda loss_info: loss_info.loss, global_step=train_step_counter, transform_grads_fn=clip_gradients, summarize_gradients=self._summarize_grads_and_vars, variables_to_train=lambda: self._actor_network.trainable_weights, ) if isinstance(loss_info, eager_utils.Future): loss_info = loss_info() if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._actor_network.trainable_weights: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info
def testComputeReturns(self): rewards = tf.constant(np.ones(9), dtype=tf.float32) discounts = tf.constant([1, 1, 1, 1, 0, 0.9, 0.9, 0.9, 0], dtype=tf.float32) returns = common.compute_returns(rewards, discounts) expected_returns = [5, 4, 3, 2, 1, 3.439, 2.71, 1.9, 1] self.evaluate(tf.compat.v1.global_variables_initializer()) returns = self.evaluate(returns) self.assertAllClose(returns, expected_returns)
def testComputeReturnsRandomized(self): rewards = tf.constant(np.random.random([20]), dtype=tf.float32) discounts = tf.constant(np.random.random([20]), dtype=tf.float32) returns = common.compute_returns(rewards, discounts) def _compute_returns_fn(rewards, discounts): """Python implementation of computing discounted returns.""" returns = np.zeros(len(rewards)) next_state_return = 0.0 for t in range(len(returns) - 1, -1, -1): returns[t] = rewards[t] + discounts[t] * next_state_return next_state_return = returns[t] return returns.astype(np.float32) expected_returns = tf.py_func(_compute_returns_fn, [rewards, discounts], tf.float32) self.evaluate(tf.global_variables_initializer()) returns = self.evaluate(returns) expected_returns = self.evaluate(expected_returns) self.assertAllClose(returns, expected_returns)