def _get_step(self) -> EnvStep: if self._start_on_next_step: self._start_new_episode() if StepType.is_last(self._step_type): # This is the last (terminating) observation of the environment. self._start_on_next_step = True self._num_total_steps += 1 self._num_episodes += 1 # The policy is not run on the terminal step, so we just carry over the # reward, action, and policy_info from the previous step. return EnvStep(self._step_type, self._cur_step_num, self._observation, self._action, self._reward, self._discount, self._policy_info, {}, {}) self._action, self._policy_info = self._policy_fn(self._observation) self._next_observation, self._reward, done, _ = self._env.step(self._action) self._next_discount = float(not done) self._cur_step_num += 1 if done or (self._episode_step_limit and self._cur_step_num >= self._episode_step_limit): self._next_step_type = StepType.LAST else: self._next_step_type = StepType.MID step = EnvStep(self._step_type, self._cur_step_num - 1, self._observation, self._action, self._reward, self._discount, self._policy_info, {}, {}) self._num_steps += 1 self._num_total_steps += 1 if StepType.is_first(self._step_type): self._num_total_episodes += 1 self._observation = self._next_observation self._step_type = self._next_step_type self._discount = self._next_discount return step
def _create_spec(self): observation_spec = self._env.observation_spec() action_spec = self._env.action_spec() tf_agents_time_step_spec = time_step.time_step_spec(observation_spec) step_num_spec = specs.tensor_spec.from_spec( specs.BoundedArraySpec([], dtype=np.int64, minimum=0, maximum=self._episode_step_limit, name='step_num')) return EnvStep(tf_agents_time_step_spec.step_type, step_num_spec, observation_spec, action_spec, tf_agents_time_step_spec.reward, tf_agents_time_step_spec.discount, self._policy.info_spec, {}, {})
def _add_perturbations(self, env_step: EnvStep, last_rows_read: tf.Tensor): """Add history perturbations to rewards.""" randoms = tf.gather(self._random_numbers, last_rows_read) num_perturbations = self._num_perturbations or 1 perturbations = tf.cast( randoms[..., None] * tf.pow(2., 1 + tf.range(num_perturbations, dtype=tf.float32)), tf.int64) perturbations = tf.cast(tf.math.mod(perturbations, 2), env_step.reward.dtype) - 0.5 new_reward = (env_step.reward[..., None] + self._perturbation_scale * perturbations) if self._num_perturbations is None: new_reward = tf.squeeze(new_reward, -1) new_discount = env_step.discount else: new_discount = env_step.discount[..., None] return env_step.write(reward=new_reward, discount=new_discount)
def _get_step(self) -> EnvStep: if self._start_on_next_step: self._start_new_episode() if StepType.is_last(self._step_type): # This is the last (terminating) observation of the environment. self._start_on_next_step = True self._num_total_steps += 1 self._num_episodes += 1 # The policy is not run on the terminal step, so we just carry over the # reward, action, and policy_info from the previous step. return EnvStep(self._step_type, tf.cast(self._cur_step_num, dtype=tf.int64), self._time_step.observation, self._action, self._time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._action, self._policy_state, self._policy_info = self._policy.action( self._time_step, self._policy_state) # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents. if hasattr(self._policy_info, 'log_probability'): self._policy_info = policy_step.set_log_probability( self._policy_info, tf.cast(self._policy_info.log_probability, tf.float32)) # Sample action from policy. env_action = self._action if self._env.batch_size is not None: env_action = nest_utils.batch_nested_tensors(env_action) # Sample next step from environment. self._next_time_step = self._env.step(env_action) if self._env.batch_size is not None: self._next_time_step = nest_utils.unbatch_nested_tensors( self._next_time_step) self._next_step_type = self._next_time_step.step_type self._cur_step_num += 1 if (self._episode_step_limit and self._cur_step_num >= self._episode_step_limit): self._next_step_type = tf.convert_to_tensor( # Overwrite step type. value=StepType.LAST, dtype=self._first_step_type.dtype) self._next_step_type = tf.reshape(self._next_step_type, tf.shape(self._first_step_type)) step = EnvStep( self._step_type, tf.cast(self._cur_step_num - 1, tf.int64), self._time_step.observation, self._action, # Immediate reward given by next time step. self._next_time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._num_steps += 1 self._num_total_steps += 1 if StepType.is_first(self._step_type): self._num_total_episodes += 1 self._time_step = self._next_time_step self._step_type = self._next_step_type return step
def train_step(self, experience: dataset_lib.EnvStep, target_policy: tf_policy.TFPolicy): """Performs a single training step based on experience batch. Args: experience: A batch of experience. Members should have shape [batch_size, time_length, ...]. target_policy: The policy whose value we want to estimate. Returns: A train op. """ first_env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], experience) is_last = tf.cast(experience.is_last(), tf.float32) batch_size = tf.shape(is_last)[0] time_length = tf.shape(is_last)[1] batch_range = tf.range(batch_size, dtype=tf.int64) last_indices = tf.where( tf.equal(tf.reduce_max(is_last, axis=-1), 0.), tf.cast(time_length - 1, tf.int64) * tf.ones([batch_size], dtype=tf.int64), tf.argmax(is_last, axis=-1)) last_env_step = tf.nest.map_structure( lambda t: tf.gather_nd(t, tf.stack([batch_range, last_indices], -1) ), experience) rewards = self._reward_fn(experience)[:, :-1] if self._num_qvalues is not None and tf.rank(rewards) == 2: rewards = rewards[:, :, None] # Mask out rewards after episode end. mask = (tf.range(time_length - 1, dtype=tf.int64)[None, :] < last_indices[:, None]) if self._num_qvalues is not None: mask = mask[:, :, None] rewards *= tf.cast(mask, tf.float32) # Sum up trajectory rewards. discounts = tf.pow(self._gamma, tf.range(time_length - 1, dtype=tf.float32)) if self._num_qvalues is None: discounts = discounts[None, :] else: discounts = discounts[None, :, None] sum_discounted_rewards = tf.reduce_sum(rewards * discounts, 1) # Discount to be applied on last env step. last_discounts = tf.pow(self._gamma, tf.cast(time_length - 1, tf.float32)) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self._value_network.variables) loss = self.train_loss(first_env_step, sum_discounted_rewards, last_env_step, target_policy, last_discounts) grads = tape.gradient(loss, self._value_network.variables) grad_op = self._optimizer.apply_gradients( zip(grads, self._value_network.variables)) update_op = self._update_targets() return tf.reduce_mean(loss), tf.group(grad_op, update_op)
def spec(self): # TF wraps EnvStep in a TupleWrapper. We need to put it back as an EnvStep. return EnvStep(*self._spec)