예제 #1
0
  def _get_step(self) -> EnvStep:
    if self._start_on_next_step:
      self._start_new_episode()

    if StepType.is_last(self._step_type):
      # This is the last (terminating) observation of the environment.
      self._start_on_next_step = True
      self._num_total_steps += 1
      self._num_episodes += 1
      # The policy is not run on the terminal step, so we just carry over the
      # reward, action, and policy_info from the previous step.
      return EnvStep(self._step_type, self._cur_step_num,
                     self._observation, self._action,
                     self._reward, self._discount,
                     self._policy_info, {}, {})

    self._action, self._policy_info = self._policy_fn(self._observation)
    self._next_observation, self._reward, done, _ = self._env.step(self._action)
    self._next_discount = float(not done)
    self._cur_step_num += 1

    if done or (self._episode_step_limit and
                self._cur_step_num >= self._episode_step_limit):
      self._next_step_type = StepType.LAST
    else:
      self._next_step_type = StepType.MID

    step = EnvStep(self._step_type, self._cur_step_num - 1,
                   self._observation, self._action,
                   self._reward, self._discount,
                   self._policy_info, {}, {})

    self._num_steps += 1
    self._num_total_steps += 1
    if StepType.is_first(self._step_type):
      self._num_total_episodes += 1

    self._observation = self._next_observation
    self._step_type = self._next_step_type
    self._discount = self._next_discount

    return step
    def _create_spec(self):
        observation_spec = self._env.observation_spec()
        action_spec = self._env.action_spec()

        tf_agents_time_step_spec = time_step.time_step_spec(observation_spec)
        step_num_spec = specs.tensor_spec.from_spec(
            specs.BoundedArraySpec([],
                                   dtype=np.int64,
                                   minimum=0,
                                   maximum=self._episode_step_limit,
                                   name='step_num'))
        return EnvStep(tf_agents_time_step_spec.step_type, step_num_spec,
                       observation_spec, action_spec,
                       tf_agents_time_step_spec.reward,
                       tf_agents_time_step_spec.discount,
                       self._policy.info_spec, {}, {})
예제 #3
0
    def _add_perturbations(self, env_step: EnvStep, last_rows_read: tf.Tensor):
        """Add history perturbations to rewards."""
        randoms = tf.gather(self._random_numbers, last_rows_read)
        num_perturbations = self._num_perturbations or 1
        perturbations = tf.cast(
            randoms[..., None] *
            tf.pow(2., 1 + tf.range(num_perturbations, dtype=tf.float32)),
            tf.int64)
        perturbations = tf.cast(tf.math.mod(perturbations, 2),
                                env_step.reward.dtype) - 0.5

        new_reward = (env_step.reward[..., None] +
                      self._perturbation_scale * perturbations)
        if self._num_perturbations is None:
            new_reward = tf.squeeze(new_reward, -1)
            new_discount = env_step.discount
        else:
            new_discount = env_step.discount[..., None]
        return env_step.write(reward=new_reward, discount=new_discount)
    def _get_step(self) -> EnvStep:
        if self._start_on_next_step:
            self._start_new_episode()

        if StepType.is_last(self._step_type):
            # This is the last (terminating) observation of the environment.
            self._start_on_next_step = True
            self._num_total_steps += 1
            self._num_episodes += 1
            # The policy is not run on the terminal step, so we just carry over the
            # reward, action, and policy_info from the previous step.
            return EnvStep(self._step_type,
                           tf.cast(self._cur_step_num, dtype=tf.int64),
                           self._time_step.observation, self._action,
                           self._time_step.reward, self._time_step.discount,
                           self._policy_info, {}, {})

        self._action, self._policy_state, self._policy_info = self._policy.action(
            self._time_step, self._policy_state)

        # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents.
        if hasattr(self._policy_info, 'log_probability'):
            self._policy_info = policy_step.set_log_probability(
                self._policy_info,
                tf.cast(self._policy_info.log_probability, tf.float32))

        # Sample action from policy.
        env_action = self._action
        if self._env.batch_size is not None:
            env_action = nest_utils.batch_nested_tensors(env_action)

        # Sample next step from environment.
        self._next_time_step = self._env.step(env_action)
        if self._env.batch_size is not None:
            self._next_time_step = nest_utils.unbatch_nested_tensors(
                self._next_time_step)
        self._next_step_type = self._next_time_step.step_type
        self._cur_step_num += 1
        if (self._episode_step_limit
                and self._cur_step_num >= self._episode_step_limit):
            self._next_step_type = tf.convert_to_tensor(  # Overwrite step type.
                value=StepType.LAST,
                dtype=self._first_step_type.dtype)
            self._next_step_type = tf.reshape(self._next_step_type,
                                              tf.shape(self._first_step_type))

        step = EnvStep(
            self._step_type,
            tf.cast(self._cur_step_num - 1, tf.int64),
            self._time_step.observation,
            self._action,
            # Immediate reward given by next time step.
            self._next_time_step.reward,
            self._time_step.discount,
            self._policy_info,
            {},
            {})

        self._num_steps += 1
        self._num_total_steps += 1
        if StepType.is_first(self._step_type):
            self._num_total_episodes += 1

        self._time_step = self._next_time_step
        self._step_type = self._next_step_type

        return step
예제 #5
0
    def train_step(self, experience: dataset_lib.EnvStep,
                   target_policy: tf_policy.TFPolicy):
        """Performs a single training step based on experience batch.

    Args:
      experience: A batch of experience. Members should have shape [batch_size,
        time_length, ...].
      target_policy: The policy whose value we want to estimate.

    Returns:
      A train op.
    """
        first_env_step = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                               experience)

        is_last = tf.cast(experience.is_last(), tf.float32)
        batch_size = tf.shape(is_last)[0]
        time_length = tf.shape(is_last)[1]
        batch_range = tf.range(batch_size, dtype=tf.int64)
        last_indices = tf.where(
            tf.equal(tf.reduce_max(is_last, axis=-1), 0.),
            tf.cast(time_length - 1, tf.int64) *
            tf.ones([batch_size], dtype=tf.int64), tf.argmax(is_last, axis=-1))
        last_env_step = tf.nest.map_structure(
            lambda t: tf.gather_nd(t, tf.stack([batch_range, last_indices], -1)
                                   ), experience)

        rewards = self._reward_fn(experience)[:, :-1]
        if self._num_qvalues is not None and tf.rank(rewards) == 2:
            rewards = rewards[:, :, None]

        # Mask out rewards after episode end.
        mask = (tf.range(time_length - 1, dtype=tf.int64)[None, :] <
                last_indices[:, None])
        if self._num_qvalues is not None:
            mask = mask[:, :, None]
        rewards *= tf.cast(mask, tf.float32)

        # Sum up trajectory rewards.
        discounts = tf.pow(self._gamma,
                           tf.range(time_length - 1, dtype=tf.float32))
        if self._num_qvalues is None:
            discounts = discounts[None, :]
        else:
            discounts = discounts[None, :, None]
        sum_discounted_rewards = tf.reduce_sum(rewards * discounts, 1)

        # Discount to be applied on last env step.
        last_discounts = tf.pow(self._gamma,
                                tf.cast(time_length - 1, tf.float32))

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self._value_network.variables)
            loss = self.train_loss(first_env_step, sum_discounted_rewards,
                                   last_env_step, target_policy,
                                   last_discounts)

        grads = tape.gradient(loss, self._value_network.variables)
        grad_op = self._optimizer.apply_gradients(
            zip(grads, self._value_network.variables))
        update_op = self._update_targets()
        return tf.reduce_mean(loss), tf.group(grad_op, update_op)
예제 #6
0
 def spec(self):
     # TF wraps EnvStep in a TupleWrapper. We need to put it back as an EnvStep.
     return EnvStep(*self._spec)