Exemplo n.º 1
0
  def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None):
    del seed  # Unused. Seed passed to the class.
    outer_dims = self._outer_dims
    if outer_dims is None:
      if self.time_step_spec.observation:
        outer_dims = nest_utils.get_outer_array_shape(
            time_step.observation, self.time_step_spec.observation)
      else:
        outer_dims = ()

    observation_and_action_constraint_splitter = (
        self.observation_and_action_constraint_splitter)

    if observation_and_action_constraint_splitter is not None:
      _, mask = observation_and_action_constraint_splitter(
          time_step.observation)

      zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
      masked_categorical = masked.MaskedCategorical(zero_logits, mask)
      random_action = tf.cast(
          masked_categorical.sample() + self.action_spec.minimum,
          self.action_spec.dtype)

      # If the action spec says each action should be shaped (1,), add another
      # dimension so the final shape is (B, 1) rather than (B,).
      if len(self.action_spec.shape) == 1:
        random_action = tf.expand_dims(random_action, axis=-1)
    else:
      random_action = array_spec.sample_spec_nest(
          self._action_spec, self._rng, outer_dims=outer_dims)

    info = array_spec.sample_spec_nest(
        self._info_spec, self._rng, outer_dims=outer_dims)

    return policy_step.PolicyStep(random_action, policy_state, info)
Exemplo n.º 2
0
  def _step(self, action):
    # Automatically reset the environments on step if they need to be reset.
    if self._handle_auto_reset and self._done:
      return self.reset()

    # Some environments (e.g. FrozenLake) use the action as a key to the
    # transition probability so it has to be hashable. In the case of discrete
    # actions we have a numpy scalar (e.g array(2)) which is not hashable
    # in this case, we simply pull out the scalar value which will be hashable.
    try:
      action = action.item() if self._action_is_discrete else action
    except AttributeError:
      action = action[0]  # Remove ListWrapper for single-agent compatibility

    observation, reward, self._done, self._info = self._gym_env.step(action)

    if self._match_obs_space_dtype:
      observation = self._to_obs_space_dtype(observation)

    reward = np.asarray(reward, dtype=self.reward_spec().dtype)
    outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec())

    if self._done:
      return ts_lib.termination(observation, reward, outer_dims=outer_dims)
    else:
      return ts_lib.transition(observation, reward, self._discount,
                               outer_dims=outer_dims)
Exemplo n.º 3
0
    def _step(self, action):
        # Automatically reset the environments on step if they need to be reset.
        if self._auto_reset and self._done:
            return self.reset_agent()

        action = action.item() if self._action_is_discrete else action

        observation, reward, self._done, self._info = self._gym_env.step(
            action)

        if self._match_obs_space_dtype:
            observation = self._to_obs_space_dtype(observation)

        reward = np.asarray(reward, dtype=self.reward_spec().dtype)
        outer_dims = nest_utils.get_outer_array_shape(reward,
                                                      self.reward_spec())

        if self._done:
            return ts_lib.termination(observation,
                                      reward,
                                      outer_dims=outer_dims)
        else:
            return ts_lib.transition(observation,
                                     reward,
                                     self._discount,
                                     outer_dims=outer_dims)
Exemplo n.º 4
0
  def testTerminationMultiRewards(self):
    observation = np.array([[-1], [-1]])
    reward = [np.array([[2.], [2.]]),
              np.array([[3., 3.], [4., 4.]])]
    time_step = ts.termination(observation, reward)

    self.assertItemsEqual([ts.StepType.LAST] * 2, time_step.step_type)
    self.assertItemsEqual(observation, time_step.observation)
    self.assertAllEqual(reward[0], time_step.reward[0])
    self.assertAllEqual(reward[1], time_step.reward[1])
    self.assertItemsEqual([0., 0.], time_step.discount)

    reward = np.array([[2., 2., 2.], [3., 3., 3.]])
    reward_spec = [array_spec.ArraySpec((3,), np.float32, 'multi_r')]
    outer_dims = nest_utils.get_outer_array_shape(reward, reward_spec)
    time_step_batch = ts.termination(observation, reward, outer_dims)

    # Check that passing outer_dims works
    self.assertItemsEqual([ts.StepType.LAST] * 2, time_step_batch.step_type)
    self.assertItemsEqual(observation, time_step_batch.observation)
    self.assertAllEqual(reward[0], time_step_batch.reward[0])
    self.assertAllEqual(reward[1], time_step_batch.reward[1])
    self.assertItemsEqual([0., 0.], time_step_batch.discount)

    # Check that it gets a different result with no outer_dims
    time_step_no_batch = ts.termination(observation, reward, outer_dims=[])
    self.assertEqual(ts.StepType.LAST, time_step_no_batch.step_type)
    self.assertItemsEqual(observation, time_step_no_batch.observation)
    self.assertAllEqual(reward[0], time_step_no_batch.reward[0])
    self.assertAllEqual(reward[1], time_step_no_batch.reward[1])
    self.assertEqual(0., time_step_no_batch.discount)
    def _add_batch(self, items):
        """
        Add the experiences in the batch to the replay buffer. Only batches of size 1 are supported at the moment

        Params:
            items: this contains the experiences to be added
        """
        logger.info("Adding a batch of 1 experiences to Replay buffer")

        outer_shape = nest_utils.get_outer_array_shape(items, self._data_spec)
        if outer_shape[0] != 1:
            raise NotImplementedError('PyPrioritizedReplayBuffer only supports a batch '
                                      'size of 1, but received `items` with batch '
                                      'size {}.'.format(outer_shape[0]))

        item = nest_utils.unbatch_nested_array(items)

        # get maximum priority in the replay buffer or set it's initial value is 1
        max_priority = self._prioritized_buffer_priorities.max() if self._np_state.size > 0 else 1.0

        with self._lock:
            if self._np_state.size == self._prioritized_buffer_capacity:
                # If we are at capacity, we are deleting element cur_id.
                self._on_delete(self._storage.get(self._np_state.cur_id))

            self._storage.set(self._np_state.cur_id, self._encode(item))
            # add the max priority of the experience to the priority array
            self._prioritized_buffer_priorities[self._np_state.cur_id] = max_priority

            self._np_state.size = np.minimum(self._np_state.size + 1, self._prioritized_buffer_capacity)
            self._np_state.cur_id = (self._np_state.cur_id + 1) % self._prioritized_buffer_capacity
            self._np_state.item_count += 1
def double_batch_pred2(the_model, all_inputs, specs, is_training=False):
    outer_dims = nest_utils.get_outer_array_shape(all_inputs, specs)
    all_inputs, _ = nest_utils.flatten_multi_batched_nested_tensors(
        all_inputs, specs)

    vals = the_model(all_inputs, is_training=is_training)
    vals = tf.reshape(vals, (*outer_dims, -1))
    return vals
 def call(self,
          observation,
          step_type=None,
          network_state=(),
          training=False):
     shape = nest_utils.get_outer_array_shape(observation,
                                              self._input_tensor_spec)
     return tf.constant(self._constant_output_val, tf.float32,
                        shape=shape), network_state
Exemplo n.º 8
0
 def _build_from_time_step(self, time_step):
     outer_shape = nest_utils.get_outer_array_shape(time_step,
                                                    self._time_step_spec)
     if len(outer_shape) == 1:
         self.initialize(outer_shape[0])
     elif not outer_shape:
         self.initialize(None)
     else:
         raise ValueError(
             'Cannot handle more than one outer dimension. Saw {} outer '
             'dimensions: {}'.format(len(outer_shape), outer_shape))
Exemplo n.º 9
0
 def _add_batch(self, items):
   tf.nest.assert_same_structure(items, self._data_spec)
   outer_shape = nest_utils.get_outer_array_shape(items, self._data_spec)
   with tf.device(self._device), tf.name_scope(self._scope):
     self.next_episode()
     if outer_shape[0] != 1:
       for item in items:
         self.bigtable_add_row(item)
     else:
       self.bigtable_add_row(items)
     self.bigtable_write_rows()
Exemplo n.º 10
0
  def testGetOuterArrayShape(self):
    spec = (
        array_spec.ArraySpec([5, 8], np.float32),
        (array_spec.ArraySpec([1], np.int32),
         array_spec.ArraySpec([2, 2, 2], np.float32))
    )

    batch_size = 3
    unstacked_arrays = [self.zeros_from_spec(spec) for _ in range(batch_size)]

    outer_dims = nest_utils.get_outer_array_shape(unstacked_arrays[0], spec)
    self.assertEqual((), outer_dims)

    stacked_array = nest_utils.stack_nested_arrays(unstacked_arrays)
    outer_dims = nest_utils.get_outer_array_shape(stacked_array, spec)
    self.assertEqual((batch_size,), outer_dims)

    time_dim = [nest_utils.batch_nested_array(arr) for arr in unstacked_arrays]
    batch_time = nest_utils.stack_nested_arrays(time_dim)
    outer_dims = nest_utils.get_outer_array_shape(batch_time, spec)
    self.assertEqual((batch_size, 1), outer_dims)
Exemplo n.º 11
0
  def _action(self, time_step, policy_state):
    outer_dims = self._outer_dims
    if outer_dims is None:
      if self.time_step_spec.observation:
        outer_dims = nest_utils.get_outer_array_shape(
            time_step.observation, self.time_step_spec.observation)
      else:
        outer_dims = ()

    random_action = array_spec.sample_spec_nest(
        self._action_spec, self._rng, outer_dims=outer_dims)
    return policy_step.PolicyStep(random_action, policy_state)
Exemplo n.º 12
0
  def _add_batch(self, items):
    outer_shape = nest_utils.get_outer_array_shape(items, self._data_spec)
    if outer_shape[0] != 1:
      raise NotImplementedError('PyUniformReplayBuffer only supports a batch '
                                'size of 1, but received `items` with batch '
                                'size {}.'.format(outer_shape[0]))

    item = nest_utils.unbatch_nested_array(items)
    with self._lock:
      if self._np_state.size == self._capacity:
        # If we are at capacity, we are deleting element cur_id.
        self._on_delete(self._storage.get(self._np_state.cur_id))
      self._storage.set(self._np_state.cur_id, self._encode(item))
      self._np_state.size = np.minimum(self._np_state.size + 1, self._capacity)
      self._np_state.cur_id = (self._np_state.cur_id + 1) % self._capacity
      self._np_state.item_count += 1
  def step_adversary(self, action):
    action = action.item() if self._action_is_discrete else action

    observation, reward, self._done, self._info = self._gym_env.step_adversary(
        action)

    if self._match_obs_space_dtype:
      observation = self._adversary_to_obs_space_dtype(observation)

    reward = np.asarray(reward, dtype=self.reward_spec().dtype)
    outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec())

    if self._done:
      return ts_lib.termination(observation, reward, outer_dims=outer_dims)
    else:
      return ts_lib.transition(observation, reward, self._discount,
                               outer_dims=outer_dims)
Exemplo n.º 14
0
    def _apply_actor_network(self,
                             time_step,
                             step_type,
                             policy_state,
                             mask=None):
        observation = time_step

        if self._observation_normalizer:
            observation = self._observation_normalizer.normalize(observation)
        if tf.is_tensor(observation):
            if not nest_utils.is_batched_nested_tensors(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_tensors(observation)
        else:
            if not nest_utils.get_outer_array_shape(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_array(observation)

        alpha = np.array([self.alpha])[None]
        return self._actor_network((observation, alpha),
                                   step_type,
                                   policy_state,
                                   training=self._training)
Exemplo n.º 15
0
    def _action(self, time_step, policy_state):

        outer_dims = self._outer_dims

        if outer_dims is None:
            if self.time_step_spec.observation:
                outer_dims = nest_utils.get_outer_array_shape(
                    time_step.observation, self.time_step_spec.observation)
            else:
                outer_dims = ()

        random_action = np.array([
            simple_human_policy(
                human_agent_wrapper(
                    self.__convert_tf_obs_to_numpy(
                        time_step.observation))).value
        ])

        info = array_spec.sample_spec_nest(self._info_spec,
                                           self._rng,
                                           outer_dims=outer_dims)

        return policy_step.PolicyStep(random_action, policy_state, info)
Exemplo n.º 16
0
  def run_agent(self, env, agent_list, reset_func, step_func, agent_idx=None):
    """Runs an agent in an environment given a step and reset function.

    Args:
      env: A TF-agents TF environment.
      agent_list: A list of TrainAgentPackages, each of which contains an agent
        that can be run in the environment. The agent to run will be randomly
        selected from the list (to handle population based training).
      reset_func: Callable function used to reset the environment.
      step_func: Callable function used to step the environment.
      agent_idx: The integer population index of the agent to run.

    Returns:
      The average reward achieved, the maximum reward, and the index of the
        agent selected.
    """
    if agent_idx is None:
      agent_idx = np.random.choice(len(agent_list))
    agent = agent_list[agent_idx]

    if self.collect:
      policy = agent.collect_policy
      observers = agent.observers
    else:
      policy = agent.eval_policy
      observers = agent.eval_metrics

    time_step = reset_func()
    policy_state = policy.get_initial_state(env.batch_size)

    num_steps = tf.constant(0.0)
    num_episodes = tf.zeros_like(time_step.reward)

    avg_reward = tf.zeros_like(time_step.reward)
    max_reward = tf.zeros_like(time_step.reward)

    while num_steps < agent.max_steps:
      action_step = policy.action(time_step, policy_state)
      next_time_step = step_func(action_step.action)

      # Replace with terminal timestep to manually end episode (enables
      # artificially decreasing number of steps for one of the agents).
      if agent.name == 'agent' and num_steps >= agent.max_steps - 1:
        outer_dims = nest_utils.get_outer_array_shape(
            next_time_step.reward, env.reward_spec())
        next_time_step = ts_lib.termination(
            next_time_step.observation, next_time_step.reward,
            outer_dims=outer_dims)

      traj = trajectory.from_transition(time_step, action_step, next_time_step)

      num_steps += 1
      num_episodes += tf.cast(traj.is_last(), tf.float32)

      avg_reward += next_time_step.reward
      max_reward = tf.math.maximum(max_reward, next_time_step.reward)

      for observer in observers:
        observer(traj)

      time_step = next_time_step
      policy_state = action_step.state

    avg_reward = avg_reward / num_episodes

    return avg_reward, max_reward, agent_idx