def reset_random(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_array(self._envs[0].reset_random())
     else:
         time_steps = self._execute(lambda env: env.reset_random(),
                                    self._envs)
         return nest_utils.stack_nested_arrays(time_steps)
示例#2
0
 def _action(self, time_step, policy_state):
     time_step = nest_utils.batch_nested_array(time_step)
     # Avoid passing numpy arrays to avoid retracing of the tf.function.
     time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step)
     policy_step = self._policy_action_fn(time_step, policy_state)
     return policy_step._replace(
         action=nest_utils.unbatch_nested_array(policy_step.action.numpy()))
示例#3
0
 def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None):
   if seed is not None and self._use_tf_function:
     logging.warning(
         'Using `seed` may force a retrace for each call to `action`.')
   if self._batch_time_steps:
     time_step = nest_utils.batch_nested_array(time_step)
   # Avoid passing numpy arrays to avoid retracing of the tf.function.
   time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step)
   if seed is not None:
     policy_step = self._policy_action_fn(time_step, policy_state, seed=seed)
   else:
     policy_step = self._policy_action_fn(time_step, policy_state)
   if not self._batch_time_steps:
     return policy_step
   return policy_step._replace(
       action=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.action),
       # We intentionally do not convert the `state` so it is outputted as the
       # underlying policy generated it (i.e. in the form of a Tensor) which is
       # not necessarily compatible with a py-policy. However, we do so since
       # the `state` is fed back to the policy. So if it was converted, it'd be
       # required to convert back to the original form before calling the
       # method `action` of the policy again in the next step. If one wants to
       # store the `state` e.g. in replay buffer, then we suggest placing it
       # into the `info` field.
       info=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.info))
示例#4
0
 def render(self, mode="rgb_array"):
     if self._num_envs == 1:
         img = self._envs[0].render(mode)
         return nest_utils.batch_nested_array(img)
     else:
         imgs = self._execute(lambda env: env.render(mode), self._envs)
         return nest_utils.stack_nested_arrays(imgs)
示例#5
0
    def _fill_replay_buffer(self):
        # Generate N frames: the value of pixels is the frame index.
        # The observations will be generated by stacking K frames out of those N,
        # generating some redundancies between the observations.
        single_frames = []
        frame_count = 100
        for k in range(frame_count):
            single_frames.append(np.full(self._single_shape, k,
                                         dtype=np.int32))

        # Add stack of frames to the replay buffer.
        time_steps = []
        for k in range(len(single_frames) - self._stack_count + 1):
            observation = np.concatenate(single_frames[k:k +
                                                       self._stack_count],
                                         axis=-1)
            time_steps.append(ts.transition(observation, reward=0.0))

        self._transition_count = len(time_steps) - 1
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._transition_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
 def get_passable(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_array(
             tf.cast(self._envs[0].passable, tf.float32))
     else:
         return tf.stack(
             [tf.cast(env.passable, tf.float32) for env in self._envs])
示例#7
0
    def _step(self, actions):
        """Forward a batch of actions to the wrapped environments.

    Args:
      actions: Batched action, possibly nested, to apply to the environment.

    Raises:
      ValueError: Invalid actions.

    Returns:
      Batch of observations, rewards, and done flags.
    """

        if self._num_envs == 1:
            """Nitty: modified to remove cannot squeeze error"""
            # actions = nest_utils.unbatch_nested_array(actions)
            time_steps = self._envs[0].step(actions)
            return nest_utils.batch_nested_array(time_steps)
        else:
            unstacked_actions = unstack_actions(actions)
            if len(unstacked_actions) != self.batch_size:
                raise ValueError(
                    "Primary dimension of action items does not match "
                    "batch size: %d vs. %d" %
                    (len(unstacked_actions), self.batch_size))
            time_steps = self._execute(
                lambda env_action: env_action[0].step(env_action[1]),
                zip(self._envs, unstacked_actions))
            return nest_utils.stack_nested_arrays(time_steps)
示例#8
0
  def _action(self, time_step, policy_state):
    if not self._built:
      self._build_from_time_step(time_step)

    batch_size = None
    if time_step.step_type.shape:
      batch_size = time_step.step_type.shape[0]
    if self._batch_size != batch_size:
      raise ValueError(
          'The batch size of time_step is different from the batch size '
          'provided previously. Expected {}, but saw {}.'.format(
              self._batch_size, batch_size))

    if not self._batched:
      # Since policy_state is given in a batched form from the policy and we
      # simply have to send it back we do not need to worry about it. Only
      # update time_step.
      time_step = nest_utils.batch_nested_array(time_step)

    tf.nest.assert_same_structure(self._time_step, time_step)
    feed_dict = {self._time_step: time_step}
    if policy_state is not None:
      # Flatten policy_state to handle specs that are not hashable due to lists.
      for state_ph, state in zip(
          tf.nest.flatten(self._policy_state), tf.nest.flatten(policy_state)):
        feed_dict[state_ph] = state

    action_step = self.session.run(self._action_step, feed_dict)
    action, state, info = action_step

    if not self._batched:
      action, info = nest_utils.unbatch_nested_array([action, info])

    return policy_step.PolicyStep(action, state, info)
示例#9
0
    def testSavedModel(self):
        path = os.path.join(self.get_temp_dir(), 'saved_policy')
        saver = policy_saver.PolicySaver(self.tf_policy)
        saver.save(path)

        eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
            path, self.time_step_spec, self.action_spec)
        rng = np.random.RandomState()
        sample_time_step = array_spec.sample_spec_nest(self.time_step_spec,
                                                       rng)
        batched_sample_time_step = nest_utils.batch_nested_array(
            sample_time_step)

        original_action = self.tf_policy.action(batched_sample_time_step)
        unbatched_original_action = nest_utils.unbatch_nested_tensors(
            original_action)
        original_action_np = tf.nest.map_structure(lambda t: t.numpy(),
                                                   unbatched_original_action)
        saved_policy_action = eager_py_policy.action(sample_time_step)

        tf.nest.assert_same_structure(saved_policy_action.action,
                                      self.action_spec)

        np.testing.assert_array_almost_equal(original_action_np.action,
                                             saved_policy_action.action)
 def get_num_blocks(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_array(
             tf.cast(self._envs[0].n_clutter_placed, tf.float32))
     else:
         return tf.stack([
             tf.cast(env.n_clutter_placed, tf.float32) for env in self._envs
         ])
示例#11
0
 def _get_initial_state(self, batch_size: int) -> types.NestedArray:
     if self._num_policies == 1:
         return nest_utils.batch_nested_array(
             self._policies[0].get_initial_state())
     else:
         infos = self._execute(_execute_get_initial_state, self._policies)
         infos = nest_utils.unbatch_nested_array(infos)
         return nest_utils.stack_nested_arrays(infos)
 def get_distance_to_goal(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_array(
             tf.cast(self._envs[0].distance_to_goal, tf.float32))
     else:
         return tf.stack([
             tf.cast(env.distance_to_goal, tf.float32) for env in self._envs
         ])
 def get_deliberate_placement(self):
   if self._num_envs == 1:
     return nest_utils.batch_nested_array(
         tf.cast(self._envs[0].deliberate_agent_placement, tf.float32))
   else:
     return tf.stack(
         [tf.cast(env.deliberate_agent_placement,
                  tf.float32) for env in self._envs])
 def get_shortest_path_length(self):
     if self._num_envs == 1:
         return nest_utils.batch_nested_array(
             tf.cast(self._envs[0].shortest_path_length, tf.float32))
     else:
         return tf.stack([
             tf.cast(env.shortest_path_length, tf.float32)
             for env in self._envs
         ])
示例#15
0
 def call(self, trajectory: traj.Trajectory):
     if not self._batch_size:
         if trajectory.step_type.ndim == 0:
             self._batch_size = 1
         else:
             assert trajectory.step_type.ndim == 1
             self._batch_size = trajectory.step_type.shape[0]
         self.reset()
     if trajectory.step_type.ndim == 0:
         trajectory = nest_utils.batch_nested_array(trajectory)
     self._batched_call(trajectory)
示例#16
0
  def _reset(self):
    """Reset all environments and combine the resulting observation.

    Returns:
      Time step with batch dimension.
    """
    if self._num_envs == 1:
      return nest_utils.batch_nested_array(self._envs[0].reset())
    else:
      time_steps = self._execute(lambda env: env.reset(), self._envs)
      return nest_utils.stack_nested_arrays(time_steps)
示例#17
0
    def testInferenceFromCheckpoint(self):
        path = os.path.join(self.get_temp_dir(), 'saved_policy')
        saver = policy_saver.PolicySaver(self.tf_policy)
        saver.save(path)

        rng = np.random.RandomState()
        sample_time_step = array_spec.sample_spec_nest(self.time_step_spec,
                                                       rng)
        batched_sample_time_step = nest_utils.batch_nested_array(
            sample_time_step)

        self.evaluate(
            tf.nest.map_structure(lambda v: v.assign(v * 0 + -1),
                                  self.tf_policy.variables()))
        checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint')
        saver.save_checkpoint(checkpoint_path)

        eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
            path, self.time_step_spec, self.action_spec)

        # Use evaluate to force a copy.
        saved_model_variables = self.evaluate(eager_py_policy.variables())

        checkpoint = tf.train.Checkpoint(policy=eager_py_policy._policy)
        manager = tf.train.CheckpointManager(checkpoint,
                                             directory=checkpoint_path,
                                             max_to_keep=None)

        eager_py_policy.update_from_checkpoint(manager.latest_checkpoint)

        assert_np_not_equal = lambda a, b: self.assertFalse(
            np.equal(a, b).all())
        tf.nest.map_structure(assert_np_not_equal, saved_model_variables,
                              self.evaluate(eager_py_policy.variables()))

        assert_np_all_equal = lambda a, b: self.assertTrue(
            np.equal(a, b).all())
        tf.nest.map_structure(assert_np_all_equal,
                              self.evaluate(self.tf_policy.variables()),
                              self.evaluate(eager_py_policy.variables()))

        # Can't check if the action is different as in some cases depending on
        # variable initialization it will be the same. Checking that they are at
        # least always the same.
        checkpoint_action = eager_py_policy.action(sample_time_step)

        current_policy_action = self.tf_policy.action(batched_sample_time_step)
        current_policy_action = self.evaluate(
            nest_utils.unbatch_nested_tensors(current_policy_action))
        tf.nest.map_structure(assert_np_all_equal, current_policy_action,
                              checkpoint_action)
示例#18
0
    def _action(self,
                time_step: ts.TimeStep,
                policy_state: types.NestedArray,
                seed: Optional[types.Seed] = None) -> ps.PolicyStep:
        """Forward a batch of time_step and policy_states to the wrapped policies.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: An Array, or a nested dict, list or tuple of Arrays
        representing the previous policy_state.
      seed: Seed value used to initialize a pseudorandom number generator.

    Returns:
      A batch of `PolicyStep` named tuples, each one containing:
        `action`: A nest of action Arrays matching the `action_spec()`.
        `state`: A nest of policy states to be fed into the next call to action.
        `info`: Optional side information such as action log probabilities.

    Raises:
      NotImplementedError: if `seed` is not None.
    """
        if seed is not None:
            raise NotImplementedError(
                "seed is not supported; but saw seed: {}".format(seed))
        if self._num_policies == 1:
            time_step = nest_utils.unbatch_nested_array(time_step)
            policy_state = nest_utils.unbatch_nested_array(policy_state)
            policy_steps = self._policies[0].action(time_step, policy_state)
            return nest_utils.batch_nested_array(policy_steps)
        else:
            unstacked_time_steps = nest_utils.unstack_nested_arrays(time_step)
            if len(unstacked_time_steps) != len(self._policies):
                raise ValueError(
                    "Primary dimension of time_step items does not match "
                    "batch size: %d vs. %d" %
                    (len(unstacked_time_steps), len(self._policies)))
            unstacked_policy_states = [()] * len(unstacked_time_steps)
            if policy_state:
                unstacked_policy_states = nest_utils.unstack_nested_arrays(
                    policy_state)
                if len(unstacked_policy_states) != len(self._policies):
                    raise ValueError(
                        "Primary dimension of policy_state items does not match "
                        "batch size: %d vs. %d" %
                        (len(unstacked_policy_states), len(self._policies)))
            policy_steps = self._execute(
                _execute_policy,
                zip(self._policies, unstacked_time_steps,
                    unstacked_policy_states))
            return nest_utils.stack_nested_arrays(policy_steps)
 def step_adversary(self, actions):
   if self._num_envs == 1:
     actions = nest_utils.unbatch_nested_array(actions)
     time_steps = self._envs[0].step_adversary(actions)
     return nest_utils.batch_nested_array(time_steps)
   else:
     unstacked_actions = batched_py_environment.unstack_actions(actions)
     if len(unstacked_actions) != self.batch_size:
       raise ValueError(
           'Primary dimension of action items does not match '
           'batch size: %d vs. %d' % (len(unstacked_actions), self.batch_size))
     time_steps = self._execute(
         lambda env_action: env_action[0].step_adversary(env_action[1]),
         zip(self._envs, unstacked_actions))
     return nest_utils.stack_nested_arrays(time_steps)
示例#20
0
 def _action(self, time_step, policy_state):
   time_step = nest_utils.batch_nested_array(time_step)
   # Avoid passing numpy arrays to avoid retracing of the tf.function.
   time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step)
   policy_step = self._policy_action_fn(time_step, policy_state)
   return policy_step._replace(
       action=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.action),
       # We intentionally do not convert the `state` so it is outputted as the
       # underlying policy generated it (i.e. in the form of a Tensor) which is
       # not necessarily compatible with a py-policy. However, we do so since
       # the `state` is fed back to the policy. So if it was converted, it'd be
       # required to convert back to the original form before calling the
       # method `action` of the policy again in the next step. If one wants to
       # store the `state` e.g. in replay buffer, then we suggest placing it
       # into the `info` field.
       info=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.info))
示例#21
0
文件: dqn.py 项目: mshinji/mahjong
    def _reset(self):
        print(self.count, '試合目')
        self.count += 1

        self.game = Game()
        self.player = DQN(id=0, game=self.game)
        self.game.start_game(player=self.player)
        self.game.start_kyoku()

        self.reward = 0
        self.game_end = False

        while self.game.next():
            pass

        time_step = ts.restart(self.board())
        return nest_utils.batch_nested_array(time_step)
    def setUp(self):
        super(TFDriverTest, self).setUp()
        f0 = np.array(0., dtype=np.float32)
        f1 = np.array(1., dtype=np.float32)

        # Order of args for trajectory methods:
        # (observation, action, policy_info, reward, discount)
        trajectories = [
            trajectory.first(0, 1, 2, f1, f1),
            trajectory.last(1, 2, 4, f1, f0),
            trajectory.boundary(3, 1, 2, f0, f1),
            trajectory.first(0, 1, 2, f1, f1),
            trajectory.last(1, 2, 4, f1, f0),
            trajectory.boundary(3, 1, 2, f0, f1),
            trajectory.first(0, 1, 2, f1, f1),
        ]
        self._trajectories = nest_utils.batch_nested_array(trajectories)
    def testSavedModel(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in eager.')

        observation_spec = array_spec.ArraySpec([2], np.float32)
        action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3)
        time_step_spec = ts.time_step_spec(observation_spec)

        observation_tensor_spec = tensor_spec.from_spec(observation_spec)
        action_tensor_spec = tensor_spec.from_spec(action_spec)
        time_step_tensor_spec = tensor_spec.from_spec(time_step_spec)

        actor_net = actor_network.ActorNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(10, ),
        )

        tf_policy = actor_policy.ActorPolicy(time_step_tensor_spec,
                                             action_tensor_spec,
                                             actor_network=actor_net)

        path = os.path.join(self.get_temp_dir(), 'saved_policy')
        saver = policy_saver.PolicySaver(tf_policy)
        saver.save(path)

        eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
            path, time_step_spec, action_spec)

        rng = np.random.RandomState()
        sample_time_step = array_spec.sample_spec_nest(time_step_spec, rng)
        batched_sample_time_step = nest_utils.batch_nested_array(
            sample_time_step)

        original_action = tf_policy.action(batched_sample_time_step)
        unbatched_original_action = nest_utils.unbatch_nested_tensors(
            original_action)
        original_action_np = tf.nest.map_structure(lambda t: t.numpy(),
                                                   unbatched_original_action)
        saved_policy_action = eager_py_policy.action(sample_time_step)

        tf.nest.assert_same_structure(saved_policy_action.action, action_spec)

        np.testing.assert_array_almost_equal(original_action_np.action,
                                             saved_policy_action.action)
    def _generate_replay_buffer(self, rb_cls):
        stack_count = 4
        shape = (15, 15, stack_count)
        single_shape = (15, 15, 1)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = 32
        self._replay_buffer = rb_cls(data_spec=self._trajectory_spec,
                                     capacity=self._capacity)

        # Generate N frames: the value of pixels is the frame index.
        # The observations will be generated by stacking K frames out of those N,
        # generating some redundancies between the observations.
        single_frames = []
        frame_count = 100
        for k in range(frame_count):
            single_frames.append(np.full(single_shape, k, dtype=np.int32))

        # Add stack of frames to the replay buffer.
        time_steps = []
        for k in range(len(single_frames) - stack_count + 1):
            observation = np.concatenate(single_frames[k:k + stack_count],
                                         axis=-1)
            time_steps.append(ts.transition(observation, reward=0.0))

        self._transition_count = len(time_steps) - 1
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._transition_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
示例#25
0
  def testGetOuterArrayShape(self):
    spec = (
        array_spec.ArraySpec([5, 8], np.float32),
        (array_spec.ArraySpec([1], np.int32),
         array_spec.ArraySpec([2, 2, 2], np.float32))
    )

    batch_size = 3
    unstacked_arrays = [self.zeros_from_spec(spec) for _ in range(batch_size)]

    outer_dims = nest_utils.get_outer_array_shape(unstacked_arrays[0], spec)
    self.assertEqual((), outer_dims)

    stacked_array = nest_utils.stack_nested_arrays(unstacked_arrays)
    outer_dims = nest_utils.get_outer_array_shape(stacked_array, spec)
    self.assertEqual((batch_size,), outer_dims)

    time_dim = [nest_utils.batch_nested_array(arr) for arr in unstacked_arrays]
    batch_time = nest_utils.stack_nested_arrays(time_dim)
    outer_dims = nest_utils.get_outer_array_shape(batch_time, spec)
    self.assertEqual((batch_size, 1), outer_dims)
示例#26
0
    def _fill_replay_buffer(self, n_transition=50):
        # Generate N observations.
        single_obs_list = []
        obs_count = 100
        for k in range(obs_count):
            single_obs_list.append(
                np.full(self._single_shape, k, dtype=np.int32))

        # Add stack of observations to the replay buffer.
        time_steps = []
        for k in range(len(single_obs_list) - self._stack_count + 1):
            stacked_observation = np.concatenate(
                single_obs_list[k:k + self._stack_count], axis=-1)
            time_steps.append(ts.transition(stacked_observation, reward=0.0))

        self._experience_count = n_transition
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._experience_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
示例#27
0
  def _action(self, time_step, policy_state):
    if not self._batched:
      # Since policy_state is given in a batched form from the policy and we
      # simply have to send it back we do not need to worry about it. Only
      # update time_step.
      time_step = nest_utils.batch_nested_array(time_step)

    nest.assert_same_structure(self._time_step, time_step)
    feed_dict = {self._time_step: time_step}
    if policy_state is not None:
      # Flatten policy_state to handle specs that are not hashable due to lists.
      for state_ph, state in zip(
          nest.flatten(self._policy_state), nest.flatten(policy_state)):
        feed_dict[state_ph] = state

    action_step = self.session.run(self._action_step, feed_dict)
    action, state, info = action_step

    if not self._batched:
      action, info = nest_utils.unbatch_nested_array([action, info])

    return policy_step.PolicyStep(action, state, info)
示例#28
0
文件: agents.py 项目: krishpop/sqrl
    def _apply_actor_network(self,
                             time_step,
                             step_type,
                             policy_state,
                             mask=None):
        observation = time_step

        if self._observation_normalizer:
            observation = self._observation_normalizer.normalize(observation)
        if tf.is_tensor(observation):
            if not nest_utils.is_batched_nested_tensors(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_tensors(observation)
        else:
            if not nest_utils.get_outer_array_shape(
                    observation, self.time_step_spec.observation):
                observation = nest_utils.batch_nested_array(observation)

        alpha = np.array([self.alpha])[None]
        return self._actor_network((observation, alpha),
                                   step_type,
                                   policy_state,
                                   training=self._training)
示例#29
0
文件: dqn.py 项目: mshinji/mahjong
    def _step(self, action):
        action = nest_utils.unbatch_nested_array(action)
        score = self.score()
        dahai = self.dahai(action)
        # print(action, dahai, self.player.tehai)

        self.reward = 0
        self.game.dahai(dahai, self.player)
        while self.game.next():
            pass

        if self.game.state in [Const.RYUKYOKU_STATE, Const.AGARI_STATE]:
            self.reward = self.score() - score
            self.game_end = True
            time_step = ts.termination(self.board(), reward=0)
        elif self.game.state == Const.SYUKYOKU_STATE:
            self.reward = [90, 45, 0, -180][self.rank()] * 1000
            self.game_end = True
            time_step = ts.termination(self.board(), reward=0)
        else:
            time_step = ts.transition(self.board(), reward=0, discount=1)

        return nest_utils.batch_nested_array(time_step)
示例#30
0
    def call(self, trajectory: traj.Trajectory):
        if trajectory.step_type.ndim == 0:
            trajectory = nest_utils.batch_nested_array(trajectory)

        completed_episodes = np.sum(trajectory.is_last().astype(np.int64))
        self._np_state.number_episodes += completed_episodes