def reset_random(self): if self._num_envs == 1: return nest_utils.batch_nested_array(self._envs[0].reset_random()) else: time_steps = self._execute(lambda env: env.reset_random(), self._envs) return nest_utils.stack_nested_arrays(time_steps)
def _action(self, time_step, policy_state): time_step = nest_utils.batch_nested_array(time_step) # Avoid passing numpy arrays to avoid retracing of the tf.function. time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step) policy_step = self._policy_action_fn(time_step, policy_state) return policy_step._replace( action=nest_utils.unbatch_nested_array(policy_step.action.numpy()))
def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None): if seed is not None and self._use_tf_function: logging.warning( 'Using `seed` may force a retrace for each call to `action`.') if self._batch_time_steps: time_step = nest_utils.batch_nested_array(time_step) # Avoid passing numpy arrays to avoid retracing of the tf.function. time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step) if seed is not None: policy_step = self._policy_action_fn(time_step, policy_state, seed=seed) else: policy_step = self._policy_action_fn(time_step, policy_state) if not self._batch_time_steps: return policy_step return policy_step._replace( action=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.action), # We intentionally do not convert the `state` so it is outputted as the # underlying policy generated it (i.e. in the form of a Tensor) which is # not necessarily compatible with a py-policy. However, we do so since # the `state` is fed back to the policy. So if it was converted, it'd be # required to convert back to the original form before calling the # method `action` of the policy again in the next step. If one wants to # store the `state` e.g. in replay buffer, then we suggest placing it # into the `info` field. info=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.info))
def render(self, mode="rgb_array"): if self._num_envs == 1: img = self._envs[0].render(mode) return nest_utils.batch_nested_array(img) else: imgs = self._execute(lambda env: env.render(mode), self._envs) return nest_utils.stack_nested_arrays(imgs)
def _fill_replay_buffer(self): # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(self._single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - self._stack_count + 1): observation = np.concatenate(single_frames[k:k + self._stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def get_passable(self): if self._num_envs == 1: return nest_utils.batch_nested_array( tf.cast(self._envs[0].passable, tf.float32)) else: return tf.stack( [tf.cast(env.passable, tf.float32) for env in self._envs])
def _step(self, actions): """Forward a batch of actions to the wrapped environments. Args: actions: Batched action, possibly nested, to apply to the environment. Raises: ValueError: Invalid actions. Returns: Batch of observations, rewards, and done flags. """ if self._num_envs == 1: """Nitty: modified to remove cannot squeeze error""" # actions = nest_utils.unbatch_nested_array(actions) time_steps = self._envs[0].step(actions) return nest_utils.batch_nested_array(time_steps) else: unstacked_actions = unstack_actions(actions) if len(unstacked_actions) != self.batch_size: raise ValueError( "Primary dimension of action items does not match " "batch size: %d vs. %d" % (len(unstacked_actions), self.batch_size)) time_steps = self._execute( lambda env_action: env_action[0].step(env_action[1]), zip(self._envs, unstacked_actions)) return nest_utils.stack_nested_arrays(time_steps)
def _action(self, time_step, policy_state): if not self._built: self._build_from_time_step(time_step) batch_size = None if time_step.step_type.shape: batch_size = time_step.step_type.shape[0] if self._batch_size != batch_size: raise ValueError( 'The batch size of time_step is different from the batch size ' 'provided previously. Expected {}, but saw {}.'.format( self._batch_size, batch_size)) if not self._batched: # Since policy_state is given in a batched form from the policy and we # simply have to send it back we do not need to worry about it. Only # update time_step. time_step = nest_utils.batch_nested_array(time_step) tf.nest.assert_same_structure(self._time_step, time_step) feed_dict = {self._time_step: time_step} if policy_state is not None: # Flatten policy_state to handle specs that are not hashable due to lists. for state_ph, state in zip( tf.nest.flatten(self._policy_state), tf.nest.flatten(policy_state)): feed_dict[state_ph] = state action_step = self.session.run(self._action_step, feed_dict) action, state, info = action_step if not self._batched: action, info = nest_utils.unbatch_nested_array([action, info]) return policy_step.PolicyStep(action, state, info)
def testSavedModel(self): path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(self.tf_policy) saver.save(path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, self.time_step_spec, self.action_spec) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(self.time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) original_action = self.tf_policy.action(batched_sample_time_step) unbatched_original_action = nest_utils.unbatch_nested_tensors( original_action) original_action_np = tf.nest.map_structure(lambda t: t.numpy(), unbatched_original_action) saved_policy_action = eager_py_policy.action(sample_time_step) tf.nest.assert_same_structure(saved_policy_action.action, self.action_spec) np.testing.assert_array_almost_equal(original_action_np.action, saved_policy_action.action)
def get_num_blocks(self): if self._num_envs == 1: return nest_utils.batch_nested_array( tf.cast(self._envs[0].n_clutter_placed, tf.float32)) else: return tf.stack([ tf.cast(env.n_clutter_placed, tf.float32) for env in self._envs ])
def _get_initial_state(self, batch_size: int) -> types.NestedArray: if self._num_policies == 1: return nest_utils.batch_nested_array( self._policies[0].get_initial_state()) else: infos = self._execute(_execute_get_initial_state, self._policies) infos = nest_utils.unbatch_nested_array(infos) return nest_utils.stack_nested_arrays(infos)
def get_distance_to_goal(self): if self._num_envs == 1: return nest_utils.batch_nested_array( tf.cast(self._envs[0].distance_to_goal, tf.float32)) else: return tf.stack([ tf.cast(env.distance_to_goal, tf.float32) for env in self._envs ])
def get_deliberate_placement(self): if self._num_envs == 1: return nest_utils.batch_nested_array( tf.cast(self._envs[0].deliberate_agent_placement, tf.float32)) else: return tf.stack( [tf.cast(env.deliberate_agent_placement, tf.float32) for env in self._envs])
def get_shortest_path_length(self): if self._num_envs == 1: return nest_utils.batch_nested_array( tf.cast(self._envs[0].shortest_path_length, tf.float32)) else: return tf.stack([ tf.cast(env.shortest_path_length, tf.float32) for env in self._envs ])
def call(self, trajectory: traj.Trajectory): if not self._batch_size: if trajectory.step_type.ndim == 0: self._batch_size = 1 else: assert trajectory.step_type.ndim == 1 self._batch_size = trajectory.step_type.shape[0] self.reset() if trajectory.step_type.ndim == 0: trajectory = nest_utils.batch_nested_array(trajectory) self._batched_call(trajectory)
def _reset(self): """Reset all environments and combine the resulting observation. Returns: Time step with batch dimension. """ if self._num_envs == 1: return nest_utils.batch_nested_array(self._envs[0].reset()) else: time_steps = self._execute(lambda env: env.reset(), self._envs) return nest_utils.stack_nested_arrays(time_steps)
def testInferenceFromCheckpoint(self): path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(self.tf_policy) saver.save(path) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(self.time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) self.evaluate( tf.nest.map_structure(lambda v: v.assign(v * 0 + -1), self.tf_policy.variables())) checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint') saver.save_checkpoint(checkpoint_path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, self.time_step_spec, self.action_spec) # Use evaluate to force a copy. saved_model_variables = self.evaluate(eager_py_policy.variables()) checkpoint = tf.train.Checkpoint(policy=eager_py_policy._policy) manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_path, max_to_keep=None) eager_py_policy.update_from_checkpoint(manager.latest_checkpoint) assert_np_not_equal = lambda a, b: self.assertFalse( np.equal(a, b).all()) tf.nest.map_structure(assert_np_not_equal, saved_model_variables, self.evaluate(eager_py_policy.variables())) assert_np_all_equal = lambda a, b: self.assertTrue( np.equal(a, b).all()) tf.nest.map_structure(assert_np_all_equal, self.evaluate(self.tf_policy.variables()), self.evaluate(eager_py_policy.variables())) # Can't check if the action is different as in some cases depending on # variable initialization it will be the same. Checking that they are at # least always the same. checkpoint_action = eager_py_policy.action(sample_time_step) current_policy_action = self.tf_policy.action(batched_sample_time_step) current_policy_action = self.evaluate( nest_utils.unbatch_nested_tensors(current_policy_action)) tf.nest.map_structure(assert_np_all_equal, current_policy_action, checkpoint_action)
def _action(self, time_step: ts.TimeStep, policy_state: types.NestedArray, seed: Optional[types.Seed] = None) -> ps.PolicyStep: """Forward a batch of time_step and policy_states to the wrapped policies. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: An Array, or a nested dict, list or tuple of Arrays representing the previous policy_state. seed: Seed value used to initialize a pseudorandom number generator. Returns: A batch of `PolicyStep` named tuples, each one containing: `action`: A nest of action Arrays matching the `action_spec()`. `state`: A nest of policy states to be fed into the next call to action. `info`: Optional side information such as action log probabilities. Raises: NotImplementedError: if `seed` is not None. """ if seed is not None: raise NotImplementedError( "seed is not supported; but saw seed: {}".format(seed)) if self._num_policies == 1: time_step = nest_utils.unbatch_nested_array(time_step) policy_state = nest_utils.unbatch_nested_array(policy_state) policy_steps = self._policies[0].action(time_step, policy_state) return nest_utils.batch_nested_array(policy_steps) else: unstacked_time_steps = nest_utils.unstack_nested_arrays(time_step) if len(unstacked_time_steps) != len(self._policies): raise ValueError( "Primary dimension of time_step items does not match " "batch size: %d vs. %d" % (len(unstacked_time_steps), len(self._policies))) unstacked_policy_states = [()] * len(unstacked_time_steps) if policy_state: unstacked_policy_states = nest_utils.unstack_nested_arrays( policy_state) if len(unstacked_policy_states) != len(self._policies): raise ValueError( "Primary dimension of policy_state items does not match " "batch size: %d vs. %d" % (len(unstacked_policy_states), len(self._policies))) policy_steps = self._execute( _execute_policy, zip(self._policies, unstacked_time_steps, unstacked_policy_states)) return nest_utils.stack_nested_arrays(policy_steps)
def step_adversary(self, actions): if self._num_envs == 1: actions = nest_utils.unbatch_nested_array(actions) time_steps = self._envs[0].step_adversary(actions) return nest_utils.batch_nested_array(time_steps) else: unstacked_actions = batched_py_environment.unstack_actions(actions) if len(unstacked_actions) != self.batch_size: raise ValueError( 'Primary dimension of action items does not match ' 'batch size: %d vs. %d' % (len(unstacked_actions), self.batch_size)) time_steps = self._execute( lambda env_action: env_action[0].step_adversary(env_action[1]), zip(self._envs, unstacked_actions)) return nest_utils.stack_nested_arrays(time_steps)
def _action(self, time_step, policy_state): time_step = nest_utils.batch_nested_array(time_step) # Avoid passing numpy arrays to avoid retracing of the tf.function. time_step = tf.nest.map_structure(tf.convert_to_tensor, time_step) policy_step = self._policy_action_fn(time_step, policy_state) return policy_step._replace( action=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.action), # We intentionally do not convert the `state` so it is outputted as the # underlying policy generated it (i.e. in the form of a Tensor) which is # not necessarily compatible with a py-policy. However, we do so since # the `state` is fed back to the policy. So if it was converted, it'd be # required to convert back to the original form before calling the # method `action` of the policy again in the next step. If one wants to # store the `state` e.g. in replay buffer, then we suggest placing it # into the `info` field. info=nest_utils.unbatch_nested_tensors_to_arrays(policy_step.info))
def _reset(self): print(self.count, '試合目') self.count += 1 self.game = Game() self.player = DQN(id=0, game=self.game) self.game.start_game(player=self.player) self.game.start_kyoku() self.reward = 0 self.game_end = False while self.game.next(): pass time_step = ts.restart(self.board()) return nest_utils.batch_nested_array(time_step)
def setUp(self): super(TFDriverTest, self).setUp() f0 = np.array(0., dtype=np.float32) f1 = np.array(1., dtype=np.float32) # Order of args for trajectory methods: # (observation, action, policy_info, reward, discount) trajectories = [ trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), ] self._trajectories = nest_utils.batch_nested_array(trajectories)
def testSavedModel(self): if not common.has_eager_been_enabled(): self.skipTest('Only supported in eager.') observation_spec = array_spec.ArraySpec([2], np.float32) action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3) time_step_spec = ts.time_step_spec(observation_spec) observation_tensor_spec = tensor_spec.from_spec(observation_spec) action_tensor_spec = tensor_spec.from_spec(action_spec) time_step_tensor_spec = tensor_spec.from_spec(time_step_spec) actor_net = actor_network.ActorNetwork( observation_tensor_spec, action_tensor_spec, fc_layer_params=(10, ), ) tf_policy = actor_policy.ActorPolicy(time_step_tensor_spec, action_tensor_spec, actor_network=actor_net) path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(tf_policy) saver.save(path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, time_step_spec, action_spec) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) original_action = tf_policy.action(batched_sample_time_step) unbatched_original_action = nest_utils.unbatch_nested_tensors( original_action) original_action_np = tf.nest.map_structure(lambda t: t.numpy(), unbatched_original_action) saved_policy_action = eager_py_policy.action(sample_time_step) tf.nest.assert_same_structure(saved_policy_action.action, action_spec) np.testing.assert_array_almost_equal(original_action_np.action, saved_policy_action.action)
def _generate_replay_buffer(self, rb_cls): stack_count = 4 shape = (15, 15, stack_count) single_shape = (15, 15, 1) observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs') time_step_spec = ts.time_step_spec(observation_spec) action_spec = policy_step.PolicyStep( array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')) self._trajectory_spec = trajectory.from_transition( time_step_spec, action_spec, time_step_spec) self._capacity = 32 self._replay_buffer = rb_cls(data_spec=self._trajectory_spec, capacity=self._capacity) # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - stack_count + 1): observation = np.concatenate(single_frames[k:k + stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def testGetOuterArrayShape(self): spec = ( array_spec.ArraySpec([5, 8], np.float32), (array_spec.ArraySpec([1], np.int32), array_spec.ArraySpec([2, 2, 2], np.float32)) ) batch_size = 3 unstacked_arrays = [self.zeros_from_spec(spec) for _ in range(batch_size)] outer_dims = nest_utils.get_outer_array_shape(unstacked_arrays[0], spec) self.assertEqual((), outer_dims) stacked_array = nest_utils.stack_nested_arrays(unstacked_arrays) outer_dims = nest_utils.get_outer_array_shape(stacked_array, spec) self.assertEqual((batch_size,), outer_dims) time_dim = [nest_utils.batch_nested_array(arr) for arr in unstacked_arrays] batch_time = nest_utils.stack_nested_arrays(time_dim) outer_dims = nest_utils.get_outer_array_shape(batch_time, spec) self.assertEqual((batch_size, 1), outer_dims)
def _fill_replay_buffer(self, n_transition=50): # Generate N observations. single_obs_list = [] obs_count = 100 for k in range(obs_count): single_obs_list.append( np.full(self._single_shape, k, dtype=np.int32)) # Add stack of observations to the replay buffer. time_steps = [] for k in range(len(single_obs_list) - self._stack_count + 1): stacked_observation = np.concatenate( single_obs_list[k:k + self._stack_count], axis=-1) time_steps.append(ts.transition(stacked_observation, reward=0.0)) self._experience_count = n_transition dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._experience_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def _action(self, time_step, policy_state): if not self._batched: # Since policy_state is given in a batched form from the policy and we # simply have to send it back we do not need to worry about it. Only # update time_step. time_step = nest_utils.batch_nested_array(time_step) nest.assert_same_structure(self._time_step, time_step) feed_dict = {self._time_step: time_step} if policy_state is not None: # Flatten policy_state to handle specs that are not hashable due to lists. for state_ph, state in zip( nest.flatten(self._policy_state), nest.flatten(policy_state)): feed_dict[state_ph] = state action_step = self.session.run(self._action_step, feed_dict) action, state, info = action_step if not self._batched: action, info = nest_utils.unbatch_nested_array([action, info]) return policy_step.PolicyStep(action, state, info)
def _apply_actor_network(self, time_step, step_type, policy_state, mask=None): observation = time_step if self._observation_normalizer: observation = self._observation_normalizer.normalize(observation) if tf.is_tensor(observation): if not nest_utils.is_batched_nested_tensors( observation, self.time_step_spec.observation): observation = nest_utils.batch_nested_tensors(observation) else: if not nest_utils.get_outer_array_shape( observation, self.time_step_spec.observation): observation = nest_utils.batch_nested_array(observation) alpha = np.array([self.alpha])[None] return self._actor_network((observation, alpha), step_type, policy_state, training=self._training)
def _step(self, action): action = nest_utils.unbatch_nested_array(action) score = self.score() dahai = self.dahai(action) # print(action, dahai, self.player.tehai) self.reward = 0 self.game.dahai(dahai, self.player) while self.game.next(): pass if self.game.state in [Const.RYUKYOKU_STATE, Const.AGARI_STATE]: self.reward = self.score() - score self.game_end = True time_step = ts.termination(self.board(), reward=0) elif self.game.state == Const.SYUKYOKU_STATE: self.reward = [90, 45, 0, -180][self.rank()] * 1000 self.game_end = True time_step = ts.termination(self.board(), reward=0) else: time_step = ts.transition(self.board(), reward=0, discount=1) return nest_utils.batch_nested_array(time_step)
def call(self, trajectory: traj.Trajectory): if trajectory.step_type.ndim == 0: trajectory = nest_utils.batch_nested_array(trajectory) completed_episodes = np.sum(trajectory.is_last().astype(np.int64)) self._np_state.number_episodes += completed_episodes