def test_make_dataset_nested_specs(self): environment_spec = specs.EnvironmentSpec(observations={ 'obs_1': specs.Array((3, 64, 64), 'uint8'), 'obs_2': specs.Array((10, ), 'int32') }, actions=specs.BoundedArray( (), 'float32', minimum=-1., maximum=1.), rewards=specs.Array( (), 'float32'), discounts=specs.BoundedArray( (), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) self.assertTrue( _check_specs(tuple(environment_spec), dataset.element_spec.data))
def test_make_dataset_nested_specs(self): environment_spec = specs.EnvironmentSpec(observations={ 'obs_1': specs.Array((3, 64, 64), 'uint8'), 'obs_2': specs.Array((10, ), 'int32') }, actions=specs.BoundedArray( (), 'float32', minimum=-1., maximum=1.), rewards=specs.Array( (), 'float32'), discounts=specs.BoundedArray( (), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( client=self.tf_client, environment_spec=environment_spec) expected_spec = adders.Step(observation=environment_spec.observations, action=environment_spec.actions, reward=environment_spec.rewards, discount=environment_spec.discounts, start_of_episode=specs.Array(shape=(), dtype=bool), extras=()) self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
def __init__(self, *, num_actions: int = 1, num_observations: int = 1, action_dtype=np.int32, obs_dtype=np.int32, obs_shape: Sequence[int] = (), discount_spec: Optional[types.NestedSpec] = None, reward_spec: Optional[types.NestedSpec] = None, **kwargs): """Initialize the environment.""" if reward_spec is None: reward_spec = specs.Array((), np.float32) if discount_spec is None: discount_spec = specs.BoundedArray((), np.float32, 0.0, 1.0) actions = specs.DiscreteArray(num_actions, dtype=action_dtype) observations = specs.BoundedArray(shape=obs_shape, dtype=obs_dtype, minimum=obs_dtype(0), maximum=obs_dtype(num_observations - 1)) super().__init__(spec=specs.EnvironmentSpec(observations=observations, actions=actions, rewards=reward_spec, discounts=discount_spec), **kwargs)
def _make_fake_env() -> dm_env.Environment: env_spec = specs.EnvironmentSpec( observations=specs.Array(shape=(10, 5), dtype=np.float32), actions=specs.BoundedArray( shape=(1,), dtype=np.float32, minimum=-10., maximum=10.), rewards=specs.Array(shape=(), dtype=np.float32), discounts=specs.BoundedArray( shape=(), dtype=np.float32, minimum=0., maximum=1.), ) return fakes.Environment(env_spec, episode_length=10)
def _convert_to_spec(space: gym.Space, name: Optional[str] = None) -> types.NestedSpec: """Converts an OpenAI Gym space to a dm_env spec or nested structure of specs. Box, MultiBinary and MultiDiscrete Gym spaces are converted to BoundedArray specs. Discrete OpenAI spaces are converted to DiscreteArray specs. Tuple and Dict spaces are recursively converted to tuples and dictionaries of specs. Args: space: The Gym space to convert. name: Optional name to apply to all return spec(s). Returns: A dm_env spec or nested structure of specs, corresponding to the input space. """ if isinstance(space, spaces.Discrete): return specs.DiscreteArray(num_values=space.n, dtype=space.dtype, name=name) elif isinstance(space, spaces.Box): return specs.BoundedArray(shape=space.shape, dtype=space.dtype, minimum=space.low, maximum=space.high, name=name) elif isinstance(space, spaces.MultiBinary): return specs.BoundedArray(shape=space.shape, dtype=space.dtype, minimum=0.0, maximum=1.0, name=name) elif isinstance(space, spaces.MultiDiscrete): return specs.BoundedArray(shape=space.shape, dtype=space.dtype, minimum=np.zeros(space.shape), maximum=space.nvec - 1, name=name) elif isinstance(space, spaces.Tuple): return tuple(_convert_to_spec(s, name) for s in space.spaces) elif isinstance(space, spaces.Dict): return { key: _convert_to_spec(value, key) for key, value in space.spaces.items() } else: raise ValueError('Unexpected gym space: {}'.format(space))
def __init__(self, environment_spec: specs.EnvironmentSpec, action_spec: specs.BoundedArray, z_dim: int) -> None: self._z_dim = z_dim z_spec = specs.BoundedArray((z_dim, ), np.float64, minimum=0, maximum=1) # Modify the environment_spec to also include the latent variable # observation (z) self._obs_space = environment_spec.observations assert ( len(self._obs_space.shape) == 1 ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}" updated_observations = specs.BoundedArray( (self._obs_space.shape[0] + z_dim, ), dtype=environment_spec.observations.dtype, name=environment_spec.observations.name, minimum=np.append(environment_spec.observations.minimum, [0] * z_dim), maximum=np.append(environment_spec.observations.maximum, [0] * z_dim), ) environment_spec = specs.EnvironmentSpec( observations=updated_observations, actions=environment_spec.actions, rewards=environment_spec.rewards, discounts=environment_spec.discounts, ) self._agent_networks = make_feed_forward_networks(action_spec, z_spec) self._agent = dmpo.DistributionalMPO( environment_spec=environment_spec, policy_network=self._agent_networks['policy'], critic_network=self._agent_networks['critic'], observation_network=self._agent_networks['observation'], # pytype: disable=wrong-arg-types extra_modules_to_save={ 'discriminator': self._agent_networks['discriminator'], }, return_action_entropy=True, ) self._z_distribution = tfd.Categorical([1] * z_dim) self._current_z = self._z_distribution.sample() # Create discriminator optimizer. self._discriminator_optimizer = snt.optimizers.Adam(1e-4) self._discriminator_logger = loggers.make_default_logger( 'discriminator') # Create variables for the discriminator. tf2_utils.create_variables(self._agent_networks['discriminator'], [self._obs_space])
def test_make_dataset_with_variable_length_instances(self): """Dataset with variable length instances should have shapes with None.""" environment_spec = specs.EnvironmentSpec( observations=specs.Array((0, 64, 64), 'uint8'), actions=specs.BoundedArray((), 'float32', minimum=-1., maximum=1.), rewards=specs.Array((), 'float32'), discounts=specs.BoundedArray((), 'float32', minimum=0., maximum=1.)) dataset = reverb_dataset.make_dataset( server_address=self.server_address, environment_spec=environment_spec, convert_zero_size_to_none=True) self.assertSequenceEqual(dataset.element_spec.data[0].shape.as_list(), [None, 64, 64])
def observation_spec(self) -> specs.BoundedArray: """Returns the observation spec.""" return specs.BoundedArray(shape=self._observation().shape, dtype=self._observation().dtype, name="board", minimum=0, maximum=MAX_APPLES_PER_USER)
def __init__(self, *, num_observations: Mapping[str, int], num_actions: int = 1, action_dtype=np.int32, obs_dtype=np.int32, obs_shape: Sequence[int] = (), discount_spec: Optional[types.NestedSpec] = None, reward_spec: Optional[types.NestedSpec] = None, **kwargs): """Initialize the environment.""" observations_spec = {} for key in num_observations: observations_spec[key] = specs.BoundedArray( shape=obs_shape, dtype=obs_dtype, minimum=obs_dtype(0), maximum=obs_dtype(num_observations[key] - 1)) super().__init__(num_actions=num_actions, action_dtype=action_dtype, observation_spec=observations_spec, discount_spec=discount_spec, reward_spec=reward_spec, **kwargs)
def __init__(self, DIAYN_agent: DIAYNAgent.DIAYNAgent, environment_spec: specs.EnvironmentSpec, action_spec: specs.BoundedArray, z_dim: int, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE, replay_server_port: Optional[int] = None, ) -> None: self._z_dim = z_dim z_spec = specs.BoundedArray((z_dim,), np.float64, minimum=0, maximum=1) self._environment_spec = environment_spec # Modify the environment_spec to also include the latent variable # observation (z) self._obs_space = environment_spec.observations assert (len(self._obs_space.shape) == 1), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}" self._agent_networks = make_feed_forward_networks(action_spec, z_spec) self._agent = dmpo.DistributionalMPO( environment_spec=environment_spec, policy_network=self._agent_networks['policy'], critic_network=self._agent_networks['critic'], observation_network=self._agent_networks['observation'], # pytype: disable=wrong-arg-types extra_modules_to_save={ 'hierarchical_controller': self._agent_networks['hierarchical_controller'], }, checkpoint_name='hierarchical_dmpo', replay_table_name=replay_table_name, replay_server_port=replay_server_port, return_action_entropy=True, ) self._DIAYN_agent = DIAYN_agent # Create variables for the discriminator. tf2_utils.create_variables( self._agent_networks['hierarchical_controller'], [self._obs_space])
def discount_spec(self) -> Dict[str, specs.BoundedArray]: discount_specs = {} for agent in self.agents: discount_specs[agent] = specs.BoundedArray((), np.float32, minimum=0, maximum=1.0) return discount_specs
def extra_spec(self) -> Dict[str, specs.BoundedArray]: state = self._environment.get_state() # TODO (dries): What should the real bounds be of the state spec? return { "s_t": specs.BoundedArray(state.shape, np.float32, minimum=float("-inf"), maximum=float("inf")) }
def observation_spec(self): # pov obs are uint8, but required type is float. obs_spec = self._observation_spec['pov'] obs_spec = specs.BoundedArray(shape=obs_spec.shape, dtype=np.float32, minimum=obs_spec.minimum, maximum=obs_spec.maximum, name=obs_spec.name) return OVAR(observation=obs_spec, obs_vector=self._observation_spec['vector'], action=self.action_spec(), reward=self.reward_spec())
def __init__(self, *, action_dim: int = 1, observation_dim: int = 1, bounded: bool = False, dtype=np.float32, reward_dtype=np.float32, **kwargs): """Initialize the environment. Args: action_dim: number of action dimensions. observation_dim: number of observation dimensions. bounded: whether or not the actions are bounded in [-1, 1]. dtype: dtype of the action and observation spaces. reward_dtype: dtype of the reward and discounts. **kwargs: additional kwargs passed to the Environment base class. """ action_shape = () if action_dim == 0 else (action_dim,) observation_shape = () if observation_dim == 0 else (observation_dim,) observations = specs.Array(observation_shape, dtype) rewards = specs.Array((), reward_dtype) discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0) if bounded: actions = specs.BoundedArray(action_shape, dtype, -1.0, 1.0) else: actions = specs.Array(action_shape, dtype) super().__init__( spec=specs.EnvironmentSpec( observations=observations, actions=actions, rewards=rewards, discounts=discounts), **kwargs)
def extra_spec(self) -> Dict[str, specs.BoundedArray]: extras = {} if self.return_state_info: shape = self.environment._get_state().shape ex_spec = specs.BoundedArray( shape=shape, dtype="float32", name="observation", minimum=[float("-inf")] * shape[0], maximum=[float("inf")] * shape[0], ) extras.update({"s_t": ex_spec}) return extras
def __init__(self, *, num_actions: int = 1, num_observations: int = 1, action_dtype=np.int32, obs_dtype=np.int32, reward_dtype=np.float32, obs_shape: Sequence[int] = (), **kwargs): """Initialize the environment.""" actions = specs.DiscreteArray(num_actions, dtype=action_dtype) observations = specs.BoundedArray(shape=obs_shape, dtype=obs_dtype, minimum=obs_dtype(0), maximum=obs_dtype(num_observations - 1)) rewards = specs.Array((), reward_dtype) discounts = specs.BoundedArray((), reward_dtype, 0.0, 1.0) super().__init__(spec=specs.EnvironmentSpec(observations=observations, actions=actions, rewards=rewards, discounts=discounts), **kwargs)
def setUp(self): super().setUp() self.state_dims = 8 self.action_dims = 4 self.params = { 'world': jnp.ones((3, )), 'policy': jnp.ones((3, )), 'value': jnp.ones((3, )) } self.env_spec = specs.EnvironmentSpec( observations=specs.Array(shape=(self.state_dims, ), dtype=float), actions=specs.Array(shape=(self.action_dims, ), dtype=float), rewards=specs.Array(shape=(1, ), dtype=float, name='reward'), discounts=specs.BoundedArray(shape=(), dtype=float, minimum=0., maximum=1., name='discount'))
def define_residual_spec(rl_features, env, base_agent, action_norm, action_norm_scale=1.0, include_base_action=True, include_base_feats=True, base_network=None): # TODO(minttu): pass in GymWrapper(env) without any other wrapper classes. """Defines environment observation and action spaces as seen by the RL agent. Args: rl_features: A list of state features visible to the agent. If set, they replace any visual features. env: The environment which defines the action space, rewards and discounts. base_agent: base agent to use in residual training. action_norm: bc_agent.ActionSpace object defining action normalization. action_norm_scale: Scalar by which to scale residual action normalization. include_base_action: If True, add base agent action to spec. include_base_feats: If True, add features given by base agent to spec. base_network: Network type used by the base agent, if applicable. Returns: residual_spec: An acme.specs.EnvironmentSpec instance defining the residual spec. """ feats_spec = collections.OrderedDict() visible_state_dim = 0 # This check allows train_bc to use this function to set residual spec # without using env wrappers. if isinstance(env, gym.Env): for k, v in env.observation_space.spaces.items(): if k in rl_features: visible_state_dim += v.shape[0] if v.shape else 1 else: if FLAGS.domain == 'mime': obs_space = mime_env_utils.make_dict_space(env.scene, *rl_features).spaces else: obs_space = env.observation_spec() for k, v in obs_space.items(): if k in rl_features: visible_state_dim += v.shape[0] if v.shape else 1 if include_base_feats: base_feat_size = { 'resnet18_narrow32': 256, 'hand_vil': 200, }[base_network] feats_spec['feats'] = specs.Array([base_feat_size], np.float32, 'feats') if visible_state_dim > 0: feats_spec['visible_state'] = (specs.Array([visible_state_dim], np.float32, 'visible_state')) if include_base_action: feats_spec['base_action'] = specs.Array([base_agent.action_target_dim], np.float32, 'base_action') if FLAGS.rl_observation_network is not None: # TODO(minttu): Get image size from env observation spec. if FLAGS.input_type == 'depth': feats_spec['depth'] = specs.Array( [FLAGS.image_size, FLAGS.image_size, 3], np.uint8, 'depth') elif FLAGS.input_type == 'rgb': image_size = FLAGS.image_size rgb_shape = ([3, image_size, image_size, 3] if FLAGS.late_fusion else [image_size, image_size, 9]) feats_spec['rgb'] = specs.Array(rgb_shape, np.uint8, 'rgb') if isinstance(env, gym.Env): env_action_spec = env.action_space env_action_spec.minimum = env_action_spec.low env_action_spec.maximum = env_action_spec.high env_action_spec.name = 'action' # Concatenating fields here since it is non-trivial to use dictionary # observations with DemoReader's generator. concat_shape = np.sum([a.shape for a in feats_spec.values()]) feats_spec = collections.OrderedDict() feats_spec['residual_obs'] = specs.Array((concat_shape, ), np.float32, 'residual_obs') else: env_action_spec = env.action_spec() env_min = env_action_spec.minimum env_max = env_action_spec.maximum # Allow (at the extreme) to fully reverse a base action (from one action # space limit to the opposite limit). min_residual = env_min - env_max if include_base_action else env_min max_residual = env_max - env_min if include_base_action else env_max print('min residual', min_residual, 'max residual', max_residual) residual_action_space = bc_agent.ActionSpace(action_norm, env=env, scale=action_norm_scale) if action_norm in ['centered', 'zeromean_unitvar']: # Reuse stats; normalization scheme may still be different. residual_action_space.mean = base_agent.action_space.mean residual_action_space.std = base_agent.action_space.std norm_min = residual_action_space.normalize_flat(min_residual) norm_max = residual_action_space.normalize_flat(max_residual) norm_action_spec = specs.BoundedArray(shape=env_action_spec.shape, dtype=env_action_spec.dtype, minimum=norm_min, maximum=norm_max, name=env_action_spec.name) print(env_action_spec) print(norm_action_spec) if isinstance(env, gym.Env): reward_spec = specs.BoundedArray(shape=(), dtype=float, minimum=env.reward_range[0], maximum=env.reward_range[1], name='reward') else: reward_spec = env.reward_spec() if isinstance(env, gym.Env): discount_spec = specs.BoundedArray(shape=(), dtype=float, minimum=0., maximum=1., name='discount') else: discount_spec = env.discount_spec() # residual_spec = specs.make_environment_spec(env) # Use same normalization for base agent and residual agent. residual_spec = specs.EnvironmentSpec(observations=feats_spec, actions=norm_action_spec, rewards=reward_spec, discounts=discount_spec) print('Residual spec', residual_spec) return residual_spec
def agent_info_spec() -> specs.BoundedArray: """Create the spec for the agent_info part of the observation""" return specs.BoundedArray((4, ), dtype=np.float32, minimum=0.0, maximum=10)
def init(self, params): if not _TF_USE_GPU: tf.config.set_visible_devices([], 'GPU') tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS) tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS) if params.seed: agent_seed = params.seed + sum([ord(c) for c in params.name]) random.seed(agent_seed) np.random.seed(agent_seed) tf.random.set_seed(agent_seed) # Internalize params. self._params = params self._name = params.name # Whether learning stopped. self._stop = False # Define specs. Everything needs to be single precision by default. observation_spec = specs.Array(shape=(params.states.rank, ), dtype=np.float32, name='obs') action_spec = specs.BoundedArray(shape=(params.num_phases, ), dtype=np.float32, minimum=0., maximum=1., name='action') reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward') discount_spec = specs.BoundedArray(shape=(), dtype=np.float32, minimum=0., maximum=1., name='discount') env_spec = specs.EnvironmentSpec(observations=observation_spec, actions=action_spec, rewards=reward_spec, discounts=discount_spec) # Logger. dir_path = f'{params.exp_path}/logs/{self._name}' self._logger = make_default_logger(directory=dir_path, label=self._name) agent_logger = make_default_logger(directory=dir_path, label=f'{self._name}-learning') networks = _make_networks(actions_dim=params.num_phases, state_dim=params.states.rank, policy_layers=params.policy_layers, critic_layers=params.critic_layers) self.agent = acme_agent.DDPG( environment_spec=env_spec, policy_network=networks['policy'], critic_network=networks['critic'], observation_network=networks['observation'], discount=params.discount_factor, batch_size=params.batch_size, prefetch_size=params.prefetch_size, target_update_period=params.target_update_period, min_replay_size=params.min_replay_size, max_replay_size=params.max_replay_size, samples_per_insert=params.samples_per_insert, n_step=params.n_step, sigma_init=params.sigma_init, sigma_final=params.sigma_final, sigma_schedule_timesteps=params.sigma_schedule_timesteps, clipping=params.clipping, logger=agent_logger, checkpoint=False, ) # Observations counter. self._obs_counter = 0
def discount_spec(self) -> specs.BoundedArray: return specs.BoundedArray((), np.float32, minimum=0, maximum=1.0)
def reward_spec(self) -> specs.BoundedArray: return specs.BoundedArray((), np.float32, minimum=self._environment.game.min_utility(), maximum=self._environment.game.max_utility())
def discount_spec(self) -> Dict[str, specs.BoundedArray]: return { agent: specs.BoundedArray((), np.float32, minimum=0, maximum=1.0) for agent in self._possible_agents }
from typing import Optional from acme import environment_loop from acme import specs from acme import types from acme.testing import fakes import numpy as np from absl.testing import absltest from absl.testing import parameterized EPISODE_LENGTH = 10 # Discount specs F32_2_MIN_0_MAX_1 = specs.BoundedArray( dtype=np.float32, shape=(2,), minimum=0.0, maximum=1.0) F32_2x1_MIN_0_MAX_1 = specs.BoundedArray( dtype=np.float32, shape=(2, 1), minimum=0.0, maximum=1.0) TREE_MIN_0_MAX_1 = {'a': F32_2_MIN_0_MAX_1, 'b': F32_2x1_MIN_0_MAX_1} # Reward specs F32 = specs.Array(dtype=np.float32, shape=()) F32_1x3 = specs.Array(dtype=np.float32, shape=(1, 3)) TREE = {'a': F32, 'b': F32_1x3} TEST_CASES = ( ('scalar_discount_scalar_reward', None, None), ('vector_discount_scalar_reward', F32_2_MIN_0_MAX_1, F32), ('matrix_discount_matrix_reward', F32_2x1_MIN_0_MAX_1, F32_1x3), ('tree_discount_tree_reward', TREE_MIN_0_MAX_1, TREE), )
def __init__(self, num_players: int): self._reset_next_step = True self.scaling = 200.0 # Chose action act_min = [0.0] * 7 # 6 + No action act_max = [1.0] * 7 # 6 + No action # Action continuous component # All directions are in x, y format act_min.extend([ -100 / self.scaling, -1, -1, # dash (power, direction) 0, -1, -1, # kick (power, direction) 0, 0, # change_view (width, quality) -1, -1, 0, # tackle (direction, foul) -1, -1, # turn (direction) -1, -1, ]) # turn_neck(direction) act_max.extend( [100 / self.scaling, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert len(act_min) == len(act_max) action_spec = specs.BoundedArray( shape=(len(act_min), ), dtype="float32", name="action", minimum=act_min, maximum=act_max, ) self.action_size = action_spec.shape[0] # obs_dict = {"time_left": 0, "side": 1, "sense_self": 2, # "coords": (3, 5), "body_dir": (5, 7), # "head_dir": (7, 9), "width": (9, 12), # "quality": 13, "stamina": 14, "effort": 15, # "speed_amount": 16, "speed_direction": (17, 19), # "neck_direction": (19, 21), # "see_ball": 21, "ball_dist": 22, # "ball_dir": (23, 25), "ball_dist_change": 25, # "ball_dir_change": 26, "ball_speed": 27, # "last_action": (28, 28 + self.action_size), # } # TODO: Check if all bounds are correct obs_min = [ 0.0, # time_left 0.0, # side 0.0, # sense_self -100 / self.scaling, -50 / self.scaling, # coords -1, -1, # body_dir -1, -1, # head_dir 0, 0, 0, # width 0, # quality 0, # stamina 0, # effort 0, # speed_amount -1, -1, # speed_direction -1, -1, # neck_direction 0, # see_ball 0, # ball_dist -1, -1, # ball_dir -100 / self.scaling, # ball_dist_change -180 / self.scaling, # ball_dir_change 0, # ball_speed ] obs_max = [ 1.0, # time_left 1.0, # side 1.0, # sense_self 100 / self.scaling, 50 / self.scaling, # coords 1, 1, # body_dir 1, 1, # head_dir 1, 1, 1, # width 1, # quality 1, # stamina 1, # effort 100 / self.scaling, # speed_amount 1, 1, # speed_direction 1, 1, # neck_direction 1, # see_ball 100 / self.scaling, # ball_dist 1, 1, # ball_dir 100 / self.scaling, # ball_dist_change 180 / self.scaling, # ball_dir_change 100 / self.scaling, # ball_speed ] # Last action obs_min.extend(action_spec.minimum) obs_max.extend(action_spec.maximum) # [see_player, is_on_team, player_distance, # player_direction] for num_agents-1 self.num_agents = num_players # TODO: Add this in again. # for i in range(21): # # [see_player, is_on_team, player_distance, # player_direction (x, y format)] # obs_min.extend([0, 0, -200 / self.scaling, -1, -1]) # obs_max.extend([1, 1, +200 / self.scaling, 1, 1]) assert len(obs_min) == len(obs_max) self.obs_size = len(obs_min) self.agents = ["player_" + str(r) for r in range(num_players)] self._observation_specs = {} self._action_specs = {} obs_spec = specs.BoundedArray( shape=(self.obs_size, ), dtype="float32", name="observation", minimum=obs_min, maximum=obs_max, ) # Time_left, ball coords, ball delta_coords state_min = [0, -100 / self.scaling, -100 / self.scaling, -10, -10] state_max = [1, 100 / self.scaling, 100 / self.scaling, 10, 10] # First player is the critic player # Players sides, coords, delta_coords, body_angle (x, y format), # head_angle (x, y format) for i in range(num_players): state_min.extend([ 0.0, -100 / self.scaling, -100 / self.scaling, -10, -10, -1, -1, -1, -1, ]) state_max.extend([ 1.0, +100 / self.scaling, +100 / self.scaling, +10, +10, 1, 1, 1, 1 ]) # Add all observations to state info for i in range(num_players): state_min.extend(obs_min) state_max.extend(obs_max) assert len(state_min) == len(state_max) self._state_spec = specs.BoundedArray( shape=(len(state_min), ), dtype="float32", name="state", minimum=state_min, maximum=state_max, ) self._discount = dict( zip(self.agents, [np.float32(1.0)] * len(self.agents))) # TODO: Delete this # self.previous_act = {"player_0": None} for agent in self.agents: # TODO: Why is the action spec in two places? self._observation_specs[agent] = OLT( observation=obs_spec, legal_actions=action_spec, terminal=specs.Array((1, ), np.float32), ) self._action_specs[agent] = action_spec
def init(self, params): if not _TF_USE_GPU: tf.config.set_visible_devices([], 'GPU') tf.config.threading.set_inter_op_parallelism_threads(_TF_NUM_THREADS) tf.config.threading.set_intra_op_parallelism_threads(_TF_NUM_THREADS) if params.seed: agent_seed = params.seed + sum([ord(c) for c in params.name]) random.seed(agent_seed) np.random.seed(agent_seed) tf.random.set_seed(agent_seed) # Internalize params. self._params = params self._name = params.name # Whether learning stopped. self._stop = False # Define specs. Everything needs to be single precision by default. observation_spec = specs.Array(shape=(params.states.rank, ), dtype=np.float32, name='obs') action_spec = specs.DiscreteArray(dtype=np.int32, num_values=params.actions.depth, name="action") reward_spec = specs.Array(shape=(), dtype=np.float32, name='reward') discount_spec = specs.BoundedArray(shape=(), dtype=np.float32, minimum=0., maximum=1., name='discount') env_spec = specs.EnvironmentSpec(observations=observation_spec, actions=action_spec, rewards=reward_spec, discounts=discount_spec) # Logger. dir_path = f'{params.exp_path}/logs/{self._name}' self._logger = make_default_logger(directory=dir_path, label=self._name) agent_logger = make_default_logger(directory=dir_path, label=f'{self._name}-learning') network = Network(num_actions=env_spec.actions.num_values, rnn_hidden_size=params.rnn_hidden_size, head_layers=params.head_layers) self.agent = acme_agent.R2D2( environment_spec=env_spec, network=network, batch_size=params.batch_size, samples_per_insert=params.samples_per_insert, burn_in_length=params.burn_in_length, trace_length=params.trace_length, replay_period=params.replay_period, min_replay_size=params.min_replay_size, max_replay_size=params.max_replay_size, discount=params.discount_factor, prefetch_size=params.prefetch_size, target_update_period=params.target_update_period, importance_sampling_exponent=params.importance_sampling_exponent, priority_exponent=params.priority_exponent, epsilon_init=params.epsilon_init, epsilon_final=params.epsilon_final, epsilon_schedule_timesteps=params.epsilon_schedule_timesteps, learning_rate=params.learning_rate, store_lstm_state=params.store_lstm_state, max_priority_weight=params.max_priority_weight, logger=agent_logger, checkpoint=False, ) # Observations counter. self._obs_counter = 0