def select_action(self, agent: str, observation: types.NestedArray) -> types.NestedArray: """select an action for a single agent in the system Args: agent (str): agent id observation (types.NestedArray): observation tensor received from the environment. Returns: types.NestedArray: action and policy. """ # TODO Mask actions here using observation.legal_actions # Initialize the RNN state if necessary. if self._states[agent] is None: # index network either on agent type or on agent id agent_key = agent.split("_")[0] if self._shared_weights else agent self._states[agent] = self._policy_networks[ agent_key].initia_state(1) # Step the recurrent policy forward given the current observation and state. action, policy, new_state = self._policy(agent, observation.observation, self._states[agent]) # Bookkeeping of recurrent states for the observe method. self._update_state(agent, new_state) # Return a numpy array with squeezed out batch dimension. action = tf2_utils.to_numpy_squeeze(action) policy = tf2_utils.to_numpy_squeeze(policy) return action, policy
def select_action(self, agent: str, observation: types.NestedArray) -> types.NestedArray: """select an action for a single agent in the system Args: agent (str): agent id. observation (types.NestedArray): observation tensor received from the environment. Returns: types.NestedArray: agent action """ # Step the recurrent policy/value network forward # given the current observation and state. self._prev_log_probs[agent], action = self._policy(agent, observation) # Return a numpy array with squeezed out batch dimension. action = tf2_utils.to_numpy_squeeze(action) # TODO(Kale-ab) : Remove. This is for debugging. if np.isnan(action).any(): print( f"Value error- Log Probs:{self._prev_log_probs[agent]} Action: {action} " # noqa: E501 ) return action
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_obs = tf2_utils.add_batch_dim(observation) # Initialize the RNN state if necessary. if self._state is None: self._state = self._network.initial_state(1) # Forward. policy_output, new_state = self._policy(batched_obs, self._state) # If the policy network parameterises a distribution, sample from it. def maybe_sample(output): if isinstance(output, tfd.Distribution): output = output.sample() return output policy_output = tree.map_structure(maybe_sample, policy_output) self._prev_state = self._state self._state = new_state # Convert to numpy and squeeze out the batch dimension. action = tf2_utils.to_numpy_squeeze(policy_output) return action
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Pass the observation through the policy network. action = self._policy(observation) # Return a numpy array with squeezed out batch dimension. return tf2_utils.to_numpy_squeeze(action)
def observe(self, action: types.NestedArray, next_timestep: dm_env.TimeStep): if not self._adder: return numpy_state = tf2_utils.to_numpy_squeeze(self._prev_state) self._adder.add(action, next_timestep, extras=(numpy_state, ))
def observe( self, action: types.NestedArray, next_timestep: dm_env.TimeStep, ): if not self._adder: return extras = {'logits': self._prev_logits, 'core_state': self._prev_state} extras = tf2_utils.to_numpy_squeeze(extras) self._adder.add(action, next_timestep, extras)
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_obs = tf2_utils.add_batch_dim(observation) # Forward the policy network. action = self._policy(batched_obs) # Convert to numpy and squeeze out the batch dimension. action = tf2_utils.to_numpy_squeeze(action) return action
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_observation = tf2_utils.add_batch_dim(observation) # Compute the policy, conditioned on the observation. policy = self._policy_network(batched_observation) if self._deterministic_policy: action = policy.mean() else: action = policy.sample() self._log_prob = policy.log_prob(action) return tf2_utils.to_numpy_squeeze(action)
def select_action( self, agent: str, observation: types.NestedArray ) -> Tuple[types.NestedArray, types.NestedArray]: """select an action for a single agent in the system Args: agent (str): agent id. observation (types.NestedArray): observation tensor received from the environment. Returns: Tuple[types.NestedArray, types.NestedArray]: agent action and policy. """ # Step the recurrent policy/value network forward # given the current observation and state. action, policy = self._policy(agent, observation.observation) # Return a numpy array with squeezed out batch dimension. action = tf2_utils.to_numpy_squeeze(action) policy = tf2_utils.to_numpy_squeeze(policy) return action, policy
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_observation = tf2_utils.add_batch_dim(observation) # Compute the policy, conditioned on the observation. action, policy, log_prob = self._policy_network.getAll( batched_observation) self._prev_logP = log_prob self._prev_means = policy # Return a numpy array with squeezed out batch dimension. return tf2_utils.to_numpy_squeeze(action)
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Initialize the RNN state if necessary. if self._state is None: self._state = self._network.initial_state(1) # Step the recurrent policy forward given the current observation and state. policy_output, new_state = self._policy(observation, self._state) # Bookkeeping of recurrent states for the observe method. self._prev_state = self._state self._state = new_state # Return a numpy array with squeezed out batch dimension. return tf2_utils.to_numpy_squeeze(policy_output)
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_obs = tf2_utils.add_batch_dim(observation) if self._state is None: self._state = self._network.initial_state(1) # Forward. (logits, _), new_state = self._policy(batched_obs, self._state) self._prev_logits = logits self._prev_state = self._state self._state = new_state action = tfd.Categorical(logits).sample() action = tf2_utils.to_numpy_squeeze(action) return action
def select_actions( self, observations: Dict[str, OLT]) -> Dict[str, types.NestedArray]: """select the actions for all agents in the system Args: observations (Dict[str, OLT]): transition object containing observations, legal actions and terminals. Returns: Dict[str, types.NestedArray]: actions for all agents in the system. """ actions = {} for agent, observation in observations.items(): # Pass the observation through the policy network. if not self._evaluator: epsilon = self._trainer.get_epsilon() else: # Note (dries): For some reason 0 epsilon breaks on StarCraft. epsilon = 1e-10 epsilon = tf.convert_to_tensor(epsilon) if self._fingerprint: trainer_step = self._trainer.get_trainer_steps() fingerprint = tf.concat([epsilon, trainer_step], axis=0) fingerprint = tf.expand_dims(fingerprint, axis=0) fingerprint = tf.cast(fingerprint, "float32") else: fingerprint = None action = self._policy( agent, observation.observation, observation.legal_actions, epsilon, fingerprint, ) actions[agent] = tf2_utils.to_numpy_squeeze(action) # Return a numpy array with squeezed out batch dimension. return actions
def select_actions( self, observations: Dict[str, OLT]) -> Dict[str, types.NestedArray]: """select the actions for all agents in the system Args: observations (Dict[str, OLT]): transition object containing observations, legal actions and terminals. Returns: Dict[str, types.NestedArray]: actions for all agents in the system. """ actions = {} message_inputs = self._communication_module.process_messages( self._messages) for agent, observation in observations.items(): # Pass the observation through the policy network. if self._trainer is not None: epsilon = self._trainer.get_epsilon() else: epsilon = 0.0 epsilon = tf.convert_to_tensor(epsilon) (policy_output, new_message), new_state = self._policy( agent, observation.observation, self._states[agent], message_inputs[agent], observation.legal_actions, epsilon, ) self._states[agent] = new_state self._messages[agent] = new_message actions[agent] = tf2_utils.to_numpy_squeeze(policy_output) # Return a numpy array with squeezed out batch dimension. return actions
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_obs = tf2_utils.add_batch_dim(observation) # Initialize the RNN state if necessary. if self._state is None: self._state = self._network.initial_state(1) # Forward. policy_output, new_state = self._policy(batched_obs, self._state) self._prev_state = self._state self._state = new_state # Convert to numpy and squeeze out the batch dimension. action = tf2_utils.to_numpy_squeeze(policy_output) return action
def select_action(self, observation: types.NestedArray) -> types.NestedArray: # Add a dummy batch dimension and as a side effect convert numpy to TF. batched_obs = tf2_utils.add_batch_dim(observation) # Forward the policy network. policy_output = self._policy_network(batched_obs) # If the policy network parameterises a distribution, sample from it. def maybe_sample(output): if isinstance(output, tfd.Distribution): output = output.sample() return output policy_output = tree.map_structure(maybe_sample, policy_output) # Convert to numpy and squeeze out the batch dimension. action = tf2_utils.to_numpy_squeeze(policy_output) return action
def select_action2(self, observation: types.NestedArray, mask: types.NestedArray) -> types.NestedArray: # Initialize the RNN state if necessary. if self._state is None: self._state = self._network.initial_state(1) # Step the recurrent policy forward given the current observation and state. policy_output, new_state = self._policy(observation, self._state, mask) #counter=0 #while mask[policy_output]==0 and counter<1: # policy_output, new_state = self._policy(observation, self._state, mask) # counter+=1 #if counter==1: # print("Valid actions are hard to find here! ->"+str(set(mask))) # Bookkeeping of recurrent states for the observe method. self._prev_state = self._state self._state = new_state # Return a numpy array with squeezed out batch dimension. return tf2_utils.to_numpy_squeeze(policy_output)
def select_action(self, agent: str, observation: types.NestedArray) -> types.NestedArray: """select an action for a single agent in the system Args: agent (str): agent id observation (types.NestedArray): observation tensor received from the environment. Returns: types.NestedArray: agent action """ if not self._evaluator: epsilon = self._trainer.get_epsilon() else: epsilon = 1e-10 epsilon = tf.convert_to_tensor(epsilon) if self._fingerprint: trainer_step = self._trainer.get_trainer_steps() fingerprint = tf.concat([epsilon, trainer_step], axis=0) fingerprint = tf.expand_dims(fingerprint, axis=0) fingerprint = tf.cast(fingerprint, "float32") else: fingerprint = None action = self._policy( agent, observation.observation, observation.legal_actions, epsilon, fingerprint, ) action = tf2_utils.to_numpy_squeeze(action) return action
def observe( self, actions: Dict[str, types.NestedArray], next_timestep: dm_env.TimeStep, next_extras: Optional[Dict[str, types.NestedArray]] = {}, ) -> None: """record observed timestep from the environment Args: actions (Dict[str, types.NestedArray]): system agents' actions. next_timestep (dm_env.TimeStep): data emitted by an environment during interaction. next_extras (Dict[str, types.NestedArray], optional): possible extra information to record during the transition. Defaults to {}. """ if not self._adder: return _, policy = actions if not self._store_recurrent_state: if next_extras: # TODO (dries): Sort out this mypy issue. self._adder.add(policy, next_timestep, next_extras) # type: ignore else: self._adder.add(policy, next_timestep) # type: ignore return numpy_states = { agent: tf2_utils.to_numpy_squeeze(_state) for agent, _state in self._states.items() } if next_extras: next_extras.update({"core_states": numpy_states}) self._adder.add(policy, next_timestep, next_extras) # type: ignore else: self._adder.add(policy, next_timestep, numpy_states) # type: ignore
def observe( self, actions: Dict[str, types.NestedArray], next_timestep: dm_env.TimeStep, next_extras: Dict[str, types.NestedArray] = {}, ) -> None: """record observed timestep from the environment Args: actions (Dict[str, types.NestedArray]): system agents' actions. next_timestep (dm_env.TimeStep): data emitted by an environment during interaction. next_extras (Dict[str, types.NestedArray], optional): possible extra information to record during the transition. Defaults to {}. """ if not self._adder: return next_extras.update({"log_probs": self._prev_log_probs}) next_extras = tf2_utils.to_numpy_squeeze(next_extras) self._adder.add(actions, next_timestep, next_extras)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.RNNCore, target_network: snt.RNNCore, burn_in_length: int, trace_length: int, replay_period: int, demonstration_generator: iter, demonstration_ratio: float, model_directory: str, counter: counting.Counter = None, logger: loggers.Logger = None, discount: float = 0.99, batch_size: int = 32, target_update_period: int = 100, importance_sampling_exponent: float = 0.2, epsilon: float = 0.01, learning_rate: float = 1e-3, log_to_bigtable: bool = False, log_name: str = 'agent', checkpoint: bool = True, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, ): extra_spec = { 'core_state': network.initial_state(1), } # replay table # Remove batch dimensions. extra_spec = tf2_utils.squeeze_batch_dim(extra_spec) replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(0.8), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1), signature=adders.SequenceAdder.signature(environment_spec, extra_spec)) # demonstation table. demonstration_table = reverb.Table( name='demonstration_table', sampler=reverb.selectors.Prioritized(0.8), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1), signature=adders.SequenceAdder.signature(environment_spec, extra_spec)) # launch server self._server = reverb.Server([replay_table, demonstration_table], port=None) address = f'localhost:{self._server.port}' sequence_length = burn_in_length + trace_length + 1 # Component to add things into replay and demo sequence_kwargs = dict( period=replay_period, sequence_length=sequence_length, ) adder = adders.SequenceAdder(client=reverb.Client(address), **sequence_kwargs) priority_function = {demonstration_table.name: lambda x: 1.} demo_adder = adders.SequenceAdder(client=reverb.Client(address), priority_fns=priority_function, **sequence_kwargs) # play demonstrations and write # exhaust the generator # TODO: MAX REPLAY SIZE _prev_action = 1 # this has to come from spec _add_first = True #include this to make datasets equivalent numpy_state = tf2_utils.to_numpy_squeeze(network.initial_state(1)) for ts, action in demonstration_generator: if _add_first: demo_adder.add_first(ts) _add_first = False else: demo_adder.add(_prev_action, ts, extras=(numpy_state, )) _prev_action = action # reset to new episode if ts.last(): _prev_action = None _add_first = True # replay dataset max_in_flight_samples_per_worker = 2 * batch_size if batch_size else 100 dataset = reverb.ReplayDataset.from_table_signature( server_address=address, table=adders.DEFAULT_PRIORITY_TABLE, max_in_flight_samples_per_worker=max_in_flight_samples_per_worker, num_workers_per_iterator= 2, # memory perf improvment attempt https://github.com/deepmind/acme/issues/33 sequence_length=sequence_length, emit_timesteps=sequence_length is None) # demonstation dataset d_dataset = reverb.ReplayDataset.from_table_signature( server_address=address, table=demonstration_table.name, max_in_flight_samples_per_worker=max_in_flight_samples_per_worker, num_workers_per_iterator=2, sequence_length=sequence_length, emit_timesteps=sequence_length is None) dataset = tf.data.experimental.sample_from_datasets( [dataset, d_dataset], [1 - demonstration_ratio, demonstration_ratio]) # Batch and prefetch. dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) learner = learning.R2D2Learner( environment_spec=environment_spec, network=network, target_network=target_network, burn_in_length=burn_in_length, dataset=dataset, reverb_client=reverb.TFClient(address), counter=counter, logger=logger, sequence_length=sequence_length, discount=discount, target_update_period=target_update_period, importance_sampling_exponent=importance_sampling_exponent, max_replay_size=max_replay_size, learning_rate=learning_rate, store_lstm_state=False, ) self._checkpointer = tf2_savers.Checkpointer( directory=model_directory, subdirectory='r2d2_learner_v1', time_delta_minutes=15, objects_to_save=learner.state, enable_checkpointing=checkpoint, ) self._snapshotter = tf2_savers.Snapshotter(objects_to_save=None, time_delta_minutes=15000., directory=model_directory) policy_network = snt.DeepRNN([ network, lambda qs: trfl.epsilon_greedy(qs, epsilon=epsilon).sample(), ]) actor = actors.RecurrentActor(policy_network, adder) observations_per_step = (float(replay_period * batch_size) / samples_per_insert) super().__init__(actor=actor, learner=learner, min_observations=replay_period * max(batch_size, min_replay_size), observations_per_step=observations_per_step)
def observe(self, action: types.NestedArray, next_timestep: dm_env.TimeStep): extras = {'logP': self._prev_logP, 'policy': self._prev_means} extras = tf2_utils.to_numpy_squeeze(extras) self._adder.add(action, next_timestep, extras)