def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec, self._num_critic_heads) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. behavior_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticSamplingHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) policy_variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient(variable_source, policy_variables, update_period=1000) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, max_in_flight_items=self._max_in_flight_items, discount=self._additional_discount) # Create the agent. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_adder(self, replay_client: reverb.Client) -> adders.Adder: """Creates an adder which handles observations.""" return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=self._config.n_step, discount=self._config.discount)
def make_adder(self, replay_client: reverb.Client) -> Optional[adders.Adder]: """Create an adder which records data generated by the actor/environment.""" return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=self._config.n_step, discount=self._config.discount)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ): """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and behavior networks environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec) # Create behavior network by adding some random dithering. behavior_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), networks.ClippedGaussian(self._sigma), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder(client=replay, n_step=self._n_step, discount=self._discount) # Create the agent. actor = actors.FeedForwardActor(behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_adder( self, replay_client: reverb.Client, environment_spec: Optional[specs.EnvironmentSpec], policy: Optional[actor_core_lib.FeedForwardPolicy] ) -> Optional[adders.Adder]: del environment_spec, policy return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=1, discount=self._config.discount)
def make_adder( self, replay_client: reverb.Client, environment_spec: Optional[specs.EnvironmentSpec], policy: Optional[actor_core_lib.FeedForwardPolicy] ) -> Optional[adders.Adder]: """Create an adder which records data generated by the actor/environment.""" del environment_spec, policy return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=self._config.n_step, discount=self._config.discount)
def make_adder( self, replay_client: reverb.Client, environment_spec: Optional[specs.EnvironmentSpec], policy: Optional[dqn_actor.EpsilonPolicy], ) -> Optional[adders.Adder]: """Creates an adder which handles observations.""" del environment_spec, policy return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=self._config.n_step, discount=self._config.discount)
def make_adder(self, replay_client: reverb.Client) -> Optional[adders.Adder]: direct_rl_adder = self._rl_agent.make_adder(replay_client) if self._config.share_iterator: return direct_rl_adder ail_adder = adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=1, discount=self._config.discount) # Some direct rl algorithms (such as PPO), might be passing extra data # which we won't be able to process here properly, so we need to ignore them return adders.ForkingAdder( [adders.IgnoreExtrasAdder(ail_adder), direct_rl_adder])
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, epsilon: float, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._env_spec.actions) # Just inline the policy network here. policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) tf2_utils.create_variables(policy_network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.trainable_variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = actors.FeedForwardActor(policy_network, adder, variable_client) # Create the loop to connect environment and agent. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, steps_key='actor_steps') return acme.EnvironmentLoop(environment, actor, counter, logger)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'network': network.trainable_variables}, update_period=self._variable_update_period) # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, adder=adder, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: hk.Transformed, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon: float = 0., learning_rate: float = 1e-3, discount: float = 0.99, seed: int = 1, ): """Initialize the agent.""" # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature( environment_spec=environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. dataset = datasets.make_reverb_dataset( server_address=address, environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) def policy(params: hk.Params, key: jnp.ndarray, observation: jnp.ndarray) -> jnp.ndarray: action_values = network.apply(params, observation) return rlax.epsilon_greedy(epsilon).sample(key, action_values) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, obs_spec=environment_spec.observations, rng=hk.PRNGSequence(seed), optimizer=optax.adam(learning_rate), discount=discount, importance_sampling_exponent=importance_sampling_exponent, target_update_period=target_update_period, iterator=dataset.as_numpy_iterator(), replay_client=reverb.Client(address), ) variable_client = variable_utils.VariableClient(learner, '') actor = actors.FeedForwardActor(policy=policy, rng=hk.PRNGSequence(seed), variable_client=variable_client, adder=adder) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, observation_network: types.TensorTransformation = tf.identity, discount: float = 0.99, batch_size: int = 256, prefetch_size: int = 4, target_policy_update_period: int = 100, target_critic_update_period: int = 100, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, policy_loss_module: snt.Module = None, policy_optimizer: snt.Optimizer = None, critic_optimizer: snt.Optimizer = None, n_step: int = 5, num_samples: int = 20, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_policy_update_period: number of updates to perform before updating the target policy network. target_critic_update_period: number of updates to perform before updating the target critic network. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. policy_loss_module: configured MPO loss function for the policy optimization; defaults to sensible values on the control suite. See `acme/tf/losses/mpo.py` for more details. policy_optimizer: optimizer to be used on the policy. critic_optimizer: optimizer to be used on the critic. n_step: number of steps to squash into a single transition. num_samples: number of actions to sample when doing a Monte Carlo integration with respect to the policy. clipping: whether to clip gradients by global norm. logger: logging object used to write to logs. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset object to learn from. dataset = datasets.make_reverb_dataset( table=replay_table_name, client=reverb.TFClient(address), batch_size=batch_size, prefetch_size=prefetch_size, environment_spec=environment_spec, transition_adder=True) # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) # Create target networks before creating online/target network variables. target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.StochasticSamplingHead(), ]) # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder) # Create optimizers. policy_optimizer = policy_optimizer or snt.optimizers.Adam(1e-4) critic_optimizer = critic_optimizer or snt.optimizers.Adam(1e-4) # The learner updates the parameters (and initializes them). learner = learning.MPOLearner( policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_loss_module=policy_loss_module, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, clipping=clipping, discount=discount, num_samples=num_samples, target_policy_update_period=target_policy_update_period, target_critic_update_period=target_critic_update_period, dataset=dataset, logger=logger, counter=counter, checkpoint=checkpoint) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, demonstration_dataset: tf.data.Dataset, demonstration_ratio: float, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, n_step: int = 5, epsilon: tf.Tensor = None, learning_rate: float = 1e-3, discount: float = 0.99, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) demonstration_dataset: tf.data.Dataset producing (timestep, action) tuples containing full episodes. demonstration_ratio: Ratio of transitions coming from demonstrations. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, transition_adder=True) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=n_step, discount=discount) dataset_demos = demonstration_dataset.map(transition) dataset = tf.data.experimental.sample_from_datasets( [dataset, dataset_demos], [1 - demonstration_ratio, demonstration_ratio]) # Batch and prefetch. dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(prefetch_size) # Use constant 0.05 epsilon greedy policy by default. if epsilon is None: epsilon = tf.Variable(0.05, trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = dqn.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__(self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, observation_network: types.TensorTransformation = tf.identity, discount: float = 0.99, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, n_step: int = 5, sigma: float = 0.3, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. n_step: number of steps to squash into a single transition. sigma: standard deviation of zero-mean, Gaussian exploration noise. clipping: whether to clip gradients by global norm. logger: logger object to be used by learner. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=replay_table_name, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder( priority_fns={replay_table_name: lambda x: 1.}, client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. dataset = datasets.make_reverb_dataset( table=replay_table_name, client=reverb.TFClient(address), environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # pytype: disable=wrong-arg-types # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) # Create target networks. target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.ClippedGaussian(sigma), networks.ClipToSpec(act_spec), ]) # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(behavior_network, adder=adder) # Create optimizers. policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4) critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4) # The learner updates the parameters (and initializes them). learner = learning.DDPGLearner( policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, clipping=clipping, discount=discount, target_update_period=target_update_period, dataset=dataset, counter=counter, logger=logger, checkpoint=checkpoint, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
"""Creates a single-process replay infrastructure from an environment spec.""" # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=replay_table_name, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_replay_size), signature=adders.NStepTransitionAdder.signature( environment_spec=environment_spec)) server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{server.port}' client = reverb.Client(address) adder = adders.NStepTransitionAdder(client=client, n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. data_iterator = datasets.make_reverb_dataset( table=replay_table_name, server_address=address, batch_size=batch_size, prefetch_size=prefetch_size, environment_spec=environment_spec, transition_adder=True, ).as_numpy_iterator() return ReverbReplay(server, adder, data_iterator, client)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 32, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 100000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon: Optional[float] = 0.05, learning_rate: float = 1e-3, discount: float = 0.99, logger: loggers.Logger = None, max_gradient_norm: Optional[float] = None, expert_data: List[Dict] = None, ) -> None: """ Initialize the agent. """ # Create a replay server to add data to. This uses no limiter behavior # in order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # Adding expert data to the replay memory: if expert_data is not None: for d in expert_data: adder.add_first(d["first"]) for (action, next_ts) in d["mid"]: adder.add(np.int32(action), next_ts) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset(server_address=address, batch_size=batch_size, prefetch_size=prefetch_size) # Creating the epsilon greedy policy network: epsilon = tf.Variable(epsilon) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not # needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, max_gradient_norm=max_gradient_norm, logger=logger, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def make_adder(self, replay_client: reverb.Client) -> adders.Adder: return adders_reverb.NStepTransitionAdder( priority_fns={self._config.replay_table_name: None}, client=replay_client, n_step=1, discount=self._config.discount)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon: tf.Tensor = None, learning_rate: float = 1e-3, discount: float = 0.99, cql_alpha: float = 1., logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint_subpath: str = '~/acme/', ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. priority_exponent: exponent used in prioritized sampling. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. checkpoint: boolean indicating whether to checkpoint the learner. checkpoint_subpath: directory for the checkpoint. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) # Use constant 0.05 epsilon greedy policy by default. if epsilon is None: epsilon = tf.Variable(0.05, trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = CQLLearner( network=network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, cql_alpha=cql_alpha, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, logger=logger, counter=counter, checkpoint_subpath=checkpoint_subpath) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 20, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon_init: float = 1.0, epsilon_final: float = 0.01, epsilon_schedule_timesteps: int = 20000, learning_rate: float = 1e-3, discount: float = 0.99, max_gradient_norm: Optional[float] = None, logger: loggers.Logger = None, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing (beta). See https://arxiv.org/pdf/1710.02298.pdf priority_exponent: exponent used in prioritized sampling (omega). See https://arxiv.org/pdf/1710.02298.pdf n_step: number of steps to squash into a single transition. epsilon_init: Initial epsilon value (probability of taking a random action) epsilon_final: Final epsilon value (probability of taking a random action) epsilon_schedule_timesteps: timesteps to decay epsilon from 'epsilon_init' to 'epsilon_final'. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. max_gradient_norm: used for gradient clipping. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' self._adder = adders.NStepTransitionAdder( client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = make_reverb_dataset(server_address=address, batch_size=batch_size, prefetch_size=prefetch_size) policy_network = snt.Sequential([ network, EpsilonGreedyExploration( epsilon_init=epsilon_init, epsilon_final=epsilon_final, epsilon_schedule_timesteps=epsilon_schedule_timesteps) ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors_tf2.FeedForwardActor(policy_network, self._adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, max_gradient_norm=max_gradient_norm, logger=logger, checkpoint=False) self._saver = tf2_savers.Saver(learner.state) # Deterministic (max-Q) actor. max_Q_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=0.0).sample(), ]) self._deterministic_actor = actors_tf2.FeedForwardActor(max_Q_network) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__(self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, encoder_network: types.TensorTransformation = tf.identity, entropy_coeff: float = 0.01, target_update_period: int = 0, discount: float = 0.99, batch_size: int = 256, policy_learn_rate: float = 3e-4, critic_learn_rate: float = 5e-4, prefetch_size: int = 4, min_replay_size: int = 1000, max_replay_size: int = 250000, samples_per_insert: float = 64.0, n_step: int = 5, sigma: float = 0.5, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. n_step: number of steps to squash into a single transition. sigma: standard deviation of zero-mean, Gaussian exploration noise. clipping: whether to clip gradients by global norm. logger: logger object to be used by learner. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. dim_actions = np.prod(environment_spec.actions.shape, dtype=int) extra_spec = { 'logP': tf.ones(shape=(1), dtype=tf.float32), 'policy': tf.ones(shape=(1, dim_actions), dtype=tf.float32) } # Remove batch dimensions. extra_spec = tf2_utils.squeeze_batch_dim(extra_spec) replay_table = reverb.Table( name=replay_table_name, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature( environment_spec, extras_spec=extra_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder( priority_fns={replay_table_name: lambda x: 1.}, client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. dataset = datasets.make_reverb_dataset(table=replay_table_name, server_address=address, batch_size=batch_size, prefetch_size=prefetch_size) # Make sure observation network is a Sonnet Module. observation_network = model.MDPNormalization(environment_spec, encoder_network) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations # Create the behavior policy. sampling_head = model.SquashedGaussianSamplingHead(act_spec, sigma) self._behavior_network = model.PolicyValueBehaviorNet( snt.Sequential([observation_network, policy_network]), sampling_head) # Create variables. emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) # Create the actor which defines how we take actions. actor = model.SACFeedForwardActor(self._behavior_network, adder) if target_update_period > 0: target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) else: target_policy_network = policy_network target_critic_network = critic_network target_observation_network = observation_network # Create optimizers. policy_optimizer = snt.optimizers.Adam(learning_rate=policy_learn_rate) critic_optimizer = snt.optimizers.Adam(learning_rate=critic_learn_rate) # The learner updates the parameters (and initializes them). learner = learning.SACLearner( policy_network=policy_network, critic_network=critic_network, sampling_head=sampling_head, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, target_update_period=target_update_period, learning_rate=policy_learn_rate, clipping=clipping, entropy_coeff=entropy_coeff, discount=discount, dataset=dataset, counter=counter, logger=logger, checkpoint=checkpoint, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, params=None, logger: loggers.Logger = None, checkpoint: bool = True, paths: Save_paths = None, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. priority_exponent: exponent used in prioritized sampling. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. checkpoint: boolean indicating whether to checkpoint the learner. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. if params is None: params = { 'batch_size': 256, 'prefetch_size': 4, 'target_update_period': 100, 'samples_per_insert': 32.0, 'min_replay_size': 1000, 'max_replay_size': 1000000, 'importance_sampling_exponent': 0.2, 'priority_exponent': 0.6, 'n_step': 5, 'epsilon': 0.05, 'learning_rate': 1e-3, 'discount': 0.99, } replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(params['priority_exponent']), remover=reverb.selectors.Fifo(), max_size=params['max_replay_size'], rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=params['n_step'], discount=params['discount']) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, batch_size=params['batch_size'], prefetch_size=params['prefetch_size'], transition_adder=True) # Use constant 0.05 epsilon greedy policy by default. epsilon = tf.Variable(params['epsilon'], trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). # tf2_utils.create_variables(network, [environment_spec.observations]) # tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=params['discount'], importance_sampling_exponent=params[ 'importance_sampling_exponent'], learning_rate=params['learning_rate'], target_update_period=params['target_update_period'], dataset=dataset, replay_client=replay_client, logger=logger, checkpoint=checkpoint) if checkpoint: self._checkpointer = tf2_savers.Checkpointer( add_uid=False, objects_to_save=learner.state, directory=paths.data_dir, subdirectory=paths.experiment_name, time_delta_minutes=60.) else: self._checkpointer = None super().__init__(actor=actor, learner=learner, min_observations=max(params['batch_size'], params['min_replay_size']), observations_per_step=float(params['batch_size']) / params['samples_per_insert'])
def __init__( self, network: snt.Module, model: models.Model, optimizer: snt.Optimizer, n_step: int, discount: float, replay_capacity: int, num_simulations: int, environment_spec: specs.EnvironmentSpec, batch_size: int, ): # Create a replay server for storing transitions. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=replay_capacity, rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) action_spec: specs.DiscreteArray = environment_spec.actions dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, extra_spec={ 'pi': specs.Array(shape=(action_spec.num_values, ), dtype=np.float32) }, transition_adder=True) dataset = dataset.batch(batch_size, drop_remainder=True) tf2_utils.create_variables(network, [environment_spec.observations]) # Now create the agent components: actor & learner. actor = acting.MCTSActor( environment_spec=environment_spec, model=model, network=network, discount=discount, adder=adder, num_simulations=num_simulations, ) learner = learning.AZLearner( network=network, optimizer=optimizer, dataset=dataset, discount=discount, ) # The parent class combines these together into one 'agent'. super().__init__( actor=actor, learner=learner, min_observations=10, observations_per_step=1, )
# order to allow the Agent interface to handle it. replay_table = reverb.Table( name=replay_table_name, sampler=sampler, remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_replay_size), signature=adders.NStepTransitionAdder.signature( environment_spec, extra_spec), ) server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{server.port}' client = reverb.Client(address) adder = adders.NStepTransitionAdder(client, n_step, discount, priority_fns) # The dataset provides an interface to sample from replay. data_iterator = datasets.make_reverb_dataset( table=replay_table_name, server_address=address, batch_size=batch_size, prefetch_size=prefetch_size, environment_spec=environment_spec, transition_adder=True, ).as_numpy_iterator() return ReverbReplay(server, adder, data_iterator, client=client) def make_reverb_online_queue( environment_spec: specs.EnvironmentSpec,
def make_adder(replay_client: reverb.Client) -> adders.Adder: return adders_reverb.NStepTransitionAdder( priority_fns={'default': None}, client=replay_client, n_step=1, discount=1)
rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1), signature=adders.NStepTransitionAdder.signature(environment_spec) ) replay_table_name = adders.DEFAULT_PRIORITY_TABLE # Get the server and address so we can give it to the modules such as our actor # that will interact with the replay buffer. replay_server = reverb.Server([replay_buffer], port=None) replay_server_address = 'localhost:%d' % replay_server.port # Create a 5-step transition adder where in between those steps a discount of # 0.99 is used (which should be the same discount used for learning). adder = adders.NStepTransitionAdder( priority_fns={replay_table_name: lambda x: 1.}, client=reverb.Client(replay_server_address), n_step=5, discount=0.99) # This connects to the created reverb server; also note that we use a transition # adder above so we'll tell the dataset function that so that it knows the type # of data that's coming out. dataset = datasets.make_reverb_dataset( table=replay_table_name, server_address=replay_server_address, batch_size=256, prefetch_size=True) # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.batch_concat observation_network = tf2_utils.to_sonnet_module(observation_network)