def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._obs_spec]) policy_network = snt.DeepRNN([ network, lambda qs: tf.cast(tf.argmax(qs, axis=-1), tf.int32), ]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.RecurrentActor( policy_network=policy_network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger( 'evaluator', save_data=True, steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter, logger)
def __init__(self, base_agent, residual_spec, policy_network, action_norm, action_norm_scale=1.0, env=None, visible_state_features=()): self.base_agent = base_agent obs_spec = residual_spec.observations tf2_utils.create_variables(policy_network, [obs_spec]) self.network = policy_network self.visible_state_features = visible_state_features self.env = env self.action_space = ActionSpace( action_norm, env=env, scale=action_norm_scale) # Reuse stats; normalization scheme may still be different. self.action_space.mean = self.base_agent.action_space.mean self.action_space.std = self.base_agent.action_space.std # Options that may want to be shared between base and residual agent. self.binary_grip_action = self.base_agent.binary_grip_action # self.grip_action_from_state = self.base_agent.grip_action_from_state # self.zero_action_keeps_state = self.base_agent.zero_action_keeps_state # self.early_closing = self.base_agent.early_closing # For convenience, might want to revisit later. self.action_pred_dim = self.base_agent.action_pred_dim self.action_target_dim = self.base_agent.action_target_dim
def __init__(self, DIAYN_agent: DIAYNAgent.DIAYNAgent, environment_spec: specs.EnvironmentSpec, action_spec: specs.BoundedArray, z_dim: int, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE, replay_server_port: Optional[int] = None, ) -> None: self._z_dim = z_dim z_spec = specs.BoundedArray((z_dim,), np.float64, minimum=0, maximum=1) self._environment_spec = environment_spec # Modify the environment_spec to also include the latent variable # observation (z) self._obs_space = environment_spec.observations assert (len(self._obs_space.shape) == 1), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}" self._agent_networks = make_feed_forward_networks(action_spec, z_spec) self._agent = dmpo.DistributionalMPO( environment_spec=environment_spec, policy_network=self._agent_networks['policy'], critic_network=self._agent_networks['critic'], observation_network=self._agent_networks['observation'], # pytype: disable=wrong-arg-types extra_modules_to_save={ 'hierarchical_controller': self._agent_networks['hierarchical_controller'], }, checkpoint_name='hierarchical_dmpo', replay_table_name=replay_table_name, replay_server_port=replay_server_port, return_action_entropy=True, ) self._DIAYN_agent = DIAYN_agent # Create variables for the discriminator. tf2_utils.create_variables( self._agent_networks['hierarchical_controller'], [self._obs_space])
def _make_network(spec) -> snt.Module: network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, spec.actions.num_values]), ]) tf2_utils.create_variables(network, [spec.observations]) return network
def create_actor_variables_with_fingerprints( self, ) -> Dict[str, Dict[str, snt.Module]]: actor_networks: Dict[str, Dict[str, snt.Module]] = { "values": {}, "target_values": {}, } # get actor specs actor_obs_specs = self._architecture._get_actor_specs() # create policy variables for each agent for agent_key in self._architecture._actor_agent_keys: obs_spec = actor_obs_specs[agent_key] # Create variables for value and policy networks. tf2_utils.create_variables( self._architecture._value_networks[agent_key], [obs_spec, self._fingerprint_spec], ) # create target value network variables tf2_utils.create_variables( self._architecture._target_value_networks[agent_key], [obs_spec, self._fingerprint_spec], ) actor_networks["values"] = self._architecture._value_networks actor_networks[ "target_values"] = self._architecture._target_value_networks return actor_networks
def test_snapshot_distribution(self): """Test that snapshotter correctly calls saves/restores snapshots.""" # Create a test network. net1 = snt.Sequential([ networks.LayerNormMLP([10, 10]), networks.MultivariateNormalDiagHead(1) ]) spec = specs.Array([10], dtype=np.float32) tf2_utils.create_variables(net1, [spec]) # Save the test network. directory = self.get_tempdir() objects_to_save = {'net': net1} snapshotter = tf2_savers.Snapshotter(objects_to_save, directory=directory) snapshotter.save() # Reload the test network. net2 = tf.saved_model.load(os.path.join(snapshotter.directory, 'net')) inputs = tf2_utils.add_batch_dim(tf2_utils.zeros_like(spec)) with tf.GradientTape() as tape: dist1 = net1(inputs) loss1 = tf.math.reduce_sum(dist1.mean() + dist1.variance()) grads1 = tape.gradient(loss1, net1.trainable_variables) with tf.GradientTape() as tape: dist2 = net2(inputs) loss2 = tf.math.reduce_sum(dist2.mean() + dist2.variance()) grads2 = tape.gradient(loss2, net2.trainable_variables) assert all(tree.map_structure(np.allclose, list(grads1), list(grads2)))
def test_rnn_snapshot(self): """Test that snapshotter correctly calls saves/restores snapshots on RNNs.""" # Create a test network. net = snt.LSTM(10) spec = specs.Array([10], dtype=np.float32) tf2_utils.create_variables(net, [spec]) # Test that if you add some postprocessing without rerunning # create_variables, it still works. wrapped_net = snt.DeepRNN([net, lambda x: x]) for net1 in [net, wrapped_net]: # Save the test network. directory = self.get_tempdir() objects_to_save = {'net': net1} snapshotter = tf2_savers.Snapshotter(objects_to_save, directory=directory) snapshotter.save() # Reload the test network. net2 = tf.saved_model.load(os.path.join(snapshotter.directory, 'net')) inputs = tf2_utils.add_batch_dim(tf2_utils.zeros_like(spec)) with tf.GradientTape() as tape: outputs1, next_state1 = net1(inputs, net1.initial_state(1)) loss1 = tf.math.reduce_sum(outputs1) grads1 = tape.gradient(loss1, net1.trainable_variables) with tf.GradientTape() as tape: outputs2, next_state2 = net2(inputs, net2.initial_state(1)) loss2 = tf.math.reduce_sum(outputs2) grads2 = tape.gradient(loss2, net2.trainable_variables) assert np.allclose(outputs1, outputs2) assert np.allclose(tree.flatten(next_state1), tree.flatten(next_state2)) assert all(tree.map_structure(np.allclose, list(grads1), list(grads2)))
def test_update(self): # Create two instances of the same model. actor_model = snt.nets.MLP([50, 30]) learner_model = snt.nets.MLP([50, 30]) # Create variables first. input_spec = tf.TensorSpec(shape=(28,), dtype=tf.float32) tf2_utils.create_variables(actor_model, [input_spec]) tf2_utils.create_variables(learner_model, [input_spec]) # Register them as client and source variables, respectively. actor_variables = actor_model.variables np_learner_variables = [ tf2_utils.to_numpy(v) for v in learner_model.variables ] variable_source = fakes.VariableSource(np_learner_variables) variable_client = tf2_variable_utils.VariableClient( variable_source, {'policy': actor_variables}) # Now, given some random batch of test input: x = tf.random.normal(shape=(8, 28)) # Before copying variables, the models have different outputs. actor_output = actor_model(x).numpy() learner_output = learner_model(x).numpy() self.assertFalse(np.allclose(actor_output, learner_output)) # Update the variable client. variable_client.update_and_wait() # After copying variables (by updating the client), the models are the same. actor_output = actor_model(x).numpy() learner_output = learner_model(x).numpy() self.assertTrue(np.allclose(actor_output, learner_output))
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.trainable_variables}, update_period=self._variable_update_period) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the run loop and return it. logger = loggers.make_default_logger('evaluator') return acme.EnvironmentLoop(environment, actor, counter=counter, logger=logger)
def evaluator(self, variable_source: acme.VariableSource, counter: counting.Counter): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._environment_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = acting.IMPALAActor( network=network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger( 'evaluator', steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter, logger)
def learner(self, queue: reverb.Client, counter: counting.Counter): """The Learning part of the agent.""" # Use architect and create the environment. # Create the networks. network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._environment_spec.observations]) # The dataset object to learn from. dataset = datasets.make_reverb_dataset( server_address=queue.server_address, batch_size=self._batch_size, prefetch_size=self._prefetch_size) logger = loggers.make_default_logger('learner', steps_key='learner_steps') counter = counting.Counter(counter, 'learner') # Return the learning agent. learner = learning.IMPALALearner( environment_spec=self._environment_spec, network=network, dataset=dataset, discount=self._discount, learning_rate=self._learning_rate, entropy_cost=self._entropy_cost, baseline_cost=self._baseline_cost, max_abs_reward=self._max_abs_reward, max_gradient_norm=self._max_gradient_norm, counter=counter, logger=logger, ) return tf2_savers.CheckpointingRunner(learner, time_delta_minutes=5, subdirectory='impala_learner')
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec, self._num_critic_heads) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. behavior_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticSamplingHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) policy_variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient(variable_source, policy_variables, update_period=1000) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, max_in_flight_items=self._max_in_flight_items, discount=self._additional_discount) # Create the agent. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, actor, counter, logger)
def create_critic_variables(self) -> Dict[str, Dict[str, snt.Module]]: critic_networks: Dict[str, Dict[str, snt.Module]] = { "critics": {}, "target_critics": {}, } # get critic specs embed_specs, act_specs = self._get_critic_specs() # create critics for agent_key in self._critic_agent_keys: # get specs emb_spec = embed_specs[agent_key] act_spec = act_specs[agent_key] # Create variables. tf2_utils.create_variables(self._critic_networks[agent_key], [emb_spec, act_spec]) # create target network variables tf2_utils.create_variables(self._target_critic_networks[agent_key], [emb_spec, act_spec]) critic_networks["critics"] = self._critic_networks critic_networks["target_critics"] = self._target_critic_networks return critic_networks
def __init__( self, environment_spec: specs.EnvironmentSpec, replay_capacity: int, batch_size: int, hidden_sizes: Tuple[int, ...], learning_rate: float = 1e-3, terminal_tol: float = 1e-3, ): self._obs_spec = environment_spec.observations self._action_spec = environment_spec.actions # Hyperparameters. self._batch_size = batch_size self._terminal_tol = terminal_tol # Modelling self._replay = replay.Replay(replay_capacity) self._transition_model = MLPTransitionModel(environment_spec, hidden_sizes) self._optimizer = snt.optimizers.Adam(learning_rate) self._forward = tf.function(self._transition_model) tf2_utils.create_variables( self._transition_model, [self._obs_spec, self._action_spec]) self._variables = self._transition_model.trainable_variables # Model state. self._needs_reset = True
def main(_): # Create an environment and grab the spec. environment = atari.environment(FLAGS.game) environment_spec = specs.make_environment_spec(environment) # Create dataset. dataset = atari.dataset(path=FLAGS.dataset_path, game=FLAGS.game, run=FLAGS.run, num_shards=FLAGS.num_shards) # Discard extra inputs dataset = dataset.map(lambda x: x._replace(data=x.data[:5])) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Build network. g_network = make_network(environment_spec.actions) q_network = make_network(environment_spec.actions) network = networks.DiscreteFilteredQNetwork(g_network=g_network, q_network=q_network, threshold=FLAGS.bcq_threshold) tf2_utils.create_variables(network, [environment_spec.observations]) evaluator_network = snt.Sequential([ q_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Counters. counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluation_network = actors.FeedForwardActor(evaluator_network) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluation_network, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = bcq.DiscreteBCQLearner( network=network, dataset=dataset, learning_rate=FLAGS.learning_rate, discount=FLAGS.discount, importance_sampling_exponent=FLAGS.importance_sampling_exponent, target_update_period=FLAGS.target_update_period, counter=counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def test_feedforward(self, recurrent: bool): model = snt.Linear(42) if recurrent: model = snt.DeepRNN([model]) input_spec = specs.Array(shape=(10,), dtype=np.float32) tf2_utils.create_variables(model, [input_spec]) variables: Sequence[tf.Variable] = model.variables shapes = [v.shape.as_list() for v in variables] self.assertSequenceEqual(shapes, [[42], [10, 42]])
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. evaluator_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticMeanHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) policy_variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, policy_variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor( policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger( 'evaluator', time_delta=self._log_every, steps_key='evaluator_steps') observers = self._make_observers() if self._make_observers else () # Create the run loop and return it. return acme.EnvironmentLoop( environment, evaluator, counter, logger, observers=observers)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ): """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and behavior networks environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec) # Create behavior network by adding some random dithering. behavior_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), networks.ClippedGaussian(self._sigma), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder(client=replay, n_step=self._n_step, discount=self._discount) # Create the agent. actor = actors.FeedForwardActor(behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def init(self, environment_spec: specs.EnvironmentSpec): """Initialize the networks given an environment spec.""" # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations # Create variables for the policy and critic nets. _ = utils.create_variables(self.policy_network, [obs_spec]) _ = utils.create_variables(self.critic_network, [obs_spec, act_spec]) if self.prior_network is not None: _ = utils.create_variables(self.prior_network, [obs_spec])
def setUp(self): super().setUp() # Create two instances of the same model. self._actor_model = snt.nets.MLP(_MLP_LAYERS) self._learner_model = snt.nets.MLP(_MLP_LAYERS) # Create variables first. input_spec = tf.TensorSpec(shape=(_INPUT_SIZE, ), dtype=tf.float32) tf2_utils.create_variables(self._actor_model, [input_spec]) tf2_utils.create_variables(self._learner_model, [input_spec])
def main(_): wb_run = init_or_resume() if FLAGS.seed: tf.random.set_seed(FLAGS.seed) # Create an environment and grab the spec. environment, env_spec = _build_environment(FLAGS.environment_name) # Load demonstration dataset. raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir) dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size, FLAGS.n_step_returns, FLAGS.discount) # Create the policy and critic networks. policy_network = networks.get_default_critic(env_spec) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [env_spec.observations]) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_network = snt.Sequential([ policy_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Create the actor which defines how we take actions. evaluation_actor = actors.FeedForwardActor(evaluator_network) counter = counting.Counter() disp, disp_loop = _build_custom_loggers(wb_run) eval_loop = EnvironmentLoop(environment=environment, actor=evaluation_actor, counter=counter, logger=disp_loop) # The learner updates the parameters (and initializes them). learner = BCLearner(network=policy_network, learning_rate=FLAGS.learning_rate, dataset=dataset, counter=counter) # Run the environment loop. for _ in tqdm(range(FLAGS.epochs)): for _ in range(FLAGS.evaluate_every): learner.step() eval_loop.run(FLAGS.evaluation_episodes) learner.save(tag=FLAGS.logs_tag)
def init(self, environment_spec: specs.EnvironmentSpec): """Initialize the networks given an environment spec.""" # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations # Create variables for the observation net and, as a side-effect, get a # spec describing the embedding space. emb_spec = utils.create_variables(self.observation_network, [obs_spec]) # Create variables for the policy and critic nets. _ = utils.create_variables(self.policy_network, [emb_spec]) _ = utils.create_variables(self.critic_network, [emb_spec, act_spec])
def __init__(self, environment_spec: specs.EnvironmentSpec, action_spec: specs.BoundedArray, z_dim: int) -> None: self._z_dim = z_dim z_spec = specs.BoundedArray((z_dim, ), np.float64, minimum=0, maximum=1) # Modify the environment_spec to also include the latent variable # observation (z) self._obs_space = environment_spec.observations assert ( len(self._obs_space.shape) == 1 ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}" updated_observations = specs.BoundedArray( (self._obs_space.shape[0] + z_dim, ), dtype=environment_spec.observations.dtype, name=environment_spec.observations.name, minimum=np.append(environment_spec.observations.minimum, [0] * z_dim), maximum=np.append(environment_spec.observations.maximum, [0] * z_dim), ) environment_spec = specs.EnvironmentSpec( observations=updated_observations, actions=environment_spec.actions, rewards=environment_spec.rewards, discounts=environment_spec.discounts, ) self._agent_networks = make_feed_forward_networks(action_spec, z_spec) self._agent = dmpo.DistributionalMPO( environment_spec=environment_spec, policy_network=self._agent_networks['policy'], critic_network=self._agent_networks['critic'], observation_network=self._agent_networks['observation'], # pytype: disable=wrong-arg-types extra_modules_to_save={ 'discriminator': self._agent_networks['discriminator'], }, return_action_entropy=True, ) self._z_distribution = tfd.Categorical([1] * z_dim) self._current_z = self._z_distribution.sample() # Create discriminator optimizer. self._discriminator_optimizer = snt.optimizers.Adam(1e-4) self._discriminator_logger = loggers.make_default_logger( 'discriminator') # Create variables for the discriminator. tf2_utils.create_variables(self._agent_networks['discriminator'], [self._obs_space])
def make_default_networks( environment_spec: specs.EnvironmentSpec, *, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), policy_init_scale: float = 0.7, critic_init_scale: float = 1e-3, critic_num_components: int = 5, ) -> Mapping[str, snt.Module]: """Creates networks used by the agent.""" # Unpack the environment spec to get appropriate shapes, dtypes, etc. act_spec = environment_spec.actions obs_spec = environment_spec.observations num_dimensions = np.prod(act_spec.shape, dtype=int) # Create the observation network and make sure it's a Sonnet module. observation_network = tf2_utils.batch_concat observation_network = tf2_utils.to_sonnet_module(observation_network) # Create the policy network. policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, init_scale=policy_init_scale, use_tfd_independent=True) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( action_network=networks.ClipToSpec(act_spec)), networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.GaussianMixtureHead(num_dimensions=1, num_components=critic_num_components, init_scale=critic_init_scale) ]) # Create network variables. # Get embedding spec by creating observation network variables. emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.RNNCore, queue: adder.Adder, counter: counting.Counter = None, logger: loggers.Logger = None, discount: float = 0.99, n_step_horizon: int = 16, learning_rate: float = 1e-3, entropy_cost: float = 0.01, baseline_cost: float = 0.5, max_abs_reward: Optional[float] = None, max_gradient_norm: Optional[float] = None, verbose_level: Optional[int] = 0, ): num_actions = environment_spec.actions.num_values self._logger = logger or loggers.TerminalLogger('agent') extra_spec = { 'core_state': network.initial_state(1), 'logits': tf.ones(shape=(1, num_actions), dtype=tf.float32) } # Remove batch dimensions. extra_spec = tf2_utils.squeeze_batch_dim(extra_spec) tf2_utils.create_variables(network, [environment_spec.observations]) actor = acting.A2CActor(environment_spec=environment_spec, verbose_level=verbose_level, network=network, queue=queue) learner = learning.A2CLearner( environment_spec=environment_spec, network=network, dataset=queue, counter=counter, logger=logger, discount=discount, learning_rate=learning_rate, entropy_cost=entropy_cost, baseline_cost=baseline_cost, max_gradient_norm=max_gradient_norm, max_abs_reward=max_abs_reward, ) super().__init__(actor=actor, learner=learner, min_observations=0, observations_per_step=n_step_horizon)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, epsilon: float, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._obs_spec]) policy_network = snt.DeepRNN([ network, lambda qs: tf.cast(trfl.epsilon_greedy(qs, epsilon).sample(), tf.int32), ]) # Component to add things into replay. sequence_length = self._burn_in_length + self._trace_length + 1 adder = adders.SequenceAdder( client=replay, period=self._replay_period, sequence_length=sequence_length, delta_encoded=True, ) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.RecurrentActor( policy_network=policy_network, variable_client=variable_client, adder=adder) counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger( 'actor', save_data=False, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_ope_networks( task_id: str, environment_spec: EnvironmentSpec, **network_params: Dict[str, Any], ) -> snt.Module: if task_id.startswith("dm_control"): value_func = make_value_func_dm_control(**network_params) elif task_id.startswith("bsuite"): value_func = make_value_func_bsuite(environment_spec, **network_params) else: raise ValueError(f"task id {task_id} not known") tf2_utils.create_variables( value_func, [environment_spec.observations, environment_spec.actions]) return value_func
def test_scalar_output(self): model = tf2_utils.to_sonnet_module(tf.reduce_sum) input_spec = specs.Array(shape=(10,), dtype=np.float32) expected_spec = tf.TensorSpec(shape=(), dtype=tf.float32) output_spec = tf2_utils.create_variables(model, [input_spec]) self.assertEqual(model.variables, ()) self.assertEqual(output_spec, expected_spec)
def test_none_output(self): model = tf2_utils.to_sonnet_module(lambda x: None) input_spec = specs.Array(shape=(10,), dtype=np.float32) expected_spec = None output_spec = tf2_utils.create_variables(model, [input_spec]) self.assertEqual(model.variables, ()) self.assertEqual(output_spec, expected_spec)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and evaluator networks environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Create evaluator network. evaluator_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the evaluator; note it will not add experience to replay. evaluator = actors.FeedForwardActor(evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every, steps_key='evaluator_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)