示例#1
0
  def evaluator(
      self,
      variable_source: acme.VariableSource,
      counter: counting.Counter,
  ):
    """The evaluation process."""
    environment = self._environment_factory(True)
    network = self._network_factory(self._environment_spec.actions)

    tf2_utils.create_variables(network, [self._obs_spec])
    policy_network = snt.DeepRNN([
        network,
        lambda qs: tf.cast(tf.argmax(qs, axis=-1), tf.int32),
    ])

    variable_client = tf2_variable_utils.VariableClient(
        client=variable_source,
        variables={'policy': policy_network.variables},
        update_period=self._variable_update_period)

    # Make sure not to use a random policy after checkpoint restoration by
    # assigning variables before running the environment loop.
    variable_client.update_and_wait()

    # Create the agent.
    actor = actors.RecurrentActor(
        policy_network=policy_network, variable_client=variable_client)

    # Create the run loop and return it.
    logger = loggers.make_default_logger(
        'evaluator', save_data=True, steps_key='evaluator_steps')
    counter = counting.Counter(counter, 'evaluator')

    return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#2
0
  def __init__(self, base_agent, residual_spec, policy_network, action_norm,
               action_norm_scale=1.0, env=None, visible_state_features=()):
    self.base_agent = base_agent
    obs_spec = residual_spec.observations

    tf2_utils.create_variables(policy_network, [obs_spec])
    self.network = policy_network
    self.visible_state_features = visible_state_features

    self.env = env
    self.action_space = ActionSpace(
        action_norm, env=env, scale=action_norm_scale)
    # Reuse stats; normalization scheme may still be different.
    self.action_space.mean = self.base_agent.action_space.mean
    self.action_space.std = self.base_agent.action_space.std

    # Options that may want to be shared between base and residual agent.
    self.binary_grip_action = self.base_agent.binary_grip_action
    # self.grip_action_from_state = self.base_agent.grip_action_from_state
    # self.zero_action_keeps_state = self.base_agent.zero_action_keeps_state
    # self.early_closing = self.base_agent.early_closing

    # For convenience, might want to revisit later.
    self.action_pred_dim = self.base_agent.action_pred_dim
    self.action_target_dim = self.base_agent.action_target_dim
示例#3
0
    def __init__(self, DIAYN_agent: DIAYNAgent.DIAYNAgent,
                 environment_spec: specs.EnvironmentSpec,
                 action_spec: specs.BoundedArray, z_dim: int,
                 replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE,
                 replay_server_port: Optional[int] = None,
                 ) -> None:
        self._z_dim = z_dim
        z_spec = specs.BoundedArray((z_dim,), np.float64, minimum=0, maximum=1)
        self._environment_spec = environment_spec
        # Modify the environment_spec to also include the latent variable
        # observation  (z)
        self._obs_space = environment_spec.observations
        assert (len(self._obs_space.shape) == 1), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}"
        self._agent_networks = make_feed_forward_networks(action_spec, z_spec)
        self._agent = dmpo.DistributionalMPO(
            environment_spec=environment_spec,
            policy_network=self._agent_networks['policy'],
            critic_network=self._agent_networks['critic'],
            observation_network=self._agent_networks['observation'],  # pytype: disable=wrong-arg-types
            extra_modules_to_save={
                'hierarchical_controller': self._agent_networks['hierarchical_controller'],
            },
            checkpoint_name='hierarchical_dmpo',
            replay_table_name=replay_table_name,
            replay_server_port=replay_server_port,
            return_action_entropy=True,
        )

        self._DIAYN_agent = DIAYN_agent

        # Create variables for the discriminator.
        tf2_utils.create_variables(
            self._agent_networks['hierarchical_controller'],
            [self._obs_space])
 def _make_network(spec) -> snt.Module:
     network = snt.Sequential([
         snt.Flatten(),
         snt.nets.MLP([50, 50, spec.actions.num_values]),
     ])
     tf2_utils.create_variables(network, [spec.observations])
     return network
示例#5
0
    def create_actor_variables_with_fingerprints(
        self, ) -> Dict[str, Dict[str, snt.Module]]:

        actor_networks: Dict[str, Dict[str, snt.Module]] = {
            "values": {},
            "target_values": {},
        }

        # get actor specs
        actor_obs_specs = self._architecture._get_actor_specs()

        # create policy variables for each agent
        for agent_key in self._architecture._actor_agent_keys:

            obs_spec = actor_obs_specs[agent_key]

            # Create variables for value and policy networks.
            tf2_utils.create_variables(
                self._architecture._value_networks[agent_key],
                [obs_spec, self._fingerprint_spec],
            )

            # create target value network variables
            tf2_utils.create_variables(
                self._architecture._target_value_networks[agent_key],
                [obs_spec, self._fingerprint_spec],
            )

        actor_networks["values"] = self._architecture._value_networks
        actor_networks[
            "target_values"] = self._architecture._target_value_networks

        return actor_networks
示例#6
0
  def test_snapshot_distribution(self):
    """Test that snapshotter correctly calls saves/restores snapshots."""
    # Create a test network.
    net1 = snt.Sequential([
        networks.LayerNormMLP([10, 10]),
        networks.MultivariateNormalDiagHead(1)
    ])
    spec = specs.Array([10], dtype=np.float32)
    tf2_utils.create_variables(net1, [spec])

    # Save the test network.
    directory = self.get_tempdir()
    objects_to_save = {'net': net1}
    snapshotter = tf2_savers.Snapshotter(objects_to_save, directory=directory)
    snapshotter.save()

    # Reload the test network.
    net2 = tf.saved_model.load(os.path.join(snapshotter.directory, 'net'))
    inputs = tf2_utils.add_batch_dim(tf2_utils.zeros_like(spec))

    with tf.GradientTape() as tape:
      dist1 = net1(inputs)
      loss1 = tf.math.reduce_sum(dist1.mean() + dist1.variance())
      grads1 = tape.gradient(loss1, net1.trainable_variables)

    with tf.GradientTape() as tape:
      dist2 = net2(inputs)
      loss2 = tf.math.reduce_sum(dist2.mean() + dist2.variance())
      grads2 = tape.gradient(loss2, net2.trainable_variables)

    assert all(tree.map_structure(np.allclose, list(grads1), list(grads2)))
示例#7
0
  def test_rnn_snapshot(self):
    """Test that snapshotter correctly calls saves/restores snapshots on RNNs."""
    # Create a test network.
    net = snt.LSTM(10)
    spec = specs.Array([10], dtype=np.float32)
    tf2_utils.create_variables(net, [spec])

    # Test that if you add some postprocessing without rerunning
    # create_variables, it still works.
    wrapped_net = snt.DeepRNN([net, lambda x: x])

    for net1 in [net, wrapped_net]:
      # Save the test network.
      directory = self.get_tempdir()
      objects_to_save = {'net': net1}
      snapshotter = tf2_savers.Snapshotter(objects_to_save, directory=directory)
      snapshotter.save()

      # Reload the test network.
      net2 = tf.saved_model.load(os.path.join(snapshotter.directory, 'net'))
      inputs = tf2_utils.add_batch_dim(tf2_utils.zeros_like(spec))

      with tf.GradientTape() as tape:
        outputs1, next_state1 = net1(inputs, net1.initial_state(1))
        loss1 = tf.math.reduce_sum(outputs1)
        grads1 = tape.gradient(loss1, net1.trainable_variables)

      with tf.GradientTape() as tape:
        outputs2, next_state2 = net2(inputs, net2.initial_state(1))
        loss2 = tf.math.reduce_sum(outputs2)
        grads2 = tape.gradient(loss2, net2.trainable_variables)

      assert np.allclose(outputs1, outputs2)
      assert np.allclose(tree.flatten(next_state1), tree.flatten(next_state2))
      assert all(tree.map_structure(np.allclose, list(grads1), list(grads2)))
示例#8
0
  def test_update(self):
    # Create two instances of the same model.
    actor_model = snt.nets.MLP([50, 30])
    learner_model = snt.nets.MLP([50, 30])

    # Create variables first.
    input_spec = tf.TensorSpec(shape=(28,), dtype=tf.float32)
    tf2_utils.create_variables(actor_model, [input_spec])
    tf2_utils.create_variables(learner_model, [input_spec])

    # Register them as client and source variables, respectively.
    actor_variables = actor_model.variables
    np_learner_variables = [
        tf2_utils.to_numpy(v) for v in learner_model.variables
    ]
    variable_source = fakes.VariableSource(np_learner_variables)
    variable_client = tf2_variable_utils.VariableClient(
        variable_source, {'policy': actor_variables})

    # Now, given some random batch of test input:
    x = tf.random.normal(shape=(8, 28))

    # Before copying variables, the models have different outputs.
    actor_output = actor_model(x).numpy()
    learner_output = learner_model(x).numpy()
    self.assertFalse(np.allclose(actor_output, learner_output))

    # Update the variable client.
    variable_client.update_and_wait()

    # After copying variables (by updating the client), the models are the same.
    actor_output = actor_model(x).numpy()
    learner_output = learner_model(x).numpy()
    self.assertTrue(np.allclose(actor_output, learner_output))
示例#9
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""

        # Build environment, model, network.
        environment = self._environment_factory()
        network = self._network_factory(self._env_spec.actions)
        model = self._model_factory(self._env_spec)

        # Create variable client for communicating with the learner.
        tf2_utils.create_variables(network, [self._env_spec.observations])
        variable_client = tf2_variable_utils.VariableClient(
            client=variable_source,
            variables={'policy': network.trainable_variables},
            update_period=self._variable_update_period)

        # Create the agent.
        actor = acting.MCTSActor(
            environment_spec=self._env_spec,
            model=model,
            network=network,
            discount=self._discount,
            variable_client=variable_client,
            num_simulations=self._num_simulations,
        )

        # Create the run loop and return it.
        logger = loggers.make_default_logger('evaluator')
        return acme.EnvironmentLoop(environment,
                                    actor,
                                    counter=counter,
                                    logger=logger)
示例#10
0
  def evaluator(self, variable_source: acme.VariableSource,
                counter: counting.Counter):
    """The evaluation process."""
    environment = self._environment_factory(True)
    network = self._network_factory(self._environment_spec.actions)
    tf2_utils.create_variables(network, [self._environment_spec.observations])

    variable_client = tf2_variable_utils.VariableClient(
        client=variable_source,
        variables={'policy': network.variables},
        update_period=self._variable_update_period)

    # Make sure not to use a random policy after checkpoint restoration by
    # assigning variables before running the environment loop.
    variable_client.update_and_wait()

    # Create the agent.
    actor = acting.IMPALAActor(
        network=network, variable_client=variable_client)

    # Create the run loop and return it.
    logger = loggers.make_default_logger(
        'evaluator', steps_key='evaluator_steps')
    counter = counting.Counter(counter, 'evaluator')
    return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#11
0
  def learner(self, queue: reverb.Client, counter: counting.Counter):
    """The Learning part of the agent."""
    # Use architect and create the environment.
    # Create the networks.
    network = self._network_factory(self._environment_spec.actions)
    tf2_utils.create_variables(network, [self._environment_spec.observations])

    # The dataset object to learn from.
    dataset = datasets.make_reverb_dataset(
        server_address=queue.server_address,
        batch_size=self._batch_size,
        prefetch_size=self._prefetch_size)

    logger = loggers.make_default_logger('learner', steps_key='learner_steps')
    counter = counting.Counter(counter, 'learner')

    # Return the learning agent.
    learner = learning.IMPALALearner(
        environment_spec=self._environment_spec,
        network=network,
        dataset=dataset,
        discount=self._discount,
        learning_rate=self._learning_rate,
        entropy_cost=self._entropy_cost,
        baseline_cost=self._baseline_cost,
        max_abs_reward=self._max_abs_reward,
        max_gradient_norm=self._max_gradient_norm,
        counter=counter,
        logger=logger,
    )

    return tf2_savers.CheckpointingRunner(learner,
                                          time_delta_minutes=5,
                                          subdirectory='impala_learner')
示例#12
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ) -> acme.EnvironmentLoop:
        """The actor process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and target networks to act with.
        environment = self._environment_factory(False)
        agent_networks = self._network_factory(action_spec,
                                               self._num_critic_heads)

        # Make sure observation network is defined.
        observation_network = agent_networks.get('observation', tf.identity)

        # Create a stochastic behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            agent_networks['policy'],
            networks.StochasticSamplingHead(),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(behavior_network, [observation_spec])
        policy_variables = {'policy': behavior_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(variable_source,
                                                            policy_variables,
                                                            update_period=1000)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(
            client=replay,
            n_step=self._n_step,
            max_in_flight_items=self._max_in_flight_items,
            discount=self._additional_discount)

        # Create the agent.
        actor = actors.FeedForwardActor(policy_network=behavior_network,
                                        adder=adder,
                                        variable_client=variable_client)

        # Create logger and counter; actors will not spam bigtable.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             time_delta=self._log_every,
                                             steps_key='actor_steps')

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#13
0
    def create_critic_variables(self) -> Dict[str, Dict[str, snt.Module]]:

        critic_networks: Dict[str, Dict[str, snt.Module]] = {
            "critics": {},
            "target_critics": {},
        }

        # get critic specs
        embed_specs, act_specs = self._get_critic_specs()

        # create critics
        for agent_key in self._critic_agent_keys:

            # get specs
            emb_spec = embed_specs[agent_key]
            act_spec = act_specs[agent_key]

            # Create variables.
            tf2_utils.create_variables(self._critic_networks[agent_key],
                                       [emb_spec, act_spec])

            # create target network variables
            tf2_utils.create_variables(self._target_critic_networks[agent_key],
                                       [emb_spec, act_spec])

        critic_networks["critics"] = self._critic_networks
        critic_networks["target_critics"] = self._target_critic_networks
        return critic_networks
示例#14
0
  def __init__(
      self,
      environment_spec: specs.EnvironmentSpec,
      replay_capacity: int,
      batch_size: int,
      hidden_sizes: Tuple[int, ...],
      learning_rate: float = 1e-3,
      terminal_tol: float = 1e-3,
  ):
    self._obs_spec = environment_spec.observations
    self._action_spec = environment_spec.actions
    # Hyperparameters.
    self._batch_size = batch_size
    self._terminal_tol = terminal_tol

    # Modelling
    self._replay = replay.Replay(replay_capacity)
    self._transition_model = MLPTransitionModel(environment_spec, hidden_sizes)
    self._optimizer = snt.optimizers.Adam(learning_rate)
    self._forward = tf.function(self._transition_model)
    tf2_utils.create_variables(
        self._transition_model, [self._obs_spec, self._action_spec])
    self._variables = self._transition_model.trainable_variables

    # Model state.
    self._needs_reset = True
示例#15
0
文件: run_bcq.py 项目: deepmind/acme
def main(_):
    # Create an environment and grab the spec.
    environment = atari.environment(FLAGS.game)
    environment_spec = specs.make_environment_spec(environment)

    # Create dataset.
    dataset = atari.dataset(path=FLAGS.dataset_path,
                            game=FLAGS.game,
                            run=FLAGS.run,
                            num_shards=FLAGS.num_shards)
    # Discard extra inputs
    dataset = dataset.map(lambda x: x._replace(data=x.data[:5]))

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Build network.
    g_network = make_network(environment_spec.actions)
    q_network = make_network(environment_spec.actions)
    network = networks.DiscreteFilteredQNetwork(g_network=g_network,
                                                q_network=q_network,
                                                threshold=FLAGS.bcq_threshold)
    tf2_utils.create_variables(network, [environment_spec.observations])

    evaluator_network = snt.Sequential([
        q_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Counters.
    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluation_network = actors.FeedForwardActor(evaluator_network)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluation_network,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = bcq.DiscreteBCQLearner(
        network=network,
        dataset=dataset,
        learning_rate=FLAGS.learning_rate,
        discount=FLAGS.discount,
        importance_sampling_exponent=FLAGS.importance_sampling_exponent,
        target_update_period=FLAGS.target_update_period,
        counter=counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
示例#16
0
 def test_feedforward(self, recurrent: bool):
   model = snt.Linear(42)
   if recurrent:
     model = snt.DeepRNN([model])
   input_spec = specs.Array(shape=(10,), dtype=np.float32)
   tf2_utils.create_variables(model, [input_spec])
   variables: Sequence[tf.Variable] = model.variables
   shapes = [v.shape.as_list() for v in variables]
   self.assertSequenceEqual(shapes, [[42], [10, 42]])
示例#17
0
  def evaluator(
      self,
      variable_source: acme.VariableSource,
      counter: counting.Counter,
  ):
    """The evaluation process."""

    action_spec = self._environment_spec.actions
    observation_spec = self._environment_spec.observations

    # Create environment and target networks to act with.
    environment = self._environment_factory(True)
    agent_networks = self._network_factory(action_spec)

    # Make sure observation network is defined.
    observation_network = agent_networks.get('observation', tf.identity)

    # Create a stochastic behavior policy.
    evaluator_network = snt.Sequential([
        observation_network,
        agent_networks['policy'],
        networks.StochasticMeanHead(),
    ])

    # Ensure network variables are created.
    tf2_utils.create_variables(evaluator_network, [observation_spec])
    policy_variables = {'policy': evaluator_network.variables}

    # Create the variable client responsible for keeping the actor up-to-date.
    variable_client = tf2_variable_utils.VariableClient(
        variable_source,
        policy_variables,
        update_period=self._variable_update_period)

    # Make sure not to evaluate a random actor by assigning variables before
    # running the environment loop.
    variable_client.update_and_wait()

    # Create the agent.
    evaluator = actors.FeedForwardActor(
        policy_network=evaluator_network, variable_client=variable_client)

    # Create logger and counter.
    counter = counting.Counter(counter, 'evaluator')
    logger = loggers.make_default_logger(
        'evaluator', time_delta=self._log_every, steps_key='evaluator_steps')
    observers = self._make_observers() if self._make_observers else ()

    # Create the run loop and return it.
    return acme.EnvironmentLoop(
        environment,
        evaluator,
        counter,
        logger,
        observers=observers)
示例#18
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The actor process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and behavior networks
        environment = self._environment_factory(False)
        agent_networks = self._network_factory(action_spec)

        # Create behavior network by adding some random dithering.
        behavior_network = snt.Sequential([
            agent_networks.get('observation', tf.identity),
            agent_networks.get('policy'),
            networks.ClippedGaussian(self._sigma),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(behavior_network, [observation_spec])
        variables = {'policy': behavior_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables,
            update_period=self._variable_update_period)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(client=replay,
                                            n_step=self._n_step,
                                            discount=self._discount)

        # Create the agent.
        actor = actors.FeedForwardActor(behavior_network,
                                        adder=adder,
                                        variable_client=variable_client)

        # Create logger and counter; actors will not spam bigtable.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             time_delta=self._log_every,
                                             steps_key='actor_steps')

        # Create the loop to connect environment and agent.
        return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#19
0
    def init(self, environment_spec: specs.EnvironmentSpec):
        """Initialize the networks given an environment spec."""
        # Get observation and action specs.
        act_spec = environment_spec.actions
        obs_spec = environment_spec.observations

        # Create variables for the policy and critic nets.
        _ = utils.create_variables(self.policy_network, [obs_spec])
        _ = utils.create_variables(self.critic_network, [obs_spec, act_spec])
        if self.prior_network is not None:
            _ = utils.create_variables(self.prior_network, [obs_spec])
示例#20
0
    def setUp(self):
        super().setUp()

        # Create two instances of the same model.
        self._actor_model = snt.nets.MLP(_MLP_LAYERS)
        self._learner_model = snt.nets.MLP(_MLP_LAYERS)

        # Create variables first.
        input_spec = tf.TensorSpec(shape=(_INPUT_SIZE, ), dtype=tf.float32)
        tf2_utils.create_variables(self._actor_model, [input_spec])
        tf2_utils.create_variables(self._learner_model, [input_spec])
示例#21
0
def main(_):
    wb_run = init_or_resume()

    if FLAGS.seed:
        tf.random.set_seed(FLAGS.seed)

    # Create an environment and grab the spec.
    environment, env_spec = _build_environment(FLAGS.environment_name)

    # Load demonstration dataset.
    raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir)

    dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size,
                                 FLAGS.n_step_returns, FLAGS.discount)

    # Create the policy and critic networks.
    policy_network = networks.get_default_critic(env_spec)

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [env_spec.observations])

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_network = snt.Sequential([
        policy_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Create the actor which defines how we take actions.
    evaluation_actor = actors.FeedForwardActor(evaluator_network)

    counter = counting.Counter()

    disp, disp_loop = _build_custom_loggers(wb_run)

    eval_loop = EnvironmentLoop(environment=environment,
                                actor=evaluation_actor,
                                counter=counter,
                                logger=disp_loop)

    # The learner updates the parameters (and initializes them).
    learner = BCLearner(network=policy_network,
                        learning_rate=FLAGS.learning_rate,
                        dataset=dataset,
                        counter=counter)

    # Run the environment loop.
    for _ in tqdm(range(FLAGS.epochs)):
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        eval_loop.run(FLAGS.evaluation_episodes)

    learner.save(tag=FLAGS.logs_tag)
示例#22
0
  def init(self, environment_spec: specs.EnvironmentSpec):
    """Initialize the networks given an environment spec."""
    # Get observation and action specs.
    act_spec = environment_spec.actions
    obs_spec = environment_spec.observations

    # Create variables for the observation net and, as a side-effect, get a
    # spec describing the embedding space.
    emb_spec = utils.create_variables(self.observation_network, [obs_spec])

    # Create variables for the policy and critic nets.
    _ = utils.create_variables(self.policy_network, [emb_spec])
    _ = utils.create_variables(self.critic_network, [emb_spec, act_spec])
    def __init__(self, environment_spec: specs.EnvironmentSpec,
                 action_spec: specs.BoundedArray, z_dim: int) -> None:
        self._z_dim = z_dim
        z_spec = specs.BoundedArray((z_dim, ),
                                    np.float64,
                                    minimum=0,
                                    maximum=1)
        # Modify the environment_spec to also include the latent variable
        # observation  (z)
        self._obs_space = environment_spec.observations
        assert (
            len(self._obs_space.shape) == 1
        ), f"Only vector observations are supported for now. Observations shape passed: {obs_shape}"
        updated_observations = specs.BoundedArray(
            (self._obs_space.shape[0] + z_dim, ),
            dtype=environment_spec.observations.dtype,
            name=environment_spec.observations.name,
            minimum=np.append(environment_spec.observations.minimum,
                              [0] * z_dim),
            maximum=np.append(environment_spec.observations.maximum,
                              [0] * z_dim),
        )
        environment_spec = specs.EnvironmentSpec(
            observations=updated_observations,
            actions=environment_spec.actions,
            rewards=environment_spec.rewards,
            discounts=environment_spec.discounts,
        )
        self._agent_networks = make_feed_forward_networks(action_spec, z_spec)
        self._agent = dmpo.DistributionalMPO(
            environment_spec=environment_spec,
            policy_network=self._agent_networks['policy'],
            critic_network=self._agent_networks['critic'],
            observation_network=self._agent_networks['observation'],  # pytype: disable=wrong-arg-types
            extra_modules_to_save={
                'discriminator': self._agent_networks['discriminator'],
            },
            return_action_entropy=True,
        )

        self._z_distribution = tfd.Categorical([1] * z_dim)
        self._current_z = self._z_distribution.sample()

        # Create discriminator optimizer.
        self._discriminator_optimizer = snt.optimizers.Adam(1e-4)
        self._discriminator_logger = loggers.make_default_logger(
            'discriminator')

        # Create variables for the discriminator.
        tf2_utils.create_variables(self._agent_networks['discriminator'],
                                   [self._obs_space])
示例#24
0
文件: networks.py 项目: deepmind/acme
def make_default_networks(
    environment_spec: specs.EnvironmentSpec,
    *,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
    policy_init_scale: float = 0.7,
    critic_init_scale: float = 1e-3,
    critic_num_components: int = 5,
) -> Mapping[str, snt.Module]:
    """Creates networks used by the agent."""

    # Unpack the environment spec to get appropriate shapes, dtypes, etc.
    act_spec = environment_spec.actions
    obs_spec = environment_spec.observations
    num_dimensions = np.prod(act_spec.shape, dtype=int)

    # Create the observation network and make sure it's a Sonnet module.
    observation_network = tf2_utils.batch_concat
    observation_network = tf2_utils.to_sonnet_module(observation_network)

    # Create the policy network.
    policy_network = snt.Sequential([
        networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
        networks.MultivariateNormalDiagHead(num_dimensions,
                                            init_scale=policy_init_scale,
                                            use_tfd_independent=True)
    ])

    # The multiplexer concatenates the (maybe transformed) observations/actions.
    critic_network = snt.Sequential([
        networks.CriticMultiplexer(
            action_network=networks.ClipToSpec(act_spec)),
        networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
        networks.GaussianMixtureHead(num_dimensions=1,
                                     num_components=critic_num_components,
                                     init_scale=critic_init_scale)
    ])

    # Create network variables.
    # Get embedding spec by creating observation network variables.
    emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])
    tf2_utils.create_variables(policy_network, [emb_spec])
    tf2_utils.create_variables(critic_network, [emb_spec, act_spec])

    return {
        'policy': policy_network,
        'critic': critic_network,
        'observation': observation_network,
    }
示例#25
0
文件: agent.py 项目: srsohn/mtsgi
    def __init__(
        self,
        environment_spec: specs.EnvironmentSpec,
        network: snt.RNNCore,
        queue: adder.Adder,
        counter: counting.Counter = None,
        logger: loggers.Logger = None,
        discount: float = 0.99,
        n_step_horizon: int = 16,
        learning_rate: float = 1e-3,
        entropy_cost: float = 0.01,
        baseline_cost: float = 0.5,
        max_abs_reward: Optional[float] = None,
        max_gradient_norm: Optional[float] = None,
        verbose_level: Optional[int] = 0,
    ):
        num_actions = environment_spec.actions.num_values
        self._logger = logger or loggers.TerminalLogger('agent')

        extra_spec = {
            'core_state': network.initial_state(1),
            'logits': tf.ones(shape=(1, num_actions), dtype=tf.float32)
        }
        # Remove batch dimensions.
        extra_spec = tf2_utils.squeeze_batch_dim(extra_spec)
        tf2_utils.create_variables(network, [environment_spec.observations])

        actor = acting.A2CActor(environment_spec=environment_spec,
                                verbose_level=verbose_level,
                                network=network,
                                queue=queue)
        learner = learning.A2CLearner(
            environment_spec=environment_spec,
            network=network,
            dataset=queue,
            counter=counter,
            logger=logger,
            discount=discount,
            learning_rate=learning_rate,
            entropy_cost=entropy_cost,
            baseline_cost=baseline_cost,
            max_gradient_norm=max_gradient_norm,
            max_abs_reward=max_abs_reward,
        )

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=0,
                         observations_per_step=n_step_horizon)
示例#26
0
  def actor(
      self,
      replay: reverb.Client,
      variable_source: acme.VariableSource,
      counter: counting.Counter,
      epsilon: float,
  ) -> acme.EnvironmentLoop:
    """The actor process."""
    environment = self._environment_factory(False)
    network = self._network_factory(self._environment_spec.actions)

    tf2_utils.create_variables(network, [self._obs_spec])

    policy_network = snt.DeepRNN([
        network,
        lambda qs: tf.cast(trfl.epsilon_greedy(qs, epsilon).sample(), tf.int32),
    ])

    # Component to add things into replay.
    sequence_length = self._burn_in_length + self._trace_length + 1
    adder = adders.SequenceAdder(
        client=replay,
        period=self._replay_period,
        sequence_length=sequence_length,
        delta_encoded=True,
    )

    variable_client = tf2_variable_utils.VariableClient(
        client=variable_source,
        variables={'policy': policy_network.variables},
        update_period=self._variable_update_period)

    # Make sure not to use a random policy after checkpoint restoration by
    # assigning variables before running the environment loop.
    variable_client.update_and_wait()

    # Create the agent.
    actor = actors.RecurrentActor(
        policy_network=policy_network,
        variable_client=variable_client,
        adder=adder)

    counter = counting.Counter(counter, 'actor')
    logger = loggers.make_default_logger(
        'actor', save_data=False, steps_key='actor_steps')

    # Create the loop to connect environment and agent.
    return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#27
0
def make_ope_networks(
    task_id: str,
    environment_spec: EnvironmentSpec,
    **network_params: Dict[str, Any],
) -> snt.Module:
    if task_id.startswith("dm_control"):
        value_func = make_value_func_dm_control(**network_params)
    elif task_id.startswith("bsuite"):
        value_func = make_value_func_bsuite(environment_spec, **network_params)
    else:
        raise ValueError(f"task id {task_id} not known")

    tf2_utils.create_variables(
        value_func, [environment_spec.observations, environment_spec.actions])

    return value_func
示例#28
0
 def test_scalar_output(self):
   model = tf2_utils.to_sonnet_module(tf.reduce_sum)
   input_spec = specs.Array(shape=(10,), dtype=np.float32)
   expected_spec = tf.TensorSpec(shape=(), dtype=tf.float32)
   output_spec = tf2_utils.create_variables(model, [input_spec])
   self.assertEqual(model.variables, ())
   self.assertEqual(output_spec, expected_spec)
示例#29
0
 def test_none_output(self):
   model = tf2_utils.to_sonnet_module(lambda x: None)
   input_spec = specs.Array(shape=(10,), dtype=np.float32)
   expected_spec = None
   output_spec = tf2_utils.create_variables(model, [input_spec])
   self.assertEqual(model.variables, ())
   self.assertEqual(output_spec, expected_spec)
示例#30
0
    def evaluator(
        self,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The evaluation process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and evaluator networks
        environment = self._environment_factory(True)
        agent_networks = self._network_factory(action_spec)

        # Create evaluator network.
        evaluator_network = snt.Sequential([
            agent_networks.get('observation', tf.identity),
            agent_networks.get('policy'),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(evaluator_network, [observation_spec])
        variables = {'policy': evaluator_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables,
            update_period=self._variable_update_period)

        # Make sure not to evaluate a random actor by assigning variables before
        # running the environment loop.
        variable_client.update_and_wait()

        # Create the evaluator; note it will not add experience to replay.
        evaluator = actors.FeedForwardActor(evaluator_network,
                                            variable_client=variable_client)

        # Create logger and counter.
        counter = counting.Counter(counter, 'evaluator')
        logger = loggers.make_default_logger('evaluator',
                                             time_delta=self._log_every,
                                             steps_key='evaluator_steps')

        # Create the run loop and return it.
        return acme.EnvironmentLoop(environment, evaluator, counter, logger)