示例#1
0
    def actor(
        self,
        replay: reverb.Client,
        variable_source: acme.VariableSource,
        counter: counting.Counter,
    ):
        """The actor process."""

        action_spec = self._environment_spec.actions
        observation_spec = self._environment_spec.observations

        # Create environment and behavior networks
        environment = self._environment_factory(False)
        agent_networks = self._network_factory(action_spec)

        # Create behavior network by adding some random dithering.
        behavior_network = snt.Sequential([
            agent_networks.get('observation', tf.identity),
            agent_networks.get('policy'),
            networks.ClippedGaussian(self._sigma),
        ])

        # Ensure network variables are created.
        tf2_utils.create_variables(behavior_network, [observation_spec])
        variables = {'policy': behavior_network.variables}

        # Create the variable client responsible for keeping the actor up-to-date.
        variable_client = tf2_variable_utils.VariableClient(
            variable_source,
            variables,
            update_period=self._variable_update_period)

        # Make sure not to use a random policy after checkpoint restoration by
        # assigning variables before running the environment loop.
        variable_client.update_and_wait()

        # Component to add things into replay.
        adder = adders.NStepTransitionAdder(client=replay,
                                            n_step=self._n_step,
                                            discount=self._discount)

        # Create the agent.
        actor = actors.FeedForwardActor(behavior_network,
                                        adder=adder,
                                        variable_client=variable_client)

        # Create logger and counter; actors will not spam bigtable.
        counter = counting.Counter(counter, 'actor')
        logger = loggers.make_default_logger('actor',
                                             save_data=False,
                                             time_delta=self._log_every,
                                             steps_key='actor_steps')

        # Create the loop to connect environment and agent.
        return acme.EnvironmentLoop(environment, actor, counter, logger)
示例#2
0
  def make_policy(
      self,
      environment_spec: specs.EnvironmentSpec,
      sigma: float = 0.0,
  ) -> snt.Module:
    """Create a single network which evaluates the policy."""
    # Stack the observation and policy networks.
    stack = [
        self.observation_network,
        self.policy_network,
    ]

    # If a stochastic/non-greedy policy is requested, add Gaussian noise on
    # top to enable a simple form of exploration.
    # TODO(mwhoffman): Refactor this to remove it from the class.
    if sigma > 0.0:
      stack += [
          network_utils.ClippedGaussian(sigma),
          network_utils.ClipToSpec(environment_spec.actions),
      ]

    # Return a network which sequentially evaluates everything in the stack.
    return snt.Sequential(stack)
示例#3
0
    def __init__(self,
                 environment_spec: specs.EnvironmentSpec,
                 policy_network: snt.Module,
                 critic_network: snt.Module,
                 observation_network: types.TensorTransformation = tf.identity,
                 discount: float = 0.99,
                 batch_size: int = 256,
                 prefetch_size: int = 4,
                 target_update_period: int = 100,
                 min_replay_size: int = 1000,
                 max_replay_size: int = 1000000,
                 samples_per_insert: float = 32.0,
                 n_step: int = 5,
                 sigma: float = 0.3,
                 clipping: bool = True,
                 logger: loggers.Logger = None,
                 counter: counting.Counter = None,
                 checkpoint: bool = True,
                 replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE):
        """Initialize the agent.

    Args:
      environment_spec: description of the actions, observations, etc.
      policy_network: the online (optimized) policy.
      critic_network: the online critic.
      observation_network: optional network to transform the observations before
        they are fed into any network.
      discount: discount to use for TD updates.
      batch_size: batch size for updates.
      prefetch_size: size to prefetch from replay.
      target_update_period: number of learner steps to perform before updating
        the target networks.
      min_replay_size: minimum replay size before updating.
      max_replay_size: maximum replay size.
      samples_per_insert: number of samples to take from replay for every insert
        that is made.
      n_step: number of steps to squash into a single transition.
      sigma: standard deviation of zero-mean, Gaussian exploration noise.
      clipping: whether to clip gradients by global norm.
      logger: logger object to be used by learner.
      counter: counter object used to keep track of steps.
      checkpoint: boolean indicating whether to checkpoint the learner.
      replay_table_name: string indicating what name to give the replay table.
    """
        # Create a replay server to add data to. This uses no limiter behavior in
        # order to allow the Agent interface to handle it.
        replay_table = reverb.Table(
            name=replay_table_name,
            sampler=reverb.selectors.Uniform(),
            remover=reverb.selectors.Fifo(),
            max_size=max_replay_size,
            rate_limiter=reverb.rate_limiters.MinSize(1),
            signature=adders.NStepTransitionAdder.signature(environment_spec))
        self._server = reverb.Server([replay_table], port=None)

        # The adder is used to insert observations into replay.
        address = f'localhost:{self._server.port}'
        adder = adders.NStepTransitionAdder(
            priority_fns={replay_table_name: lambda x: 1.},
            client=reverb.Client(address),
            n_step=n_step,
            discount=discount)

        # The dataset provides an interface to sample from replay.
        dataset = datasets.make_reverb_dataset(
            table=replay_table_name,
            client=reverb.TFClient(address),
            environment_spec=environment_spec,
            batch_size=batch_size,
            prefetch_size=prefetch_size,
            transition_adder=True)

        # Get observation and action specs.
        act_spec = environment_spec.actions
        obs_spec = environment_spec.observations
        emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])  # pytype: disable=wrong-arg-types

        # Make sure observation network is a Sonnet Module.
        observation_network = tf2_utils.to_sonnet_module(observation_network)

        # Create target networks.
        target_policy_network = copy.deepcopy(policy_network)
        target_critic_network = copy.deepcopy(critic_network)
        target_observation_network = copy.deepcopy(observation_network)

        # Create the behavior policy.
        behavior_network = snt.Sequential([
            observation_network,
            policy_network,
            networks.ClippedGaussian(sigma),
            networks.ClipToSpec(act_spec),
        ])

        # Create variables.
        tf2_utils.create_variables(policy_network, [emb_spec])
        tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_policy_network, [emb_spec])
        tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
        tf2_utils.create_variables(target_observation_network, [obs_spec])

        # Create the actor which defines how we take actions.
        actor = actors.FeedForwardActor(behavior_network, adder=adder)

        # Create optimizers.
        policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4)
        critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4)

        # The learner updates the parameters (and initializes them).
        learner = learning.DDPGLearner(
            policy_network=policy_network,
            critic_network=critic_network,
            observation_network=observation_network,
            target_policy_network=target_policy_network,
            target_critic_network=target_critic_network,
            target_observation_network=target_observation_network,
            policy_optimizer=policy_optimizer,
            critic_optimizer=critic_optimizer,
            clipping=clipping,
            discount=discount,
            target_update_period=target_update_period,
            dataset=dataset,
            counter=counter,
            logger=logger,
            checkpoint=checkpoint,
        )

        super().__init__(actor=actor,
                         learner=learner,
                         min_observations=max(batch_size, min_replay_size),
                         observations_per_step=float(batch_size) /
                         samples_per_insert)
 def __init__(self, visualRadius, action_size, action_spec, exploration_sigma):
     super(ActorNetwork, self).__init__(name="commons-actor")
     self.policy_network = PolicyNetwork(
         visualRadius, action_size, action_spec)
     self.behavior_network = self.policy_network + snt.Sequential([networks.ClippedGaussian(exploration_sigma),
                                                                   networks.ClipToSpec(action_spec)])
示例#5
0
# Create the target networks
target_policy_network = copy.deepcopy(policy_network)
target_critic_network = copy.deepcopy(critic_network)
target_observation_network = copy.deepcopy(observation_network)

# Get observation and action specs.
act_spec = environment_spec.actions
obs_spec = environment_spec.observations
emb_spec = tf2_utils.create_variables(observation_network, [obs_spec])

# Create the behavior policy.
behavior_network = snt.Sequential([
    observation_network,
    policy_network,
    networks.ClippedGaussian(0.3), #sigma = 0.3
    networks.ClipToSpec(act_spec),
])

# We must create the variables in the networks before passing them to learner.
# Create variables.
tf2_utils.create_variables(policy_network, [emb_spec])
tf2_utils.create_variables(critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_policy_network, [emb_spec])
tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec])
tf2_utils.create_variables(target_observation_network, [obs_spec])

actor = actors.FeedForwardActor(behavior_network, adder=adder)

learner = d4pg.D4PGLearner(policy_network=policy_network,
                           critic_network=critic_network,
示例#6
0
def make_networks(
    environment_spec: mava_specs.MAEnvironmentSpec,
    policy_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (
        256,
        256,
        256,
    ),
    critic_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (512, 512, 256),
    shared_weights: bool = True,
    sigma: float = 0.3,
) -> Mapping[str, types.TensorTransformation]:
    """Creates networks used by the agents."""
    specs = environment_spec.get_agent_specs()

    # Create agent_type specs
    if shared_weights:
        type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()}
        specs = type_specs

    if isinstance(policy_networks_layer_sizes, Sequence):
        policy_networks_layer_sizes = {
            key: policy_networks_layer_sizes for key in specs.keys()
        }
    if isinstance(critic_networks_layer_sizes, Sequence):
        critic_networks_layer_sizes = {
            key: critic_networks_layer_sizes for key in specs.keys()
        }

    observation_networks = {}
    policy_networks = {}
    critic_networks = {}
    for key in specs.keys():

        # Get total number of action dimensions from action spec.
        num_dimensions = np.prod(specs[key].actions.shape, dtype=int)

        # Create the shared observation network; here simply a state-less operation.
        observation_network = tf2_utils.to_sonnet_module(tf.identity)

        # Create the policy network.
        policy_network = snt.Sequential(
            [
                networks.LayerNormMLP(
                    policy_networks_layer_sizes[key], activate_final=True
                ),
                networks.NearZeroInitializedLinear(num_dimensions),
                networks.TanhToSpec(specs[key].actions),
                networks.ClippedGaussian(sigma),
                networks.ClipToSpec(specs[key].actions),
            ]
        )

        # Create the critic network.
        critic_network = snt.Sequential(
            [
                # The multiplexer concatenates the observations/actions.
                networks.CriticMultiplexer(),
                networks.LayerNormMLP(
                    critic_networks_layer_sizes[key], activate_final=False
                ),
                snt.Linear(1),
            ]
        )
        observation_networks[key] = observation_network
        policy_networks[key] = policy_network
        critic_networks[key] = critic_network

    return {
        "policies": policy_networks,
        "critics": critic_networks,
        "observations": observation_networks,
    }
示例#7
0
def make_default_networks(
    environment_spec: mava_specs.MAEnvironmentSpec,
    policy_networks_layer_sizes: Union[Dict[str, Sequence],
                                       Sequence] = (256, 256, 256),
    critic_networks_layer_sizes: Union[Dict[str, Sequence],
                                       Sequence] = (512, 512, 256),
    shared_weights: bool = True,
    sigma: float = 0.3,
    archecture_type: ArchitectureType = ArchitectureType.feedforward,
) -> Mapping[str, types.TensorTransformation]:
    """Default networks for maddpg.

    Args:
        environment_spec (mava_specs.MAEnvironmentSpec): description of the action and
            observation spaces etc. for each agent in the system.
        policy_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional):
            size of policy networks. Defaults to (256, 256, 256).
        critic_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional):
            size of critic networks. Defaults to (512, 512, 256).
        shared_weights (bool, optional): whether agents should share weights or not.
            Defaults to True.
        sigma (float, optional): hyperparameters used to add Gaussian noise for
            simple exploration. Defaults to 0.3.
        archecture_type (ArchitectureType, optional): archecture used for
            agent networks. Can be feedforward or recurrent. Defaults to
            ArchitectureType.feedforward.

    Returns:
        Mapping[str, types.TensorTransformation]: returned agent networks.
    """

    # Set Policy function and layer size
    if archecture_type == ArchitectureType.feedforward:
        policy_network_func = snt.Sequential
    elif archecture_type == ArchitectureType.recurrent:
        policy_networks_layer_sizes = (128, 128)
        policy_network_func = snt.DeepRNN

    specs = environment_spec.get_agent_specs()

    # Create agent_type specs
    if shared_weights:
        type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()}
        specs = type_specs

    if isinstance(policy_networks_layer_sizes, Sequence):
        policy_networks_layer_sizes = {
            key: policy_networks_layer_sizes
            for key in specs.keys()
        }
    if isinstance(critic_networks_layer_sizes, Sequence):
        critic_networks_layer_sizes = {
            key: critic_networks_layer_sizes
            for key in specs.keys()
        }

    observation_networks = {}
    policy_networks = {}
    critic_networks = {}
    for key in specs.keys():
        # TODO (dries): Make specs[key].actions
        #  return a list of specs for hybrid action space
        # Get total number of action dimensions from action spec.
        agent_act_spec = specs[key].actions
        if type(specs[key].actions) == DiscreteArray:
            num_actions = agent_act_spec.num_values
            minimum = [-1.0] * num_actions
            maximum = [1.0] * num_actions
            agent_act_spec = BoundedArray(
                shape=(num_actions, ),
                minimum=minimum,
                maximum=maximum,
                dtype="float32",
                name="actions",
            )

        # Get total number of action dimensions from action spec.
        num_dimensions = np.prod(agent_act_spec.shape, dtype=int)

        # An optional network to process observations
        observation_network = tf2_utils.to_sonnet_module(tf.identity)
        # Create the policy network.
        if archecture_type == ArchitectureType.feedforward:
            policy_network = [
                networks.LayerNormMLP(policy_networks_layer_sizes[key],
                                      activate_final=True),
            ]
        elif archecture_type == ArchitectureType.recurrent:
            policy_network = [
                networks.LayerNormMLP(policy_networks_layer_sizes[key][:-1],
                                      activate_final=True),
                snt.LSTM(policy_networks_layer_sizes[key][-1]),
            ]

        policy_network += [
            networks.NearZeroInitializedLinear(num_dimensions),
            networks.TanhToSpec(agent_act_spec),
        ]

        # Add Gaussian noise for simple exploration.
        if sigma and sigma > 0.0:
            policy_network += [
                networks.ClippedGaussian(sigma),
                networks.ClipToSpec(agent_act_spec),
            ]

        policy_network = policy_network_func(policy_network)

        # Create the critic network.
        critic_network = snt.Sequential([
            # The multiplexer concatenates the observations/actions.
            networks.CriticMultiplexer(),
            networks.LayerNormMLP(list(critic_networks_layer_sizes[key]) + [1],
                                  activate_final=False),
        ])
        observation_networks[key] = observation_network
        policy_networks[key] = policy_network
        critic_networks[key] = critic_network

    return {
        "policies": policy_networks,
        "critics": critic_networks,
        "observations": observation_networks,
    }