def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (50, ), critic_layer_sizes: Sequence[int] = (50, ), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, tanh_mean=True, init_scale=0.3, fixed_scale=True, use_tfd_independent=False) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes, activate_final=True), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_dmpo_networks( action_spec, policy_layer_sizes = (300, 200), critic_layer_sizes = (400, 300), vmin = -150., vmax = 150., num_atoms = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf_utils.batch_concat, }
def make_mpo_networks( action_spec, policy_layer_sizes=(256, 256, 256), critic_layer_sizes=(512, 512, 256), policy_init_std=1e-9, obs_network=None): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) critic_layer_sizes = list(critic_layer_sizes) + [1] policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead( num_dimensions, init_scale=policy_init_std, min_scale=1e-10) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) if obs_network is None: obs_network = tf_utils.batch_concat return { 'policy': policy_network, 'critic': critic_network, 'observation': obs_network, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), ) -> Dict[str, types.TensorTransformation]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, init_scale=0.7, use_tfd_independent=True) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. multiplexer = networks.CriticMultiplexer( action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ multiplexer, networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.NearZeroInitializedLinear(1), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_default_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), ) -> Mapping[str, types.TensorTransformation]: """Creates networks used by the agent.""" # Get total number of action dimensions from action spec. num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ tf2_utils.batch_concat, networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead( num_dimensions, tanh_mean=True, min_scale=0.3, init_scale=0.7, fixed_scale=False, use_tfd_independent=False) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. multiplexer = networks.CriticMultiplexer( action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ multiplexer, networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.NearZeroInitializedLinear(1), ]) return { "policy": policy_network, "critic": critic_network, }
def make_networks( action_spec: specs.BoundedArray, num_critic_heads: int, policy_layer_sizes: Sequence[int] = (50, ), critic_layer_sizes: Sequence[int] = (50, ), num_layers_shared: int = 1, distributional_critic: bool = True, vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, tanh_mean=False, init_scale=0.69) ]) if not distributional_critic: critic_layer_sizes = list(critic_layer_sizes) + [1] if not num_layers_shared: # No layers are shared critic_network_base = None else: critic_network_base = networks.LayerNormMLP( critic_layer_sizes[:num_layers_shared], activate_final=True) critic_network_heads = [ snt.nets.MLP(critic_layer_sizes, activation=tf.nn.elu, activate_final=False) for _ in range(num_critic_heads) ] if distributional_critic: critic_network_heads = [ snt.Sequential( [c, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) for c in critic_network_heads ] # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( critic_network=critic_network_base, action_network=networks.ClipToSpec(action_spec)), networks.Multihead(network_heads=critic_network_heads), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': tf2_utils.batch_concat, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (50, 1024, 1024), critic_layer_sizes: Sequence[int] = (50, 1024, 1024), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, snt.Module]: """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, w_init=snt.initializers.Orthogonal(), activation=tf.nn.relu, activate_final=True), networks.MultivariateNormalDiagHead( num_dimensions, tanh_mean=False, init_scale=1.0, fixed_scale=False, use_tfd_independent=True, w_init=snt.initializers.Orthogonal()) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( observation_network=snt.Sequential([ snt.Linear(critic_layer_sizes[0], w_init=snt.initializers.Orthogonal()), snt.LayerNorm(axis=slice(1, None), create_scale=True, create_offset=True), tf.nn.tanh ]), critic_network=snt.nets.MLP(critic_layer_sizes[1:], w_init=snt.initializers.Orthogonal(), activation=tf.nn.relu, activate_final=True), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms, w_init=snt.initializers.Orthogonal()) ]) observation_network = networks.DrQTorso() return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
def make_network_with_prior( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (200, 100), critic_layer_sizes: Sequence[int] = (400, 300), prior_layer_sizes: Sequence[int] = (200, 100), policy_keys: Optional[Sequence[str]] = None, prior_keys: Optional[Sequence[str]] = None, ) -> Mapping[str, types.TensorTransformation]: """Creates networks used by the agent.""" # Get total number of action dimensions from action spec. num_dimensions = np.prod(action_spec.shape, dtype=int) flatten_concat_policy = functools.partial( svg0_utils.batch_concat_selection, concat_keys=policy_keys) flatten_concat_prior = functools.partial( svg0_utils.batch_concat_selection, concat_keys=prior_keys) policy_network = snt.Sequential([ flatten_concat_policy, networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead( num_dimensions, tanh_mean=True, min_scale=0.1, init_scale=0.7, fixed_scale=False, use_tfd_independent=False) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. multiplexer = networks.CriticMultiplexer( observation_network=flatten_concat_policy, action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ multiplexer, networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.NearZeroInitializedLinear(1), ]) prior_network = snt.Sequential([ flatten_concat_prior, networks.LayerNormMLP(prior_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead( num_dimensions, tanh_mean=True, min_scale=0.1, init_scale=0.7, fixed_scale=False, use_tfd_independent=False) ]) return { "policy": policy_network, "critic": critic_network, "prior": prior_network, }
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec, self._num_critic_heads) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a deterministic behavior policy. evaluator_modules = [ observation_network, agent_networks['policy'], networks.StochasticMeanHead(), ] if isinstance(action_spec, specs.BoundedArray): evaluator_modules += [networks.ClipToSpec(action_spec)] evaluator_network = snt.Sequential(evaluator_modules) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) policy_variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient(variable_source, policy_variables, update_period=1000) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor(policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every, steps_key='evaluator_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)
def make_default_networks( environment_spec: specs.EnvironmentSpec, *, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), policy_init_scale: float = 0.7, critic_init_scale: float = 1e-3, critic_num_components: int = 5, ) -> Mapping[str, snt.Module]: """Creates networks used by the agent.""" # Unpack the environment spec to get appropriate shapes, dtypes, etc. act_spec = environment_spec.actions obs_spec = environment_spec.observations num_dimensions = np.prod(act_spec.shape, dtype=int) # Create the observation network and make sure it's a Sonnet module. observation_network = tf2_utils.batch_concat observation_network = tf2_utils.to_sonnet_module(observation_network) # Create the policy network. policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, init_scale=policy_init_scale, use_tfd_independent=True) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = snt.Sequential([ networks.CriticMultiplexer( action_network=networks.ClipToSpec(act_spec)), networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.GaussianMixtureHead(num_dimensions=1, num_components=critic_num_components, init_scale=critic_init_scale) ]) # Create network variables. # Get embedding spec by creating observation network variables. emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
def load_policy_net( task_name: str, noise_level: float, dataset_path: str, environment_spec: specs.EnvironmentSpec, near_policy_dataset: bool = False, ): dataset_path = Path(dataset_path) if task_name.startswith("bsuite"): # BSuite tasks. bsuite_id = task_name[len("bsuite_"):] + "/0" path = bsuite_policy_path( bsuite_id, noise_level, near_policy_dataset, dataset_path) logging.info("Policy path: %s", path) policy_net = tf.saved_model.load(path) policy_noise_level = 0.1 # params["policy_noise_level"] observation_network = tf2_utils.to_sonnet_module(functools.partial( tf.reshape, shape=(-1,) + environment_spec.observations.shape)) policy_net = snt.Sequential([ observation_network, policy_net, # Uncomment this line to add action noise to the target policy. lambda q: trfl.epsilon_greedy(q, epsilon=policy_noise_level).sample(), ]) elif task_name.startswith("dm_control"): # DM Control tasks. if near_policy_dataset: raise ValueError( "Near-policy dataset is not available for dm_control tasks.") dm_control_task = task_name[len("dm_control_"):] path = dm_control_policy_path( dm_control_task, noise_level, dataset_path) logging.info("Policy path: %s", path) policy_net = tf.saved_model.load(path) policy_noise_level = 0.2 # params["policy_noise_level"] observation_network = tf2_utils.to_sonnet_module(tf2_utils.batch_concat) policy_net = snt.Sequential([ observation_network, policy_net, # Uncomment these two lines to add action noise to target policy. acme_utils.GaussianNoise(policy_noise_level), networks.ClipToSpec(environment_spec.actions), ]) else: raise ValueError(f"task name {task_name} is unsupported.") return policy_net
def make_dmpo_networks( action_spec, policy_layer_sizes=(256, 256, 256), critic_layer_sizes=(512, 512, 256), vmin=-150., vmax=150., num_atoms=51, policy_init_std=1e-9, obs_network=None, binary_grip_action=False): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) if policy_layer_sizes: policy_network = snt.Sequential([ networks.LayerNormMLP([int(l) for l in policy_layer_sizes]), networks.MultivariateNormalDiagHead( num_dimensions, init_scale=policy_init_std, min_scale=1e-10) ]) else: # Useful when initializing from a trained BC network. policy_network = snt.Sequential([ ArmPolicyNormalDiagHead( binary_grip_action=binary_grip_action, num_dimensions=num_dimensions, init_scale=policy_init_std, min_scale=1e-10) ]) # The multiplexer concatenates the (maybe transformed) observations/actions. critic_network = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential( [critic_network, networks.DiscreteValuedHead(vmin, vmax, num_atoms)]) if obs_network is None: obs_network = tf_utils.batch_concat return { 'policy': policy_network, 'critic': critic_network, 'observation': obs_network, }
def make_feed_forward_networks( action_spec: specs.BoundedArray, z_spec: specs.BoundedArray, policy_layer_sizes: Tuple[int, ...] = (256, 256), critic_layer_sizes: Tuple[int, ...] = (256, 256), discriminator_layer_sizes: Tuple[int, ...] = (256, 256), hierarchical_controller_layer_sizes: Tuple[int, ...] = (256, 256), vmin: float = -150., # Minimum value for the Critic distribution. vmax: float = 150., # Maximum value for the Critic distribution. num_atoms: int = 51, # Number of atoms for the discrete value distribution. ) -> Dict[str, types.TensorTransformation]: num_dimensions = np.prod(action_spec.shape, dtype=int) z_dim = np.prod(z_spec.shape, dtype=int) observation_network = tf2_utils.batch_concat policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) critic_multiplexer = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ critic_multiplexer, networks.DiscreteValuedHead(vmin, vmax, num_atoms), ]) # The discriminator in DIAYN uses the same architecture as the critic. discriminator_network = networks.LayerNormMLP(discriminator_layer_sizes + (z_dim, )) hierarchical_controller_network = networks.LayerNormMLP( hierarchical_controller_layer_sizes + (z_dim, )) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, 'discriminator': discriminator_network, 'hierarchical_controller': hierarchical_controller_network, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (256, 256, 256), critic_layer_sizes: Sequence[int] = (512, 512, 256), vmin: float = -150., vmax: float = 150., num_atoms: int = 51, ) -> Dict[str, types.TensorTransformation]: """Creates networks used by the agent.""" # Get total number of action dimensions from action spec. num_dimensions = np.prod(action_spec.shape, dtype=int) # Create the shared observation network; here simply a state-less operation. observation_network = tf2_utils.batch_concat # Create the policy network. policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes), networks.MultivariateNormalDiagHead(num_dimensions) ]) # The multiplexer transforms concatenates the observations/actions. multiplexer = networks.CriticMultiplexer( critic_network=networks.LayerNormMLP(critic_layer_sizes), action_network=networks.ClipToSpec(action_spec)) # Create the critic network. critic_network = snt.Sequential([ multiplexer, networks.DiscreteValuedHead(vmin, vmax, num_atoms), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, }
def make_networks( action_spec: specs.BoundedArray, policy_layer_sizes: Sequence[int] = (50, 50), critic_layer_sizes: Sequence[int] = (50, 50), ): """Creates networks used by the agent.""" num_dimensions = np.prod(action_spec.shape, dtype=int) observation_network = tf2_utils.batch_concat policy_network = snt.Sequential([ networks.LayerNormMLP(policy_layer_sizes, activate_final=True), networks.MultivariateNormalDiagHead(num_dimensions, tanh_mean=True, init_scale=0.3, fixed_scale=True, use_tfd_independent=False) ]) evaluator_network = snt.Sequential([ observation_network, policy_network, networks.StochasticMeanHead(), ]) # The multiplexer concatenates the (maybe transformed) observations/actions. multiplexer = networks.CriticMultiplexer( action_network=networks.ClipToSpec(action_spec)) critic_network = snt.Sequential([ multiplexer, networks.LayerNormMLP(critic_layer_sizes, activate_final=True), networks.NearZeroInitializedLinear(1), ]) return { 'policy': policy_network, 'critic': critic_network, 'observation': observation_network, 'evaluator': evaluator_network, }
def make_policy( self, environment_spec: specs.EnvironmentSpec, sigma: float = 0.0, ) -> snt.Module: """Create a single network which evaluates the policy.""" # Stack the observation and policy networks. stack = [ self.observation_network, self.policy_network, ] # If a stochastic/non-greedy policy is requested, add Gaussian noise on # top to enable a simple form of exploration. # TODO(mwhoffman): Refactor this to remove it from the class. if sigma > 0.0: stack += [ network_utils.ClippedGaussian(sigma), network_utils.ClipToSpec(environment_spec.actions), ] # Return a network which sequentially evaluates everything in the stack. return snt.Sequential(stack)
def __init__(self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, observation_network: types.TensorTransformation = tf.identity, discount: float = 0.99, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, n_step: int = 5, sigma: float = 0.3, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. n_step: number of steps to squash into a single transition. sigma: standard deviation of zero-mean, Gaussian exploration noise. clipping: whether to clip gradients by global norm. logger: logger object to be used by learner. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=replay_table_name, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder( priority_fns={replay_table_name: lambda x: 1.}, client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. dataset = datasets.make_reverb_dataset( table=replay_table_name, client=reverb.TFClient(address), environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # pytype: disable=wrong-arg-types # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) # Create target networks. target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.ClippedGaussian(sigma), networks.ClipToSpec(act_spec), ]) # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(behavior_network, adder=adder) # Create optimizers. policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4) critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4) # The learner updates the parameters (and initializes them). learner = learning.DDPGLearner( policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, clipping=clipping, discount=discount, target_update_period=target_update_period, dataset=dataset, counter=counter, logger=logger, checkpoint=checkpoint, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def make_default_networks( environment_spec: mava_specs.MAEnvironmentSpec, policy_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (256, 256, 256), critic_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (512, 512, 256), shared_weights: bool = True, sigma: float = 0.3, archecture_type: ArchitectureType = ArchitectureType.feedforward, ) -> Mapping[str, types.TensorTransformation]: """Default networks for maddpg. Args: environment_spec (mava_specs.MAEnvironmentSpec): description of the action and observation spaces etc. for each agent in the system. policy_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional): size of policy networks. Defaults to (256, 256, 256). critic_networks_layer_sizes (Union[Dict[str, Sequence], Sequence], optional): size of critic networks. Defaults to (512, 512, 256). shared_weights (bool, optional): whether agents should share weights or not. Defaults to True. sigma (float, optional): hyperparameters used to add Gaussian noise for simple exploration. Defaults to 0.3. archecture_type (ArchitectureType, optional): archecture used for agent networks. Can be feedforward or recurrent. Defaults to ArchitectureType.feedforward. Returns: Mapping[str, types.TensorTransformation]: returned agent networks. """ # Set Policy function and layer size if archecture_type == ArchitectureType.feedforward: policy_network_func = snt.Sequential elif archecture_type == ArchitectureType.recurrent: policy_networks_layer_sizes = (128, 128) policy_network_func = snt.DeepRNN specs = environment_spec.get_agent_specs() # Create agent_type specs if shared_weights: type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()} specs = type_specs if isinstance(policy_networks_layer_sizes, Sequence): policy_networks_layer_sizes = { key: policy_networks_layer_sizes for key in specs.keys() } if isinstance(critic_networks_layer_sizes, Sequence): critic_networks_layer_sizes = { key: critic_networks_layer_sizes for key in specs.keys() } observation_networks = {} policy_networks = {} critic_networks = {} for key in specs.keys(): # TODO (dries): Make specs[key].actions # return a list of specs for hybrid action space # Get total number of action dimensions from action spec. agent_act_spec = specs[key].actions if type(specs[key].actions) == DiscreteArray: num_actions = agent_act_spec.num_values minimum = [-1.0] * num_actions maximum = [1.0] * num_actions agent_act_spec = BoundedArray( shape=(num_actions, ), minimum=minimum, maximum=maximum, dtype="float32", name="actions", ) # Get total number of action dimensions from action spec. num_dimensions = np.prod(agent_act_spec.shape, dtype=int) # An optional network to process observations observation_network = tf2_utils.to_sonnet_module(tf.identity) # Create the policy network. if archecture_type == ArchitectureType.feedforward: policy_network = [ networks.LayerNormMLP(policy_networks_layer_sizes[key], activate_final=True), ] elif archecture_type == ArchitectureType.recurrent: policy_network = [ networks.LayerNormMLP(policy_networks_layer_sizes[key][:-1], activate_final=True), snt.LSTM(policy_networks_layer_sizes[key][-1]), ] policy_network += [ networks.NearZeroInitializedLinear(num_dimensions), networks.TanhToSpec(agent_act_spec), ] # Add Gaussian noise for simple exploration. if sigma and sigma > 0.0: policy_network += [ networks.ClippedGaussian(sigma), networks.ClipToSpec(agent_act_spec), ] policy_network = policy_network_func(policy_network) # Create the critic network. critic_network = snt.Sequential([ # The multiplexer concatenates the observations/actions. networks.CriticMultiplexer(), networks.LayerNormMLP(list(critic_networks_layer_sizes[key]) + [1], activate_final=False), ]) observation_networks[key] = observation_network policy_networks[key] = policy_network critic_networks[key] = critic_network return { "policies": policy_networks, "critics": critic_networks, "observations": observation_networks, }
def __init__(self, visualRadius, action_size, action_spec, exploration_sigma): super(ActorNetwork, self).__init__(name="commons-actor") self.policy_network = PolicyNetwork( visualRadius, action_size, action_spec) self.behavior_network = self.policy_network + snt.Sequential([networks.ClippedGaussian(exploration_sigma), networks.ClipToSpec(action_spec)])
def make_networks( environment_spec: mava_specs.MAEnvironmentSpec, policy_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = ( 256, 256, 256, ), critic_networks_layer_sizes: Union[Dict[str, Sequence], Sequence] = (512, 512, 256), shared_weights: bool = True, sigma: float = 0.3, ) -> Mapping[str, types.TensorTransformation]: """Creates networks used by the agents.""" specs = environment_spec.get_agent_specs() # Create agent_type specs if shared_weights: type_specs = {key.split("_")[0]: specs[key] for key in specs.keys()} specs = type_specs if isinstance(policy_networks_layer_sizes, Sequence): policy_networks_layer_sizes = { key: policy_networks_layer_sizes for key in specs.keys() } if isinstance(critic_networks_layer_sizes, Sequence): critic_networks_layer_sizes = { key: critic_networks_layer_sizes for key in specs.keys() } observation_networks = {} policy_networks = {} critic_networks = {} for key in specs.keys(): # Get total number of action dimensions from action spec. num_dimensions = np.prod(specs[key].actions.shape, dtype=int) # Create the shared observation network; here simply a state-less operation. observation_network = tf2_utils.to_sonnet_module(tf.identity) # Create the policy network. policy_network = snt.Sequential( [ networks.LayerNormMLP( policy_networks_layer_sizes[key], activate_final=True ), networks.NearZeroInitializedLinear(num_dimensions), networks.TanhToSpec(specs[key].actions), networks.ClippedGaussian(sigma), networks.ClipToSpec(specs[key].actions), ] ) # Create the critic network. critic_network = snt.Sequential( [ # The multiplexer concatenates the observations/actions. networks.CriticMultiplexer(), networks.LayerNormMLP( critic_networks_layer_sizes[key], activate_final=False ), snt.Linear(1), ] ) observation_networks[key] = observation_network policy_networks[key] = policy_network critic_networks[key] = critic_network return { "policies": policy_networks, "critics": critic_networks, "observations": observation_networks, }
# Create the target networks target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.ClippedGaussian(0.3), #sigma = 0.3 networks.ClipToSpec(act_spec), ]) # We must create the variables in the networks before passing them to learner. # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) actor = actors.FeedForwardActor(behavior_network, adder=adder) learner = d4pg.D4PGLearner(policy_network=policy_network, critic_network=critic_network, observation_network=observation_network,