Пример #1
0
    def __init__(self,
                 action_spec,
                 actor_network: Network,
                 critic_network: Network,
                 critic_loss=None,
                 target_entropy=None,
                 initial_log_alpha=0.0,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 gradient_clipping=None,
                 train_step_counter=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """Create a SacAlgorithm

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network): The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            critic_loss (None|OneStepTDLoss): an object for calculating critic loss.
                If None, a default OneStepTDLoss will be used.
            initial_log_alpha (float): initial value for variable log_alpha
            target_entropy (float|None): The target average policy entropy, for updating alpha.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic.
            alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha.
            gradient_clipping (float): Norm length to clip gradients.
            train_step_counter (tf.Variable): An optional counter to increment
                every time the a new iteration is started. If None, it will use
                tf.summary.experimental.get_step(). If this is still None, a
                counter will be created.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        critic_network1 = critic_network
        critic_network2 = critic_network.copy(name='CriticNetwork2')
        log_alpha = tfa_common.create_variable(name='log_alpha',
                                               initial_value=initial_log_alpha,
                                               dtype=tf.float32,
                                               trainable=True)
        super().__init__(
            action_spec,
            train_state_spec=SacState(
                share=SacShareState(actor=actor_network.state_spec),
                actor=SacActorState(critic1=critic_network.state_spec,
                                    critic2=critic_network.state_spec),
                critic=SacCriticState(
                    critic1=critic_network.state_spec,
                    critic2=critic_network.state_spec,
                    target_critic1=critic_network.state_spec,
                    target_critic2=critic_network.state_spec)),
            action_distribution_spec=actor_network.output_spec,
            predict_state_spec=actor_network.state_spec,
            optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer],
            get_trainable_variables_func=[
                lambda: actor_network.trainable_variables, lambda:
                (critic_network1.trainable_variables + critic_network2.
                 trainable_variables), lambda: [log_alpha]
            ],
            gradient_clipping=gradient_clipping,
            train_step_counter=train_step_counter,
            debug_summaries=debug_summaries,
            name=name)

        self._log_alpha = log_alpha
        self._actor_network = actor_network
        self._critic_network1 = critic_network1
        self._critic_network2 = critic_network2
        self._target_critic_network1 = self._critic_network1.copy(
            name='TargetCriticNetwork1')
        self._target_critic_network2 = self._critic_network2.copy(
            name='TargetCriticNetwork2')
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        flat_action_spec = tf.nest.flatten(self._action_spec)
        self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0])
        if target_entropy is None:
            target_entropy = np.sum(
                list(
                    map(dist_utils.calc_default_target_entropy,
                        flat_action_spec)))
        self._target_entropy = target_entropy

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_network1, self._critic_network2],
            target_models=[
                self._target_critic_network1, self._target_critic_network2
            ],
            tau=target_update_tau,
            period=target_update_period)

        tfa_common.soft_variables_update(
            self._critic_network1.variables,
            self._target_critic_network1.variables,
            tau=1.0)

        tfa_common.soft_variables_update(
            self._critic_network2.variables,
            self._target_critic_network2.variables,
            tau=1.0)
Пример #2
0
    def __init__(self,
                 action_spec,
                 actor_network: Network,
                 critic_network: Network,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 critic_loss=None,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 gradient_clipping=None,
                 train_step_counter=None,
                 debug_summaries=False,
                 name="DdpgAlgorithm"):
        """
        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network):  The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck
                (OU) noise added in the default collect policy.
            ou_damping (float): Damping factor for the OU noise added in the
                default collect policy.
            critic_loss (None|OneStepTDLoss): an object for calculating critic
                loss. If None, a default OneStepTDLoss will be used.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            gradient_clipping (float): Norm length to clip gradients.
            train_step_counter (tf.Variable): An optional counter to increment
                every time the a new iteration is started. If None, it will use
                tf.summary.experimental.get_step(). If this is still None, a
                counter will be created.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        train_state_spec = DdpgState(
            actor=DdpgActorState(actor=actor_network.state_spec,
                                 critic=critic_network.state_spec),
            critic=DdpgCriticState(critic=critic_network.state_spec,
                                   target_actor=actor_network.state_spec,
                                   target_critic=critic_network.state_spec))

        super().__init__(action_spec,
                         train_state_spec=train_state_spec,
                         action_distribution_spec=action_spec,
                         optimizer=[actor_optimizer, critic_optimizer],
                         get_trainable_variables_func=[
                             lambda: actor_network.trainable_variables,
                             lambda: critic_network.trainable_variables
                         ],
                         gradient_clipping=gradient_clipping,
                         train_step_counter=train_step_counter,
                         debug_summaries=debug_summaries,
                         name=name)

        self._actor_network = actor_network
        self._critic_network = critic_network
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._target_actor_network = actor_network.copy(
            name='target_actor_network')
        self._target_critic_network = critic_network.copy(
            name='target_critic_network')

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        self._ou_process = self._create_ou_process(ou_stddev, ou_damping)

        self._update_target = common.get_target_updater(
            models=[self._actor_network, self._critic_network],
            target_models=[
                self._target_actor_network, self._target_critic_network
            ],
            tau=target_update_tau,
            period=target_update_period)

        self._dqda_clipping = dqda_clipping

        tfa_common.soft_variables_update(self._critic_network.variables,
                                         self._target_critic_network.variables,
                                         tau=1.0)
        tfa_common.soft_variables_update(self._actor_network.variables,
                                         self._target_actor_network.variables,
                                         tau=1.0)
Пример #3
0
    def __init__(self,
                 observation_spec,
                 action_spec: BoundedTensorSpec,
                 actor_network_ctor=ActorNetwork,
                 critic_network_ctor=CriticNetwork,
                 use_parallel_network=False,
                 reward_weights=None,
                 env=None,
                 config: TrainerConfig = None,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 critic_loss_ctor=None,
                 num_critic_replicas=1,
                 target_update_tau=0.05,
                 target_update_period=1,
                 rollout_random_action=0.,
                 dqda_clipping=None,
                 action_l2=0,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 debug_summaries=False,
                 name="DdpgAlgorithm"):
        """
        Args:
            observation_spec (nested TensorSpec): representing the observations.
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network_ctor (Callable): Function to construct the actor network.
                ``actor_network_ctor`` needs to accept ``input_tensor_spec`` and
                ``action_spec`` as its arguments and return an actor network.
                The constructed network will be called with ``forward(observation, state)``.
            critic_network_ctor (Callable): Function to construct the critic
                network. ``critic_netwrok_ctor`` needs to accept ``input_tensor_spec``
                which is a tuple of ``(observation_spec, action_spec)``. The
                constructed network will be called with
                ``forward((observation, action), state)``.
            use_parallel_network (bool): whether to use parallel network for
                calculating critics.
            reward_weights (list[float]): this is only used when the reward is
                multidimensional. In that case, the weighted sum of the q values
                is used for training the actor.
            num_critic_replicas (int): number of critics to be used. Default is 1.
            env (Environment): The environment to interact with. env is a batched
                environment, which means that it runs multiple simulations
                simultateously. ``env`` only needs to be provided to the root
                algorithm.
            config (TrainerConfig): config for training. config only needs to be
                provided to the algorithm which performs ``train_iter()`` by
                itself.
            ou_stddev (float): Standard deviation for the Ornstein-Uhlenbeck
                (OU) noise added in the default collect policy.
            ou_damping (float): Damping factor for the OU noise added in the
                default collect policy.
            critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss
                constructor. If ``None``, a default ``OneStepTDLoss`` will be used.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            rollout_random_action (float): the probability of taking a uniform
                random action during a ``rollout_step()``. 0 means always directly
                taking actions added with OU noises and 1 means always sample
                uniformly random actions. A bigger value results in more
                exploration during rollout.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between ``[-dqda_clipping, dqda_clipping]``.
                Does not perform clipping if ``dqda_clipping == 0``.
            action_l2 (float): weight of squared action l2-norm on actor loss.
            actor_optimizer (torch.optim.optimizer): The optimizer for actor.
            critic_optimizer (torch.optim.optimizer): The optimizer for critic.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """

        critic_network = critic_network_ctor(
            input_tensor_spec=(observation_spec, action_spec))
        actor_network = actor_network_ctor(input_tensor_spec=observation_spec,
                                           action_spec=action_spec)
        if use_parallel_network:
            critic_networks = critic_network.make_parallel(num_critic_replicas)
        else:
            critic_networks = alf.networks.NaiveParallelNetwork(
                critic_network, num_critic_replicas)
        self._action_l2 = action_l2

        train_state_spec = DdpgState(
            actor=DdpgActorState(actor=actor_network.state_spec,
                                 critics=critic_networks.state_spec),
            critics=DdpgCriticState(critics=critic_networks.state_spec,
                                    target_actor=actor_network.state_spec,
                                    target_critics=critic_networks.state_spec))

        super().__init__(observation_spec,
                         action_spec,
                         train_state_spec=train_state_spec,
                         env=env,
                         config=config,
                         debug_summaries=debug_summaries,
                         name=name)

        if actor_optimizer is not None:
            self.add_optimizer(actor_optimizer, [actor_network])
        if critic_optimizer is not None:
            self.add_optimizer(critic_optimizer, [critic_networks])

        self._actor_network = actor_network
        self._num_critic_replicas = num_critic_replicas
        self._critic_networks = critic_networks

        self._reward_weights = None
        if reward_weights:
            self._reward_weights = torch.tensor(reward_weights,
                                                dtype=torch.float32)

        self._target_actor_network = actor_network.copy(
            name='target_actor_networks')
        self._target_critic_networks = critic_networks.copy(
            name='target_critic_networks')

        self._rollout_random_action = float(rollout_random_action)

        if critic_loss_ctor is None:
            critic_loss_ctor = OneStepTDLoss
        critic_loss_ctor = functools.partial(critic_loss_ctor,
                                             debug_summaries=debug_summaries)
        self._critic_losses = [None] * num_critic_replicas
        for i in range(num_critic_replicas):
            self._critic_losses[i] = critic_loss_ctor(name=("critic_loss" +
                                                            str(i)))

        self._ou_process = common.create_ou_process(action_spec, ou_stddev,
                                                    ou_damping)

        self._update_target = common.get_target_updater(
            models=[self._actor_network, self._critic_networks],
            target_models=[
                self._target_actor_network, self._target_critic_networks
            ],
            tau=target_update_tau,
            period=target_update_period)

        self._dqda_clipping = dqda_clipping
Пример #4
0
    def __init__(self,
                 observation_spec,
                 action_spec: BoundedTensorSpec,
                 actor_network_cls=ActorDistributionNetwork,
                 critic_network_cls=CriticNetwork,
                 q_network_cls=QNetwork,
                 reward_weights=None,
                 use_entropy_reward=True,
                 use_parallel_network=False,
                 num_critic_replicas=2,
                 env=None,
                 config: TrainerConfig = None,
                 critic_loss_ctor=None,
                 target_entropy=None,
                 prior_actor_ctor=None,
                 target_kld_per_dim=3.,
                 initial_log_alpha=0.0,
                 max_log_alpha=None,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """
        Args:
            observation_spec (nested TensorSpec): representing the observations.
            action_spec (nested BoundedTensorSpec): representing the actions; can
                be a mixture of discrete and continuous actions. The number of
                continuous actions can be arbitrary while only one discrete
                action is allowed currently. If it's a mixture, then it must be
                a tuple/list ``(discrete_action_spec, continuous_action_spec)``.
            actor_network_cls (Callable): is used to construct the actor network.
                The constructed actor network will be called
                to sample continuous actions. All of its output specs must be
                continuous. Note that we don't need a discrete actor network
                because a discrete action can simply be sampled from the Q values.
            critic_network_cls (Callable): is used to construct critic network.
                for estimating ``Q(s,a)`` given that the action is continuous.
            q_network (Callable): is used to construct QNetwork for estimating ``Q(s,a)``
                given that the action is discrete. Its output spec must be consistent with
                the discrete action in ``action_spec``.
            reward_weights (None|list[float]): this is only used when the reward is
                multidimensional. In that case, the weighted sum of the q values
                is used for training the actor if reward_weights is not None.
                Otherwise, the sum of the q values is used.
            use_entropy_reward (bool): whether to include entropy as reward
            use_parallel_network (bool): whether to use parallel network for
                calculating critics.
            num_critic_replicas (int): number of critics to be used. Default is 2.
            env (Environment): The environment to interact with. ``env`` is a
                batched environment, which means that it runs multiple simulations
                simultateously. ``env` only needs to be provided to the root
                algorithm.
            config (TrainerConfig): config for training. It only needs to be
                provided to the algorithm which performs ``train_iter()`` by
                itself.
            critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss
                constructor. If ``None``, a default ``OneStepTDLoss`` will be used.
            initial_log_alpha (float): initial value for variable ``log_alpha``.
            max_log_alpha (float|None): if not None, ``log_alpha`` will be
                capped at this value.
            target_entropy (float|Callable|None): If a floating value, it's the
                target average policy entropy, for updating ``alpha``. If a
                callable function, then it will be called on the action spec to
                calculate a target entropy. If ``None``, a default entropy will
                be calculated. For the mixed action type, discrete action and
                continuous action will have separate alphas and target entropies,
                so this argument can be a 2-element list/tuple, where the first
                is for discrete action and the second for continuous action.
            prior_actor_ctor (Callable): If provided, it will be called using
                ``prior_actor_ctor(observation_spec, action_spec, debug_summaries=debug_summaries)``
                to constructor a prior actor. The output of the prior actor is
                the distribution of the next action. Two prior actors are implemented:
                ``alf.algorithms.prior_actor.SameActionPriorActor`` and
                ``alf.algorithms.prior_actor.UniformPriorActor``.
            target_kld_per_dim (float): ``alpha`` is dynamically adjusted so that
                the KLD is about ``target_kld_per_dim * dim``.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between
                ``[-dqda_clipping, dqda_clipping]``. Will not perform clipping if
                ``dqda_clipping == 0``.
            actor_optimizer (torch.optim.optimizer): The optimizer for actor.
            critic_optimizer (torch.optim.optimizer): The optimizer for critic.
            alpha_optimizer (torch.optim.optimizer): The optimizer for alpha.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        self._num_critic_replicas = num_critic_replicas
        self._use_parallel_network = use_parallel_network

        critic_networks, actor_network, self._act_type, reward_dim = self._make_networks(
            observation_spec, action_spec, actor_network_cls,
            critic_network_cls, q_network_cls)

        self._use_entropy_reward = use_entropy_reward

        if reward_dim > 1:
            assert not use_entropy_reward, (
                "use_entropy_reward=True is not supported for multidimensional reward"
            )
            assert self._act_type == ActionType.Continuous, (
                "Only continuous action is supported for multidimensional reward"
            )

        self._reward_weights = None
        if reward_weights:
            assert reward_dim > 1, (
                "reward_weights cannot be used for one dimensional reward")
            assert len(reward_weights) == reward_dim, (
                "Mismatch between len(reward_weights)=%s and reward_dim=%s" %
                (len(reward_weights), reward_dim))
            self._reward_weights = torch.tensor(reward_weights,
                                                dtype=torch.float32)

        def _init_log_alpha():
            return nn.Parameter(torch.tensor(float(initial_log_alpha)))

        if self._act_type == ActionType.Mixed:
            # separate alphas for discrete and continuous actions
            log_alpha = type(action_spec)(
                (_init_log_alpha(), _init_log_alpha()))
        else:
            log_alpha = _init_log_alpha()

        action_state_spec = SacActionState(
            actor_network=(() if self._act_type == ActionType.Discrete else
                           actor_network.state_spec),
            critic=(() if self._act_type == ActionType.Continuous else
                    critic_networks.state_spec))
        super().__init__(
            observation_spec,
            action_spec,
            train_state_spec=SacState(
                action=action_state_spec,
                actor=(() if self._act_type != ActionType.Continuous else
                       critic_networks.state_spec),
                critic=SacCriticState(
                    critics=critic_networks.state_spec,
                    target_critics=critic_networks.state_spec)),
            predict_state_spec=SacState(action=action_state_spec),
            env=env,
            config=config,
            debug_summaries=debug_summaries,
            name=name)

        if actor_optimizer is not None:
            self.add_optimizer(actor_optimizer, [actor_network])
        if critic_optimizer is not None:
            self.add_optimizer(critic_optimizer, [critic_networks])
        if alpha_optimizer is not None:
            self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha))

        self._log_alpha = log_alpha
        if self._act_type == ActionType.Mixed:
            self._log_alpha_paralist = nn.ParameterList(
                nest.flatten(log_alpha))

        if max_log_alpha is not None:
            self._max_log_alpha = torch.tensor(float(max_log_alpha))
        else:
            self._max_log_alpha = None

        self._actor_network = actor_network
        self._critic_networks = critic_networks
        self._target_critic_networks = self._critic_networks.copy(
            name='target_critic_networks')

        if critic_loss_ctor is None:
            critic_loss_ctor = OneStepTDLoss
        critic_loss_ctor = functools.partial(critic_loss_ctor,
                                             debug_summaries=debug_summaries)
        # Have different names to separate their summary curves
        self._critic_losses = []
        for i in range(num_critic_replicas):
            self._critic_losses.append(
                critic_loss_ctor(name="critic_loss%d" % (i + 1)))

        self._prior_actor = None
        if prior_actor_ctor is not None:
            assert self._act_type == ActionType.Continuous, (
                "Only continuous action is supported when using prior_actor")
            self._prior_actor = prior_actor_ctor(
                observation_spec=observation_spec,
                action_spec=action_spec,
                debug_summaries=debug_summaries)
            total_action_dims = sum(
                [spec.numel for spec in alf.nest.flatten(action_spec)])
            self._target_entropy = -target_kld_per_dim * total_action_dims
        else:
            if self._act_type == ActionType.Mixed:
                if not isinstance(target_entropy, (tuple, list)):
                    target_entropy = nest.map_structure(
                        lambda _: target_entropy, self._action_spec)
                # separate target entropies for discrete and continuous actions
                self._target_entropy = nest.map_structure(
                    lambda spec, t: _set_target_entropy(self.name, t, [spec]),
                    self._action_spec, target_entropy)
            else:
                self._target_entropy = _set_target_entropy(
                    self.name, target_entropy, nest.flatten(self._action_spec))

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_networks],
            target_models=[self._target_critic_networks],
            tau=target_update_tau,
            period=target_update_period)
Пример #5
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_network: DistributionNetwork,
                 critic_network: Network,
                 gamma=0.99,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 target_update_tau=0.05,
                 target_update_period=10,
                 dqda_clipping=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 name="SarsaAlgorithm"):
        """Create an SarsaAlgorithm.

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            observation_spec (nested TensorSpec): spec for observation.
            actor_network (Network|DistributionNetwork):  The network will be
                called with call(observation, step_type). If it is DistributionNetwork
                an action will be sampled.
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            gamma (float): discount rate for reward
            ou_stddev (float): Only used for DDPG. Standard deviation for the
                Ornstein-Uhlenbeck (OU) noise added in the default collect policy.
            ou_damping (float): Only used for DDPG. Damping factor for the OU
                noise added in the default collect policy.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            gradient_clipping (float): Norm length to clip gradients.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        if isinstance(actor_network, DistributionNetwork):
            self._action_distribution_spec = actor_network.output_spec
        elif isinstance(actor_network, Network):
            self._action_distribution_spec = action_spec
        else:
            raise ValueError("Expect DistributionNetwork or Network for"
                             " `actor_network`, got %s" % type(actor_network))

        super().__init__(observation_spec,
                         action_spec,
                         predict_state_spec=SarsaState(
                             prev_observation=observation_spec,
                             prev_step_type=tf.TensorSpec((), tf.int32),
                             actor=actor_network.state_spec),
                         train_state_spec=SarsaState(
                             prev_observation=observation_spec,
                             prev_step_type=tf.TensorSpec((), tf.int32),
                             actor=actor_network.state_spec,
                             target_actor=actor_network.state_spec,
                             critic=critic_network.state_spec,
                             target_critic=critic_network.state_spec,
                         ),
                         optimizer=[actor_optimizer, critic_optimizer],
                         trainable_module_sets=[[actor_network],
                                                [critic_network]],
                         gradient_clipping=gradient_clipping,
                         debug_summaries=debug_summaries,
                         name=name)
        self._actor_network = actor_network
        self._critic_network = critic_network
        self._target_actor_network = actor_network.copy(
            name='target_actor_network')
        self._target_critic_network = critic_network.copy(
            name='target_critic_network')
        self._update_target = common.get_target_updater(
            models=[self._actor_network, self._critic_network],
            target_models=[
                self._target_actor_network, self._target_critic_network
            ],
            tau=target_update_tau,
            period=target_update_period)
        self._dqda_clipping = dqda_clipping
        self._gamma = gamma
        self._ou_process = create_ou_process(action_spec, ou_stddev,
                                             ou_damping)
Пример #6
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_network_ctor,
                 critic_network_ctor,
                 use_parallel_network=False,
                 num_critic_replicas=2,
                 env=None,
                 config=None,
                 critic_loss_cls=OneStepTDLoss,
                 target_entropy=None,
                 use_entropy_reward=False,
                 initial_alpha=1.0,
                 ou_stddev=0.2,
                 ou_damping=0.15,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 target_update_tau=0.05,
                 target_update_period=10,
                 use_smoothed_actor=False,
                 dqda_clipping=0.,
                 on_policy=False,
                 debug_summaries=False,
                 name="SarsaAlgorithm"):
        """
        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            observation_spec (nested TensorSpec): spec for observation.
            actor_network_ctor (Callable): Function to construct the actor network.
                ``actor_network_ctor`` needs to accept ``input_tensor_spec`` and
                ``action_spec`` as its arguments and return an actor network.
                The constructed network will be called with ``forward(observation, state)``.
            critic_network_ctor (Callable): Function to construct the critic
                network. ``critic_netwrok_ctor`` needs to accept ``input_tensor_spec``
                which is a tuple of ``(observation_spec, action_spec)``. The
                constructed network will be called with
                ``forward((observation, action), state)``.
            use_parallel_network (bool): whether to use parallel network for
                calculating critics. This can be useful when
                ``mini_batch_size * mini_batch_length`` (when ``temporally_independent_train_step``
                is True)  or ``mini_batch_size``  (when ``temporally_independent_train_step``
                is False) is not very large. You have to test to see which way
                is faster for your particular situation.
            num_critic_replicas (int): number of critics to be used. Default is 2.
            env (Environment): The environment to interact with. ``env`` is a
                batched environment, which means that it runs multiple
                simulations simultaneously. Running multiple environments in
                parallel is crucial to on-policy algorithms as it increases the
                diversity of data and decreases temporal correlation. ``env`` only
                needs to be provided to the root ``Algorithm``.
            config (TrainerConfig): config for training. ``config`` only needs to
                be provided to the algorithm which performs ``train_iter()`` by
                itself.
            initial_alpha (float|None): If provided, will add ``-alpha*entropy``
                to the loss to encourage diverse action.
            target_entropy (float|Callable|None): If a floating value, it's the
                target average policy entropy, for updating ``alpha``. If a
                callable function, then it will be called on the action spec to
                calculate a target entropy. If ``None``, a default entropy will
                be calculated.
            use_entropy_reward (bool): If ``True``, will use alpha*entropy as
                additional reward.
            ou_stddev (float): Only used for DDPG. Standard deviation for the
                Ornstein-Uhlenbeck (OU) noise added in the default collect policy.
            ou_damping (float): Only used for DDPG. Damping factor for the OU
                noise added in the default collect policy.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            use_smoothed_actor (bool): use a smoothed version of actor for
                predict and rollout. This option can be used if ``on_policy`` is
                ``False``.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient ``dqda`` element-wise between
                ``[-dqda_clipping, dqda_clipping]``. Does not perform clipping
                if ``dqda_clipping == 0``.
            actor_optimizer (torch.optim.Optimizer): The optimizer for actor.
            critic_optimizer (torch.optim.Optimizer): The optimizer for critic
                networks.
            alpha_optimizer (torch.optim.Optimizer): The optimizer for alpha.
                Only used if ``initial_alpha`` is not ``None``.
            on_policy (bool): whether it is used as an on-policy algorithm.
            debug_summaries (bool): ``True`` if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        critic_network = critic_network_ctor(
            input_tensor_spec=(observation_spec, action_spec))
        actor_network = actor_network_ctor(input_tensor_spec=observation_spec,
                                           action_spec=action_spec)
        flat_action_spec = alf.nest.flatten(action_spec)
        is_continuous = min(
            map(lambda spec: spec.is_continuous, flat_action_spec))
        assert is_continuous, (
            "SarsaAlgorithm only supports continuous action."
            " action_spec: %s" % action_spec)

        if use_parallel_network:
            critic_networks = critic_network.make_parallel(num_critic_replicas)
        else:
            critic_networks = alf.networks.NaiveParallelNetwork(
                critic_network, num_critic_replicas)

        self._on_policy = on_policy

        if not actor_network.is_distribution_output:
            noise_process = alf.networks.OUProcess(state_spec=action_spec,
                                                   damping=ou_damping,
                                                   stddev=ou_stddev)
            noise_state = noise_process.state_spec
        else:
            noise_process = None
            noise_state = ()

        super().__init__(observation_spec,
                         action_spec,
                         env=env,
                         config=config,
                         predict_state_spec=SarsaState(
                             noise=noise_state,
                             prev_observation=observation_spec,
                             prev_step_type=alf.TensorSpec((), torch.int32),
                             actor=actor_network.state_spec),
                         train_state_spec=SarsaState(
                             noise=noise_state,
                             prev_observation=observation_spec,
                             prev_step_type=alf.TensorSpec((), torch.int32),
                             actor=actor_network.state_spec,
                             critics=critic_networks.state_spec,
                             target_critics=critic_networks.state_spec,
                         ),
                         debug_summaries=debug_summaries,
                         name=name)
        self._actor_network = actor_network
        self._num_critic_replicas = num_critic_replicas
        self._critic_networks = critic_networks
        self._target_critic_networks = critic_networks.copy(
            name='target_critic_networks')
        self.add_optimizer(actor_optimizer, [actor_network])
        self.add_optimizer(critic_optimizer, [critic_networks])

        self._log_alpha = None
        self._use_entropy_reward = False
        if initial_alpha is not None:
            if actor_network.is_distribution_output:
                self._target_entropy = _set_target_entropy(
                    self.name, target_entropy, flat_action_spec)
                log_alpha = torch.tensor(np.log(initial_alpha),
                                         dtype=torch.float32)
                if alpha_optimizer is None:
                    self._log_alpha = log_alpha
                else:
                    self._log_alpha = nn.Parameter(log_alpha)
                    self.add_optimizer(alpha_optimizer, [self._log_alpha])
                self._use_entropy_reward = use_entropy_reward
            else:
                logging.info(
                    "initial_alpha and alpha_optimizer is ignored. "
                    "The `actor_network` needs to output Distribution in "
                    "order to use entropy as regularization or reward")

        models = copy.copy(critic_networks)
        target_models = copy.copy(self._target_critic_networks)

        self._rollout_actor_network = self._actor_network
        if use_smoothed_actor:
            assert not on_policy, ("use_smoothed_actor can only be used in "
                                   "off-policy training")
            self._rollout_actor_network = actor_network.copy(
                name='rollout_actor_network')
            models.append(self._actor_network)
            target_models.append(self._rollout_actor_network)

        self._update_target = common.get_target_updater(
            models=models,
            target_models=target_models,
            tau=target_update_tau,
            period=target_update_period)

        self._dqda_clipping = dqda_clipping

        self._noise_process = noise_process
        self._critic_losses = []
        for i in range(num_critic_replicas):
            self._critic_losses.append(
                critic_loss_cls(debug_summaries=debug_summaries and i == 0))

        self._is_rnn = len(alf.nest.flatten(critic_network.state_spec)) > 0
Пример #7
0
    def __init__(
            self,
            observation_spec,
            action_spec: BoundedTensorSpec,
            critic_network: MdqCriticNetwork,
            env=None,
            config: TrainerConfig = None,
            critic_loss_ctor=None,
            target_entropy=dist_utils.calc_default_target_entropy_quantized,
            initial_log_alpha=0.0,
            target_update_tau=0.05,
            target_update_period=1,
            distill_noise=0.01,
            critic_optimizer=None,
            alpha_optimizer=None,
            debug_summaries=False,
            name="MdqAlgorithm"):
        """
        Args:
            observation_spec (nested TensorSpec): representing the observations.
            action_spec (nested BoundedTensorSpec): representing the actions.
            critic_network (MdqCriticNetwork): an instance of MdqCriticNetwork
            env (Environment): The environment to interact with. ``env`` is a
                batched environment, which means that it runs multiple simulations
                simultateously. ``env` only needs to be provided to the root
                algorithm.
            config (TrainerConfig): config for training. It only needs to be
                provided to the algorithm which performs ``train_iter()`` by
                itself.
            critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss
                constructor. If ``None``, a default ``OneStepTDLoss`` will be used.
            initial_log_alpha (float): initial value for variable ``log_alpha``.
            target_entropy (float|Callable): If a floating value, it's the
                target average policy entropy, for updating ``alpha``. If a
                callable function, then it will be called on the action spec to
                calculate a target entropy. Note that in MDQ algorithm, as the
                continuous action is represented by a discrete distribution for
                each action dimension, ``calc_default_target_entropy_quantized``
                is used to compute the target entropy by default.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            distill_noise (int): the std of random Gaussian noise added to the
                action used for distillation.
            critic_optimizer (torch.optim.optimizer): The optimizer for critic.
            alpha_optimizer (torch.optim.optimizer): The optimizer for alpha.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """

        critic_networks = critic_network
        target_critic_networks = critic_networks.copy(
            name='target_critic_networks')

        train_state_spec = MdqState(
            critic=MdqCriticState(critic=critic_networks.state_spec,
                                  target_critic=critic_networks.state_spec))

        super().__init__(observation_spec,
                         action_spec,
                         train_state_spec=train_state_spec,
                         env=env,
                         config=config,
                         debug_summaries=debug_summaries,
                         name=name)

        self._critic_networks = critic_networks
        self._target_critic_networks = target_critic_networks

        self.add_optimizer(critic_optimizer, [critic_networks])

        if critic_loss_ctor is None:
            critic_loss_ctor = OneStepTDLoss
        critic_loss_ctor = functools.partial(critic_loss_ctor,
                                             debug_summaries=debug_summaries)

        flat_action_spec = nest.flatten(self._action_spec)
        self._flat_action_spec = flat_action_spec
        self._action_dim = flat_action_spec[0].shape[0]
        self._log_pi_uniform_prior = self._critic_networks.get_uniform_prior_logpi(
        )

        self._num_critic_replicas = self._critic_networks._num_critic_replicas

        self._critic_losses = []

        for i in range(self._num_critic_replicas):
            self._critic_losses.append(
                critic_loss_ctor(name="critic_loss%d" % (i + 1)))

        self._is_continuous = flat_action_spec[0].is_continuous
        self._target_entropy = _set_target_entropy(self.name, target_entropy,
                                                   flat_action_spec)

        log_alpha = nn.Parameter(torch.Tensor([float(initial_log_alpha)]))
        self._log_alpha = log_alpha

        self._update_target = common.get_target_updater(
            models=[self._critic_networks],
            target_models=[self._target_critic_networks],
            tau=target_update_tau,
            period=target_update_period)

        if alpha_optimizer is not None:
            self.add_optimizer(alpha_optimizer, [log_alpha])
        self._distill_noise = distill_noise
Пример #8
0
    def __init__(self,
                 output_dim,
                 noise_dim=32,
                 input_tensor_spec=None,
                 hidden_layers=(256, ),
                 net: Network = None,
                 net_moving_average_rate=None,
                 entropy_regularization=0.,
                 mi_weight=None,
                 mi_estimator_cls=MIEstimator,
                 par_vi="gfsf",
                 optimizer=None,
                 name="Generator"):
        r"""Create a Generator.

        Args:
            output_dim (int): dimension of output
            noise_dim (int): dimension of noise
            input_tensor_spec (nested TensorSpec): spec of inputs. If there is
                no inputs, this should be None.
            hidden_layers (tuple): size of hidden layers.
            net (Network): network for generating outputs from [noise, inputs]
                or noise (if inputs is None). If None, a default one with
                hidden_layers will be created
            net_moving_average_rate (float): If provided, use a moving average
                version of net to do prediction. This has been shown to be
                effective for GAN training (arXiv:1907.02544, arXiv:1812.04948).
            entropy_regularization (float): weight of entropy regularization
            mi_estimator_cls (type): the class of mutual information estimator
                for maximizing the mutual information between [noise, inputs]
                and [outputs, inputs].
            par_vi (string): ParVI methods, options are
                [``svgd``, ``svgd2``, ``svgd3``, ``gfsf``],
                * svgd: empirical expectation of SVGD is evaluated by a single 
                    resampled particle. The main benefit of this choice is it 
                    supports conditional case, while all other options do not.
                * svgd2: empirical expectation of SVGD is evaluated by splitting
                    half of the sampled batch. It is a trade-off between 
                    computational efficiency and convergence speed.
                * svgd3: empirical expectation of SVGD is evaluated by 
                    resampled particles of the same batch size. It has better
                    convergence but involves resampling, so less efficient
                    computaionally comparing with svgd2.
                * gfsf: wasserstein gradient flow with smoothed functions. It 
                    involves a kernel matrix inversion, so computationally most
                    expensive, but in some case the convergence seems faster 
                    than svgd approaches.
            optimizer (torch.optim.Optimizer): (optional) optimizer for training
            name (str): name of this generator
        """
        super().__init__(train_state_spec=(), optimizer=optimizer, name=name)
        self._noise_dim = noise_dim
        self._entropy_regularization = entropy_regularization
        self._par_vi = par_vi
        if entropy_regularization == 0:
            self._grad_func = self._ml_grad
        else:
            if par_vi == 'gfsf':
                self._grad_func = self._gfsf_grad
            elif par_vi == 'svgd':
                self._grad_func = self._svgd_grad
            elif par_vi == 'svgd2':
                self._grad_func = self._svgd_grad2
            elif par_vi == 'svgd3':
                self._grad_func = self._svgd_grad3
            else:
                raise ValueError("Unsupported par_vi method: %s" % par_vi)

            self._kernel_width_averager = AdaptiveAverager(
                tensor_spec=TensorSpec(shape=()))

        noise_spec = TensorSpec(shape=(noise_dim, ))

        if net is None:
            net_input_spec = noise_spec
            if input_tensor_spec is not None:
                net_input_spec = [net_input_spec, input_tensor_spec]
            net = EncodingNetwork(input_tensor_spec=net_input_spec,
                                  fc_layer_params=hidden_layers,
                                  last_layer_size=output_dim,
                                  last_activation=math_ops.identity,
                                  name="Generator")

        self._mi_estimator = None
        self._input_tensor_spec = input_tensor_spec
        if mi_weight is not None:
            x_spec = noise_spec
            y_spec = TensorSpec((output_dim, ))
            if input_tensor_spec is not None:
                x_spec = [x_spec, input_tensor_spec]
            self._mi_estimator = mi_estimator_cls(x_spec,
                                                  y_spec,
                                                  sampler='shift')
            self._mi_weight = mi_weight
        self._net = net
        self._predict_net = None
        self._net_moving_average_rate = net_moving_average_rate
        if net_moving_average_rate:
            self._predict_net = net.copy(name="Genrator_average")
            self._predict_net_updater = common.get_target_updater(
                self._net, self._predict_net, tau=net_moving_average_rate)
Пример #9
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 model_ctor,
                 mcts_algorithm_ctor,
                 num_unroll_steps,
                 td_steps,
                 recurrent_gradient_scaling_factor=0.5,
                 reward_normalizer=None,
                 reward_clip_value=-1.,
                 train_reward_function=True,
                 train_game_over_function=True,
                 reanalyze_ratio=0.,
                 reanalyze_td_steps=5,
                 reanalyze_batch_size=None,
                 data_transformer_ctor=None,
                 target_update_tau=1.,
                 target_update_period=1000,
                 debug_summaries=False,
                 name="MuZero"):
        """
        Args:
            observation_spec (TensorSpec): representing the observations.
            action_spec (BoundedTensorSpec): representing the actions.
            model_ctor (Callable): will be called as
                ``model_ctor(observation_spec=?, action_spec=?, debug_summaries=?)``
                to construct the model. The model should follow the interface
                ``alf.algorithms.mcts_models.MCTSModel``.
            mcts_algorithm_ctor (Callable): will be called as
                ``mcts_algorithm_ctor(observation_spec=?, action_spec=?, debug_summaries=?)``
                to construct an ``MCTSAlgorithm`` instance.
            num_unroll_steps (int): steps for unrolling the model during training.
            td_steps (int): bootstrap so many steps into the future for calculating
                the discounted return. -1 means to bootstrap to the end of the game.
                Can only used for environments whose rewards are zero except for
                the last step as the current implmentation only use the reward
                at the last step to calculate the return.
            recurrent_gradient_scaling_factor (float): the gradient go through
                the ``model.recurrent_inference`` is scaled by this factor. This
                is suggested in Appendix G.
            reward_normalizer (Normalizer|None): if provided, will be used to
                normalize reward.
            train_reward_function (bool): whether train reward function. If
                False, reward should only be given at the last step of an episode.
            train_game_over_function (bool): whether train game over function.
            reanalyze_ratio (float): float number in [0., 1.]. Reanalyze so much
                portion of data retrieved from replay buffer. Reanalyzing means
                using recent model to calculate the value and policy target.
            reanalyze_td_steps (int): the n for the n-step return for reanalyzing.
            reanalyze_batch_size (int|None): the memory usage may be too much for
                reanalyzing all the data for one training iteration. If so, provide
                a number for this so that it will analyzing the data in several
                batches.
            data_transformer_ctor (Callable|list[Callable]): should be same as
                ``TrainerConfig.data_transformer_ctor``.
            target_update_tau (float): Factor for soft update of the target
                networks used for reanalyzing.
            target_update_period (int): Period for soft update of the target
                networks used for reanalyzing.
            debug_summaries (bool):
            name (str):
        """
        model = model_ctor(observation_spec,
                           action_spec,
                           debug_summaries=debug_summaries)
        mcts = mcts_algorithm_ctor(observation_spec=observation_spec,
                                   action_spec=action_spec,
                                   debug_summaries=debug_summaries)
        mcts.set_model(model)
        self._device = alf.get_default_device()
        super().__init__(observation_spec=observation_spec,
                         action_spec=action_spec,
                         train_state_spec=mcts.predict_state_spec,
                         predict_state_spec=mcts.predict_state_spec,
                         rollout_state_spec=mcts.predict_state_spec,
                         debug_summaries=debug_summaries,
                         name=name)

        self._mcts = mcts
        self._model = model
        self._num_unroll_steps = num_unroll_steps
        self._td_steps = td_steps
        self._discount = mcts.discount
        self._recurrent_gradient_scaling_factor = recurrent_gradient_scaling_factor
        self._reward_normalizer = reward_normalizer
        self._reward_clip_value = reward_clip_value
        self._train_reward_function = train_reward_function
        self._train_game_over_function = train_game_over_function
        self._reanalyze_ratio = reanalyze_ratio
        self._reanalyze_td_steps = reanalyze_td_steps
        self._reanalyze_batch_size = reanalyze_batch_size
        self._data_transformer = None
        self._data_transformer_ctor = data_transformer_ctor

        self._update_target = None
        if reanalyze_ratio > 0:
            self._target_model = model_ctor(observation_spec,
                                            action_spec,
                                            debug_summaries=debug_summaries)
            self._update_target = common.get_target_updater(
                models=[self._model],
                target_models=[self._target_model],
                tau=target_update_tau,
                period=target_update_period)