def __init__(self, train_state_spec, action_spec, feature_spec, hidden_size=256, num_replicas=1, dynamics_network: DynamicsNetwork = None, name="DynamicsLearningAlgorithm"): """Create a DynamicsLearningAlgorithm. Args: hidden_size (int|tuple): size of hidden layer(s) dynamics_network (Network): network for predicting the change of the next feature based on the previous feature and action. It should accept input with spec of the format [feature_spec, encoded_action_spec] and output a tensor of the shape feature_spec. For discrete action case, encoded_action is a one-hot representation of the action. For continuous action, encoded action is the original action. """ super().__init__(train_state_spec=train_state_spec, name=name) flat_action_spec = nest.flatten(action_spec) assert len(flat_action_spec) == 1, "doesn't support nested action_spec" flat_feature_spec = nest.flatten(feature_spec) assert len( flat_feature_spec) == 1, "doesn't support nested feature_spec" action_spec = flat_action_spec[0] if action_spec.is_discrete: self._num_actions = action_spec.maximum - action_spec.minimum + 1 else: self._num_actions = action_spec.shape[-1] self._action_spec = action_spec self._feature_spec = feature_spec self._num_replicas = num_replicas if isinstance(hidden_size, int): hidden_size = (hidden_size, ) if dynamics_network is None: encoded_action_spec = TensorSpec((self._num_actions, ), dtype=torch.float32) dynamics_network = DynamicsNetwork( name="dynamics_net", input_tensor_spec=(feature_spec, encoded_action_spec), preprocessing_combiner=NestConcat(), fc_layer_params=hidden_size, output_tensor_spec=flat_feature_spec[0]) if num_replicas > 1: self._dynamics_network = dynamics_network.make_parallel( num_replicas) else: self._dynamics_network = dynamics_network
def _actor_train_step(self, exp: Experience, state, action, critics, log_pi, action_distribution): neg_entropy = sum(nest.flatten(log_pi)) if self._act_type == ActionType.Discrete: # Pure discrete case doesn't need to learn an actor network return (), LossInfo(extra=SacActorInfo(neg_entropy=neg_entropy)) if self._act_type == ActionType.Continuous: critics, critics_state = self._compute_critics( self._critic_networks, exp.observation, action, state) if critics.ndim == 3: # Multidimensional reward: [B, num_criric_replicas, reward_dim] if self._reward_weights is None: critics = critics.sum(dim=2) else: critics = torch.tensordot(critics, self._reward_weights, dims=1) target_q_value = critics.min(dim=1)[0] continuous_log_pi = log_pi cont_alpha = torch.exp(self._log_alpha).detach() else: # use the critics computed during action prediction for Mixed type critics_state = () discrete_act_dist = action_distribution[0] discrete_entropy = discrete_act_dist.entropy() # critics is already after min over replicas weighted_q_value = torch.sum(discrete_act_dist.probs * critics, dim=-1) discrete_alpha = torch.exp(self._log_alpha[0]).detach() target_q_value = weighted_q_value + discrete_alpha * discrete_entropy action, continuous_log_pi = action[1], log_pi[1] cont_alpha = torch.exp(self._log_alpha[1]).detach() dqda = nest_utils.grad(action, target_q_value.sum()) def actor_loss_fn(dqda, action): if self._dqda_clipping: dqda = torch.clamp(dqda, -self._dqda_clipping, self._dqda_clipping) loss = 0.5 * losses.element_wise_squared_loss( (dqda + action).detach(), action) return loss.sum(list(range(1, loss.ndim))) actor_loss = nest.map_structure(actor_loss_fn, dqda, action) actor_loss = math_ops.add_n(nest.flatten(actor_loss)) actor_info = LossInfo(loss=actor_loss + cont_alpha * continuous_log_pi, extra=SacActorInfo(actor_loss=actor_loss, neg_entropy=neg_entropy)) return critics_state, actor_info
def __init__(self, feature_spec, action_spec, train_state_spec, planning_horizon=25, upper_bound=None, lower_bound=None, name="PlanningAlgorithm"): """Create a PlanningAlgorithm. Args: planning_horizon (int): planning horizon in terms of time steps upper_bound (int): upper bound for elements in solution; action_spec.maximum will be used if not specified lower_bound (int): lower bound for elements in solution; action_spec.minimum will be used if not specified """ super().__init__( feature_spec, action_spec, train_state_spec=train_state_spec, name=name) flat_action_spec = nest.flatten(action_spec) assert len(flat_action_spec) == 1, "doesn't support nested action_spec" flat_feature_spec = nest.flatten(feature_spec) assert len( flat_feature_spec) == 1, "doesn't support nested feature_spec" action_spec = flat_action_spec[0] assert action_spec.is_continuous, "only support \ continious control" self._num_actions = action_spec.shape[-1] self._action_spec = action_spec self._feature_spec = feature_spec self._planning_horizon = planning_horizon self._upper_bound = action_spec.maximum if upper_bound is None \ else upper_bound self._lower_bound = action_spec.minimum if lower_bound is None \ else lower_bound self._reward_func = None self._dynamics_func = None self._step_eval_func = None # per step evaluation function
def __init__(self, feature_spec, action_spec, population_size, planning_horizon, upper_bound=None, lower_bound=None, hidden_size=256, name="RandomShootingAlgorithm"): """Create a RandomShootingAlgorithm. Args: population_size (int): the size of polulation for random shooting planning_horizon (int): planning horizon in terms of time steps upper_bound (int): upper bound for elements in solution; action_spec.maximum will be used if not specified lower_bound (int): lower bound for elements in solution; action_spec.minimum will be used if not specified hidden_size (int|tuple): size of hidden layer(s) """ super().__init__( feature_spec=feature_spec, action_spec=action_spec, train_state_spec=(), planning_horizon=planning_horizon, upper_bound=upper_bound, lower_bound=lower_bound, name=name) flat_action_spec = nest.flatten(action_spec) assert len(flat_action_spec) == 1, ("RandomShootingAlgorithm doesn't " "support nested action_spec") flat_feature_spec = nest.flatten(feature_spec) assert len(flat_feature_spec) == 1, ("RandomShootingAlgorithm doesn't " "support nested feature_spec") self._population_size = population_size solution_size = self._planning_horizon * self._num_actions self._plan_optimizer = RandomOptimizer( solution_size, self._population_size, upper_bound=action_spec.maximum, lower_bound=action_spec.minimum)
def train_step(self, exp: Experience, state: SacState): # We detach exp.observation here so that in the case that exp.observation # is calculated by some other trainable module, the training of that # module will not be affected by the gradient back-propagated from the # actor. However, the gradient from critic will still affect the training # of that module. (action_distribution, action, critics, action_state) = self._predict_action(common.detach(exp.observation), state=state.action) log_pi = nest.map_structure(lambda dist, a: dist.log_prob(a), action_distribution, action) if self._act_type == ActionType.Mixed: # For mixed type, add log_pi separately log_pi = type(self._action_spec)( (sum(nest.flatten(log_pi[0])), sum(nest.flatten(log_pi[1])))) else: log_pi = sum(nest.flatten(log_pi)) if self._prior_actor is not None: prior_step = self._prior_actor.train_step(exp, ()) log_prior = dist_utils.compute_log_probability( prior_step.output, action) log_pi = log_pi - log_prior actor_state, actor_loss = self._actor_train_step( exp, state.actor, action, critics, log_pi, action_distribution) critic_state, critic_info = self._critic_train_step( exp, state.critic, action, log_pi, action_distribution) alpha_loss = self._alpha_train_step(log_pi) state = SacState(action=action_state, actor=actor_state, critic=critic_state) info = SacInfo(action_distribution=action_distribution, actor=actor_loss, critic=critic_info, alpha=alpha_loss) return AlgStep(action, state, info)
def __init__(self, feature_spec, action_spec, population_size, planning_horizon=25, upper_bound=None, lower_bound=None, name="RandomShootingAlgorithm"): """Create a RandomShootingAlgorithm. Args: population_size (int): the size of polulation for random shooting planning_horizon (int): planning horizon in terms of time steps upper_bound (int): upper bound for elements in solution; action_spec.maximum will be used if not specified lower_bound (int): lower bound for elements in solution; action_spec.minimum will be used if not specified """ super().__init__( feature_spec=feature_spec, action_spec=action_spec, planning_horizon=planning_horizon, upper_bound=upper_bound, lower_bound=lower_bound, name=name) flat_action_spec = nest.flatten(action_spec) assert len(flat_action_spec) == 1, ("RandomShootingAlgorithm doesn't " "support nested action_spec") self._population_size = population_size solution_size = self._planning_horizon * self._num_actions self._solution_size = solution_size # expand action bound to solution bound solution_upper_bound = self._upper_bound.unsqueeze(0).expand( planning_horizon, *self._upper_bound.shape).reshape(-1) solution_lower_bound = self._lower_bound.unsqueeze(0).expand( planning_horizon, *self._lower_bound.shape).reshape(-1) self._plan_optimizer = RandomOptimizer( solution_size, self._population_size, upper_bound=solution_upper_bound, lower_bound=solution_lower_bound, cost_func=self._calc_cost_for_action_sequence)
def __init__(self, feature_spec, action_spec, planning_horizon=25, upper_bound=None, lower_bound=None, name="PlanningAlgorithm"): """Create a PlanningAlgorithm. Args: planning_horizon (int): planning horizon in terms of time steps upper_bound (int): upper bound for elements in solution; action_spec.maximum will be used if not specified lower_bound (int): lower bound for elements in solution; action_spec.minimum will be used if not specified particles_per_replica (int): number of particles used for each replica """ super().__init__( feature_spec, action_spec, train_state_spec=PlannerState( prev_plan=TensorSpec((planning_horizon, action_spec.shape[-1]))), name=name) flat_action_spec = nest.flatten(action_spec) assert len(flat_action_spec) == 1, "doesn't support nested action_spec" action_spec = flat_action_spec[0] assert action_spec.is_continuous, "only support \ continious control" self._num_actions = action_spec.shape[-1] self._action_spec = action_spec self._feature_spec = feature_spec self._planning_horizon = planning_horizon self._upper_bound = torch.Tensor(action_spec.maximum) \ if upper_bound is None else upper_bound self._lower_bound = torch.Tensor(action_spec.minimum) \ if lower_bound is None else lower_bound self._action_seq_cost_func = None
def _critic_train_step(self, exp: Experience, state: SacCriticState, action, log_pi, action_distribution): critics, critics_state = self._compute_critics(self._critic_networks, exp.observation, exp.action, state.critics) target_critics, target_critics_state = self._compute_critics( self._target_critic_networks, exp.observation, action, state.target_critics) target_critics = target_critics.min(dim=1)[0] if self._act_type == ActionType.Discrete: critics = self._select_q_value(exp.action, critics) target_critics = self._select_q_value( action, target_critics.unsqueeze(dim=1)) elif self._act_type == ActionType.Mixed: critics = self._select_q_value(exp.action[0], critics) discrete_act_dist = action_distribution[0] target_critics = torch.sum(discrete_act_dist.probs * target_critics, dim=-1) target_critic = target_critics.reshape(exp.reward.shape) if self._use_entropy_reward: entropy_reward = nest.map_structure( lambda la, lp: -torch.exp(la) * lp, self._log_alpha, log_pi) entropy_reward = sum(nest.flatten(entropy_reward)) target_critic = target_critic + entropy_reward target_critic = target_critic.detach() state = SacCriticState(critics=critics_state, target_critics=target_critics_state) info = SacCriticInfo(critics=critics, target_critic=target_critic) return state, info
def _actor_train_step(self, exp: Experience, state: DdpgActorState): action, actor_state = self._actor_network(exp.observation, state=state.actor) q_values, critic_states = self._critic_networks( (exp.observation, action), state=state.critics) if q_values.ndim == 3: # Multidimensional reward: [B, num_criric_replicas, reward_dim] if self._reward_weights is None: q_values = q_values.sum(dim=2) else: q_values = torch.tensordot(q_values, self._reward_weights, dims=1) if self._num_critic_replicas > 1: q_value = q_values.min(dim=1)[0] else: q_value = q_values.squeeze(dim=1) dqda = nest_utils.grad(action, q_value.sum()) def actor_loss_fn(dqda, action): if self._dqda_clipping: dqda = torch.clamp(dqda, -self._dqda_clipping, self._dqda_clipping) loss = 0.5 * losses.element_wise_squared_loss( (dqda + action).detach(), action) if self._action_l2 > 0: assert action.requires_grad loss += self._action_l2 * (action**2) loss = loss.sum(list(range(1, loss.ndim))) return loss actor_loss = nest.map_structure(actor_loss_fn, dqda, action) state = DdpgActorState(actor=actor_state, critics=critic_states) info = LossInfo(loss=sum(nest.flatten(actor_loss)), extra=actor_loss) return AlgStep(output=action, state=state, info=info)
def _alpha_train_step(self, log_pi): alpha_loss = nest.map_structure( lambda la, lp, t: la * (-lp - t).detach(), self._log_alpha, log_pi, self._target_entropy) return sum(nest.flatten(alpha_loss))
def _check_spec_equal(spec1, spec2): assert nest.flatten(spec1) == nest.flatten(spec2), ( "Unmatched action specs: {} vs. {}".format(spec1, spec2))
def _make_networks(self, observation_spec, action_spec, continuous_actor_network_cls, critic_network_cls, q_network_cls): def _make_parallel(net): if self._use_parallel_network: nets = net.make_parallel(self._num_critic_replicas) else: nets = alf.networks.NaiveParallelNetwork( net, self._num_critic_replicas) return nets def _check_spec_equal(spec1, spec2): assert nest.flatten(spec1) == nest.flatten(spec2), ( "Unmatched action specs: {} vs. {}".format(spec1, spec2)) discrete_action_spec = [ spec for spec in nest.flatten(action_spec) if spec.is_discrete ] continuous_action_spec = [ spec for spec in nest.flatten(action_spec) if spec.is_continuous ] if discrete_action_spec and continuous_action_spec: # When there are both continuous and discrete actions, we require # that acition_spec is a tuple/list ``(discrete, continuous)``. assert (isinstance( action_spec, (tuple, list)) and len(action_spec) == 2), ( "In the mixed case, the action spec must be a tuple/list" " (discrete_action_spec, continuous_action_spec)!") _check_spec_equal(action_spec[0], discrete_action_spec) _check_spec_equal(action_spec[1], continuous_action_spec) discrete_action_spec = action_spec[0] continuous_action_spec = action_spec[1] elif discrete_action_spec: discrete_action_spec = action_spec elif continuous_action_spec: continuous_action_spec = action_spec actor_network = None reward_dim = 1 if continuous_action_spec: assert continuous_actor_network_cls is not None, ( "If there are continuous actions, then a ActorDistributionNetwork " "must be provided for sampling continuous actions!") actor_network = continuous_actor_network_cls( input_tensor_spec=observation_spec, action_spec=continuous_action_spec) if not discrete_action_spec: act_type = ActionType.Continuous assert critic_network_cls is not None, ( "If only continuous actions exist, then a CriticNetwork must" " be provided!") critic_network = critic_network_cls( input_tensor_spec=(observation_spec, continuous_action_spec)) reward_dim = critic_network.output_spec.numel critic_networks = _make_parallel(critic_network) if discrete_action_spec: assert reward_dim == 1, ( "Discrete action is not supported for multidimensional reward") act_type = ActionType.Discrete assert len(alf.nest.flatten(discrete_action_spec)) == 1, ( "Only support at most one discrete action currently! " "Discrete action spec: {}".format(discrete_action_spec)) assert q_network_cls is not None, ( "If there exists a discrete action, then QNetwork must " "be provided!") if continuous_action_spec: act_type = ActionType.Mixed q_network = q_network_cls( input_tensor_spec=(observation_spec, continuous_action_spec), action_spec=discrete_action_spec) else: q_network = q_network_cls(input_tensor_spec=observation_spec, action_spec=action_spec) critic_networks = _make_parallel(q_network) return critic_networks, actor_network, act_type, reward_dim
def __init__(self, observation_spec, action_spec: BoundedTensorSpec, actor_network_cls=ActorDistributionNetwork, critic_network_cls=CriticNetwork, q_network_cls=QNetwork, reward_weights=None, use_entropy_reward=True, use_parallel_network=False, num_critic_replicas=2, env=None, config: TrainerConfig = None, critic_loss_ctor=None, target_entropy=None, prior_actor_ctor=None, target_kld_per_dim=3., initial_log_alpha=0.0, max_log_alpha=None, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, debug_summaries=False, name="SacAlgorithm"): """ Args: observation_spec (nested TensorSpec): representing the observations. action_spec (nested BoundedTensorSpec): representing the actions; can be a mixture of discrete and continuous actions. The number of continuous actions can be arbitrary while only one discrete action is allowed currently. If it's a mixture, then it must be a tuple/list ``(discrete_action_spec, continuous_action_spec)``. actor_network_cls (Callable): is used to construct the actor network. The constructed actor network will be called to sample continuous actions. All of its output specs must be continuous. Note that we don't need a discrete actor network because a discrete action can simply be sampled from the Q values. critic_network_cls (Callable): is used to construct critic network. for estimating ``Q(s,a)`` given that the action is continuous. q_network (Callable): is used to construct QNetwork for estimating ``Q(s,a)`` given that the action is discrete. Its output spec must be consistent with the discrete action in ``action_spec``. reward_weights (None|list[float]): this is only used when the reward is multidimensional. In that case, the weighted sum of the q values is used for training the actor if reward_weights is not None. Otherwise, the sum of the q values is used. use_entropy_reward (bool): whether to include entropy as reward use_parallel_network (bool): whether to use parallel network for calculating critics. num_critic_replicas (int): number of critics to be used. Default is 2. env (Environment): The environment to interact with. ``env`` is a batched environment, which means that it runs multiple simulations simultateously. ``env` only needs to be provided to the root algorithm. config (TrainerConfig): config for training. It only needs to be provided to the algorithm which performs ``train_iter()`` by itself. critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss constructor. If ``None``, a default ``OneStepTDLoss`` will be used. initial_log_alpha (float): initial value for variable ``log_alpha``. max_log_alpha (float|None): if not None, ``log_alpha`` will be capped at this value. target_entropy (float|Callable|None): If a floating value, it's the target average policy entropy, for updating ``alpha``. If a callable function, then it will be called on the action spec to calculate a target entropy. If ``None``, a default entropy will be calculated. For the mixed action type, discrete action and continuous action will have separate alphas and target entropies, so this argument can be a 2-element list/tuple, where the first is for discrete action and the second for continuous action. prior_actor_ctor (Callable): If provided, it will be called using ``prior_actor_ctor(observation_spec, action_spec, debug_summaries=debug_summaries)`` to constructor a prior actor. The output of the prior actor is the distribution of the next action. Two prior actors are implemented: ``alf.algorithms.prior_actor.SameActionPriorActor`` and ``alf.algorithms.prior_actor.UniformPriorActor``. target_kld_per_dim (float): ``alpha`` is dynamically adjusted so that the KLD is about ``target_kld_per_dim * dim``. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between ``[-dqda_clipping, dqda_clipping]``. Will not perform clipping if ``dqda_clipping == 0``. actor_optimizer (torch.optim.optimizer): The optimizer for actor. critic_optimizer (torch.optim.optimizer): The optimizer for critic. alpha_optimizer (torch.optim.optimizer): The optimizer for alpha. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ self._num_critic_replicas = num_critic_replicas self._use_parallel_network = use_parallel_network critic_networks, actor_network, self._act_type, reward_dim = self._make_networks( observation_spec, action_spec, actor_network_cls, critic_network_cls, q_network_cls) self._use_entropy_reward = use_entropy_reward if reward_dim > 1: assert not use_entropy_reward, ( "use_entropy_reward=True is not supported for multidimensional reward" ) assert self._act_type == ActionType.Continuous, ( "Only continuous action is supported for multidimensional reward" ) self._reward_weights = None if reward_weights: assert reward_dim > 1, ( "reward_weights cannot be used for one dimensional reward") assert len(reward_weights) == reward_dim, ( "Mismatch between len(reward_weights)=%s and reward_dim=%s" % (len(reward_weights), reward_dim)) self._reward_weights = torch.tensor(reward_weights, dtype=torch.float32) def _init_log_alpha(): return nn.Parameter(torch.tensor(float(initial_log_alpha))) if self._act_type == ActionType.Mixed: # separate alphas for discrete and continuous actions log_alpha = type(action_spec)( (_init_log_alpha(), _init_log_alpha())) else: log_alpha = _init_log_alpha() action_state_spec = SacActionState( actor_network=(() if self._act_type == ActionType.Discrete else actor_network.state_spec), critic=(() if self._act_type == ActionType.Continuous else critic_networks.state_spec)) super().__init__( observation_spec, action_spec, train_state_spec=SacState( action=action_state_spec, actor=(() if self._act_type != ActionType.Continuous else critic_networks.state_spec), critic=SacCriticState( critics=critic_networks.state_spec, target_critics=critic_networks.state_spec)), predict_state_spec=SacState(action=action_state_spec), env=env, config=config, debug_summaries=debug_summaries, name=name) if actor_optimizer is not None: self.add_optimizer(actor_optimizer, [actor_network]) if critic_optimizer is not None: self.add_optimizer(critic_optimizer, [critic_networks]) if alpha_optimizer is not None: self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha)) self._log_alpha = log_alpha if self._act_type == ActionType.Mixed: self._log_alpha_paralist = nn.ParameterList( nest.flatten(log_alpha)) if max_log_alpha is not None: self._max_log_alpha = torch.tensor(float(max_log_alpha)) else: self._max_log_alpha = None self._actor_network = actor_network self._critic_networks = critic_networks self._target_critic_networks = self._critic_networks.copy( name='target_critic_networks') if critic_loss_ctor is None: critic_loss_ctor = OneStepTDLoss critic_loss_ctor = functools.partial(critic_loss_ctor, debug_summaries=debug_summaries) # Have different names to separate their summary curves self._critic_losses = [] for i in range(num_critic_replicas): self._critic_losses.append( critic_loss_ctor(name="critic_loss%d" % (i + 1))) self._prior_actor = None if prior_actor_ctor is not None: assert self._act_type == ActionType.Continuous, ( "Only continuous action is supported when using prior_actor") self._prior_actor = prior_actor_ctor( observation_spec=observation_spec, action_spec=action_spec, debug_summaries=debug_summaries) total_action_dims = sum( [spec.numel for spec in alf.nest.flatten(action_spec)]) self._target_entropy = -target_kld_per_dim * total_action_dims else: if self._act_type == ActionType.Mixed: if not isinstance(target_entropy, (tuple, list)): target_entropy = nest.map_structure( lambda _: target_entropy, self._action_spec) # separate target entropies for discrete and continuous actions self._target_entropy = nest.map_structure( lambda spec, t: _set_target_entropy(self.name, t, [spec]), self._action_spec, target_entropy) else: self._target_entropy = _set_target_entropy( self.name, target_entropy, nest.flatten(self._action_spec)) self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_networks], target_models=[self._target_critic_networks], tau=target_update_tau, period=target_update_period)
def __init__(self, observation_spec, feature_spec, action_spec, dynamics_module: DynamicsLearningAlgorithm, reward_module: RewardEstimationAlgorithm, planner_module: PlanAlgorithm, env=None, config: TrainerConfig = None, dynamics_optimizer=None, reward_optimizer=None, planner_optimizer=None, debug_summaries=False, name="MbrlAlgorithm"): """Create an MbrlAlgorithm. The MbrlAlgorithm takes as input the following set of modules for making decisions on actions based on the current observation: 1) learnable/fixed dynamics module 2) learnable/fixed reward module 3) learnable/fixed planner module Args: action_spec (nested BoundedTensorSpec): representing the actions. dynamics_module (DynamicsLearningAlgorithm): module for learning to predict the next feature based on the previous feature and action. It should accept input with spec [feature_spec, encoded_action_spec] and output a tensor of shape feature_spec. For discrete action, encoded_action is an one-hot representation of the action. For continuous action, encoded action is same as the original action. reward_module (RewardEstimationAlgorithm): module for calculating the reward, i.e., evaluating the reward for a (s, a) pair planner_module (PlanAlgorithm): module for generating planned action based on specified reward function and dynamics function env (Environment): The environment to interact with. env is a batched environment, which means that it runs multiple simulations simultateously. env only needs to be provided to the root Algorithm. config (TrainerConfig): config for training. config only needs to be provided to the algorithm which performs `train_iter()` by itself. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ train_state_spec = MbrlState(dynamics=dynamics_module.train_state_spec, reward=reward_module.train_state_spec, planner=planner_module.train_state_spec) super().__init__(feature_spec, action_spec, train_state_spec=train_state_spec, env=env, config=config, debug_summaries=debug_summaries, name=name) flat_action_spec = nest.flatten(action_spec) action_spec = flat_action_spec[0] assert action_spec.is_continuous, "only support \ continious control" num_actions = action_spec.shape[-1] flat_feature_spec = nest.flatten(feature_spec) assert len(flat_feature_spec) == 1, "Mbrl doesn't support nested \ feature_spec" self._action_spec = action_spec self._num_actions = num_actions if dynamics_optimizer is not None: self.add_optimizer(dynamics_optimizer, [dynamics_module]) if planner_optimizer is not None: self.add_optimizer(planner_optimizer, [planner_module]) if reward_optimizer is not None: self.add_optimizer(reward_optimizer, [reward_module]) self._dynamics_module = dynamics_module self._reward_module = reward_module self._planner_module = planner_module self._planner_module.set_reward_func(self._calc_step_reward) self._planner_module.set_dynamics_func(self._predict_next_step)
def __init__( self, observation_spec, action_spec: BoundedTensorSpec, critic_network: MdqCriticNetwork, env=None, config: TrainerConfig = None, critic_loss_ctor=None, target_entropy=dist_utils.calc_default_target_entropy_quantized, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, distill_noise=0.01, critic_optimizer=None, alpha_optimizer=None, debug_summaries=False, name="MdqAlgorithm"): """ Args: observation_spec (nested TensorSpec): representing the observations. action_spec (nested BoundedTensorSpec): representing the actions. critic_network (MdqCriticNetwork): an instance of MdqCriticNetwork env (Environment): The environment to interact with. ``env`` is a batched environment, which means that it runs multiple simulations simultateously. ``env` only needs to be provided to the root algorithm. config (TrainerConfig): config for training. It only needs to be provided to the algorithm which performs ``train_iter()`` by itself. critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss constructor. If ``None``, a default ``OneStepTDLoss`` will be used. initial_log_alpha (float): initial value for variable ``log_alpha``. target_entropy (float|Callable): If a floating value, it's the target average policy entropy, for updating ``alpha``. If a callable function, then it will be called on the action spec to calculate a target entropy. Note that in MDQ algorithm, as the continuous action is represented by a discrete distribution for each action dimension, ``calc_default_target_entropy_quantized`` is used to compute the target entropy by default. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. distill_noise (int): the std of random Gaussian noise added to the action used for distillation. critic_optimizer (torch.optim.optimizer): The optimizer for critic. alpha_optimizer (torch.optim.optimizer): The optimizer for alpha. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_networks = critic_network target_critic_networks = critic_networks.copy( name='target_critic_networks') train_state_spec = MdqState( critic=MdqCriticState(critic=critic_networks.state_spec, target_critic=critic_networks.state_spec)) super().__init__(observation_spec, action_spec, train_state_spec=train_state_spec, env=env, config=config, debug_summaries=debug_summaries, name=name) self._critic_networks = critic_networks self._target_critic_networks = target_critic_networks self.add_optimizer(critic_optimizer, [critic_networks]) if critic_loss_ctor is None: critic_loss_ctor = OneStepTDLoss critic_loss_ctor = functools.partial(critic_loss_ctor, debug_summaries=debug_summaries) flat_action_spec = nest.flatten(self._action_spec) self._flat_action_spec = flat_action_spec self._action_dim = flat_action_spec[0].shape[0] self._log_pi_uniform_prior = self._critic_networks.get_uniform_prior_logpi( ) self._num_critic_replicas = self._critic_networks._num_critic_replicas self._critic_losses = [] for i in range(self._num_critic_replicas): self._critic_losses.append( critic_loss_ctor(name="critic_loss%d" % (i + 1))) self._is_continuous = flat_action_spec[0].is_continuous self._target_entropy = _set_target_entropy(self.name, target_entropy, flat_action_spec) log_alpha = nn.Parameter(torch.Tensor([float(initial_log_alpha)])) self._log_alpha = log_alpha self._update_target = common.get_target_updater( models=[self._critic_networks], target_models=[self._target_critic_networks], tau=target_update_tau, period=target_update_period) if alpha_optimizer is not None: self.add_optimizer(alpha_optimizer, [log_alpha]) self._distill_noise = distill_noise
def __init__(self, observation_spec, action_spec, planner_module: PlanAlgorithm, env=None, config: TrainerConfig = None, planner_optimizer=None, debug_summaries=False, name="LatentMbrlAlgorithm"): """Create an LatentMbrlAlgorithm. The LatentMbrlAlgorithm takes as input a planner module for making decisions on actions based on the latent representation of the current observation as well as a latent dynamics model. The latent representation as well as the latent dynamics is provided by a latent predictive representation module, which is an instance of ``PredictiveRepresentationLearner``. It is set through the ``set_latent_predictive_representation_module()`` function. The latent predictive representation module should have a function ``predict_multi_step`` for performing multi-step imagined rollout. Currently it is assumed that the training of the latent representation module is outside of the ``LatentMbrlAlgorithm``, although the ``LatentMbrlAlgorithm`` can also contribute to its training by using the latent representation in loss calculation. Args: action_spec (BoundedTensorSpec): representing the actions. planner_module (PlanAlgorithm): module for generating planned action based on specified reward function and dynamics function env (Environment): The environment to interact with. env is a batched environment, which means that it runs multiple simulations simultateously. env only needs to be provided to the root Algorithm. config (TrainerConfig): config for training. config only needs to be provided to the algorithm which performs `train_iter()` by itself. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ super().__init__(observation_spec, feature_spec=observation_spec, action_spec=action_spec, dynamics_module=None, reward_module=None, planner_module=planner_module, planner_optimizer=planner_optimizer, env=env, config=config, debug_summaries=debug_summaries, name=name) flat_action_spec = nest.flatten(action_spec) action_spec = flat_action_spec[0] assert action_spec.is_continuous, "only support \ continious control" num_actions = action_spec.shape[-1] self._action_spec = action_spec self._num_actions = num_actions self._latent_pred_rep_module = None # set it later