def create_subdistributions(ordered_action_spaces: OrderedDict): """ Iterates over the spaces in `ordered_action_space` and for each, calls `make_proba_distribution` from SB3 on that space. Creates a dict of subdistributions. A subtle but important note: these Distributions do not yet have parametrized torch distributions inside them; and thus can't actually produce samples or log probabilities. Parameters are only set when `proba_distribution` is called. """ assert isinstance( ordered_action_spaces, OrderedDict), "ordered_action_spaces must be an OrderedDict" ordered_distributions = OrderedDict() for space_name, space in ordered_action_spaces.items(): inferred_distribution = make_proba_distribution(space) ordered_distributions[space_name] = inferred_distribution return ordered_distributions
def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, lr_schedule: Callable[[float], float], net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None, device: Union[th.device, str] = "auto", activation_fn: Type[nn.Module] = nn.Tanh, ortho_init: bool = True, use_sde: bool = False, log_std_init: float = 0.0, full_std: bool = True, sde_net_arch: Optional[List[int]] = None, use_expln: bool = False, squash_output: bool = False, features_extractor_class: Type[ BaseFeaturesExtractor] = FlattenExtractor, features_extractor_kwargs: Optional[Dict[str, Any]] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, ): if optimizer_kwargs is None: optimizer_kwargs = {} # Small values to avoid NaN in Adam optimizer if optimizer_class == th.optim.Adam: optimizer_kwargs["eps"] = 1e-5 super(ActorCriticPolicy, self).__init__( observation_space, action_space, device, features_extractor_class, features_extractor_kwargs, optimizer_class=optimizer_class, optimizer_kwargs=optimizer_kwargs, squash_output=squash_output, ) # Default network architecture, from stable-baselines if net_arch is None: if features_extractor_class == FlattenExtractor: net_arch = [dict(pi=[64, 64], vf=[64, 64])] else: net_arch = [] self.net_arch = net_arch self.activation_fn = activation_fn self.ortho_init = ortho_init self.features_extractor = features_extractor_class( self.observation_space, **self.features_extractor_kwargs) self.features_dim = self.features_extractor.features_dim self.normalize_images = normalize_images self.log_std_init = log_std_init dist_kwargs = None # Keyword arguments for gSDE distribution if use_sde: dist_kwargs = { "full_std": full_std, "squash_output": squash_output, "use_expln": use_expln, "learn_features": sde_net_arch is not None, } self.sde_features_extractor = None self.sde_net_arch = sde_net_arch self.use_sde = use_sde self.dist_kwargs = dist_kwargs # Action distribution self.action_dist = make_proba_distribution(action_space, use_sde=use_sde, dist_kwargs=dist_kwargs) self._build(lr_schedule)
def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, lr_schedule: Callable[[float], float], net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None, device: Union[th.device, str] = 'auto', activation_fn: Type[nn.Module] = nn.Tanh, ortho_init: bool = True, use_sde: bool = False, log_std_init: float = 0.0, full_std: bool = True, sde_net_arch: Optional[List[int]] = None, use_expln: bool = False, squash_output: bool = False, features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, features_extractor_kwargs: Optional[Dict[str, Any]] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, # my additional arguments num_partners: int = 1, partner_net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None, # net arch for each partner-specific module baseline: bool = False, nomain: bool = False, ): if optimizer_kwargs is None: optimizer_kwargs = {} # Small values to avoid NaN in Adam optimizer if optimizer_class == th.optim.Adam: optimizer_kwargs['eps'] = 1e-5 super(ModularPolicy, self).__init__(observation_space, action_space, device, features_extractor_class, features_extractor_kwargs, optimizer_class=optimizer_class, optimizer_kwargs=optimizer_kwargs, squash_output=squash_output) # NOTE Hanabi Stuff self.num_partners = num_partners print("CUDA: ", th.cuda.is_available()) # NOTE Defining Partner architecture -> Need to incorporate Rulebased ? if partner_net_arch is None: if features_extractor_class == FlattenExtractor: partner_net_arch = [dict(pi=[64, 64], vf=[64, 64])] else: partner_net_arch = [] self.partner_net_arch = partner_net_arch self.baseline = baseline self.nomain = nomain # Default network architecture, from stable-baselines if net_arch is None: if features_extractor_class == FlattenExtractor: net_arch = [ dict( pi=[64, 64], vf=[64, 64] ) ] else: net_arch = [] self.net_arch = net_arch self.activation_fn = activation_fn self.ortho_init = ortho_init self.features_extractor = features_extractor_class(self.observation_space, **self.features_extractor_kwargs) self.features_dim = self.features_extractor.features_dim self.normalize_images = normalize_images self.log_std_init = log_std_init dist_kwargs = None # Keyword arguments for gSDE distribution if use_sde: dist_kwargs = { 'full_std': full_std, 'squash_output': squash_output, 'use_expln': use_expln, 'learn_features': sde_net_arch is not None } self.sde_features_extractor = None self.sde_net_arch = sde_net_arch self.use_sde = use_sde self.dist_kwargs = dist_kwargs # Action distribution self.action_dist = make_proba_distribution(action_space, use_sde=use_sde, dist_kwargs=dist_kwargs) self.lr_schedule = lr_schedule self._build(self.lr_schedule)