def make_model_and_action_dist(policy, obs_space, action_space, config): # Get the output distribution class for predicting rewards and next-obs. policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist( obs_space, config, dist_type="deterministic", framework="torch") # Build one dynamics model if we are a Worker. # If we are the main MAML learner, build n (num_workers) dynamics Models # for being able to create checkpoints for the current state of training. device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) policy.dynamics_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["dynamics_model"], framework="torch", name="dynamics_ensemble", ).to(device) action_dist, num_outputs = ModelCatalog.get_action_dist(action_space, config, framework="torch") # Create the pi-model and register it with the Policy. policy.pi = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", name="policy_model", ) return policy.pi, action_dist
def build_model_and_distribution(policy, obs_space, action_space, config): if isinstance(action_space, Discrete): num_outputs = action_space.n dist = TorchCategorical else: num_outputs = np.prod(action_space.shape) * 2 dist = TorchDiagGaussian policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, framework="torch", model_interface=FullyConnectedNetwork, name="ac", model_config=config["model"], ) policy.model_variables = policy.model.variables() policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, framework="torch", model_interface=FullyConnectedNetwork, name="ac_target", model_config=config["model"], ) policy.target_model_variables = policy.target_model.variables() return policy.model, dist
def build_q_model_and_distribution(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_SCOPE, dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, decompose_num=config["decompose_num"]) policy.q_func_vars = policy.q_model.variables() policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_TARGET_SCOPE, dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, decompose_num=config["decompose_num"]) policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model, TorchMultiObjCategorical
def build_ddpg_models( policy: Policy, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> ModelV2: if policy.config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = TorchNoopModel if config[ "framework"] == "torch" else NoopModel num_outputs = int(np.product(observation_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=(DDPGTorchModel if config["framework"] == "torch" else DDPGTFModel), default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) policy.target_model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=(DDPGTorchModel if config["framework"] == "torch" else DDPGTFModel), default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) return policy.model
def make_model_and_action_dist( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: """Constructs the necessary ModelV2 and action dist class for the Policy. Args: policy (Policy): The TFPolicy that will use the models. obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. config (TrainerConfigDict): The SAC trainer's config dict. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional target model will be created in this function and assigned to `policy.target_model`. """ # Get the output distribution class for predicting rewards and next-obs. policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist( obs_space, config, dist_type="deterministic", framework="torch") # Build one dynamics model if we are a Worker. # If we are the main MAML learner, build n (num_workers) dynamics Models # for being able to create checkpoints for the current state of training. device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") policy.dynamics_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["dynamics_model"], framework="torch", name="dynamics_ensemble", ).to(device) action_dist, num_outputs = ModelCatalog.get_action_dist(action_space, config, framework="torch") # Create the pi-model and register it with the Policy. policy.pi = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", name="policy_model", ) return policy.pi, action_dist
def test_conv2d_default_stacks(self): """Tests, whether conv2d defaults are available for img obs spaces. """ action_space = gym.spaces.Discrete(2) shapes = [ (480, 640, 3), (240, 320, 3), (96, 96, 3), (84, 84, 3), (42, 42, 3), (10, 10, 3), ] for shape in shapes: print(f"shape={shape}") obs_space = gym.spaces.Box(-1.0, 1.0, shape=shape) for fw in framework_iterator(): model = ModelCatalog.get_model_v2(obs_space, action_space, 2, MODEL_DEFAULTS.copy(), framework=fw) self.assertTrue(isinstance(model, (VisionNetwork, TorchVision))) if fw == "torch": output, _ = model( {"obs": torch.from_numpy(obs_space.sample()[None])}) else: output, _ = model({"obs": obs_space.sample()[None]}) # B x [action logits] self.assertTrue(output.shape == (1, 2)) print("ok")
def build_q_model(self, obs_space, action_space, num_outputs, q_model_config, name): """Builds one of the (twin) Q-nets used by this SAC. Override this method in a sub-class of SACTFModel to implement your own Q-nets. Alternatively, simply set `custom_model` within the top level SAC `q_model_config` config key to make this default implementation of `build_q_model` use your custom Q-nets. Returns: TFModelV2: The TFModelV2 Q-net sub-model. """ self.concat_obs_and_actions = False if self.discrete: input_space = obs_space else: orig_space = getattr(obs_space, "original_space", obs_space) if isinstance(orig_space, Box) and len(orig_space.shape) == 1: input_space = Box( float("-inf"), float("inf"), shape=(orig_space.shape[0] + action_space.shape[0],), ) self.concat_obs_and_actions = True else: input_space = gym.spaces.Tuple([orig_space, action_space]) model = ModelCatalog.get_model_v2( input_space, action_space, num_outputs, q_model_config, framework="tf", name=name, ) return model
def __init__(self, obs_space, action_space, num_outputs, model_config, name): # TODO: (sven) Support Dicts as well. assert isinstance(obs_space.original_space, (Tuple)), \ "`obs_space.original_space` must be Tuple!" super().__init__(obs_space, action_space, num_outputs, model_config, name) # Build the CNN(s) given obs_space's image components. self.cnns = {} concat_size = 0 for i, component in enumerate(obs_space.original_space): # Image space. if len(component.shape) == 3: config = { "conv_filters": model_config.get("conv_filters", get_filter_config(component.shape)), "conv_activation": model_config.get("conv_activation"), } cnn = ModelCatalog.get_model_v2(component, action_space, num_outputs=None, model_config=config, framework="tf", name="cnn_{}".format(i)) concat_size += cnn.num_outputs self.cnns[i] = cnn # Discrete inputs -> One-hot encode. elif isinstance(component, Discrete): concat_size += component.n # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). # Everything else (1D Box). else: assert len(component.shape) == 1, \ "Only input Box 1D or 3D spaces allowed!" concat_size += component.shape[-1] self.logits_and_value_model = None self._value_out = None if num_outputs: # Action-distribution head. concat_layer = tf.keras.layers.Input((concat_size, )) logits_layer = tf.keras.layers.Dense( num_outputs, activation=tf.keras.activations.linear, name="logits")(concat_layer) # Create the value branch model. value_layer = tf.keras.layers.Dense( 1, name="value_out", activation=None, kernel_initializer=normc_initializer(0.01))(concat_layer) self.logits_and_value_model = tf.keras.models.Model( concat_layer, [logits_layer, value_layer]) else: self.num_outputs = concat_size
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if before_init: before_init(self, observation_space, action_space, config) self.config = config if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework="tf", ) self.model({ SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [_flatten_action(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), }, [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ], tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.train.AdamOptimizer(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if before_init: before_init(self, obs_space, action_space, config) if make_model_and_action_dist: self.model, self.dist_class = make_model_and_action_dist( self, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], torch=True) self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="torch") TorchPolicy.__init__(self, obs_space, action_space, config, self.model, loss_fn, self.dist_class) if after_init: after_init(self, obs_space, action_space, config)
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if before_init: before_init(self, obs_space, action_space, config) if make_model_and_action_dist: self.model, self.dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ "ERROR: TorchPolicy::make_model_and_action_dist must " \ "return a TorchModelV2 object!" else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") self.model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, self.config["model"], framework="torch") TorchPolicy.__init__(self, obs_space, action_space, config, self.model, loss_fn, self.dist_class) if after_init: after_init(self, obs_space, action_space, config)
def __init__(self, obs_space, action_space, config): model = ModelCatalog.get_model_v2(obs_space, action_space, action_space.n, config["model"], "torch") _, env_creator = Trainer._get_env_id_and_creator(config["env"], config) if config["ranked_rewards"]["enable"]: # if r2 is enabled, tne env is wrapped to include a rewards buffer # used to normalize rewards env_cls = get_r2_env_wrapper(env_creator, config["ranked_rewards"]) # the wrapped env is used only in the mcts, not in the # rollout workers def _env_creator(): return env_cls(config["env_config"]) else: def _env_creator(): return env_creator(config["env_config"]) def mcts_creator(): return MCTS(model, config["mcts_config"]) super().__init__( obs_space, action_space, config, model, alpha_zero_loss, TorchCategorical, mcts_creator, _env_creator, )
def _init_model_and_dist_class(self): if is_overridden(self.make_model) and is_overridden( self.make_model_and_action_dist): raise ValueError( "Only one of make_model or make_model_and_action_dist " "can be overridden.") if is_overridden(self.make_model): model = self.make_model() dist_class, _ = ModelCatalog.get_action_dist( self.action_space, self.config["model"], framework=self.framework) elif is_overridden(self.make_model_and_action_dist): model, dist_class = self.make_model_and_action_dist() else: dist_class, logit_dim = ModelCatalog.get_action_dist( self.action_space, self.config["model"], framework=self.framework) model = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=logit_dim, model_config=self.config["model"], framework=self.framework, ) return model, dist_class
def make_appo_model( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> ModelV2: """Builds model and target model for APPO. Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target model will not be returned, just assigned to `policy.target_model`. """ # Get the num_outputs for the following model construction calls. _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) # Construct the (main) model. policy.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=POLICY_SCOPE, framework="torch" if config["framework"] == "torch" else "tf", ) policy.model_variables = policy.model.variables() # Construct the target model. policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="torch" if config["framework"] == "torch" else "tf", ) policy.target_model_variables = policy.target_model.variables() # Return only the model (not the target model). return policy.model
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if validate_spaces: validate_spaces(self, obs_space, action_space, self.config) if before_init: before_init(self, obs_space, action_space, self.config) # Model is customized (use default action dist class). if make_model: assert make_model_and_action_dist is None, \ "Either `make_model` or `make_model_and_action_dist`" \ " must be None!" self.model = make_model(self, obs_space, action_space, config) dist_class, _ = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") # Model and action dist class are customized. elif make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Use default model and default action dist. else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="torch") # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ "ERROR: Generated Model must be a TorchModelV2 object!" policy_class.__init__( self, observation_space=obs_space, action_space=action_space, config=config, model=self.model, loss=loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) if callable(training_view_requirements_fn): self.training_view_requirements.update( training_view_requirements_fn(self)) if after_init: after_init(self, obs_space, action_space, config)
def make_model(self) -> ModelV2: # copying ddpg build model to here to be explicit model_config = self.config["model"] model_config.update( dict( actor_hidden_activation=self.config["actor_hidden_activation"], actor_hiddens=self.config["actor_hiddens"], critic_hidden_activation=self. config["critic_hidden_activation"], critic_hiddens=self.config["critic_hiddens"], twin_q=self.config["twin_q"], )) num_outputs = int(np.product(self.observation_space.shape)) # TODO: why do we even have to go through this get_model_v2 function? self.model = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=num_outputs, model_config=model_config, framework=self.config["framework"], # use this model for interface (get_q, get_q_twin, .etc) model_interface=CRRModel, default_model=TorchNoopModel, name="model", ) # TODO: this is a bad python pattern to assign attributes that do not exist in # the constructor self.target_model = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=num_outputs, model_config=model_config, framework=self.config["framework"], # use this model for interface (get_q, get_q_twin, .etc) model_interface=CRRModel, default_model=TorchNoopModel, name="target_model", ) return self.model
def build_dreamer_model(policy, obs_space, action_space, config): policy.model = ModelCatalog.get_model_v2(obs_space, action_space, 1, config["dreamer_model"], name="DreamerModel", framework="torch") policy.model_variables = policy.model.variables() return policy.model, None
def make_model(self): model = ModelCatalog.get_model_v2( self.observation_space, self.action_space, 1, self.config["dreamer_model"], name="DreamerModel", framework="torch", ) self.model_variables = model.variables() return model
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if before_init: before_init(self, obs_space, action_space, config) # Model is customized (use default action dist class). if make_model: assert make_model_and_action_dist is None self.model = make_model(self, obs_space, action_space, config) dist_class, _ = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") # Model and action dist class are customized. elif make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Use default model and default action dist. else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="torch", **self.config["model"].get("custom_model_config", {})) # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ "ERROR: Generated Model must be a TorchModelV2 object!" TorchPolicy.__init__( self, observation_space=obs_space, action_space=action_space, config=config, model=self.model, loss=loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) if after_init: after_init(self, obs_space, action_space, config)
def make_appo_model(policy) -> ModelV2: """Builds model and target model for APPO. Returns: ModelV2: The Model for the Policy to use. Note: The target model will not be returned, just assigned to `policy.target_model`. """ # Get the num_outputs for the following model construction calls. _, logit_dim = ModelCatalog.get_action_dist( policy.action_space, policy.config["model"] ) # Construct the (main) model. policy.model = ModelCatalog.get_model_v2( policy.observation_space, policy.action_space, logit_dim, policy.config["model"], name=POLICY_SCOPE, framework=policy.framework, ) policy.model_variables = policy.model.variables() # Construct the target model. policy.target_model = ModelCatalog.get_model_v2( policy.observation_space, policy.action_space, logit_dim, policy.config["model"], name=TARGET_POLICY_SCOPE, framework=policy.framework, ) policy.target_model_variables = policy.target_model.variables() # Return only the model (not the target model). return policy.model
def make_model_and_dist(policy, obs_space, action_space, config): # Get the output distribution class for predicting rewards and next-obs. policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist( obs_space, config, dist_type="deterministic", framework="torch") if config["predict_reward"]: # TODO: (sven) implement reward prediction. _ = ModelCatalog.get_action_dist(gym.spaces.Box( float("-inf"), float("inf"), ()), config, dist_type="") # Build one dynamics model if we are a Worker. # If we are the main MAML learner, build n (num_workers) dynamics Models # for being able to create checkpoints for the current state of training. policy.dynamics_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["dynamics_model"], framework="torch", name="dynamics_model", model_interface=DYNATorchModel, ) action_dist, num_outputs = ModelCatalog.get_action_dist( action_space, config, dist_type="deterministic", framework="torch") # Create the pi-model and register it with the Policy. policy.pi = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", name="policy_model", ) return policy.pi, action_dist
def make_q_models(policy): if not isinstance(policy.action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( f"Action space {policy.action_space} is not supported for DQN.") model = ModelCatalog.get_model_v2( obs_space=policy.observation_space, action_space=policy.action_space, num_outputs=policy.action_space.n, model_config=policy.config["model"], framework=policy.config["framework"], name=Q_SCOPE, ) target_model = ModelCatalog.get_model_v2( obs_space=policy.observation_space, action_space=policy.action_space, num_outputs=policy.action_space.n, model_config=policy.config["model"], framework=policy.config["framework"], name=Q_TARGET_SCOPE, ) return model, target_model
def make_model(self) -> ModelV2: """Build underlying model for this Policy. Returns: The Model for the Policy to use. """ # Default ModelV2 model. _, logit_dim = ModelCatalog.get_action_dist(self.action_space, self.config["model"]) return ModelCatalog.get_model_v2( self.observation_space, self.action_space, logit_dim, self.config["model"], framework=self.framework, )
def __init__(self, obs_space, act_space, config): super(LearnableSignalerPolicy, self).__init__(obs_space, act_space, config) self.framework = "torch" self.exploration = self._create_exploration() self.n_signals = act_space.spaces[0].n if isinstance( act_space, Tuple) else act_space.n self.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) self.model = ModelCatalog.get_model_v2( MultiDiscrete([2]), Discrete(self.n_signals), self.n_signals, config['sig_model'], framework="torch", name="SignalerNet", default_model=FullyConnectedNetwork)
def build_policy_model(self, obs_space, num_outputs, policy_model_config, name): """Builds the policy model used by this SAC. Override this method in a sub-class of SACTFModel to implement your own policy net. Alternatively, simply set `custom_model` within the top level SAC `policy_model` config key to make this default implementation of `build_policy_model` use your custom policy network. Returns: TorchModelV2: The TorchModelV2 policy sub-model. """ model = ModelCatalog.get_model_v2(obs_space, self.action_space, num_outputs, policy_model_config, framework="torch", name=name) return model
def build_model_and_distribution(policy, obs_space, action_space, config): if isinstance(action_space, Discrete): num_outputs = action_space.n dist = TorchCategorical else: num_outputs = np.prod(action_space.shape) * 2 dist = TorchDiagGaussian model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, framework="torch", model_interface=FCN_MultiV_MultiObj, name="ac", model_config=config["model"], num_decompose=decompose) return model, dist
def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) self.config = config if before_init: before_init(self, obs_space, action_space, config) if make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ "ERROR: TorchPolicy::make_model_and_action_dist must " \ "return a TorchModelV2 object!" else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework="torch") self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="torch", **self.config["model"].get("custom_options", {})) TorchPolicy.__init__( self, obs_space, action_space, config, model=self.model, loss=loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) if after_init: after_init(self, obs_space, action_space, config)
def __init__(self, obs_space, action_space, config): # update policy attr for loss calculation self.minibatch_size = config['sgd_minibatch_size'] self.ppo_epochs = config['ppo_epochs'] self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"], framework='torch') self.model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework='torch') super().__init__(obs_space, action_space, config, model=self.model, loss=ppo_surrogate_loss, action_distribution_class=self.dist_class) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
def __init__(self, obs_space, action_space, config): # update policy attr for loss calculation print('DROPPOPolicy init...') # self.framework = config['framework'] = 'torch' # self.kl_coeff = config['kl_coeff'] # self.kl_target = config['kl_target'] # self.entropy_coeff = config['entropy_coeff'] # self.cur_lr = config['lr'] # # setup ._value() for gae computation # self.setup_value(config) self.minibatch_size = config['sgd_minibatch_size'] self.ppo_epochs = config['ppo_epochs'] self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"], framework='torch') self.model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework='torch') super().__init__(obs_space, action_space, config, model=self.model, loss=ppo_surrogate_loss, action_distribution_class=self.dist_class) # Merge Model's view requirements into Policy's. # self.view_requirements.update(self.model.view_requirements) # init mixins # setup_mixins(self, obs_space, action_space, config) # Perform test runs through postprocessing- and loss functions. # self._initialize_loss_from_dummy_batch( # auto_remove_unneeded_view_reqs=True, # stats_fn=kl_and_loss_stats, # ) self.view_requirements.update({'rewards': ViewRequirement()}) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
def __init__(self, action_space: Space, *, framework: str, model: ModelV2, feature_dim: int = 288, feature_net_config: Optional[ModelConfigDict] = None, inverse_net_hiddens: Tuple[int] = (256, ), inverse_net_activation: str = "relu", forward_net_hiddens: Tuple[int] = (256, ), forward_net_activation: str = "relu", beta: float = 0.2, eta: float = 1.0, lr: float = 1e-3, sub_exploration: Optional[FromConfigSpec] = None, **kwargs): """Initializes a Curiosity object. Uses as defaults the hyperparameters described in [1]. Args: feature_dim (int): The dimensionality of the feature (phi) vectors. feature_net_config (Optional[ModelConfigDict]): Optional model configuration for the feature network, producing feature vectors (phi) from observations. This can be used to configure fcnet- or conv_net setups to properly process any observation space. inverse_net_hiddens (Tuple[int]): Tuple of the layer sizes of the inverse (action predicting) NN head (on top of the feature outputs for phi and phi'). inverse_net_activation (str): Activation specifier for the inverse net. forward_net_hiddens (Tuple[int]): Tuple of the layer sizes of the forward (phi' predicting) NN head. forward_net_activation (str): Activation specifier for the forward net. beta (float): Weight for the forward loss (over the inverse loss, which gets weight=1.0-beta) in the common loss term. eta (float): Weight for intrinsic rewards before being added to extrinsic ones. lr (float): The learning rate for the curiosity-specific optimizer, optimizing feature-, inverse-, and forward nets. sub_exploration (Optional[FromConfigSpec]): The config dict for the underlying Exploration to use (e.g. epsilon-greedy for DQN). If None, uses the FromSpecDict provided in the Policy's default config. """ if not isinstance(action_space, (Discrete, MultiDiscrete)): raise ValueError( "Only (Multi)Discrete action spaces supported for Curiosity " "so far!") super().__init__( action_space, model=model, framework=framework, **kwargs) if self.policy_config["num_workers"] != 0: raise ValueError( "Curiosity exploration currently does not support parallelism." " `num_workers` must be 0!") self.feature_dim = feature_dim if feature_net_config is None: feature_net_config = self.policy_config["model"].copy() self.feature_net_config = feature_net_config self.inverse_net_hiddens = inverse_net_hiddens self.inverse_net_activation = inverse_net_activation self.forward_net_hiddens = forward_net_hiddens self.forward_net_activation = forward_net_activation self.action_dim = self.action_space.n if isinstance( self.action_space, Discrete) else np.sum(self.action_space.nvec) self.beta = beta self.eta = eta self.lr = lr # TODO: (sven) if sub_exploration is None, use Trainer's default # Exploration config. if sub_exploration is None: raise NotImplementedError self.sub_exploration = sub_exploration # Creates modules/layers inside the actual ModelV2. self._curiosity_feature_net = ModelCatalog.get_model_v2( self.model.obs_space, self.action_space, self.feature_dim, model_config=self.feature_net_config, framework=self.framework, name="feature_net", ) self._curiosity_inverse_fcnet = self._create_fc_net( [2 * self.feature_dim] + list(self.inverse_net_hiddens) + [self.action_dim], self.inverse_net_activation, name="inverse_net") self._curiosity_forward_fcnet = self._create_fc_net( [self.feature_dim + self.action_dim] + list( self.forward_net_hiddens) + [self.feature_dim], self.forward_net_activation, name="forward_net") # This is only used to select the correct action self.exploration_submodule = from_config( cls=Exploration, config=self.sub_exploration, action_space=self.action_space, framework=self.framework, policy_config=self.policy_config, model=self.model, num_workers=self.num_workers, worker_index=self.worker_index, )