def build_q_models(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: num_outputs = 256 config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name=Q_SCOPE, model_interface=SimpleQModel, q_hiddens=config["hiddens"]) policy.target_q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name=Q_TARGET_SCOPE, model_interface=SimpleQModel, q_hiddens=config["hiddens"]) return policy.q_model
def build_sac_model(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException num_outputs = action_space.n policy.model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name="sac_model", twin_q=config["twin_q"]) policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name="target_sac_model", twin_q=config["twin_q"]) return policy.model
def build_q_models(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def build_q_models(policy, obs_space, action_space, config): policy.log_stats = config["log_stats"] if policy.log_stats: policy.stats_dict = {} policy.stats_fn = config["stats_fn"] if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) default_model = RNNModel if config[ "recurrent_dqn"] else FullyConnectedNetwork policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], default_model=default_model, name=Q_SCOPE).to(policy.device) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], default_model=default_model, name=Q_TARGET_SCOPE).to(policy.device) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def build_cac_model( policy: TFPolicy, obs_space: spaces.Space, action_space: spaces.Space, config ) -> ModelV2: policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n if isinstance(action_space, spaces.Discrete) else np.product(action_space.shape), model_config=config["model"], framework="tf", default_model=CentralizedActorCriticModel, name="cac", ) policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=np.product(action_space.shape), model_config=config["model"], framework="tf", default_model=CentralizedActorCriticModel, name="target_cac", ) return policy.model
def test_default_models(self): ray.init(object_store_memory=1000 * 1024 * 1024) for fw in framework_iterator(frameworks=("jax", "tf", "tf2", "torch")): obs_space = Box(0, 1, shape=(3, ), dtype=np.float32) p1 = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=Discrete(5), num_outputs=5, model_config={}, framework=fw, ) self.assertTrue("FullyConnectedNetwork" in type(p1).__name__) # Do a test forward pass. obs = np.array([obs_space.sample()]) if fw == "torch": obs = torch.from_numpy(obs) out, state_outs = p1({"obs": obs}) self.assertTrue(out.shape == (1, 5)) self.assertTrue(state_outs == []) # No Conv2Ds for JAX yet. if fw != "jax": p2 = ModelCatalog.get_model_v2( obs_space=Box(0, 1, shape=(84, 84, 3), dtype=np.float32), action_space=Discrete(5), num_outputs=5, model_config={}, framework=fw, ) self.assertTrue("VisionNetwork" in type(p2).__name__)
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.observation_space = observation_space self.action_space = action_space self.config = config self.action_shape = action_space.n # GPU settings self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # This attribute will be incremented every time learn_on_batch is called. self.iteration = 0 # The current time step. self.current_step = 0 # Agent parameters. self.lr = self.config["lr"] self.gamma = self.config["gamma"] self.target_update_frequency = self.config["target_update_frequency"] # Strategy self.strategy = \ EpsilonGreedyStrategy(self.config["eps_start"], self.config["eps_end"], self.config["eps_decay"]) # Replay memory self.memory = ReplayMemory(self.config["replay_memory_size"]) # Policy network self.policy_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Target network self.target_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Set the weights & biases in the target_net to be the same as those in the policy_net. self.target_net.load_state_dict(self.policy_net.state_dict()) # Put target_net in eval mode. This network will only be used for inference. self.target_net.eval() # Optimizer. self.optimizer = optim.RMSprop(self.policy_net.parameters()) # The calculated loss. self.loss = 0
def build_q_model(policy: Policy, obs_space: gym.Space, action_space: gym.Space, config: TrainerConfigDict) -> ModelV2: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance( getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance( getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") return policy.q_model
def build_ddpg_model(policy, obs_space, action_space, config): if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format(action_space)) if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") if config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = NoopModel num_outputs = int(np.product(obs_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], parameter_noise=config["parameter_noise"], twin_q=config["twin_q"]) policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], parameter_noise=config["parameter_noise"], twin_q=config["twin_q"]) return policy.model
def build_sac_model(policy, obs_space, action_space, config): # 2 cases: # 1) with separate state-preprocessor (before obs+action concat). # 2) no separate state-preprocessor: concat obs+actions right away. if config["use_state_preprocessor"]: num_outputs = 256 # Flatten last Conv2D to this many nodes. else: num_outputs = 0 # No state preprocessor: fcnet_hiddens should be empty. if config["model"]["fcnet_hiddens"]: logger.warning( "When not using a state-preprocessor with SAC, `fcnet_hiddens`" " will be set to an empty list! Any hidden layer sizes are " "defined via `policy_model.fcnet_hiddens` and " "`Q_model.fcnet_hiddens`.") config["model"]["fcnet_hiddens"] = [] # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=SACTorchModel if config["framework"] == "torch" else SACTFModel, name="sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=SACTorchModel if config["framework"] == "torch" else SACTFModel, name="target_sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) return policy.model
def build_ddpg_models(policy: Policy, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: if policy.config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = TorchNoopModel if config["framework"] == "torch" \ else NoopModel num_outputs = int(np.product(observation_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=(DDPGTorchModel if config["framework"] == "torch" else DDPGTFModel), default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) policy.target_model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], model_interface=(DDPGTorchModel if config["framework"] == "torch" else DDPGTFModel), default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) return policy.model
def build_q_model(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DistributionalQModel, name=Q_SCOPE, num_atoms=config["num_atoms"], q_hiddens=config["hiddens"], dueling=config["dueling"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], parameter_noise=config["parameter_noise"]) policy.target_q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DistributionalQModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], q_hiddens=config["hiddens"], dueling=config["dueling"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], parameter_noise=config["parameter_noise"]) return policy.q_model
def __init__(self, observation_space, action_space, config): super().__init__(observation_space, action_space, config) self.observation_space = observation_space self.action_space = action_space self.config = config self.action_shape = action_space.n # GPU settings self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.dtype_f = torch.FloatTensor self.dtype_l = torch.LongTensor self.dtype_b = torch.BoolTensor self.lr = self.config["lr"] # Extra options need to be added in dqn.py self.epsilon = self.config["epsilon"] self.epsilon_decay = self.config["epsilon_decay"] self.epsilon_min = self.config["epsilon_min"] self.gamma = torch.tensor(self.config["gamma"]).to(self.device, non_blocking=True) self.batch_size = self.config["batch_size"] self.memory = deque(maxlen=self.config["buffer_size"]) self.dqn_model = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=2, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) self.MSE_loss_fn = MSELoss(reduction='mean') self.optimizer = torch.optim.Adam(self.dqn_model.parameters(), lr=self.lr)
def build_appo_model(policy, obs_space, action_space, config): policy.model = ModelCatalog.get_model_v2(obs_space, action_space, policy.logit_dim, config["model"], name=POLICY_SCOPE, framework="tf") policy.target_model = ModelCatalog.get_model_v2(obs_space, action_space, policy.logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="tf") return policy.model
def __init__(self, obs_space, action_space, config): self.observation_space = obs_space self.action_space = action_space self.action_space_struct = get_base_struct_from_space(action_space) self.action_noise_std = config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space) self.observation_filter = get_filter(config["observation_filter"], self.preprocessor.shape) self.single_threaded = config.get("single_threaded", False) self.sess = make_session(single_threaded=self.single_threaded) self.inputs = tf1.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, config["model"], dist_type="deterministic") self.model = ModelCatalog.get_model_v2( obs_space=self.preprocessor.observation_space, action_space=action_space, num_outputs=dist_dim, model_config=config["model"]) dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) dist = dist_class(dist_inputs, self.model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( dist_inputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf1.global_variables_initializer())
def _build_q_models(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for Simple Q learning Note that this function works for both Tensorflow and PyTorch. Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) if torch.cuda.is_available(): policy.q_model = policy.q_model.to("cuda") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) if torch.cuda.is_available(): policy.target_q_model = policy.target_q_model.to("cuda") policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def __init__(self, obs_space, action_space, config): super().__init__(obs_space, action_space, config) self.action_noise_std = self.config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space( self.observation_space) self.observation_filter = get_filter(self.config["observation_filter"], self.preprocessor.shape) self.single_threaded = self.config.get("single_threaded", False) if self.config["framework"] == "tf": self.sess = make_session(single_threaded=self.single_threaded) # Set graph-level seed. if config.get("seed") is not None: with self.sess.as_default(): tf1.set_random_seed(config["seed"]) self.inputs = tf1.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) else: if not tf1.executing_eagerly(): tf1.enable_eager_execution() self.sess = self.inputs = None if config.get("seed") is not None: # Tf2.x. if config.get("framework") == "tf2": tf.random.set_seed(config["seed"]) # Tf-eager. elif tf1 and config.get("framework") == "tfe": tf1.set_random_seed(config["seed"]) # Policy network. self.dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, self.config["model"], dist_type="deterministic") self.model = ModelCatalog.get_model_v2( obs_space=self.preprocessor.observation_space, action_space=self.action_space, num_outputs=dist_dim, model_config=self.config["model"], ) self.sampler = None if self.sess: dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) dist = self.dist_class(dist_inputs, self.model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( dist_inputs, self.sess) self.sess.run(tf1.global_variables_initializer()) else: self.variables = ray.experimental.tf_utils.TensorFlowVariables( [], None, self.model.variables()) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items())
def test_custom_model(self): ray.init(object_store_memory=1000 * 1024 * 1024) ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model_v2( obs_space=Box(0, 1, shape=(3,), dtype=np.float32), action_space=Discrete(5), num_outputs=5, model_config={"custom_model": "foo"}, ) self.assertEqual(str(type(p1)), str(CustomModel))
def test_default_models(self): ray.init(object_store_memory=1000 * 1024 * 1024) p1 = ModelCatalog.get_model_v2(obs_space=Box(0, 1, shape=(3, ), dtype=np.float32), action_space=Discrete(5), num_outputs=5, model_config={}) self.assertEqual(type(p1), FullyConnectedNetwork) p2 = ModelCatalog.get_model_v2(obs_space=Box(0, 1, shape=(84, 84, 3), dtype=np.float32), action_space=Discrete(5), num_outputs=5, model_config={}) self.assertEqual(type(p2), VisionNetwork)
def build_appo_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=POLICY_SCOPE, framework="torch" if config["use_pytorch"] else "tf") policy.model_variables = policy.model.variables() policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="torch" if config["use_pytorch"] else "tf") policy.target_model_variables = policy.target_model.variables() return policy.model
def build_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=POLICY_SCOPE, framework="tf", ) return policy.model
def make_mu_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"], framework="torch") base_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework="torch") mu_model = MuZeroModel(obs_space, action_space, logit_dim, config["model"], name="MuZeroModel", base_model=base_model) return mu_model
def __init__(self, obs_space, action_space, config): """Target Network is updated by the master learner every trainer.update_target_frequency steps. All worker batches are importance sampled w.r. to the target network to ensure a more stable pi_old in PPO. """ assert config[DELAY_UPDATE] _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) self.target_model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="tf") self.model_vars = self.model.variables() self.target_model_vars = self.target_model.variables() self.get_session().run(tf.initialize_variables(self.target_model_vars)) self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") assign_ops = [] assert len(self.model_vars) == len(self.target_model_vars) for var, var_target in zip(self.model_vars, self.target_model_vars): assign_ops.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*assign_ops) @make_tf_callable(self.get_session(), True) def compute_clone_network_logits(ob): # def compute_clone_network_logits(ob, prev_action, prev_reward): # We do not support recurrent network now. feed_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(ob), # SampleBatch.PREV_REWARDS: tf.convert_to_tensor( # prev_reward), "is_training": tf.convert_to_tensor(False) } # if prev_action is not None: # feed_dict[SampleBatch.PREV_ACTIONS] = tf.convert_to_tensor( # prev_action) model_out, _ = self.target_model(feed_dict) return model_out self._compute_clone_network_logits = compute_clone_network_logits
def make_model_and_action_dist(policy, observation_space, action_space, config): # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, config["model"], # model_options dist_type="deterministic", framework="torch") model = ModelCatalog.get_model_v2(policy.preprocessor.observation_space, action_space, num_outputs=dist_dim, model_config=config["model"], framework="torch") # Make all model params not require any gradients. for p in model.parameters(): p.requires_grad = False return model, dist_class
def build_avg_model_and_distribution( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( f"Action space {action_space} is not supported for NFSP.") policy.avg_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=AVG_POL_SCOPE) policy.avg_func_vars = policy.avg_model.variables() return policy.avg_model, TorchCategorical
def __init__(self, obs_space, action_space, config): assert config[DELAY_UPDATE] # Build the target network of this policy. _, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"] ) self.target_model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name="target_func", framework="tf" ) self.model_vars = self.model.variables() self.target_model_vars = self.target_model.variables() self.get_session().run( tf.variables_initializer(self.target_model_vars)) # Here is the delayed update mechanism. self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") assign_ops = [] assert len(self.model_vars) == len(self.target_model_vars) for var, var_target in zip(self.model_vars, self.target_model_vars): assign_ops.append( var_target. assign(self.tau * var + (1.0 - self.tau) * var_target) ) self.update_target_expr = tf.group(*assign_ops) @make_tf_callable(self.get_session(), True) def compute_clone_network_logits(ob): feed_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(ob), "is_training": tf.convert_to_tensor(False) } model_out, _ = self.target_model(feed_dict) return model_out self._compute_clone_network_logits = compute_clone_network_logits
def make_nomad_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"], framework="torch") base_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework="torch") nomad_model = NomadModel(obs_space, action_space, logit_dim, config["model"], name="NomadModel", base_model=base_model, order=config["mcts_param"]["order"]) return nomad_model
def build_commnet( policy: TFPolicy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config, ) -> ModelV2: assert isinstance(action_space, gym.spaces.Tuple) unit_action_space = action_space.spaces[0] unit_action_dim = (unit_action_space.n if isinstance( unit_action_space, gym.spaces.Discrete) else np.product( unit_action_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=unit_action_dim * len(action_space.spaces), model_config=config["model"], framework="tf", default_model=CommNet, name="commnet", ) return policy.model
def build_q_model(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for DQN Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + list(config["model"]["fcnet_hiddens"]))[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") return q_model
def build_sac_model(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Constructs the necessary ModelV2 for the Policy and returns it. Args: policy (Policy): The TFPolicy that will use the models. obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. config (TrainerConfigDict): The SAC trainer's config dict. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional target model will be created in this function and assigned to `policy.target_model`. """ # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. policy_model_config = MODEL_DEFAULTS.copy() policy_model_config.update(config["policy_model"]) q_model_config = MODEL_DEFAULTS.copy() q_model_config.update(config["Q_model"]) default_model_cls = SACTorchModel if config["framework"] == "torch" \ else SACTFModel model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=None, model_config=config["model"], framework=config["framework"], default_model=default_model_cls, name="sac_model", policy_model_config=policy_model_config, q_model_config=q_model_config, twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) assert isinstance(model, default_model_cls) # Create an exact copy of the model and store it in `policy.target_model`. # This will be used for tau-synched Q-target models that run behind the # actual Q-networks and are used for target q-value calculations in the # loss terms. policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=None, model_config=config["model"], framework=config["framework"], default_model=default_model_cls, name="target_sac_model", policy_model_config=policy_model_config, q_model_config=q_model_config, twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) assert isinstance(policy.target_model, default_model_cls) return model