def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.pg.pg.DEFAULT_CONFIG, **config) self.config = config # Setup placeholders obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") # Create the model network and action outputs self.model = ModelCatalog.get_model({ "obs": obs, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss actions = ModelCatalog.get_action_placeholder(action_space) advantages = tf.placeholder(tf.float32, [None], name="adv") loss = PGLoss(action_dist, actions, advantages).loss # Mapping from sample batch keys to placeholders. These keys will be # read from postprocessed sample batches and fed into the specified # placeholders during loss computation. loss_in = [ ("obs", obs), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), # added during postprocessing ] # Initialize TFPolicyGraph sess = tf.get_default_session() TFPolicyGraph.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=loss, loss_inputs=loss_in, model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) sess.run(tf.global_variables_initializer())
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, obs_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config _, self.logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_torch_model(obs_space, self.logit_dim, self.config["model"]) loss = A3CLoss(self.model, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) TorchPolicyGraph.__init__( self, obs_space, action_space, self.model, loss, loss_inputs=["obs", "actions", "advantages", "value_targets"])
def _setup_loss(self, action_space): self.ac = ModelCatalog.get_action_placeholder(action_space) self.adv = tf.placeholder(tf.float32, [None], name="adv") log_prob = self.dist.logp(self.ac) # policy loss self.loss = -tf.reduce_mean(log_prob * self.adv)
def _setup_graph(self, ob_space, ac_space): self.x = tf.placeholder(tf.float32, [None] + list(ob_space)) dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_model( self.registry, self.x, self.logit_dim, self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.vf = tf.reshape(linear(self._model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False)
def __init__(self, registry, env_creator, config): self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.config = config self.policy = PGPolicy(registry, self.env.observation_space, self.env.action_space, config) self.sampler = SyncSampler( self.env, self.policy, NoFilter(), config["batch_size"], horizon=config["horizon"])
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model(self._input_dict, self._state_in, self._seq_lens) if self._obs_include_prev_action_reward: train_batch = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: train_batch = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_in): train_batch["state_in_{}".format(i)] = si train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict = train_batch loss = self._do_loss_init(train_batch) for k in sorted(train_batch.accessed_keys): if k != "seq_lens" and not k.startswith("state_in_"): loss_inputs.append((k, train_batch[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) self._sess.run(tf.global_variables_initializer())
def __init__(self, action_space: Space, *, framework: str, model: ModelV2, feature_dim: int = 288, feature_net_config: Optional[ModelConfigDict] = None, inverse_net_hiddens: Tuple[int] = (256, ), inverse_net_activation: str = "relu", forward_net_hiddens: Tuple[int] = (256, ), forward_net_activation: str = "relu", beta: float = 0.2, eta: float = 1.0, lr: float = 1e-3, sub_exploration: Optional[FromConfigSpec] = None, **kwargs): """Initializes a Curiosity object. Uses as defaults the hyperparameters described in [1]. Args: feature_dim (int): The dimensionality of the feature (phi) vectors. feature_net_config (Optional[ModelConfigDict]): Optional model configuration for the feature network, producing feature vectors (phi) from observations. This can be used to configure fcnet- or conv_net setups to properly process any observation space. inverse_net_hiddens (Tuple[int]): Tuple of the layer sizes of the inverse (action predicting) NN head (on top of the feature outputs for phi and phi'). inverse_net_activation (str): Activation specifier for the inverse net. forward_net_hiddens (Tuple[int]): Tuple of the layer sizes of the forward (phi' predicting) NN head. forward_net_activation (str): Activation specifier for the forward net. beta (float): Weight for the forward loss (over the inverse loss, which gets weight=1.0-beta) in the common loss term. eta (float): Weight for intrinsic rewards before being added to extrinsic ones. lr (float): The learning rate for the curiosity-specific optimizer, optimizing feature-, inverse-, and forward nets. sub_exploration (Optional[FromConfigSpec]): The config dict for the underlying Exploration to use (e.g. epsilon-greedy for DQN). If None, uses the FromSpecDict provided in the Policy's default config. """ if framework != "torch": raise ValueError("Only torch is currently supported for Curiosity") elif not isinstance(action_space, (Discrete, MultiDiscrete)): raise ValueError( "Only (Multi)Discrete action spaces supported for Curiosity " "so far!") super().__init__(action_space, model=model, framework=framework, **kwargs) if self.policy_config["num_workers"] != 0: raise ValueError( "Curiosity exploration currently does not support parallelism." " `num_workers` must be 0!") self.feature_dim = feature_dim if feature_net_config is None: feature_net_config = self.policy_config["model"].copy() self.feature_net_config = feature_net_config self.inverse_net_hiddens = inverse_net_hiddens self.inverse_net_activation = inverse_net_activation self.forward_net_hiddens = forward_net_hiddens self.forward_net_activation = forward_net_activation self.action_dim = self.action_space.n if isinstance( self.action_space, Discrete) else np.sum(self.action_space.nvec) self.beta = beta self.eta = eta self.lr = lr # TODO: (sven) if sub_exploration is None, use Trainer's default # Exploration config. if sub_exploration is None: raise NotImplementedError self.sub_exploration = sub_exploration # Creates modules/layers inside the actual ModelV2. self._curiosity_feature_net = ModelCatalog.get_model_v2( self.model.obs_space, self.action_space, self.feature_dim, model_config=self.feature_net_config, framework=self.framework, name="feature_net", ) self._curiosity_inverse_fcnet = self._create_fc_net( [2 * self.feature_dim] + list(self.inverse_net_hiddens) + [self.action_dim], self.inverse_net_activation) self._curiosity_forward_fcnet = self._create_fc_net( [self.feature_dim + self.action_dim] + list(self.forward_net_hiddens) + [self.feature_dim], self.forward_net_activation) # This is only used to select the correct action self.exploration_submodule = from_config( cls=Exploration, config=self.sub_exploration, action_space=self.action_space, framework=self.framework, policy_config=self.policy_config, model=self.model, num_workers=self.num_workers, worker_index=self.worker_index, )
register_env("dm-" + env_name, env_creator) # Placeholder to enable use of a custom pre-processor class ImagePreproc(Preprocessor): def _init_shape(self, obs_space, options): shape = (84, 84, 3) # Adjust third dim if stacking frames return shape # return gym.spaces.Box( # low=0.0, high=1.0, shape=self.shape) def transform(self, observation): return observation ModelCatalog.register_custom_preprocessor("sq_im_84", ImagePreproc) config = { # Model and preprocessor options. "model": { "custom_model": model_name, "custom_options": { # Custom notes for the experiment "notes": { "args": vars(args) }, }, # NOTE:Wrappers are applied by RLlib if custom_preproc is NOT specified "custom_preprocessor": "sq_im_84", "dim": 84, "free_log_std": False, # if args.discrete_actions else True,
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, update_ops_fn=None, grad_stats_fn=None, before_loss_init=None, make_action_sampler=None, existing_inputs=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors update_ops_fn (func): optional function that returns a list overriding the update ops to run when applying gradients before_loss_init (func): optional function to run prior to loss init that takes the same arguments as __init__ make_action_sampler (func): optional function that returns a tuple of action and action prob tensors. The function takes (policy, input_dict, obs_space, action_space, config) as its arguments existing_inputs (OrderedDict): when copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.config = config self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._update_ops_fn = update_ops_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Create the model network and action outputs if make_action_sampler: assert not existing_inputs, \ "Cloning not supported with custom action sampler" self.model = None self.dist_class = None self.action_dist = None action_sampler, action_prob = make_action_sampler( self, self.input_dict, obs_space, action_space, config) else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: existing_state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if existing_state_in: existing_seq_lens = existing_inputs["seq_lens"] else: existing_seq_lens = None else: existing_state_in = [] existing_seq_lens = None self.model = ModelCatalog.get_model(self.input_dict, obs_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) self.action_dist = self.dist_class(self.model.outputs) action_sampler = self.action_dist.sample() action_prob = self.action_dist.sampled_action_prob() # Phase 1 init sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 TFPolicy.__init__( self, obs_space, action_space, sess, obs_input=obs, action_sampler=action_sampler, action_prob=action_prob, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self.model and self.model.state_in, state_outputs=self.model and self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model and self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) # Phase 2 init self._needs_eager_conversion = set() self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def _initialize_loss_with_dummy_batch(self): # Dummy forward pass to initialize any policy attributes, etc. action_dtype, action_shape = ModelCatalog.get_action_shape( self.action_space) dummy_batch = { SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([self.observation_space.sample()])), SampleBatch.NEXT_OBS: tf.convert_to_tensor( np.array([self.observation_space.sample()])), SampleBatch.DONES: tf.convert_to_tensor(np.array([False], dtype=np.bool)), SampleBatch.ACTIONS: tf.convert_to_tensor( np.zeros((1, ) + action_shape[1:], dtype=action_dtype.as_numpy_dtype())), SampleBatch.REWARDS: tf.convert_to_tensor(np.array([0], dtype=np.float32)), } if obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: dummy_batch[SampleBatch.ACTIONS], SampleBatch.PREV_REWARDS: dummy_batch[SampleBatch.REWARDS], }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = tf.convert_to_tensor( np.expand_dims(h, 0)) dummy_batch["state_out_{}".format(i)] = tf.convert_to_tensor( np.expand_dims(h, 0)) state_batches.append(tf.convert_to_tensor(np.expand_dims(h, 0))) if state_init: dummy_batch["seq_lens"] = tf.convert_to_tensor( np.array([1], dtype=np.int32)) # for IMPALA which expects a certain sample batch size def tile_to(tensor, n): return tf.tile(tensor, [n] + [1 for _ in tensor.shape.as_list()[1:]]) if get_batch_divisibility_req: dummy_batch = { k: tile_to(v, get_batch_divisibility_req(self)) for k, v in dummy_batch.items() } # Execute a forward pass to get self.action_dist etc initialized, # and also obtain the extra action fetches _, _, fetches = self.compute_actions( dummy_batch[SampleBatch.CUR_OBS], state_batches, dummy_batch.get(SampleBatch.PREV_ACTIONS), dummy_batch.get(SampleBatch.PREV_REWARDS)) dummy_batch.update(fetches) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model.from_batch(dummy_batch) postprocessed_batch = { k: tf.convert_to_tensor(v) for k, v in postprocessed_batch.items() } loss_fn(self, self.model, self.dist_class, postprocessed_batch) if stats_fn: stats_fn(self, postprocessed_batch)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) values = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, ac_size], name="behaviour_logits") def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = (self.config["sample_batch_size"] // self.config["num_envs_per_worker"]) B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", self.observations), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), }, }
def register_carla_model(): ModelCatalog.register_custom_model("carla", CarlaModel)
def __init__(self, obs_space, action_space, config): _validate(obs_space, action_space) config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config) self.framework = "torch" super().__init__(obs_space, action_space, config) self.n_agents = len(obs_space.original_space.spaces) self.n_actions = action_space.spaces[0].n self.h_size = config["model"]["lstm_cell_size"] self.has_env_global_state = False self.has_action_mask = False self.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) agent_obs_space = obs_space.original_space.spaces[0] if isinstance(agent_obs_space, Dict): space_keys = set(agent_obs_space.spaces.keys()) if "obs" not in space_keys: raise ValueError( "Dict obs space must have subspace labeled `obs`") self.obs_size = _get_size(agent_obs_space.spaces["obs"]) if "action_mask" in space_keys: mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) if mask_shape != (self.n_actions, ): raise ValueError( "Action mask shape must be {}, got {}".format( (self.n_actions, ), mask_shape)) self.has_action_mask = True if ENV_STATE in space_keys: self.env_global_state_shape = _get_size( agent_obs_space.spaces[ENV_STATE]) self.has_env_global_state = True else: self.env_global_state_shape = (self.obs_size, self.n_agents) # The real agent obs space is nested inside the dict config["model"]["full_obs_space"] = agent_obs_space agent_obs_space = agent_obs_space.spaces["obs"] else: self.obs_size = _get_size(agent_obs_space) self.env_global_state_shape = (self.obs_size, self.n_agents) self.model = ModelCatalog.get_model_v2(agent_obs_space, action_space.spaces[0], self.n_actions, config["model"], framework="torch", name="model", default_model=RNNModel).to( self.device) self.target_model = ModelCatalog.get_model_v2( agent_obs_space, action_space.spaces[0], self.n_actions, config["model"], framework="torch", name="target_model", default_model=RNNModel).to(self.device) self.exploration = self._create_exploration() # Setup the mixer network. if config["mixer"] is None: self.mixer = None self.target_mixer = None elif config["mixer"] == "qmix": self.mixer = QMixer(self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"]).to(self.device) self.target_mixer = QMixer( self.n_agents, self.env_global_state_shape, config["mixing_embed_dim"]).to(self.device) elif config["mixer"] == "vdn": self.mixer = VDNMixer().to(self.device) self.target_mixer = VDNMixer().to(self.device) else: raise ValueError("Unknown mixer type {}".format(config["mixer"])) self.cur_epsilon = 1.0 self.update_target() # initial sync # Setup optimizer self.params = list(self.model.parameters()) if self.mixer: self.params += list(self.mixer.parameters()) self.loss = QMixLoss(self.model, self.target_model, self.mixer, self.target_mixer, self.n_agents, self.n_actions, self.config["double_q"], self.config["gamma"]) from torch.optim import RMSprop self.optimiser = RMSprop(params=self.params, lr=config["lr"], alpha=config["optim_alpha"], eps=config["optim_eps"])
def setUp(self): ray.init() ModelCatalog.register_custom_model("keras_model", MyKerasModel) ModelCatalog.register_custom_model("torch_model", MyTorchModel)
def __init__(self, obs_space, action_space, config): # Set up the config from possible default-config fn and given # config arg. if get_default_config: config = dict(get_default_config(), **config) self.config = config # Set the DL framework for this Policy. self.framework = self.config["framework"] = framework # Validate observation- and action-spaces. if validate_spaces: validate_spaces(self, obs_space, action_space, self.config) # Do some pre-initialization steps. if before_init: before_init(self, obs_space, action_space, self.config) # Model is customized (use default action dist class). if make_model: assert make_model_and_action_dist is None, \ "Either `make_model` or `make_model_and_action_dist`" \ " must be None!" self.model = make_model(self, obs_space, action_space, config) dist_class, _ = ModelCatalog.get_action_dist( action_space, self.config["model"], framework=framework) # Model and action dist class are customized. elif make_model_and_action_dist: self.model, dist_class = make_model_and_action_dist( self, obs_space, action_space, config) # Use default model and default action dist. else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"], framework=framework) self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework=framework) # Make sure, we passed in a correct Model factory. model_cls = TorchModelV2 if framework == "torch" else JAXModelV2 assert isinstance(self.model, model_cls), \ "ERROR: Generated Model must be a TorchModelV2 object!" # Call the framework-specific Policy constructor. self.parent_cls = parent_cls self.parent_cls.__init__( self, observation_space=obs_space, action_space=action_space, config=config, model=self.model, loss=None if self.config["in_evaluation"] else loss_fn, action_distribution_class=dist_class, action_sampler_fn=action_sampler_fn, action_distribution_fn=action_distribution_fn, max_seq_len=config["model"]["max_seq_len"], get_batch_divisibility_req=get_batch_divisibility_req, ) # Merge Model's view requirements into Policy's. self.view_requirements.update(self.model.view_requirements) _before_loss_init = before_loss_init or after_init if _before_loss_init: _before_loss_init(self, self.observation_space, self.action_space, config) # Perform test runs through postprocessing- and loss functions. self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=None if self.config["in_evaluation"] else stats_fn, ) if _after_loss_init: _after_loss_init(self, obs_space, action_space, config) # Got to reset global_timestep again after this fake run-through. self.global_timestep = 0
def __init__(self, obs_space, action_space, num_outputs, model_config, name): model_config = with_base_config(base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self.pi_obs_key = 'full_observation' self.vf_obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'full_observation' assert not model_config['vf_share_layers'] else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" if model_config["custom_preprocessor"]: print(obs_space) self.preprocessor = ModelCatalog.get_preprocessor_for_space(observation_space=self.obs_space.original_space, options=model_config) else: self.preprocessor = None logger.warn("No custom preprocessor for StrategoModel was specified.\n" "Some tree search policies may not initialize their placeholders correctly without this.") self.use_lstm = model_config['use_lstm'] self.lstm_cell_size = model_config['lstm_cell_size'] self.vf_share_layers = model_config.get("vf_share_layers") self.mask_invalid_actions = model_config['custom_options']['mask_invalid_actions'] conv_activation = get_activation_fn(model_config.get("conv_activation")) cnn_filters = model_config.get("conv_filters") fc_activation = get_activation_fn(model_config.get("fcnet_activation")) hiddens = model_config.get("fcnet_hiddens") if self.use_lstm: state_in = [tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="pi_lstm_h"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="pi_lstm_c"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="vf_lstm_h"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="vf_lstm_c")] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self.pi_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.pi_obs_key].shape), name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.vf_obs_key].shape), name="vf_observation") else: state_in, seq_lens_in = None, None self.pi_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.pi_obs_key].shape, name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.vf_obs_key].shape, name="vf_observation") if cnn_filters is None: # assuming board size will always remain the same for both pi and vf networks if self.use_lstm: single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] else: single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] cnn_filters = _get_filter_config(single_obs_input_shape) def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer) else: return layer def build_primary_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # encapsulated in a function to either be called once for shared policy/vf or twice for separate policy/vf _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(cnn_filters): _last_layer = maybe_td(tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format(prefix, i)))(_last_layer) _last_layer = maybe_td(tf.keras.layers.Flatten())(_last_layer) for i, size in enumerate(hiddens): _last_layer = maybe_td(tf.keras.layers.Dense( size, name="{}_fc_{}".format(prefix, i), activation=fc_activation, kernel_initializer=normc_initializer(1.0)))(_last_layer) if self.use_lstm: _last_layer, *state_out = tf.keras.layers.LSTM( units=self.lstm_cell_size, return_sequences=True, return_state=True, name="{}_lstm".format(prefix))( inputs=_last_layer, mask=tf.sequence_mask(seq_lens_in), initial_state=state_in) else: state_out = None return _last_layer, state_out if self.use_lstm: pi_state_in = state_in[:2] vf_state_in = state_in[2:] else: pi_state_in, vf_state_in = None, None policy_file_path = None if 'policy_keras_model_file_path' in model_config['custom_options']: policy_file_path = model_config['custom_options']['policy_keras_model_file_path'] if policy_file_path is not None: if self.use_lstm: raise NotImplementedError pi_state_out = None self._pi_model = load_model(filepath=policy_file_path, compile=False) # remove loaded input layer # pi_model.layers.pop(0) # self.pi_obs_inputs = pi_model.layers[0] # rename layers for layer in self._pi_model.layers: layer._name = "pi_" + layer.name self._pi_model.layers[-1]._name = 'pi_unmasked_logits' self.unmasked_logits_out = self._pi_model(self.pi_obs_inputs) else: self._pi_model = None pi_last_layer, pi_state_out = build_primary_layers(prefix="pi", obs_in=self.pi_obs_inputs, state_in=pi_state_in) self.unmasked_logits_out = maybe_td(tf.keras.layers.Dense( num_outputs, name="pi_unmasked_logits", activation=None, kernel_initializer=normc_initializer(0.01)))(pi_last_layer) vf_last_layer, vf_state_out = build_primary_layers(prefix="vf", obs_in=self.vf_obs_inputs, state_in=vf_state_in) if self.use_lstm: state_out = [*pi_state_out, *vf_state_out] else: state_out = None self._use_q_fn = model_config['custom_options']['q_fn'] if self._use_q_fn: value_out_size = num_outputs else: value_out_size = 1 value_out = maybe_td(tf.keras.layers.Dense( value_out_size, name="vf_out", activation=None, kernel_initializer=normc_initializer(0.01)))(vf_last_layer) model_inputs = [self.pi_obs_inputs, self.vf_obs_inputs] model_outputs = [self.unmasked_logits_out, value_out] if self.use_lstm: model_inputs += [seq_lens_in, *state_in] model_outputs += state_out self.base_model = tf.keras.Model(inputs=model_inputs, outputs=model_outputs) print(self.base_model.summary()) self.register_variables(self.base_model.variables)
filters_10x10 = [ [32, [2, 2], 2], [256, [2, 2], 2] ] if len(shape) == 3 and shape[:2] == [84, 84]: return filters_84x84 elif len(shape) == 3 and shape[:2] == [42, 42]: return filters_42x42 elif len(shape) == 3 and shape[:2] == [3, 4]: return filters_3x4 elif len(shape) == 3 and shape[:2] == [4, 4]: return filters_4x4 elif len(shape) == 3 and shape[:2] == [6, 6]: return filters_6x6 elif len(shape) == 3 and shape[:2] == [10, 10]: return filters_10x10 elif len(shape) == 1: # Don't use a cnn in this case return [] else: raise ValueError( "No default configuration for obs shape {}".format(shape) + ", you must specify `conv_filters` manually as a model option" ", or add it as a default to the _get_filter_config function.") ModelCatalog.register_custom_model(STRATEGO_MODEL, StrategoModel)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model( self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) is_training = tf.placeholder_with_default(True, ()) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss( action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] for i, ph in enumerate(self.model.state_in): loss_in.append(("state_in_{}".format(i), ph)) self.state_in = self.model.state_in self.state_out = self.model.state_out TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, is_training=is_training, state_inputs=self.state_in, state_outputs=self.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) if self.config.get("summarize"): bs = tf.to_float(tf.shape(self.observations)[0]) tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs) tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs) tf.summary.scalar("model/entropy", self.loss.entropy / bs) tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads)) tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list)) self.summary_op = tf.summary.merge_all() self.sess.run(tf.global_variables_initializer())
def __init__(self, action_space, value_targets, advantages, actions, logits, vf_preds, curr_action_dist, value_fn, cur_kl_coeff, valid_mask, entropy_coeff=0, clip_param=0.1, vf_clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): """Constructs the loss for Proximal Policy Objective. Arguments: action_space: Environment observation space specification. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages (Placeholder): Placeholder for calculated advantages from previous model evaluation. logits (Placeholder): Placeholder for logits output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. curr_action_dist (ActionDistribution): ActionDistribution of the current model. value_fn (Tensor): Current value function output Tensor. cur_kl_coeff (Variable): Variable holding the current PPO KL coefficient. valid_mask (Tensor): A bool mask of valid input elements (#2992). entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter vf_clip_param (float): Clip parameter for the value function vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. """ def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) prev_dist = dist_cls(logits) # Make loss functions. logp_ratio = tf.exp( curr_action_dist.logp(actions) - prev_dist.logp(actions)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) curr_entropy = curr_action_dist.entropy() self.mean_entropy = reduce_mean_valid(curr_entropy) surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_policy_loss = reduce_mean_valid(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) vf_clipped = vf_preds + tf.clip_by_value( value_fn - vf_preds, -vf_clip_param, vf_clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = reduce_mean_valid(vf_loss) loss = reduce_mean_valid( -surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) loss = reduce_mean_valid(-surrogate_loss + cur_kl_coeff * action_kl - entropy_coeff * curr_entropy) self.loss = loss
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
parser.add_argument( "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." ) parser.add_argument( "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." ) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=3) num_frames = 16 ModelCatalog.register_custom_model( "frame_stack_model", FrameStackingCartPoleModel if args.framework != "torch" else TorchFrameStackingCartPoleModel, ) config = { "env": StatelessCartPole, "model": { "vf_share_layers": True, "custom_model": "frame_stack_model", "custom_model_config": { "num_frames": num_frames, }, # To compare against a simple LSTM: # "use_lstm": True, # "lstm_use_prev_action": True, # "lstm_use_prev_reward": True,
# Test API wrapper for dueling Q-head. obs_space = Box(-1.0, 1.0, (3, )) action_space = Discrete(3) # Run in eager mode for value checking and debugging. tf1.enable_eager_execution() # __sphinx_doc_model_construct_1_begin__ my_dueling_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=MODEL_DEFAULTS, framework=args.framework, # Providing the `model_interface` arg will make the factory # wrap the chosen default model with our new model API class # (DuelingQModel). This way, both `forward` and `get_q_values` # are available in the returned class. model_interface=DuelingQModel if args.framework != "torch" else TorchDuelingQModel, name="dueling_q_model", ) # __sphinx_doc_model_construct_1_end__ batch_size = 10 input_ = np.array([obs_space.sample() for _ in range(batch_size)]) # Note that for PyTorch, you will have to provide torch tensors here. if args.framework == "torch": input_ = torch.from_numpy(input_) input_dict = {
def __init__(self, obs_space, action_space, config): _validate(obs_space, action_space) config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config) self.config = config self.observation_space = obs_space self.action_space = action_space self.n_agents = len(obs_space.original_space.spaces) self.n_actions = action_space.spaces[0].n self.h_size = config["model"]["lstm_cell_size"] agent_obs_space = obs_space.original_space.spaces[0] if isinstance(agent_obs_space, Dict): space_keys = set(agent_obs_space.spaces.keys()) if space_keys != {"obs", "action_mask"}: raise ValueError( "Dict obs space for agent must have keyset " "['obs', 'action_mask'], got {}".format(space_keys)) mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape) if mask_shape != (self.n_actions, ): raise ValueError("Action mask shape must be {}, got {}".format( (self.n_actions, ), mask_shape)) self.has_action_mask = True self.obs_size = _get_size(agent_obs_space.spaces["obs"]) # The real agent obs space is nested inside the dict agent_obs_space = agent_obs_space.spaces["obs"] else: self.has_action_mask = False self.obs_size = _get_size(agent_obs_space) self.model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) self.target_model = ModelCatalog.get_torch_model( agent_obs_space, self.n_actions, config["model"], default_model_cls=RNNModel) # Setup the mixer network. # The global state is just the stacked agent observations for now. self.state_shape = [self.obs_size, self.n_agents] if config["mixer"] is None: self.mixer = None self.target_mixer = None elif config["mixer"] == "qmix": self.mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) self.target_mixer = QMixer(self.n_agents, self.state_shape, config["mixing_embed_dim"]) elif config["mixer"] == "vdn": self.mixer = VDNMixer() self.target_mixer = VDNMixer() else: raise ValueError("Unknown mixer type {}".format(config["mixer"])) self.cur_epsilon = 1.0 self.update_target() # initial sync # Setup optimizer self.params = list(self.model.parameters()) self.loss = QMixLoss(self.model, self.target_model, self.mixer, self.target_mixer, self.n_agents, self.n_actions, self.config["double_q"], self.config["gamma"]) self.optimiser = RMSprop( params=self.params, lr=config["lr"], alpha=config["optim_alpha"], eps=config["optim_eps"])
def build_q_model_and_distribution( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> Tuple[ModelV2, TorchDistributionWrapper]: """Build q_model and target_q_model for DQN Args: policy (Policy): The policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: (q_model, TorchCategorical) Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + list(config["model"]["fcnet_hiddens"]))[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.q_func_vars = policy.q_model.variables() policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_TARGET_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model, TorchCategorical
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if before_init: before_init(self, observation_space, action_space, config) self.config = config if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework="tf", ) self.model( { SampleBatch.CUR_OBS: tf.convert_to_tensor(np.array([observation_space.sample() ])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [_flatten_action(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), }, [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ], tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.train.AdamOptimizer(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Auto-update model's inference view requirements, if recurrent. self._update_model_inference_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] # Combine view_requirements for Model and Policy. self.view_requirements.update( self.model.inference_view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
def _init_helper(self, observation_space, action_space, config, existing_inputs=None): config = dict(DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None imitation = config["imitation"] assert not imitation if imitation: T = config["sample_batch_size"] B = config["train_batch_size"] // T batch_shape = (T, B) else: batch_shape = (None, ) if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False actions_shape = batch_shape output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True actions_shape = batch_shape + (len(action_space.nvec), ) output_hidden_shape = action_space.nvec.astype(np.int32) else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) assert is_multidiscrete if imitation: make_action_ph = lambda: ssbm_actions.make_ph( ssbm_actions.flat_repeated_config, batch_shape) actions = make_action_ph() prev_actions = make_action_ph() else: # actions are stacked "multidiscrete" actions = tf.placeholder(tf.int64, actions_shape, name="actions") prev_actions = tf.placeholder(tf.int64, actions_shape, name="prev_actions") # Create input placeholders dones = tf.placeholder(tf.bool, batch_shape, name="dones") rewards = tf.placeholder(tf.float32, batch_shape, name="rewards") if imitation: observations = ssbm_spaces.slippi_conv_list[0].make_ph(batch_shape) else: observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) behavior_logp = tf.placeholder(tf.float32, batch_shape) existing_state_in = None existing_seq_lens = None # Setup the policy autoregressive = config.get("autoregressive") if autoregressive: logit_dim = 128 # not really logits else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_rewards = tf.placeholder(tf.float32, batch_shape, name="prev_reward") self.model = HumanActionModel( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], imitation=imitation, state_in=existing_state_in, seq_lens=existing_seq_lens) # HumanActionModel doesn't flatten outputs flat_outputs = snt.MergeDims(0, 2)(self.model.outputs) if autoregressive: action_dist = ssbm_actions.AutoRegressive( nest.map_structure(lambda conv: conv.build_dist(), ssbm_actions.flat_repeated_config), residual=config.get("residual")) actions_logp, actions_entropy = action_dist.logp( flat_outputs, tf.unstack(actions, axis=-1)) action_sampler, self.sampled_logp = action_dist.sample( flat_outputs) action_sampler = tf.stack( [tf.cast(t, tf.int64) for t in nest.flatten(action_sampler)], axis=-1) sampled_prob = tf.exp(self.sampled_logp) else: dist_inputs = tf.split(flat_outputs, output_hidden_shape, axis=-1) action_dist = dist_class(dist_inputs) int64_actions = [tf.cast(x, tf.int64) for x in actions] actions_logp = action_dist.logp(int64_actions) actions_entropy = action_dist.entropy() action_sampler = action_dist.sample() sampled_prob = action_dist.sampled_action_prob() self.sampled_logp = tf.log(sampled_prob) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res # actual loss computation values_tm = make_time_major(self.model.value_function()) baseline_values = values_tm[:-1] actions_logp_tm = make_time_major(actions_logp, True) behavior_logp_tm = make_time_major(behavior_logp, True) log_rhos_tm = actions_logp_tm - behavior_logp_tm discounts = tf.fill(tf.shape(baseline_values), config["gamma"]) if not config.get("soft_horizon"): discounts *= tf.to_float(~make_time_major(dones, True)) vtrace_returns = vtrace.from_importance_weights( log_rhos=log_rhos_tm, discounts=discounts, rewards=make_time_major(rewards, True), values=baseline_values, bootstrap_value=values_tm[-1]) vf_loss = tf.reduce_mean( tf.squared_difference(vtrace_returns.vs, baseline_values)) pi_loss = -tf.reduce_mean( actions_logp_tm * vtrace_returns.pg_advantages) entropy_mean = tf.reduce_mean(actions_entropy) total_loss = pi_loss total_loss += self.config["vf_loss_coeff"] * vf_loss total_loss -= self.config["entropy_coeff"] * entropy_mean self.total_loss = total_loss kl_mean = -tf.reduce_mean(log_rhos_tm) # Initialize TFPolicyGraph loss_in = [ (SampleBatch.ACTIONS, actions), (SampleBatch.DONES, dones), ("behavior_logp", behavior_logp), (SampleBatch.REWARDS, rewards), (SampleBatch.CUR_OBS, observations), (SampleBatch.PREV_ACTIONS, prev_actions), (SampleBatch.PREV_REWARDS, prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_sampler, action_prob=sampled_prob, loss=self.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { LEARNER_STATS_KEY: { "cur_lr": tf.cast(self.cur_lr, tf.float64), "pi_loss": pi_loss, "entropy": entropy_mean, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": vf_loss, "vf_explained_var": explained_variance(tf.reshape(vtrace_returns.vs, [-1]), tf.reshape(baseline_values, [-1])), "kl_mean": kl_mean, }, }
def __init__(self, action_space, value_targets, advantages, actions, logits, vf_preds, curr_action_dist, value_fn, cur_kl_coeff, valid_mask, entropy_coeff=0, clip_param=0.1, vf_clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): """Constructs the loss for Proximal Policy Objective. Arguments: action_space: Environment observation space specification. value_targets (Placeholder): Placeholder for target values; used for GAE. actions (Placeholder): Placeholder for actions taken from previous model evaluation. advantages (Placeholder): Placeholder for calculated advantages from previous model evaluation. logits (Placeholder): Placeholder for logits output from previous model evaluation. vf_preds (Placeholder): Placeholder for value function output from previous model evaluation. curr_action_dist (ActionDistribution): ActionDistribution of the current model. value_fn (Tensor): Current value function output Tensor. cur_kl_coeff (Variable): Variable holding the current PPO KL coefficient. valid_mask (Tensor): A bool mask of valid input elements (#2992). entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter vf_clip_param (float): Clip parameter for the value function vf_loss_coeff (float): Coefficient of the value function loss use_gae (bool): If true, use the Generalized Advantage Estimator. """ def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) prev_dist = dist_cls(logits) # Make loss functions. logp_ratio = tf.exp( curr_action_dist.logp(actions) - prev_dist.logp(actions)) action_kl = prev_dist.kl(curr_action_dist) self.mean_kl = reduce_mean_valid(action_kl) curr_entropy = curr_action_dist.entropy() self.mean_entropy = reduce_mean_valid(curr_entropy) surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_policy_loss = reduce_mean_valid(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) vf_clipped = vf_preds + tf.clip_by_value( value_fn - vf_preds, -vf_clip_param, vf_clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = reduce_mean_valid(vf_loss) loss = reduce_mean_valid(-surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) loss = reduce_mean_valid(-surrogate_loss + cur_kl_coeff * action_kl - entropy_coeff * curr_entropy) self.loss = loss
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False vf_config["use_lstm"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph }, observation_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, loss=self.loss_obj.loss, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, obs_space, action_space, config, loss_fn, stats_fn=None, grad_stats_fn=None, before_loss_init=None, make_model=None, action_sampler_fn=None, action_distribution_fn=None, existing_inputs=None, existing_model=None, get_batch_divisibility_req=None, obs_include_prev_action_reward=True): """Initialize a dynamic TF policy. Arguments: observation_space (gym.Space): Observation space of the policy. action_space (gym.Space): Action space of the policy. config (dict): Policy-specific configuration data. loss_fn (func): function that returns a loss tensor the policy graph, and dict of experience tensor placeholders stats_fn (func): optional function that returns a dict of TF fetches given the policy and batch input tensors grad_stats_fn (func): optional function that returns a dict of TF fetches given the policy and loss gradient tensors before_loss_init (Optional[callable]): Optional function to run prior to loss init that takes the same arguments as __init__. make_model (func): optional function that returns a ModelV2 object given (policy, obs_space, action_space, config). All policy variables should be created in this function. If not specified, a default model will be created. action_sampler_fn (Optional[callable]): An optional callable returning a tuple of action and action prob tensors given (policy, model, input_dict, obs_space, action_space, config). If None, a default action distribution will be used. action_distribution_fn (Optional[callable]): A callable returning distribution inputs (parameters), a dist-class to generate an action distribution object from, and internal-state outputs (or an empty list if not applicable). Note: No Exploration hooks have to be called from within `action_distribution_fn`. It's should only perform a simple forward pass through some model. If None, pass inputs through `self.model()` to get the distribution inputs. existing_inputs (OrderedDict): When copying a policy, this specifies an existing dict of placeholders to use instead of defining new ones existing_model (ModelV2): when copying a policy, this specifies an existing model to clone and share weights with get_batch_divisibility_req (func): optional function that returns the divisibility requirement for sample batches obs_include_prev_action_reward (bool): whether to include the previous action and reward in the model input """ self.observation_space = obs_space self.action_space = action_space self.config = config self.framework = "tf" self._loss_fn = loss_fn self._stats_fn = stats_fn self._grad_stats_fn = grad_stats_fn self._obs_include_prev_action_reward = obs_include_prev_action_reward # Setup standard placeholders prev_actions = None prev_rewards = None if existing_inputs is not None: obs = existing_inputs[SampleBatch.CUR_OBS] if self._obs_include_prev_action_reward: prev_actions = existing_inputs[SampleBatch.PREV_ACTIONS] prev_rewards = existing_inputs[SampleBatch.PREV_REWARDS] action_input = existing_inputs[SampleBatch.ACTIONS] explore = existing_inputs["is_exploring"] timestep = existing_inputs["timestep"] else: obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape), name="observation") action_input = ModelCatalog.get_action_placeholder(action_space) if self._obs_include_prev_action_reward: prev_actions = ModelCatalog.get_action_placeholder( action_space, "prev_action") prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") explore = tf.placeholder_with_default(True, (), name="is_exploring") timestep = tf.placeholder(tf.int32, (), name="timestep") self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } # Placeholder for RNN time-chunk valid lengths. self._seq_lens = tf.placeholder(dtype=tf.int32, shape=[None], name="seq_lens") dist_class = dist_inputs = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Setup self.model. if existing_model: self.model = existing_model elif make_model: self.model = make_model(self, obs_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=self.config["model"], framework="tf", **self.config["model"].get("custom_model_config", {})) # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() if existing_inputs: self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] if self._state_in: self._seq_lens = existing_inputs["seq_lens"] else: self._state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] # Fully customized action generation (e.g., custom policy). if action_sampler_fn: sampled_action, sampled_action_logp = action_sampler_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) else: # Distribution generation is customized, e.g., DQN, DDPG. if action_distribution_fn: dist_inputs, dist_class, self._state_out = \ action_distribution_fn( self, self.model, obs_batch=self._input_dict[SampleBatch.CUR_OBS], state_batches=self._state_in, seq_lens=self._seq_lens, prev_action_batch=self._input_dict[ SampleBatch.PREV_ACTIONS], prev_reward_batch=self._input_dict[ SampleBatch.PREV_REWARDS], explore=explore, is_training=self._input_dict["is_training"]) # Default distribution generation behavior: # Pass through model. E.g., PG, PPO. else: dist_inputs, self._state_out = self.model( self._input_dict, self._state_in, self._seq_lens) action_dist = dist_class(dist_inputs, self.model) # Using exploration to get final action (e.g. via sampling). sampled_action, sampled_action_logp = \ self.exploration.get_exploration_action( action_distribution=action_dist, timestep=timestep, explore=explore) # Phase 1 init. sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: batch_divisibility_req = 1 super().__init__( observation_space=obs_space, action_space=action_space, config=config, sess=sess, obs_input=obs, action_input=action_input, # for logp calculations sampled_action=sampled_action, sampled_action_logp=sampled_action_logp, dist_inputs=dist_inputs, dist_class=dist_class, loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, state_inputs=self._state_in, state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req, explore=explore, timestep=timestep) # Phase 2 init. if before_loss_init is not None: before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss()
def _setup_graph(self, ob_space, ac_space): _, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_torch_model( self.registry, ob_space, self.logit_dim, self.config["model"]) self.optimizer = torch.optim.Adam( self._model.parameters(), lr=self.config["lr"])
activation_fn=activation, scope="fc{}".format(i), ) i += 1 output = slim.fully_connected( last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope="fc_out", ) return output, last_layer ModelCatalog.register_custom_model("PommermanModel1", PommermanModel) class BaseLineAgent(BaseAgent): def act(self, obs, action_space): pass class NoDoAgent(BaseAgent): def act(self, obs, action_space): return 0 class SuicidalAgent(BaseAgent): def act(self, obs, action_space): return 5
def _setup_graph(self, ob_space, ac_space): _, self.logit_dim = ModelCatalog.get_action_dist(ac_space) self._model = ModelCatalog.get_torch_model(ob_space, self.logit_dim, self.config["model"]) self.optimizer = torch.optim.Adam(self._model.parameters(), lr=self.config["lr"])
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), ("value_targets", value_targets_ph), ("advantages", adv_ph), ("actions", act_ph), ("logits", logits_ph), ("vf_preds", vf_preds_ph), ("prev_actions", prev_actions_ph), ("prev_rewards", prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model({ "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss( action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __init__(self, obs_space, action_space, num_outputs, model_config, name): self.original_space = obs_space.original_space if \ hasattr(obs_space, "original_space") else obs_space assert isinstance(self.original_space, (Dict, Tuple)), \ "`obs_space.original_space` must be [Dict|Tuple]!" self.processed_obs_space = self.original_space if \ model_config.get("_disable_preprocessor_api") else obs_space nn.Module.__init__(self) TorchModelV2.__init__(self, self.original_space, action_space, num_outputs, model_config, name) self.flattened_input_space = flatten_space(self.original_space) # Atari type CNNs or IMPALA type CNNs (with residual layers)? # self.cnn_type = self.model_config["custom_model_config"].get( # "conv_type", "atari") # Build the CNN(s) given obs_space's image components. self.cnns = {} self.one_hot = {} self.flatten = {} concat_size = 0 for i, component in enumerate(self.flattened_input_space): # Image space. if len(component.shape) == 3: config = { "conv_filters": model_config["conv_filters"] if "conv_filters" in model_config else get_filter_config(obs_space.shape), "conv_activation": model_config.get("conv_activation"), "post_fcnet_hiddens": [], } # if self.cnn_type == "atari": cnn = ModelCatalog.get_model_v2( component, action_space, num_outputs=None, model_config=config, framework="torch", name="cnn_{}".format(i)) # TODO (sven): add IMPALA-style option. # else: # cnn = TorchImpalaVisionNet( # component, # action_space, # num_outputs=None, # model_config=config, # name="cnn_{}".format(i)) concat_size += cnn.num_outputs self.cnns[i] = cnn self.add_module("cnn_{}".format(i), cnn) # Discrete|MultiDiscrete inputs -> One-hot encode. elif isinstance(component, Discrete): self.one_hot[i] = True concat_size += component.n elif isinstance(component, MultiDiscrete): self.one_hot[i] = True concat_size += sum(component.nvec) # Everything else (1D Box). else: self.flatten[i] = int(np.product(component.shape)) concat_size += self.flatten[i] # Optional post-concat FC-stack. post_fc_stack_config = { "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), "fcnet_activation": model_config.get("post_fcnet_activation", "relu") } self.post_fc_stack = ModelCatalog.get_model_v2( Box(float("-inf"), float("inf"), shape=(concat_size, ), dtype=np.float32), self.action_space, None, post_fc_stack_config, framework="torch", name="post_fc_stack") # Actions and value heads. self.logits_layer = None self.value_layer = None self._value_out = None if num_outputs: # Action-distribution head. self.logits_layer = SlimFC( in_size=self.post_fc_stack.num_outputs, out_size=num_outputs, activation_fn=None, ) # Create the value branch model. self.value_layer = SlimFC( in_size=self.post_fc_stack.num_outputs, out_size=1, activation_fn=None, initializer=torch_normc_initializer(0.01)) else: self.num_outputs = concat_size
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape[0] = 1 return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) if self._obs_include_prev_action_reward: batch_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: batch_tensors = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in batch_tensors: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) batch_tensors[k] = placeholder if log_once("loss_init"): logger.info( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(batch_tensors))) loss = self._do_loss_init(batch_tensors) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) # XXX experimental support for automatically eagerifying the loss. # The main limitation right now is that TF doesn't support mixing eager # and non-eager tensors, so losses that read non-eager tensors through # `policy` need to use `policy.convert_to_eager(tensor)`. if self.config["use_eager"]: if not self.model: raise ValueError("eager not implemented in this case") graph_tensors = list(self._needs_eager_conversion) def gen_loss(model_outputs, *args): # fill in the batch tensor dict with eager ensors eager_inputs = dict( zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)])) # fill in the eager versions of all accessed graph tensors self._eager_tensors = dict( zip(graph_tensors, args[len(loss_inputs):])) # patch the action dist to use eager mode tensors self.action_dist.inputs = model_outputs return self._loss_fn(self, eager_inputs) # TODO(ekl) also handle the stats funcs loss = tf.py_function( gen_loss, # cast works around TypeError: Cannot convert provided value # to EagerTensor. Provided value: 0.0 Requested dtype: int64 [self.model.outputs] + [tf.cast(v, tf.float32) for (k, v) in loss_inputs] + [tf.cast(t, tf.float32) for t in graph_tensors], tf.float32) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) self._sess.run(tf.global_variables_initializer())
def build_q_model_and_distribution(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.q_func_vars = policy.q_model.variables() policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_TARGET_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model, TorchCategorical
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True output_hidden_shape = action_space.nvec.astype(np.int32) elif self.config["vtrace"]: raise UnsupportedSpaceException( "Action space {} is not supported for APPO + VTrace.", format(action_space)) else: is_multidiscrete = False output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Create input placeholders if existing_inputs: if self.config["vtrace"]: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards, adv_ph, value_targets = \ existing_inputs[:9] existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, logit_dim], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) value_targets = tf.placeholder( tf.float32, name="value_targets", shape=(None, )) self.observations = observations # Unpack behaviour logits unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split( self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ behaviour_logits action_dist = dist_class(dist_inputs) prev_action_dist = dist_class(prev_dist_inputs) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( actions, axis=1) self.loss = VTraceSurrogateLoss( actions=make_time_major(loss_actions, drop_last=True), prev_actions_logp=make_time_major( prev_action_dist.logp(actions), drop_last=True), actions_logp=make_time_major( action_dist.logp(actions), drop_last=True), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major( action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major( unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major( unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config[ "vtrace_clip_pg_rho_threshold"], clip_param=self.config["clip_param"]) else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( prev_actions_logp=make_time_major( prev_action_dist.logp(actions)), actions_logp=make_time_major(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), actions_entropy=make_time_major(action_dist.entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), advantages=make_time_major(adv_ph), value_targets=make_time_major(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions. percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] if not self.config["vtrace"]: loss_in.append(("advantages", adv_ph)) loss_in.append(("value_targets", value_targets)) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) values_batched = make_time_major( values, drop_last=self.config["vtrace"]) self.stats_fetches = { "stats": dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }, **self.KL_stats), }