def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, sess: "tf1.Session", obs_input: TensorType, sampled_action: TensorType, loss: TensorType, loss_inputs: List[Tuple[str, TensorType]], model: ModelV2 = None, sampled_action_logp: Optional[TensorType] = None, action_input: Optional[TensorType] = None, log_likelihood: Optional[TensorType] = None, dist_inputs: Optional[TensorType] = None, dist_class: Optional[type] = None, state_inputs: Optional[List[TensorType]] = None, state_outputs: Optional[List[TensorType]] = None, prev_action_input: Optional[TensorType] = None, prev_reward_input: Optional[TensorType] = None, seq_lens: Optional[TensorType] = None, max_seq_len: int = 20, batch_divisibility_req: int = 1, update_ops: List[TensorType] = None, explore: Optional[TensorType] = None, timestep: Optional[TensorType] = None): """Initializes a Policy object. Args: observation_space (gym.spaces.Space): Observation space of the env. action_space (gym.spaces.Space): Action space of the env. config (TrainerConfigDict): The Policy config dict. sess (tf1.Session): The TensorFlow session to use. obs_input (TensorType): Input placeholder for observations, of shape [BATCH_SIZE, obs...]. sampled_action (TensorType): Tensor for sampling an action, of shape [BATCH_SIZE, action...] loss (TensorType): Scalar policy loss output tensor. loss_inputs (List[Tuple[str, TensorType]]): A (name, placeholder) tuple for each loss input argument. Each placeholder name must correspond to a SampleBatch column key returned by postprocess_trajectory(), and has shape [BATCH_SIZE, data...]. These keys will be read from postprocessed sample batches and fed into the specified placeholders during loss computation. model (ModelV2): used to integrate custom losses and stats from user-defined RLlib models. sampled_action_logp (Optional[TensorType]): log probability of the sampled action. action_input (Optional[TensorType]): Input placeholder for actions for logp/log-likelihood calculations. log_likelihood (Optional[TensorType]): Tensor to calculate the log_likelihood (given action_input and obs_input). dist_class (Optional[type]): An optional ActionDistribution class to use for generating a dist object from distribution inputs. dist_inputs (Optional[TensorType]): Tensor to calculate the distribution inputs/parameters. state_inputs (Optional[List[TensorType]]): List of RNN state input Tensors. state_outputs (Optional[List[TensorType]]): List of RNN state output Tensors. prev_action_input (Optional[TensorType]): placeholder for previous actions. prev_reward_input (Optional[TensorType]): placeholder for previous rewards. seq_lens (Optional[TensorType]): Placeholder for RNN sequence lengths, of shape [NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See policy/rnn_sequencing.py for more information. max_seq_len (int): Max sequence length for LSTM training. batch_divisibility_req (int): pad all agent experiences batches to multiples of this value. This only has an effect if not using a LSTM model. update_ops (List[TensorType]): override the batchnorm update ops to run when applying gradients. Otherwise we run all update ops found in the current variable scope. explore (Optional[Union[TensorType, bool]]): Placeholder for `explore` parameter into call to Exploration.get_exploration_action. Explicitly set this to False for not creating any Exploration component. timestep (Optional[TensorType]): Placeholder for the global sampling timestep. """ self.framework = "tf" super().__init__(observation_space, action_space, config) # Get devices to build the graph on. worker_idx = self.config.get("worker_index", 0) if not config["_fake_gpus"] and \ ray.worker._mode() == ray.worker.LOCAL_MODE: num_gpus = 0 elif worker_idx == 0: num_gpus = config["num_gpus"] else: num_gpus = config["num_gpus_per_worker"] gpu_ids = get_gpu_devices() # Place on one or more CPU(s) when either: # - Fake GPU mode. # - num_gpus=0 (either set by user or we are in local_mode=True). # - no GPUs available. if config["_fake_gpus"] or num_gpus == 0 or not gpu_ids: logger.info("TFPolicy (worker={}) running on {}.".format( worker_idx if worker_idx > 0 else "local", f"{num_gpus} fake-GPUs" if config["_fake_gpus"] else "CPU")) self.devices = [ "/cpu:0" for _ in range(int(math.ceil(num_gpus)) or 1) ] # Place on one or more actual GPU(s), when: # - num_gpus > 0 (set by user) AND # - local_mode=False AND # - actual GPUs available AND # - non-fake GPU mode. else: logger.info("TFPolicy (worker={}) running on {} GPU(s).".format( worker_idx if worker_idx > 0 else "local", num_gpus)) # We are a remote worker (WORKER_MODE=1): # GPUs should be assigned to us by ray. if ray.worker._mode() == ray.worker.WORKER_MODE: gpu_ids = ray.get_gpu_ids() if len(gpu_ids) < num_gpus: raise ValueError( "TFPolicy was not able to find enough GPU IDs! Found " f"{gpu_ids}, but num_gpus={num_gpus}.") self.devices = [ f"/gpu:{i}" for i, _ in enumerate(gpu_ids) if i < num_gpus ] # Disable env-info placeholder. if SampleBatch.INFOS in self.view_requirements: self.view_requirements[SampleBatch.INFOS].used_for_training = False self.view_requirements[ SampleBatch.INFOS].used_for_compute_actions = False assert model is None or isinstance(model, (ModelV2, tf.keras.Model)), \ "Model classes for TFPolicy other than `ModelV2|tf.keras.Model` " \ "not allowed! You passed in {}.".format(model) self.model = model # Auto-update model's inference view requirements, if recurrent. if self.model is not None: self._update_model_view_requirements_from_init_state() # If `explore` is explicitly set to False, don't create an exploration # component. self.exploration = self._create_exploration() if explore is not False \ else None self._sess = sess self._obs_input = obs_input self._prev_action_input = prev_action_input self._prev_reward_input = prev_reward_input self._sampled_action = sampled_action self._is_training = self._get_is_training_placeholder() self._is_exploring = explore if explore is not None else \ tf1.placeholder_with_default(True, (), name="is_exploring") self._sampled_action_logp = sampled_action_logp self._sampled_action_prob = (tf.math.exp(self._sampled_action_logp) if self._sampled_action_logp is not None else None) self._action_input = action_input # For logp calculations. self._dist_inputs = dist_inputs self.dist_class = dist_class self._state_inputs = state_inputs or [] self._state_outputs = state_outputs or [] self._seq_lens = seq_lens self._max_seq_len = max_seq_len if self._state_inputs and self._seq_lens is None: raise ValueError( "seq_lens tensor must be given if state inputs are defined") self._batch_divisibility_req = batch_divisibility_req self._update_ops = update_ops self._apply_op = None self._stats_fetches = {} self._timestep = timestep if timestep is not None else \ tf1.placeholder_with_default( tf.zeros((), dtype=tf.int64), (), name="timestep") self._optimizer = None self._grads_and_vars = None self._grads = None # Policy tf-variables (weights), whose values to get/set via # get_weights/set_weights. self._variables = None # Local optimizer's tf-variables (e.g. state vars for Adam). # Will be stored alongside `self._variables` when checkpointing. self._optimizer_variables = None # The loss tf-op. self._loss = None # A batch dict passed into loss function as input. self._loss_input_dict = {} if loss is not None: self._initialize_loss(loss, loss_inputs) # The log-likelihood calculator op. self._log_likelihood = log_likelihood if self._log_likelihood is None and self._dist_inputs is not None and \ self.dist_class is not None: self._log_likelihood = self.dist_class( self._dist_inputs, self.model).logp(self._action_input)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) # Log device and worker index. from ray.rllib.evaluation.rollout_worker import get_global_worker worker = get_global_worker() worker_idx = worker.worker_index if worker else 0 if get_gpu_devices(): logger.info( "TF-eager Policy (worker={}) running on GPU.".format( worker_idx if worker_idx > 0 else "local")) else: logger.info( "TF-eager Policy (worker={}) running on CPU.".format( worker_idx if worker_idx > 0 else "local")) self._is_training = False self._loss_initialized = False self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # The list of local (tf) optimizers (one per loss term). self._optimizers: List[LocalOptimizer] = optimizers # Backward compatibility: A user's policy may only support a single # loss term and optimizer (no lists). self._optimizer: LocalOptimizer = \ optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0