def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: batch = self._from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None # Clip actions (from any values into env's bounds), if necessary. cfg = self.ioctx.config if cfg.get("clip_actions") and self.ioctx.worker is not None: if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = clip_action( batch[SampleBatch.ACTIONS], self.default_policy.action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = clip_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) # Re-normalize actions (from env's bounds to zero-centered), if # necessary. if cfg.get("actions_in_input_normalized") is False and \ self.ioctx.worker is not None: # If we have a complex action space and actions were flattened # and we have to normalize -> Error. error_msg = \ "Normalization of offline actions that are flattened is not "\ "supported! Make sure that you record actions into offline " \ "file with the `_disable_action_flattening=True` flag OR " \ "as already normalized (between -1.0 and 1.0) values. " \ "Also, when reading already normalized action values from " \ "offline files, make sure to set " \ "`actions_in_input_normalized=True` so that RLlib will not " \ "perform normalization on top." if isinstance(batch, SampleBatch): pol = self.default_policy if isinstance(pol.action_space_struct, (tuple, dict)) and \ not pol.config.get("_disable_action_flattening"): raise ValueError(error_msg) batch[SampleBatch.ACTIONS] = normalize_action( batch[SampleBatch.ACTIONS], pol.action_space_struct) else: for pid, b in batch.policy_batches.items(): pol = self.policy_map[pid] if isinstance(pol.action_space_struct, (tuple, dict)) and \ not pol.config.get("_disable_action_flattening"): raise ValueError(error_msg) b[SampleBatch.ACTIONS] = normalize_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) return batch
def compute_log_likelihoods( self, actions, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, actions_normalized=True, ): if is_overridden(self.action_sampler_fn) and not is_overridden( self.action_distribution_fn ): raise ValueError( "Cannot compute log-prob/likelihood w/o an " "`action_distribution_fn` and a provided " "`action_sampler_fn`!" ) seq_lens = tf.ones(len(obs_batch), dtype=tf.int32) input_batch = SampleBatch( {SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch)}, _is_training=False, ) if prev_action_batch is not None: input_batch[SampleBatch.PREV_ACTIONS] = tf.convert_to_tensor( prev_action_batch ) if prev_reward_batch is not None: input_batch[SampleBatch.PREV_REWARDS] = tf.convert_to_tensor( prev_reward_batch ) # Exploration hook before each forward pass. self.exploration.before_compute_actions(explore=False) # Action dist class and inputs are generated via custom function. if is_overridden(self.action_distribution_fn): dist_inputs, self.dist_class, _ = self.action_distribution_fn( self, self.model, input_batch, explore=False, is_training=False ) # Default log-likelihood calculation. else: dist_inputs, _ = self.model(input_batch, state_batches, seq_lens) action_dist = self.dist_class(dist_inputs, self.model) # Normalize actions if necessary. if not actions_normalized and self.config["normalize_actions"]: actions = normalize_action(actions, self.action_space_struct) log_likelihoods = action_dist.logp(actions) return log_likelihoods
def compute_log_likelihoods( self, actions: Union[List[TensorType], TensorType], obs_batch: Union[List[TensorType], TensorType], state_batches: Optional[List[TensorType]] = None, prev_action_batch: Optional[Union[List[TensorType], TensorType]] = None, prev_reward_batch: Optional[Union[List[TensorType], TensorType]] = None, actions_normalized: bool = True, ) -> TensorType: if self._log_likelihood is None: raise ValueError("Cannot compute log-prob/likelihood w/o a " "self._log_likelihood op!") # Exploration hook before each forward pass. self.exploration.before_compute_actions(explore=False, tf_sess=self.get_session()) builder = TFRunBuilder(self.get_session(), "compute_log_likelihoods") # Normalize actions if necessary. if actions_normalized is False and self.config["normalize_actions"]: actions = normalize_action(actions, self.action_space_struct) # Feed actions (for which we want logp values) into graph. builder.add_feed_dict({self._action_input: actions}) # Feed observations. builder.add_feed_dict({self._obs_input: obs_batch}) # Internal states. state_batches = state_batches or [] if len(self._state_inputs) != len(state_batches): raise ValueError( "Must pass in RNN state batches for placeholders {}, got {}". format(self._state_inputs, state_batches)) builder.add_feed_dict( {k: v for k, v in zip(self._state_inputs, state_batches)}) if state_batches: builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))}) # Prev-a and r. if self._prev_action_input is not None and \ prev_action_batch is not None: builder.add_feed_dict({self._prev_action_input: prev_action_batch}) if self._prev_reward_input is not None and \ prev_reward_batch is not None: builder.add_feed_dict({self._prev_reward_input: prev_reward_batch}) # Fetch the log_likelihoods output and return. fetches = builder.add_fetches([self._log_likelihood]) return builder.get(fetches)[0]
def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: batch = _from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None # Clip actions (from any values into env's bounds), if necessary. cfg = self.ioctx.config if cfg.get("clip_actions"): if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = clip_action( batch[SampleBatch.ACTIONS], self.ioctx.worker. policy_map["default_policy"].action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = clip_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) # Re-normalize actions (from env's bounds to 0.0 centered), if # necessary. if "actions_in_input_normalized" in cfg and \ cfg["actions_in_input_normalized"] is False: if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = normalize_action( batch[SampleBatch.ACTIONS], self.ioctx.worker. policy_map["default_policy"].action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = normalize_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) return batch
def compute_log_likelihoods( self, actions: Union[List[TensorStructType], TensorStructType], obs_batch: Union[List[TensorStructType], TensorStructType], state_batches: Optional[List[TensorType]] = None, prev_action_batch: Optional[Union[List[TensorStructType], TensorStructType]] = None, prev_reward_batch: Optional[Union[List[TensorStructType], TensorStructType]] = None, actions_normalized: bool = True, ) -> TensorType: if is_overridden(self.action_sampler_fn) and not is_overridden( self.action_distribution_fn): raise ValueError("Cannot compute log-prob/likelihood w/o an " "`action_distribution_fn` and a provided " "`action_sampler_fn`!") with torch.no_grad(): input_dict = self._lazy_tensor_dict({ SampleBatch.CUR_OBS: obs_batch, SampleBatch.ACTIONS: actions }) if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch if prev_reward_batch is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch seq_lens = torch.ones(len(obs_batch), dtype=torch.int32) state_batches = [ convert_to_torch_tensor(s, self.device) for s in (state_batches or []) ] # Exploration hook before each forward pass. self.exploration.before_compute_actions(explore=False) # Action dist class and inputs are generated via custom function. if is_overridden(self.action_distribution_fn): dist_inputs, dist_class, state_out = self.action_distribution_fn( self.model, input_dict=input_dict, state_batches=state_batches, seq_lens=seq_lens, explore=False, is_training=False, ) # Default action-dist inputs calculation. else: dist_class = self.dist_class dist_inputs, _ = self.model(input_dict, state_batches, seq_lens) action_dist = dist_class(dist_inputs, self.model) # Normalize actions if necessary. actions = input_dict[SampleBatch.ACTIONS] if not actions_normalized and self.config["normalize_actions"]: actions = normalize_action(actions, self.action_space_struct) log_likelihoods = action_dist.logp(actions) return log_likelihoods
def compute_log_likelihoods( self, actions: Union[List[TensorStructType], TensorStructType], obs_batch: Union[List[TensorStructType], TensorStructType], state_batches: Optional[List[TensorType]] = None, prev_action_batch: Optional[ Union[List[TensorStructType], TensorStructType] ] = None, prev_reward_batch: Optional[ Union[List[TensorStructType], TensorStructType] ] = None, actions_normalized: bool = True, ) -> TensorType: if self.action_sampler_fn and self.action_distribution_fn is None: raise ValueError( "Cannot compute log-prob/likelihood w/o an " "`action_distribution_fn` and a provided " "`action_sampler_fn`!" ) with torch.no_grad(): input_dict = self._lazy_tensor_dict( {SampleBatch.CUR_OBS: obs_batch, SampleBatch.ACTIONS: actions} ) if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch if prev_reward_batch is not None: input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch seq_lens = torch.ones(len(obs_batch), dtype=torch.int32) state_batches = [ convert_to_torch_tensor(s, self.device) for s in (state_batches or []) ] # Exploration hook before each forward pass. self.exploration.before_compute_actions(explore=False) # Action dist class and inputs are generated via custom function. if self.action_distribution_fn: # Try new action_distribution_fn signature, supporting # state_batches and seq_lens. try: dist_inputs, dist_class, state_out = self.action_distribution_fn( self, self.model, input_dict=input_dict, state_batches=state_batches, seq_lens=seq_lens, explore=False, is_training=False, ) # Trying the old way (to stay backward compatible). # TODO: Remove in future. except TypeError as e: if ( "positional argument" in e.args[0] or "unexpected keyword argument" in e.args[0] ): dist_inputs, dist_class, _ = self.action_distribution_fn( policy=self, model=self.model, obs_batch=input_dict[SampleBatch.CUR_OBS], explore=False, is_training=False, ) else: raise e # Default action-dist inputs calculation. else: dist_class = self.dist_class dist_inputs, _ = self.model(input_dict, state_batches, seq_lens) action_dist = dist_class(dist_inputs, self.model) # Normalize actions if necessary. actions = input_dict[SampleBatch.ACTIONS] if not actions_normalized and self.config["normalize_actions"]: actions = normalize_action(actions, self.action_space_struct) log_likelihoods = action_dist.logp(actions) return log_likelihoods
def compute_single_action( self, obs: TensorType, state: Optional[List[TensorType]] = None, prev_action: Optional[TensorType] = None, prev_reward: Optional[TensorType] = None, info: dict = None, episode: Optional["MultiAgentEpisode"] = None, clip_actions: bool = None, explore: Optional[bool] = None, timestep: Optional[int] = None, normalize_actions: bool = None, **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: """Unbatched version of compute_actions. Args: obs (TensorType): Single observation. state (Optional[List[TensorType]]): List of RNN state inputs, if any. prev_action (Optional[TensorType]): Previous action value, if any. prev_reward (Optional[TensorType]): Previous reward, if any. info (dict): Info object, if any. episode (Optional[MultiAgentEpisode]): this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. normalize_actions (bool): Should actions be normalized according to the Policy's action space? clip_actions (bool): Should actions be clipped according to the Policy's action space? explore (Optional[bool]): Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep (Optional[int]): The current (sampling) time step. Keyword Args: kwargs: Forward compatibility. Returns: Tuple: - actions (TensorType): Single action. - state_outs (List[TensorType]): List of RNN state outputs, if any. - info (dict): Dictionary of extra features, if any. """ normalize_actions = \ normalize_actions if normalize_actions is not None \ else self.config["normalize_actions"] clip_actions = clip_actions if clip_actions is not None else \ self.config["clip_actions"] prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [ s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else np.expand_dims( s, 0) for s in state ] out = self.compute_actions( [obs], state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] if normalize_actions: single_action = normalize_action(single_action, self.action_space_struct) elif clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}