def __init__(self, policy: Type[BasePolicy], env: Union[GymEnv, str], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, verbose: int = 0, device: Union[th.device, str] = 'auto', support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, tensorboard_log = None): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for SDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress (from 1 to 0) # this is used to update the learning rate self._current_progress = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int self.tensorboard_log = tensorboard_log # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: eval_env = gym.make(env) if monitor_wrapper: eval_env = Monitor(eval_env, filename=None) self.eval_env = DummyVecEnv([lambda: eval_env]) if self.verbose >= 1: print("Creating environment from the given name, wrapped in a DummyVecEnv.") env = gym.make(env) if monitor_wrapper: env = Monitor(env, filename=None) env = DummyVecEnv([lambda: env]) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError("Error: the model does not support multiple envs requires a single vectorized" " environment.") # -------------------- logging/tensorboard -------------------- # output_formats = [HumanOutputFormat(sys.stdout)] if not self.tensorboard_log is None: output_formats.append(TensorBoardOutputFormat(self.tensorboard_log)) self.logger = Logger(folder=None, output_formats=output_formats)
def __init__( self, policy: Type[BasePolicy], env: Union[GymEnv, str, None], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, tensorboard_log: Optional[str] = None, verbose: int = 0, device: Union[th.device, str] = "auto", support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, ): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.tensorboard_log = tensorboard_log self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress remaining (from 1 to 0) # this is used to update the learning rate self._current_progress_remaining = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: self.eval_env = maybe_make_env(env, monitor_wrapper, self.verbose) env = maybe_make_env(env, monitor_wrapper, self.verbose) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs; it requires " "a single vectorized environment.") if self.use_sde and not isinstance(self.observation_space, gym.spaces.Box): raise ValueError( "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions." )
def __init__( self, policy: Type[BasePolicy], env: Union[GymEnv, str, None], policy_base: Type[BasePolicy], learning_rate: Union[float, Schedule], policy_kwargs: Optional[Dict[str, Any]] = None, tensorboard_log: Optional[str] = None, verbose: int = 0, device: Union[th.device, str] = "auto", support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, supported_action_spaces: Optional[Tuple[gym.spaces.Space, ...]] = None, ): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 # Used for computing fps, it is updated at each call of learn() self._num_timesteps_at_start = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.tensorboard_log = tensorboard_log self.lr_schedule = None # type: Optional[Schedule] self._last_obs = None # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] self._last_episode_starts = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress remaining (from 1 to 0) # this is used to update the learning rate self._current_progress_remaining = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging (and TD3 delayed updates) self._n_updates = 0 # type: int # The logger object self._logger = None # type: Logger # Whether the user passed a custom logger or not self._custom_logger = False # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: self.eval_env = maybe_make_env(env, self.verbose) env = maybe_make_env(env, self.verbose) env = self._wrap_env(env, self.verbose, monitor_wrapper) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if supported_action_spaces is not None: assert isinstance( self.action_space, supported_action_spaces ), (f"The algorithm only supports {supported_action_spaces} as action spaces " f"but {self.action_space} was provided") if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs; it requires " "a single vectorized environment.") # Catch common mistake: using MlpPolicy/CnnPolicy instead of MultiInputPolicy if policy in ["MlpPolicy", "CnnPolicy"] and isinstance( self.observation_space, gym.spaces.Dict): raise ValueError( f"You must use `MultiInputPolicy` when working with dict observation space, not {policy}" ) if self.use_sde and not isinstance(self.action_space, gym.spaces.Box): raise ValueError( "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions." )