def __init__(self, observation_space, action_space, learning_rate=0.001, update_period=100, embedding_dim=10, net_fn=None, net_kwargs=None, device="cuda:best", rate_power=0.5, batch_size=10, memory_size=10000, with_action=False, **kwargs): assert isinstance(observation_space, spaces.Box) UncertaintyEstimator.__init__(self, observation_space, action_space) self.learning_rate = learning_rate self.loss_fn = F.mse_loss self.update_period = update_period self.embedding_dim = embedding_dim out_size = embedding_dim * action_space.n if with_action else embedding_dim self.net_fn = load(net_fn) if isinstance(net_fn, str) else \ net_fn or partial(get_network, shape=observation_space.shape, embedding_dim=out_size) self.net_kwargs = net_kwargs or {} if "out_size" in self.net_kwargs: self.net_kwargs["out_size"] = out_size self.device = choose_device(device) self.rate_power = rate_power self.batch_size = batch_size self.memory = ReplayMemory(capacity=memory_size) self.with_action = with_action self.reset()
def read_env_config(config_path): """ Read .yaml config file for an environment instance. The file contains the environment constructor and its params. Example: ``` env.yaml constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom' params: reward_free: false array_observation: true nrooms: 5 ``` Parameters ---------- config_path : str yaml file name containing the env config Returns ------- Tuple (constructor, kwargs) for the env """ with open(config_path) as file: env_config = yaml.safe_load(file) return load(env_config["constructor"]), env_config["params"]
def read_agent_config(config_path): """ Read .yaml config file for an Agent instance. The file contains the agent class and its parameters. TODO: recursive update of base_config. Example: ``` myagent.yaml agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' gamma: 1.0 lp_metric: 2 min_dist: 0.0 max_repr: 800 bonus_scale_factor: 1.0 ``` Parameters ---------- config_path : str yaml file name containing the agent config Returns ------- agent_class base_config : dict dictionary whose keys are ('agent_class', 'init_kwargs', 'eval_kwargs', 'fit_kwargs') """ agent_config = process_agent_yaml(config_path) base_config_yaml = agent_config.pop("base_config", None) # TODO: recursive update if base_config_yaml is None: base_config = agent_config else: base_config = process_agent_yaml(base_config_yaml) for key in _AGENT_KEYS: try: base_config[key].update(agent_config[key]) except KeyError: base_config[key] = agent_config[key] agent_class = load(base_config.pop("agent_class")) return agent_class, base_config
def __init__(self, env, uncertainty_estimator_fn, uncertainty_estimator_kwargs=None, bonus_scale_factor=1.0, bonus_max=np.inf): Wrapper.__init__(self, env) self.bonus_scale_factor = bonus_scale_factor self.bonus_max = bonus_max uncertainty_estimator_kwargs = uncertainty_estimator_kwargs or {} uncertainty_estimator_fn = load(uncertainty_estimator_fn) if isinstance(uncertainty_estimator_fn, str) else \ uncertainty_estimator_fn self.uncertainty_estimator = uncertainty_estimator_fn( env.observation_space, env.action_space, **uncertainty_estimator_kwargs) self.previous_obs = None
class DQNAgent(AgentWithSimplePolicy): """DQN Agent based on PyTorch. Notes ----- Uses Q(lambda) for computing targets by default. To recover the standard DQN, set :code:`lambda_ = 0.0` and :code:`chunk_size = 1`. Parameters ---------- env: :class:`~rlberry.types.Env` Environment, can be a tuple (constructor, kwargs) gamma: float, default = 0.99 Discount factor. batch_size: int, default=32 Batch size. chunk_size: int, default=8 Length of sub-trajectories sampled from the replay buffer. lambda_: float, default=0.5 Q(lambda) parameter. target_update_parameter : int or float If int: interval (in number total number of online updates) between updates of the target network. If float: soft update coefficient device: str Torch device, see :func:`~rlberry.utils.torch.choose_device` learning_rate : float, default = 1e-3 Optimizer learning rate. loss_function: {"l1", "l2", "smooth_l1"}, default: "l2" Loss function used to compute Bellman error. epsilon_init: float, default = 1.0 Initial epsilon value for epsilon-greedy exploration. epsilon_final: float, default = 0.1 Final epsilon value for epsilon-greedy exploration. epsilon_decay_interval : int After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`. optimizer_type : {"ADAM", "RMS_PROP"} Optimization algorithm. q_net_constructor : Callable Function/constructor that returns a torch module for the Q-network: :code:`qnet = q_net_constructor(env, **kwargs)`. Module (Q-network) requirements: * Input shape = (batch_dim, chunk_size, obs_dims) * Ouput shape = (batch_dim, chunk_size, number_of_actions) q_net_kwargs : optional, dict Parameters for q_net_constructor. use_double_dqn : bool, default = False If True, use Double DQN. use_prioritized_replay : bool, default = False If True, use Prioritized Experience Replay. train_interval: int Update the model every :code:`train_interval` steps. If -1, train only at the end of the episodes. gradient_steps: int How many gradient steps to do at each update. If -1, take the number of timesteps since last update. max_replay_size : int Maximum number of transitions in the replay buffer. learning_starts : int How many steps of the model to collect transitions for before learning starts eval_interval : int, default = None Interval (in number of transitions) between agent evaluations in fit(). If None, never evaluate. """ name = "DQN" def __init__( self, env: types.Env, gamma: float = 0.99, batch_size: int = 32, chunk_size: int = 8, lambda_: float = 0.5, target_update_parameter: Union[int, float] = 0.005, device: str = "cuda:best", learning_rate: float = 1e-3, epsilon_init: float = 1.0, epsilon_final: float = 0.1, epsilon_decay_interval: int = 20_000, loss_function: str = "l2", optimizer_type: str = "ADAM", q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None, q_net_kwargs: Optional[dict] = None, use_double_dqn: bool = False, use_prioritized_replay: bool = False, train_interval: int = 10, gradient_steps: int = -1, max_replay_size: int = 200_000, learning_starts: int = 5_000, eval_interval: Optional[int] = None, **kwargs, ): # For all parameters, define self.param = param _, _, _, values = inspect.getargvalues(inspect.currentframe()) values.pop("self") for arg, val in values.items(): setattr(self, arg, val) AgentWithSimplePolicy.__init__(self, env, **kwargs) env = self.env assert isinstance(env.observation_space, spaces.Box) assert isinstance(env.action_space, spaces.Discrete) # DQN parameters # Online and target Q networks, torch device self._device = choose_device(device) if isinstance(q_net_constructor, str): q_net_ctor = load(q_net_constructor) elif q_net_constructor is None: q_net_ctor = default_q_net_fn q_net_kwargs = q_net_kwargs or dict() self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device) self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device) # Optimizer and loss optimizer_kwargs = { "optimizer_type": optimizer_type, "lr": learning_rate } self._optimizer = optimizer_factory(self._qnet_online.parameters(), **optimizer_kwargs) self._loss_function = loss_function_factory(loss_function, reduction="none") # Training params self._train_interval = train_interval self._gradient_steps = gradient_steps self._learning_starts = learning_starts self._learning_starts = learning_starts self._eval_interval = eval_interval # Setup replay buffer if hasattr(self.env, "_max_episode_steps"): max_episode_steps = self.env._max_episode_steps else: max_episode_steps = np.inf self._max_episode_steps = max_episode_steps self._replay_buffer = replay.ReplayBuffer( max_replay_size=max_replay_size, rng=self.rng, max_episode_steps=self._max_episode_steps, enable_prioritized=use_prioritized_replay, ) self._replay_buffer.setup_entry("observations", np.float32) self._replay_buffer.setup_entry("next_observations", np.float32) self._replay_buffer.setup_entry("actions", np.int32) self._replay_buffer.setup_entry("rewards", np.float32) self._replay_buffer.setup_entry("dones", bool) # Counters self._total_timesteps = 0 self._total_episodes = 0 self._total_updates = 0 self._timesteps_since_last_update = 0 # epsilon scheduling self._epsilon_schedule = polynomial_schedule( self.epsilon_init, self.epsilon_final, power=1.0, transition_steps=self.epsilon_decay_interval, transition_begin=0, )
def __init__(self, env, n_episodes=1000, horizon=256, gamma=0.99, loss_function="l2", batch_size=100, device="cuda:best", target_update=1, learning_rate=0.001, epsilon_init=1.0, epsilon_final=0.1, epsilon_decay=5000, optimizer_type='ADAM', qvalue_net_fn=None, qvalue_net_kwargs=None, double=True, memory_capacity=10000, use_bonus=False, uncertainty_estimator_kwargs=None, prioritized_replay=True, update_frequency=1, **kwargs): # Wrap arguments and initialize base class memory_kwargs = { 'capacity': memory_capacity, 'n_steps': 1, 'gamma': gamma } exploration_kwargs = { 'method': "EpsilonGreedy", 'temperature': epsilon_init, 'final_temperature': epsilon_final, 'tau': epsilon_decay, } self.use_bonus = use_bonus if self.use_bonus: env = UncertaintyEstimatorWrapper(env, **uncertainty_estimator_kwargs) IncrementalAgent.__init__(self, env, **kwargs) self.horizon = horizon self.exploration_kwargs = exploration_kwargs or {} self.memory_kwargs = memory_kwargs or {} self.n_episodes = n_episodes self.batch_size = batch_size self.target_update = target_update self.double = double assert isinstance(env.action_space, spaces.Discrete), \ "Only compatible with Discrete action spaces." self.prioritized_replay = prioritized_replay memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory self.memory = memory_class(**self.memory_kwargs) self.exploration_policy = \ exploration_factory(self.env.action_space, **self.exploration_kwargs) self.training = True self.steps = 0 self.episode = 0 self.writer = None self.optimizer_kwargs = { 'optimizer_type': optimizer_type, 'lr': learning_rate } self.device = choose_device(device) self.loss_function = loss_function self.gamma = gamma qvalue_net_kwargs = qvalue_net_kwargs or {} qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \ qvalue_net_fn or default_qvalue_net_fn self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net.load_state_dict(self.value_net.state_dict()) self.target_net.eval() logger.info("Number of trainable parameters: {}".format( trainable_parameters(self.value_net))) self.value_net.to(self.device) self.target_net.to(self.device) self.loss_function = loss_function_factory(self.loss_function) self.optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs) self.update_frequency = update_frequency self.steps = 0