def train( self, actor: Actor, replay: Replay = None, lr: float = 3e-4, epochs: int = 200, steps_per_epoch: int = 4000, max_ep_len: int = 1000, gamma: float = 0.99, callbacks: Iterable[Callable] = (), ): """Vanilla Policy Gradients algorithm with no added bells or whistles. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. replay: (base.Replay, optional) Experience replay object for sampling previous experiences. If not provided, defaults to 'ExperienceReplay' with a buffer size of 1,000,000. Users can provide a replay object, which is pre-populated with experiences (for specific use cases). steps_per_epoch: (int, optional) Number of steps of interaction for the agent and the environment in each epoch. Default: 4000. epochs: (int, optional) Number of training epochs. Default: 100. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 max_ep_len: (int, optional) Maximum length of episode. Defaults to 1000, but *this should be provided for each unique environment!* This has an effect on how end-of-episode rewards are computed. callbacks: (Iterable[Callable], optional) callback functions to execute at the end of each training epoch. """ device = utils.get_device(actor) self.optimizer = Adam(actor.parameters(), lr=lr) self.replay = NoReplay(steps_per_epoch) if replay is None else replay for epoch in range(1, epochs + 1): state = self.env.reset() ep_reward, ep_length = 0, 0 num_episodes = 0 for t in range(1, steps_per_epoch + 1): action, _ = actor(state.to(device)) state, reward, done, _ = self.env.step(action) self.replay.append([state, action, reward, done]) ep_reward += reward ep_length += 1 if done or (ep_length == max_ep_len): num_episodes += 1 self.ep_rewards.append(ep_reward) state = self.env.reset() ep_reward, ep_length = 0, 0 self.update(actor, gamma=gamma) avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="") for callback in callbacks: callback(self)
def update( self, iteration: int, actor: Actor, critics: Iterable[Critic], batch_size: int = 128, gamma: float = 0.99, target_noise: float = 0.2, noise_clip: float = 0.5, policy_delay: int = 2, polyak: float = 0.995, ): """Samples from the experience replay and performs a single TD3 update. Parameters ---------- iteration: (int) Number of update iterations that have been performed during this update step. Used for monitoring policy update delays. actor: (base.Actor) Actor (policy) network to optimize. critics: (Iterable[base.Critic]) Critic networks to optimize. In standard SAC there are *two* critics, but this method only requires that *two or more* critics are provided. batch_size: (int, optional) Minibatch size for SGD. Default: 128. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 target_noise: (float, optional) Stddev for smoothing noise added to target policy. Default: 0.2. noise_clip: (float, optional) Max absolute value of target policy smoothing noise. Default: 0.5. policy_delay: (int, optional) Policy will only be updated once every policy_delay times for each update of the Q-networks. Default: 2. polyak: (float, optional) Interpolation factor in polyak averaging for target networks. Range: (0, 1). Default: 0.995 """ device = utils.get_device(actor) batch = self.replay.sample(batch_size, device=device) self.critic_optimizer.zero_grad() self.critic_loss( batch, critics, gamma=gamma, target_noise=target_noise, noise_clip=noise_clip, ).backward() self.critic_optimizer.step() if iteration % policy_delay != 0: return self.actor_optimizer.zero_grad() actor_loss(batch, actor, critics).backward() self.actor_optimizer.step() for p, pt in zip(actor.parameters(), self.target_actor.parameters()): pt.data = pt.data * polyak + (1 - polyak) * p.data for critic, target in zip(critics, self.target_critics): for p, pt in zip(critic.parameters(), target.parameters()): pt.data = pt.data * polyak + (1 - polyak) * p.data
def update( self, actor: Actor, critic: Critic, batch_size: int = 128, gamma: float = 0.99, polyak: float = 0.995, ): """Samples from the experience replay and performs a single DDPG update. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. critic: (Iterable[base.Critic]) Critic networks to optimize. In standard SAC there are *two* critics, but this method only requires that *two or more* critics are provided. batch_size: (int, optional) Minibatch size for SGD. Default: 128. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 polyak: (float, optional) Interpolation factor in polyak averaging for target networks. Range: (0, 1). Default: 0.995 """ device = utils.get_device(actor) batch = self.replay.sample(batch_size, device=device) self.critic_optimizer.zero_grad() self.critic_loss( batch, critic, gamma=gamma, ).backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() actor_loss(batch, actor, critic).backward() self.actor_optimizer.step() for p, pt in zip(actor.parameters(), self.target_actor.parameters()): pt.data = pt.data * polyak + (1 - polyak) * p.data for p, pt in zip(critic.parameters(), self.target_critic.parameters()): pt.data = pt.data * polyak + (1 - polyak) * p.data
def train( self, actor: Actor, critic: Critic, replay: Replay = None, actor_lr: float = 3e-4, critic_lr: float = 1e-3, train_actor_iters: int = 80, train_critic_iters: int = 80, epochs: int = 200, steps_per_epoch: int = 4000, clip_ratio: float = 0.2, gamma: float = 0.99, lam: float = 0.97, target_kl: float = 0.01, max_ep_len: int = 1000, callbacks: Iterable[Callable] = (), ): """Proximal Policy Optimization (via objective clipping) with early stopping based on approximate KL divergence of the policy network. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. critic: (base.Critic) Critic network to optimize. replay: (base.Replay, optional) Experience replay object for sampling previous experiences. If not provided, defaults to 'ExperienceReplay' with a buffer size of 1,000,000. Users can provide a replay object, which is pre-populated with experiences (for specific use cases). steps_per_epoch: (int, optional) Number of steps of interaction for the agent and the environment in each epoch. Default: 4000. epochs: (int, optional) Number of training epochs. Default: 100. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 actor_lr: (float, optional) Learning rate actor optimizer. Default: 1e-3. critic_lr: (float, optional) Learning rate critic optimizer. Default: 1e-3. train_actor_iters: (int, optional) Max number of actor training steps per epoch. Default: 80. train_critic_iters: (int, optional) Max number of critic training steps per epoch. Default: 80. clip_ratio: (float, optional) Hyperparameter for clipping in the policy objective. Scales how much the policy is allowed change per training update. Default: 0.2. lam: (float, optional) Hyperparameter for GAE-Lambda calaulation. Range: (0, 1). Default: 0.97 target_kl: (float, optional) Max KL divergence between new and old policies after an update. Used for early stopping. Typically in range (0.01, 0.05). Default: 0.01. max_ep_len: (int, optional) Maximum length of episode. Defaults to 1000, but *this should be provided for each unique environment!* This has an effect on how end-of-episode rewards are computed. callbacks: (Iterable[Callable], optional) callback functions to execute at the end of each training epoch. """ device = utils.get_device(actor) self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(critic.parameters(), lr=critic_lr) self.replay = NoReplay(steps_per_epoch) if replay is None else replay for epoch in range(1, epochs + 1): state = self.env.reset() ep_reward, ep_length = 0, 0 num_episodes = 0 for t in range(1, steps_per_epoch + 1): action, logprob = actor(state.to(device)) next_state, reward, done, _ = self.env.step(action) done = False if ep_length == max_ep_len else done self.replay.append( [state, action, logprob, reward, done, next_state]) state = next_state ep_reward += reward ep_length += 1 if done or (ep_length == max_ep_len): num_episodes += 1 self.ep_rewards.append(ep_reward) state = self.env.reset() ep_reward, ep_length = 0, 0 self.update( actor, critic, train_actor_iters=train_actor_iters, train_critic_iters=train_critic_iters, clip_ratio=clip_ratio, gamma=gamma, lam=lam, target_kl=target_kl, ) avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="") for callback in callbacks: callback(self)
def train( self, actor: Actor, critics: Iterable[Critic], replay: Replay = None, steps_per_epoch: int = 4000, epochs: int = 100, gamma: float = 0.99, polyak: float = 0.995, actor_lr: float = 5e-4, critic_lr: float = 1e-3, alpha: float = 0.2, batch_size: int = 128, start_steps: int = 4000, update_after: int = 1000, update_every: int = 1, max_ep_len: int = 1000, callbacks: Iterable[Callable] = (), ): """Soft actor-critic (SAC) training algorithm. Supports both continuous and discrete action spaces. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. critics: (Iterable[base.Critic]) Critic networks to optimize. In standard SAC there are *two* critics, but this method only requires that *two or more* critics are provided. replay: (base.Replay, optional) Experience replay object for sampling previous experiences. If not provided, defaults to 'ExperienceReplay' with a buffer size of 1,000,000. Users can provide a replay object, which is pre-populated with experiences (for specific use cases). steps_per_epoch: (int, optional) Number of steps of interaction for the agent and the environment in each epoch. Default: 4000. epochs: (int, optional) Number of training epochs. Default: 100. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 polyak: (float, optional) Interpolation factor in polyak averaging for target networks. Range: (0, 1). Default: 0.995 actor_lr: (float, optional) Learning rate actor optimizer. Default: 1e-3. critic_lr: (float, optional) Learning rate critic optimizer. Default: 1e-3. alpha: (float, optional) Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) Default: 0.2. batch_size: (int, optional) Minibatch size for SGD. Default: 128. start_steps: (int, optional) Number of steps for random action selection before running real policy (helps exploration). Default: 1000. update_after: (int, optional) Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. Default: 5000. update_every: (int, optional) Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. Default: 1. max_ep_len: (int, optional) Maximum length of episode. Defaults to 1000, but *this should be provided for each unique environment!* This has an effect on how end-of-episode rewards are computed. callbacks: (Iterable[Callable], optional) callback functions to execute at the end of each training epoch. """ device = utils.get_device(actor) self.replay = replay if replay is None: self.replay = ExperienceReplay(int(1e6)) critic_params = chain(*[c.parameters() for c in critics]) self.critic_optimizer = torch.optim.Adam(critic_params, lr=critic_lr) self.actor_optimizer = torch.optim.Adam(actor.parameters(), lr=actor_lr) self.target_critics = deepcopy(critics) state = self.env.reset() ep_reward, ep_length = 0, 0 total_steps = steps_per_epoch * epochs for step in range(1, total_steps + 1): if step < start_steps: action = self.env.action_space.sample() else: action, _ = actor(state.to(device)) next_state, reward, done, _ = self.env.step(action) done = False if ep_length == max_ep_len else done self.replay.append([state, action, reward, done, next_state]) state = next_state ep_reward += reward ep_length += 1 if step > update_after and step % update_every == 0: for j in range(update_every): self.update( actor, critics, batch_size=batch_size, gamma=gamma, alpha=alpha, polyak=polyak, ) if step % steps_per_epoch == 0: for callback in callbacks: callback(self) if done or (ep_length == max_ep_len): self.ep_rewards.append(ep_reward) epoch = (step + 1) // steps_per_epoch print(f"\rEpoch {epoch} | Step {step} | Reward {ep_reward}", end="") state, ep_reward, ep_length = self.env.reset(), 0, 0
def train( self, actor: Actor, critic: Critic, replay: Replay = None, actor_lr: float = 3e-4, critic_lr: float = 1e-3, train_critic_iters: int = 10, epochs: int = 200, steps_per_epoch: int = 4000, max_ep_len: int = 1000, gamma: float = 0.99, lam: float = 0.97, callbacks: Iterable[Callable] = (), ): """Advantage Actor-Critic (A2C) algorithm for training RL agents in both continuous and discrete action spaces. **NOTE:** Synchronous A2C was chosen over the asynchronous version (A3C) due to its simplicity. It's also questionable that A3C performs better than A2C in the first place (in terms of the resulting trained policy, not training speed in Python). Other people/organizations have also pointed this out, including OpenAI. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. critic: (base.Critic) Critic network to optimize. replay: (base.Replay, optional) Experience replay object for sampling previous experiences. If not provided, defaults to 'ExperienceReplay' with a buffer size of 1,000,000. Users can provide a replay object, which is pre-populated with experiences (for specific use cases). steps_per_epoch: (int, optional) Number of steps of interaction for the agent and the environment in each epoch. Default: 4000. epochs: (int, optional) Number of training epochs. Default: 100. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 actor_lr: (float, optional) Learning rate actor optimizer. Default: 1e-3. critic_lr: (float, optional) Learning rate critic optimizer. Default: 1e-3. train_critic_iters: (int, optional) Max number of critic training steps per epoch. Default: 10. lam: (float, optional) Hyperparameter for GAE-Lambda calaulation. Range: (0, 1). Default: 0.97. max_ep_len: (int, optional) Maximum length of episode. Defaults to 1000, but *this should be provided for each unique environment!* This has an effect on how end-of-episode rewards are computed. callbacks: (Iterable[Callable], optional) callback functions to execute at the end of each training epoch. """ device = utils.get_device(actor) self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(critic.parameters(), lr=critic_lr) self.replay = NoReplay(steps_per_epoch) if replay is None else replay for epoch in range(1, epochs + 1): state = self.env.reset() ep_reward, ep_length = 0, 0 num_episodes = 0 for t in range(1, steps_per_epoch + 1): action, logprob = actor(state.to(device)) next_state, reward, done, _ = self.env.step(action) self.replay.append([state, action, logprob, reward, done, next_state]) state = next_state ep_reward += reward ep_length += 1 if done or (ep_length == max_ep_len): num_episodes += 1 self.ep_rewards.append(ep_reward) state = self.env.reset() ep_reward, ep_length = 0, 0 self.update( actor, critic, train_critic_iters=train_critic_iters, gamma=gamma, lam=lam, ) avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="") for callback in callbacks: callback(self)
def train( self, actor: Actor, critics: Iterable[Critic], replay: Replay = None, steps_per_epoch: int = 4000, epochs: int = 100, gamma: float = 0.99, polyak: float = 0.995, actor_lr: float = 5e-4, critic_lr: float = 1e-3, batch_size: int = 128, start_steps: int = 5000, update_after: int = 1000, update_every: int = 50, act_noise: float = 0.1, target_noise: float = 0.2, noise_clip: float = 0.5, policy_delay: int = 2, max_ep_len: int = 1000, callbacks: Iterable[Callable] = (), ): """Twin-Delayed Deep Deterministic (TD3) Policy Gradients training algorithm. Supports only *deterministic* policies in *continuous* action spaces. Parameters ---------- actor: (base.Actor) Actor (policy) network to optimize. critics: (Iterable[base.Critic]) Critic networks to optimize. In standard SAC there are *two* critics, but this method only requires that *two or more* critics are provided. replay: (base.Replay, optional) Experience replay object for sampling previous experiences. If not provided, defaults to 'ExperienceReplay' with a buffer size of 1,000,000. Users can provide a replay object, which is pre-populated with experiences (for specific use cases). steps_per_epoch: (int, optional) Number of steps of interaction for the agent and the environment in each epoch. Default: 4000. epochs: (int, optional) Number of training epochs. Default: 100. gamma: (float, optional) Discount factor. Range: (0, 1). Default: 0.99 polyak: (float, optional) Interpolation factor in polyak averaging for target networks. Range: (0, 1). Default: 0.995 actor_lr: (float, optional) Learning rate actor optimizer. Default: 1e-3. critic_lr: (float, optional) Learning rate critic optimizer. Default: 1e-3. batch_size: (int, optional) Minibatch size for SGD. Default: 128. start_steps: (int, optional) Number of steps for random action selection before running real policy (helps exploration). Default: 1000. update_after: (int, optional) Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. Default: 5000. update_every: (int, optional) Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. Default: 1. act_noise: (float, optional) Stddev for Gaussian exploration noise added to policy at training time. Default: 0.1. target_noise: (float, optional) Stddev for smoothing noise added to target policy. Default: 0.2. noise_clip: (float, optional) Max absolute value of target policy smoothing noise. Default: 0.5. policy_delay: (int, optional) Policy will only be updated once every policy_delay times for each update of the Q-networks. Default: 2. max_ep_len: (int, optional) Maximum length of episode. Defaults to 1000, but *this should be provided for each unique environment!* This has an effect on how end-of-episode rewards are computed. callbacks: (Iterable[Callable], optional) callback functions to execute at the end of each training epoch. """ device = utils.get_device(actor) self.replay = replay if replay is None: self.replay = ExperienceReplay(int(1e6)) critic_params = chain(*[c.parameters() for c in critics]) self.critic_optimizer = Adam(critic_params, lr=critic_lr) self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr) self.target_actor = deepcopy(actor) self.target_critics = deepcopy(critics) total_steps = steps_per_epoch * epochs state, ep_reward, ep_length = self.env.reset(), 0, 0 for step in range(1, total_steps + 1): if step < start_steps: action = self.env.action_space.sample() else: action, _ = actor(state.to(device)) action += act_noise * torch.randn_like(action) next_state, reward, done, _ = self.env.step(action) done = False if ep_length == max_ep_len else done self.replay.append([state, action, reward, done, next_state]) state = next_state ep_reward += reward ep_length += 1 if step >= update_after and step % update_every == 0: for iter in range(update_every): self.update( iter, actor, critics, batch_size=batch_size, gamma=gamma, target_noise=target_noise, noise_clip=noise_clip, policy_delay=policy_delay, polyak=polyak, ) if step % steps_per_epoch == 0: for callback in callbacks: callback(self) if done or (ep_length == max_ep_len): self.ep_rewards.append(ep_reward) epoch = (step + 1) // steps_per_epoch print(f"\rEpoch {epoch} | Step {step} | Reward {ep_reward}", end="") state, ep_reward, ep_length = self.env.reset(), 0, 0