Пример #1
0
    def train(
            self,
            actor: Actor,
            replay: Replay = None,
            lr: float = 3e-4,
            epochs: int = 200,
            steps_per_epoch: int = 4000,
            max_ep_len: int = 1000,
            gamma: float = 0.99,
            callbacks: Iterable[Callable] = (),
    ):
        """Vanilla Policy Gradients algorithm with no added bells or whistles.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        replay: (base.Replay, optional) Experience replay object for sampling
            previous experiences.  If not provided, defaults to 'ExperienceReplay'
            with a buffer size of 1,000,000.  Users can provide a replay object,
            which is pre-populated with experiences (for specific use cases).
        steps_per_epoch: (int, optional) Number of steps of interaction
            for the agent and the environment in each epoch.  Default: 4000.
        epochs: (int, optional) Number of training epochs.  Default:  100.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        max_ep_len: (int, optional) Maximum length of episode.  Defaults to 1000,
            but *this should be provided for each unique environment!*  This
            has an effect on how end-of-episode rewards are computed.
        callbacks: (Iterable[Callable], optional) callback functions to execute
            at the end of each training epoch.
        """
        device = utils.get_device(actor)
        self.optimizer = Adam(actor.parameters(), lr=lr)
        self.replay = NoReplay(steps_per_epoch) if replay is None else replay

        for epoch in range(1, epochs + 1):
            state = self.env.reset()
            ep_reward, ep_length = 0, 0
            num_episodes = 0

            for t in range(1, steps_per_epoch + 1):
                action, _ = actor(state.to(device))

                state, reward, done, _ = self.env.step(action)
                self.replay.append([state, action, reward, done])
                ep_reward += reward
                ep_length += 1

                if done or (ep_length == max_ep_len):
                    num_episodes += 1
                    self.ep_rewards.append(ep_reward)
                    state = self.env.reset()
                    ep_reward, ep_length = 0, 0

            self.update(actor, gamma=gamma)
            avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes
            print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="")

            for callback in callbacks:
                callback(self)
Пример #2
0
    def update(
        self,
        iteration: int,
        actor: Actor,
        critics: Iterable[Critic],
        batch_size: int = 128,
        gamma: float = 0.99,
        target_noise: float = 0.2,
        noise_clip: float = 0.5,
        policy_delay: int = 2,
        polyak: float = 0.995,
    ):
        """Samples from the experience replay and performs a single TD3 update.

        Parameters
        ----------
        iteration: (int) Number of update iterations that have been performed
            during this update step.  Used for monitoring policy update delays.
        actor: (base.Actor) Actor (policy) network to optimize.
        critics: (Iterable[base.Critic]) Critic networks to optimize. In standard
            SAC there are *two* critics, but this method only requires that *two or
            more* critics are provided.
        batch_size: (int, optional) Minibatch size for SGD.  Default: 128.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        target_noise: (float, optional) Stddev for smoothing noise added to
            target policy.  Default: 0.2.
        noise_clip: (float, optional) Max absolute value of target policy
            smoothing noise.  Default: 0.5.
        policy_delay: (int, optional) Policy will only be updated once every
            policy_delay times for each update of the Q-networks.  Default: 2.
        polyak: (float, optional) Interpolation factor in polyak averaging for
            target networks.  Range: (0, 1).  Default: 0.995
        """
        device = utils.get_device(actor)
        batch = self.replay.sample(batch_size, device=device)

        self.critic_optimizer.zero_grad()
        self.critic_loss(
            batch,
            critics,
            gamma=gamma,
            target_noise=target_noise,
            noise_clip=noise_clip,
        ).backward()
        self.critic_optimizer.step()

        if iteration % policy_delay != 0:
            return

        self.actor_optimizer.zero_grad()
        actor_loss(batch, actor, critics).backward()
        self.actor_optimizer.step()

        for p, pt in zip(actor.parameters(), self.target_actor.parameters()):
            pt.data = pt.data * polyak + (1 - polyak) * p.data
        for critic, target in zip(critics, self.target_critics):
            for p, pt in zip(critic.parameters(), target.parameters()):
                pt.data = pt.data * polyak + (1 - polyak) * p.data
Пример #3
0
    def update(
        self,
        actor: Actor,
        critic: Critic,
        batch_size: int = 128,
        gamma: float = 0.99,
        polyak: float = 0.995,
    ):
        """Samples from the experience replay and performs a single DDPG update.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        critic: (Iterable[base.Critic]) Critic networks to optimize. In standard
            SAC there are *two* critics, but this method only requires that *two or
            more* critics are provided.
        batch_size: (int, optional) Minibatch size for SGD.  Default: 128.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        polyak: (float, optional) Interpolation factor in polyak averaging for
            target networks.  Range: (0, 1).  Default: 0.995
        """
        device = utils.get_device(actor)
        batch = self.replay.sample(batch_size, device=device)

        self.critic_optimizer.zero_grad()
        self.critic_loss(
            batch,
            critic,
            gamma=gamma,
        ).backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        actor_loss(batch, actor, critic).backward()
        self.actor_optimizer.step()

        for p, pt in zip(actor.parameters(), self.target_actor.parameters()):
            pt.data = pt.data * polyak + (1 - polyak) * p.data
        for p, pt in zip(critic.parameters(), self.target_critic.parameters()):
            pt.data = pt.data * polyak + (1 - polyak) * p.data
Пример #4
0
    def train(
            self,
            actor: Actor,
            critic: Critic,
            replay: Replay = None,
            actor_lr: float = 3e-4,
            critic_lr: float = 1e-3,
            train_actor_iters: int = 80,
            train_critic_iters: int = 80,
            epochs: int = 200,
            steps_per_epoch: int = 4000,
            clip_ratio: float = 0.2,
            gamma: float = 0.99,
            lam: float = 0.97,
            target_kl: float = 0.01,
            max_ep_len: int = 1000,
            callbacks: Iterable[Callable] = (),
    ):
        """Proximal Policy Optimization (via objective clipping) with early
        stopping based on approximate KL divergence of the policy network.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        critic: (base.Critic) Critic network to optimize.
        replay: (base.Replay, optional) Experience replay object for sampling
            previous experiences.  If not provided, defaults to 'ExperienceReplay'
            with a buffer size of 1,000,000.  Users can provide a replay object,
            which is pre-populated with experiences (for specific use cases).
        steps_per_epoch: (int, optional) Number of steps of interaction
            for the agent and the environment in each epoch.  Default: 4000.
        epochs: (int, optional) Number of training epochs.  Default:  100.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        actor_lr: (float, optional) Learning rate actor optimizer.  Default: 1e-3.
        critic_lr: (float, optional) Learning rate critic optimizer.  Default: 1e-3.
        train_actor_iters: (int, optional) Max number of actor training steps
            per epoch.  Default: 80.
        train_critic_iters: (int, optional) Max number of critic training steps
            per epoch.  Default: 80.
        clip_ratio: (float, optional) Hyperparameter for clipping in the policy
            objective.  Scales how much the policy is allowed change per
            training update.  Default: 0.2.
        lam: (float, optional) Hyperparameter for GAE-Lambda calaulation.
            Range: (0, 1).  Default: 0.97
        target_kl: (float, optional) Max KL divergence between new and old
            policies after an update. Used for early stopping. Typically in
            range (0.01, 0.05).  Default: 0.01.
        max_ep_len: (int, optional) Maximum length of episode.  Defaults to 1000,
            but *this should be provided for each unique environment!*  This
            has an effect on how end-of-episode rewards are computed.
        callbacks: (Iterable[Callable], optional) callback functions to execute
            at the end of each training epoch.
        """
        device = utils.get_device(actor)
        self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr)
        self.critic_optimizer = Adam(critic.parameters(), lr=critic_lr)
        self.replay = NoReplay(steps_per_epoch) if replay is None else replay

        for epoch in range(1, epochs + 1):
            state = self.env.reset()
            ep_reward, ep_length = 0, 0
            num_episodes = 0

            for t in range(1, steps_per_epoch + 1):
                action, logprob = actor(state.to(device))

                next_state, reward, done, _ = self.env.step(action)
                done = False if ep_length == max_ep_len else done
                self.replay.append(
                    [state, action, logprob, reward, done, next_state])
                state = next_state
                ep_reward += reward
                ep_length += 1

                if done or (ep_length == max_ep_len):
                    num_episodes += 1
                    self.ep_rewards.append(ep_reward)
                    state = self.env.reset()
                    ep_reward, ep_length = 0, 0

            self.update(
                actor,
                critic,
                train_actor_iters=train_actor_iters,
                train_critic_iters=train_critic_iters,
                clip_ratio=clip_ratio,
                gamma=gamma,
                lam=lam,
                target_kl=target_kl,
            )

            avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes
            print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="")

            for callback in callbacks:
                callback(self)
Пример #5
0
    def train(
            self,
            actor: Actor,
            critics: Iterable[Critic],
            replay: Replay = None,
            steps_per_epoch: int = 4000,
            epochs: int = 100,
            gamma: float = 0.99,
            polyak: float = 0.995,
            actor_lr: float = 5e-4,
            critic_lr: float = 1e-3,
            alpha: float = 0.2,
            batch_size: int = 128,
            start_steps: int = 4000,
            update_after: int = 1000,
            update_every: int = 1,
            max_ep_len: int = 1000,
            callbacks: Iterable[Callable] = (),
    ):
        """Soft actor-critic (SAC) training algorithm.  Supports both continuous
        and discrete action spaces.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        critics: (Iterable[base.Critic]) Critic networks to optimize. In standard
            SAC there are *two* critics, but this method only requires that *two or
            more* critics are provided.
        replay: (base.Replay, optional) Experience replay object for sampling
            previous experiences.  If not provided, defaults to 'ExperienceReplay'
            with a buffer size of 1,000,000.  Users can provide a replay object,
            which is pre-populated with experiences (for specific use cases).
        steps_per_epoch: (int, optional) Number of steps of interaction
            for the agent and the environment in each epoch.  Default: 4000.
        epochs: (int, optional) Number of training epochs.  Default:  100.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        polyak: (float, optional) Interpolation factor in polyak averaging for
            target networks.  Range: (0, 1).  Default: 0.995
        actor_lr: (float, optional) Learning rate actor optimizer.  Default: 1e-3.
        critic_lr: (float, optional) Learning rate critic optimizer.  Default: 1e-3.
        alpha: (float, optional) Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)  Default: 0.2.
        batch_size: (int, optional) Minibatch size for SGD.  Default: 128.
        start_steps: (int, optional) Number of steps for random action selection
            before running real policy (helps exploration).  Default: 1000.
        update_after: (int, optional) Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.  Default: 5000.
        update_every: (int, optional) Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.  Default: 1.
        max_ep_len: (int, optional) Maximum length of episode.  Defaults to 1000,
            but *this should be provided for each unique environment!*  This
            has an effect on how end-of-episode rewards are computed.
        callbacks: (Iterable[Callable], optional) callback functions to execute
            at the end of each training epoch.
        """
        device = utils.get_device(actor)
        self.replay = replay
        if replay is None:
            self.replay = ExperienceReplay(int(1e6))

        critic_params = chain(*[c.parameters() for c in critics])
        self.critic_optimizer = torch.optim.Adam(critic_params, lr=critic_lr)
        self.actor_optimizer = torch.optim.Adam(actor.parameters(),
                                                lr=actor_lr)
        self.target_critics = deepcopy(critics)

        state = self.env.reset()
        ep_reward, ep_length = 0, 0
        total_steps = steps_per_epoch * epochs

        for step in range(1, total_steps + 1):
            if step < start_steps:
                action = self.env.action_space.sample()
            else:
                action, _ = actor(state.to(device))

            next_state, reward, done, _ = self.env.step(action)
            done = False if ep_length == max_ep_len else done
            self.replay.append([state, action, reward, done, next_state])
            state = next_state
            ep_reward += reward
            ep_length += 1

            if step > update_after and step % update_every == 0:
                for j in range(update_every):
                    self.update(
                        actor,
                        critics,
                        batch_size=batch_size,
                        gamma=gamma,
                        alpha=alpha,
                        polyak=polyak,
                    )

            if step % steps_per_epoch == 0:
                for callback in callbacks:
                    callback(self)

            if done or (ep_length == max_ep_len):
                self.ep_rewards.append(ep_reward)
                epoch = (step + 1) // steps_per_epoch
                print(f"\rEpoch {epoch} | Step {step} | Reward {ep_reward}",
                      end="")
                state, ep_reward, ep_length = self.env.reset(), 0, 0
Пример #6
0
    def train(
        self,
        actor: Actor,
        critic: Critic,
        replay: Replay = None,
        actor_lr: float = 3e-4,
        critic_lr: float = 1e-3,
        train_critic_iters: int = 10,
        epochs: int = 200,
        steps_per_epoch: int = 4000,
        max_ep_len: int = 1000,
        gamma: float = 0.99,
        lam: float = 0.97,
        callbacks: Iterable[Callable] = (),
    ):
        """Advantage Actor-Critic (A2C) algorithm for training RL agents in both
        continuous and discrete action spaces.

        **NOTE:** Synchronous A2C was chosen over the asynchronous version (A3C)
        due to its simplicity.  It's also questionable that A3C performs better
        than A2C in the first place (in terms of the resulting trained policy,
        not training speed in Python).  Other people/organizations have also
        pointed this out, including OpenAI.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        critic: (base.Critic) Critic network to optimize.
        replay: (base.Replay, optional) Experience replay object for sampling
            previous experiences.  If not provided, defaults to 'ExperienceReplay'
            with a buffer size of 1,000,000.  Users can provide a replay object,
            which is pre-populated with experiences (for specific use cases).
        steps_per_epoch: (int, optional) Number of steps of interaction
            for the agent and the environment in each epoch.  Default: 4000.
        epochs: (int, optional) Number of training epochs.  Default:  100.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        actor_lr: (float, optional) Learning rate actor optimizer.  Default: 1e-3.
        critic_lr: (float, optional) Learning rate critic optimizer.  Default: 1e-3.
        train_critic_iters: (int, optional) Max number of critic training steps
            per epoch.  Default: 10.
        lam: (float, optional) Hyperparameter for GAE-Lambda calaulation.
            Range: (0, 1).  Default: 0.97.
        max_ep_len: (int, optional) Maximum length of episode.  Defaults to 1000,
            but *this should be provided for each unique environment!*  This
            has an effect on how end-of-episode rewards are computed.
        callbacks: (Iterable[Callable], optional) callback functions to execute
            at the end of each training epoch.
        """
        device = utils.get_device(actor)
        self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr)
        self.critic_optimizer = Adam(critic.parameters(), lr=critic_lr)
        self.replay = NoReplay(steps_per_epoch) if replay is None else replay

        for epoch in range(1, epochs + 1):
            state = self.env.reset()
            ep_reward, ep_length = 0, 0
            num_episodes = 0

            for t in range(1, steps_per_epoch + 1):
                action, logprob = actor(state.to(device))

                next_state, reward, done, _ = self.env.step(action)
                self.replay.append([state, action, logprob, reward, done, next_state])
                state = next_state
                ep_reward += reward
                ep_length += 1

                if done or (ep_length == max_ep_len):
                    num_episodes += 1
                    self.ep_rewards.append(ep_reward)
                    state = self.env.reset()
                    ep_reward, ep_length = 0, 0

            self.update(
                actor,
                critic,
                train_critic_iters=train_critic_iters,
                gamma=gamma,
                lam=lam,
            )

            avg_reward = sum(self.ep_rewards[-num_episodes:]) / num_episodes
            print(f"\rEpoch {epoch} | Avg Reward {avg_reward}", end="")

            for callback in callbacks:
                callback(self)
Пример #7
0
    def train(
            self,
            actor: Actor,
            critics: Iterable[Critic],
            replay: Replay = None,
            steps_per_epoch: int = 4000,
            epochs: int = 100,
            gamma: float = 0.99,
            polyak: float = 0.995,
            actor_lr: float = 5e-4,
            critic_lr: float = 1e-3,
            batch_size: int = 128,
            start_steps: int = 5000,
            update_after: int = 1000,
            update_every: int = 50,
            act_noise: float = 0.1,
            target_noise: float = 0.2,
            noise_clip: float = 0.5,
            policy_delay: int = 2,
            max_ep_len: int = 1000,
            callbacks: Iterable[Callable] = (),
    ):
        """Twin-Delayed Deep Deterministic (TD3) Policy Gradients training
        algorithm.  Supports only *deterministic* policies in *continuous*
        action spaces.

        Parameters
        ----------
        actor: (base.Actor) Actor (policy) network to optimize.
        critics: (Iterable[base.Critic]) Critic networks to optimize. In standard
            SAC there are *two* critics, but this method only requires that *two or
            more* critics are provided.
        replay: (base.Replay, optional) Experience replay object for sampling
            previous experiences.  If not provided, defaults to 'ExperienceReplay'
            with a buffer size of 1,000,000.  Users can provide a replay object,
            which is pre-populated with experiences (for specific use cases).
        steps_per_epoch: (int, optional) Number of steps of interaction
            for the agent and the environment in each epoch.  Default: 4000.
        epochs: (int, optional) Number of training epochs.  Default:  100.
        gamma: (float, optional) Discount factor.  Range: (0, 1).  Default: 0.99
        polyak: (float, optional) Interpolation factor in polyak averaging for
            target networks.  Range: (0, 1).  Default: 0.995
        actor_lr: (float, optional) Learning rate actor optimizer.  Default: 1e-3.
        critic_lr: (float, optional) Learning rate critic optimizer.  Default: 1e-3.
        batch_size: (int, optional) Minibatch size for SGD.  Default: 128.
        start_steps: (int, optional) Number of steps for random action selection
            before running real policy (helps exploration).  Default: 1000.
        update_after: (int, optional) Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.  Default: 5000.
        update_every: (int, optional) Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.  Default: 1.
        act_noise: (float, optional) Stddev for Gaussian exploration noise added
            to policy at training time.  Default: 0.1.
        target_noise: (float, optional) Stddev for smoothing noise added to
            target policy.  Default: 0.2.
        noise_clip: (float, optional) Max absolute value of target policy
            smoothing noise.  Default: 0.5.
        policy_delay: (int, optional) Policy will only be updated once every
            policy_delay times for each update of the Q-networks.  Default: 2.
        max_ep_len: (int, optional) Maximum length of episode.  Defaults to 1000,
            but *this should be provided for each unique environment!*  This
            has an effect on how end-of-episode rewards are computed.
        callbacks: (Iterable[Callable], optional) callback functions to execute
            at the end of each training epoch.
        """
        device = utils.get_device(actor)
        self.replay = replay
        if replay is None:
            self.replay = ExperienceReplay(int(1e6))

        critic_params = chain(*[c.parameters() for c in critics])
        self.critic_optimizer = Adam(critic_params, lr=critic_lr)
        self.actor_optimizer = Adam(actor.parameters(), lr=actor_lr)
        self.target_actor = deepcopy(actor)
        self.target_critics = deepcopy(critics)

        total_steps = steps_per_epoch * epochs
        state, ep_reward, ep_length = self.env.reset(), 0, 0

        for step in range(1, total_steps + 1):
            if step < start_steps:
                action = self.env.action_space.sample()
            else:
                action, _ = actor(state.to(device))
                action += act_noise * torch.randn_like(action)

            next_state, reward, done, _ = self.env.step(action)
            done = False if ep_length == max_ep_len else done
            self.replay.append([state, action, reward, done, next_state])
            state = next_state
            ep_reward += reward
            ep_length += 1

            if step >= update_after and step % update_every == 0:
                for iter in range(update_every):
                    self.update(
                        iter,
                        actor,
                        critics,
                        batch_size=batch_size,
                        gamma=gamma,
                        target_noise=target_noise,
                        noise_clip=noise_clip,
                        policy_delay=policy_delay,
                        polyak=polyak,
                    )

            if step % steps_per_epoch == 0:
                for callback in callbacks:
                    callback(self)

            if done or (ep_length == max_ep_len):
                self.ep_rewards.append(ep_reward)
                epoch = (step + 1) // steps_per_epoch
                print(f"\rEpoch {epoch} | Step {step} | Reward {ep_reward}",
                      end="")
                state, ep_reward, ep_length = self.env.reset(), 0, 0