예제 #1
0
def cartpole_episode_batch():
    env_fn = make_env_fn("CartPole-v1")
    env = env_fn()
    policy_fn = lambda: MockPolicy(91, env.observation_space, env.action_space)

    runner = Runner(env_fn, policy_fn, seed=91)
    episodes, _ = runner.run(15)
    return episodes, policy_fn
예제 #2
0
    def pretrain_setup(self, total_timesteps):
        if self.lr_schedule == "linear":
            lr = LinearDecay(self.lr,
                             total_timesteps // self.timesteps_per_iteration)
        else:
            lr = self.lr

        self.optimizer = tf.keras.optimizers.Adam(lr, epsilon=self.epsilon)
        self.runner = Runner(**self.runner_config)
예제 #3
0
파일: ddpg.py 프로젝트: rystrauss/interact
    def pretrain_setup(self, total_timesteps: int):
        self.runner = Runner(**self.runner_config)

        self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_lr)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        self.beta_schedule = LinearDecay(
            initial_learning_rate=self.prioritized_replay_beta,
            decay_steps=self.prioritized_replay_beta_steps or total_timesteps,
            end_learning_rate=self.final_prioritized_replay_beta,
        )

        self.noise_schedule = LinearDecay(
            initial_learning_rate=self.initial_noise_scale,
            decay_steps=self.noise_scale_steps or total_timesteps,
            end_learning_rate=self.initial_noise_scale,
        )
예제 #4
0
    def pretrain_setup(self, total_timesteps: int):
        if self.lr_schedule == "linear":
            lr = LinearDecay(self.lr, total_timesteps // self.timesteps_per_iteration)
        else:
            lr = self.lr

        if self.cliprange_schedule == "linear":
            self.cliprange = LinearDecay(self.cliprange, total_timesteps)

        self.policy_optimizer = tf.optimizers.Adam(learning_rate=lr)
        if self.policy_epochs != self.value_epochs:
            self.value_optimizer = tf.optimizers.Adam(learning_rate=lr)
        self.aux_optimizer = tf.optimizers.Adam(learning_rate=lr)

        self.runner = Runner(**self.runner_config)
예제 #5
0
파일: dqn.py 프로젝트: rystrauss/interact
    def pretrain_setup(self, total_timesteps: int):
        self.epsilon = LinearDecay(
            initial_learning_rate=self.initial_epsilon,
            decay_steps=self.epsilon_timesteps,
            end_learning_rate=self.final_epsilon,
        )

        self.beta_schedule = LinearDecay(
            initial_learning_rate=self.prioritized_replay_beta,
            decay_steps=self.prioritized_replay_beta_steps or total_timesteps,
            end_learning_rate=self.final_prioritized_replay_beta,
        )

        self.runner = Runner(**self.runner_config)
        self.optimizer = tf.optimizers.Adam(learning_rate=self.lr)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
예제 #6
0
파일: ddpg.py 프로젝트: rystrauss/interact
class DDPGAgent(Agent):
    """The deep deterministic policy gradients algorithm.

    This implementation also makes available the features that constitute the
    Twin Delayed DDPG (TD3) algorithm, although they are disabled by default.

    Args:
        env_fn: A function that, when called, returns an instance of the agent's
            environment.
        network: Base network type to be used by the policy and Q-functions.
        actor_lr: Learning rate to use for updating the actor.
        critic_lr: Learning rate to use for updating the critics.
        tau: Parameter for the polyak averaging used to update the target networks.
        target_update_interval: Frequency with which the target Q-networks are updated.
        gamma: The discount factor.
        buffer_size: The maximum size of the replay buffer.
        train_freq: The frequency with which training updates are performed.
        target_update_interval: The frequency with which the target network is updated.
        learning_starts: The number of timesteps after which learning starts.
        random_steps: Actions will be sampled completely at random for this many
            timesteps at the beginning of training.
        batch_size: The size of batches sampled from the replay buffer over which
            updates are performed.
        num_workers: The number of parallel workers to use for experience collection.
        num_envs_per_worker: The number of synchronous environments to be executed in
            each worker.
        prioritized_replay: If True, a prioritized experience replay will be used.
        prioritized_replay_alpha: Alpha parameter for prioritized replay.
        prioritized_replay_beta: Initial beta parameter for prioritized replay.
        final_prioritized_replay_beta: The final value of the prioritized replay beta
            parameter.
        prioritized_replay_beta_steps: Number of steps over which the prioritized
            replay beta parameter will be annealed. If None, this will be set to the
            total number of training steps.
        prioritized_replay_epsilon: Epsilon to add to td-errors when updating
            priorities.
        initial_noise_scale: The initial scale of the Gaussian noise that is added to
            actions for exploration.
        final_noise_scale: The final scale of the Gaussian noise that is added to
            actions for exploration.
        noise_scale_steps: The number of timesteps over which the amount of exploration
            noise is annealed from `initial_noise_scale` to `final_noise_scale`. If
            None, the total duration of training is used.
        use_huber: If True, the Huber loss is used in favor of MSE for critic updates.
        use_twin_critic: If True, twin critic networks are used.
        policy_delay: The policy is updated once for every `policy_delay` critic
            updates.
        smooth_target_policy: If true, target policy smoothing is used in the critic
            updates.
        target_noise: The amount of target noise that is used for smoothing.
        target_noise_clip: The value at which target noise is clipped.
    """
    def __init__(
        self,
        env_fn: Callable[[], gym.Env],
        network: str = "mlp",
        actor_lr: float = 1e-3,
        critic_lr: float = 1e-3,
        tau: float = 0.002,
        gamma: float = 0.95,
        buffer_size: int = 50000,
        train_freq: int = 1,
        target_update_interval: int = 1,
        learning_starts: int = 1500,
        random_steps: int = 1500,
        batch_size: int = 256,
        num_workers: int = 1,
        num_envs_per_worker: int = 1,
        prioritized_replay: bool = False,
        prioritized_replay_alpha: float = 0.6,
        prioritized_replay_beta: float = 0.4,
        final_prioritized_replay_beta: float = 4.0,
        prioritized_replay_beta_steps: Optional[int] = None,
        prioritized_replay_epsilon: float = 1e-6,
        initial_noise_scale: float = 0.1,
        final_noise_scale: float = 0.1,
        noise_scale_steps: Optional[int] = None,
        use_huber: bool = False,
        use_twin_critic: bool = False,
        policy_delay: int = 1,
        smooth_target_policy: bool = False,
        target_noise: float = 0.2,
        target_noise_clip: float = 0.5,
    ):
        super().__init__(env_fn)

        env = self.make_env()
        assert isinstance(
            env.action_space, gym.spaces.Box
        ), "DDPG can only be used with continuous action spaces."

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.learning_starts = learning_starts
        self.random_steps = random_steps
        self.target_update_interval = target_update_interval
        self.tau = tau
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.num_envs_per_worker = num_envs_per_worker
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_beta_steps = prioritized_replay_beta_steps
        self.final_prioritized_replay_beta = final_prioritized_replay_beta
        self.prioritized_replay_epsilon = prioritized_replay_epsilon
        self.initial_noise_scale = initial_noise_scale
        self.final_noise_scale = final_noise_scale
        self.noise_scale_steps = noise_scale_steps
        self.use_huber = use_huber
        self.use_twin_critic = use_twin_critic
        self.policy_delay = policy_delay
        self.smooth_target_policy = smooth_target_policy
        self.target_noise = target_noise
        self.target_noise_clip = target_noise_clip

        self.actor_critic = DeterministicActorCriticPolicy(
            env.observation_space, env.action_space, network, use_twin_critic)
        self.target_actor_critic = DeterministicActorCriticPolicy(
            env.observation_space, env.action_space, network, use_twin_critic)
        self.target_actor_critic.set_weights(self.actor_critic.get_weights())

        def policy_fn():
            if num_workers == 1:
                return self.actor_critic

            return DeterministicActorCriticPolicy(env.observation_space,
                                                  env.action_space, network,
                                                  use_twin_critic)

        self.runner = None
        self.runner_config = dict(
            env_fn=env_fn,
            policy_fn=policy_fn,
            num_envs_per_worker=num_envs_per_worker,
            num_workers=num_workers,
        )

        self.replay_buffer = None
        self.beta_schedule = None
        self.noise_schedule = None
        self.actor_optimizer = None
        self.critic_optimizer = None

        self._has_updated = False

    @property
    def timesteps_per_iteration(self) -> int:
        return self.num_workers * self.num_envs_per_worker

    def pretrain_setup(self, total_timesteps: int):
        self.runner = Runner(**self.runner_config)

        self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_lr)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        self.beta_schedule = LinearDecay(
            initial_learning_rate=self.prioritized_replay_beta,
            decay_steps=self.prioritized_replay_beta_steps or total_timesteps,
            end_learning_rate=self.final_prioritized_replay_beta,
        )

        self.noise_schedule = LinearDecay(
            initial_learning_rate=self.initial_noise_scale,
            decay_steps=self.noise_scale_steps or total_timesteps,
            end_learning_rate=self.initial_noise_scale,
        )

    @tf.function
    def act(self, obs: TensorType, deterministic: bool = True) -> TensorType:
        assert deterministic, "Non-deterministic actions not supported for DDPG."
        return self.actor_critic(obs)

    @tf.function
    def _update(self, obs, actions, rewards, dones, next_obs, weights,
                update_policy):
        target_actions = self.target_actor_critic(next_obs)
        if self.smooth_target_policy:
            epsilon = tf.random.normal(target_actions.shape,
                                       stddev=self.target_noise)
            epsilon = tf.clip_by_value(epsilon, -self.target_noise_clip,
                                       self.target_noise_clip)
            target_actions += epsilon
            target_actions = tf.clip_by_value(
                target_actions,
                self.actor_critic.action_space_low,
                self.actor_critic.action_space_high,
            )

        if self.use_twin_critic:
            target_pi_q_values = tf.minimum(
                *self.target_actor_critic.q_function(
                    [next_obs, target_actions]))
        else:
            target_pi_q_values = self.target_actor_critic.q_function(
                [next_obs, target_actions])
        backup = rewards + self.gamma * (1 - dones) * target_pi_q_values

        loss_fn = tf.losses.huber if self.use_huber else tf.losses.mse

        with tf.GradientTape() as tape:
            if self.use_twin_critic:
                q1_values, q2_values = self.actor_critic.q_function(
                    [obs, actions])
                q1_loss = loss_fn(backup[:, tf.newaxis], q1_values[:,
                                                                   tf.newaxis])
                q2_loss = loss_fn(backup[:, tf.newaxis], q2_values[:,
                                                                   tf.newaxis])
                critic_loss = q1_loss + q2_loss
            else:
                q_values = self.actor_critic.q_function([obs, actions])
                critic_loss = loss_fn(backup[:, tf.newaxis],
                                      q_values[:, tf.newaxis])

            if not self.use_huber:
                critic_loss *= 0.5

            critic_loss = tf.reduce_mean(critic_loss * weights)

        grads = tape.gradient(critic_loss,
                              self.actor_critic.q_function.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(grads, self.actor_critic.q_function.trainable_variables))

        if self.use_twin_critic:
            td_error = 0.5 * ((q1_values - backup) + (q2_values - backup))
        else:
            td_error = q_values - backup

        if not update_policy:
            return {
                "critic_loss": critic_loss,
                "mean_q": tf.reduce_mean(q_values),
                "min_q": tf.reduce_min(q_values),
                "max_q": tf.reduce_max(q_values),
            }, td_error

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(self.actor_critic.policy.trainable_weights)

            if self.use_twin_critic:
                pi_q_values = tf.minimum(*self.actor_critic.q_function(
                    [obs, self.actor_critic(obs)]))
            else:
                pi_q_values = self.actor_critic.q_function(
                    [obs, self.actor_critic(obs)])
            actor_loss = -tf.reduce_mean(pi_q_values)

        grads = tape.gradient(actor_loss,
                              self.actor_critic.policy.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(grads, self.actor_critic.policy.trainable_variables))

        return {
            "critic_loss": critic_loss,
            "mean_q": tf.reduce_mean(q_values),
            "min_q": tf.reduce_min(q_values),
            "max_q": tf.reduce_max(q_values),
            "actor_loss": actor_loss,
        }, td_error

    @tf.function
    def _update_target(self):
        polyak_update(self.actor_critic.variables,
                      self.target_actor_critic.variables, self.tau)

    def train(self, update: int) -> Tuple[Dict[str, float], List[Dict]]:
        cur_noise_scale = self.noise_schedule(update)

        episodes, ep_infos = self.runner.run(
            1,
            # In the beginning, randomly select actions from a uniform distribution
            # for better exploration.
            uniform_sample=(update * self.timesteps_per_iteration <=
                            self.random_steps),
            noise_scale=cur_noise_scale,
        )

        self.replay_buffer.add(episodes.to_sample_batch())

        metrics = dict()
        if (update * self.timesteps_per_iteration > self.learning_starts
                and update % self.train_freq == 0):
            for _ in range(self.train_freq):
                if self.prioritized_replay:
                    sample = self.replay_buffer.sample(
                        self.batch_size, self.beta_schedule(update))
                    weights = sample[SampleBatch.PRIO_WEIGHTS]
                else:
                    sample = self.replay_buffer.sample(self.batch_size)
                    weights = 1.0

                batch_metrics, td_errors = self._update(
                    sample[SampleBatch.OBS],
                    sample[SampleBatch.ACTIONS],
                    sample[SampleBatch.REWARDS],
                    sample[SampleBatch.DONES],
                    sample[SampleBatch.NEXT_OBS],
                    weights,
                    update % self.policy_delay == 0 or not self._has_updated,
                )

                self._has_updated = True

                if self.prioritized_replay:
                    self.replay_buffer.update_priorities(
                        sample["batch_indices"], td_errors)

            metrics.update(batch_metrics)

            if self.num_workers != 1:
                self.runner.update_policies(self.policy.get_weights())

        if update % self.target_update_interval == 0:
            self._update_target()

        metrics["noise_scale"] = cur_noise_scale

        return metrics, ep_infos
예제 #7
0
class A2CAgent(Agent):
    """The advantage actor-critic algorithm.

    Advantage Actor-Critic (A2C) is a relatively simply actor-critic method which uses
    the advantage function in the policy update.

    Args:
        env_fn: A function that, when called, returns an instance of the agent's
            environment.
        policy_network: The type of model to use for the policy network.
        value_network: Either 'copy' or 'shared', indicating whether or not weights
            should be shared between the policy and value networks.
        num_envs_per_worker: The number of synchronous environments to be executed in
            each worker.
        num_workers: The number of parallel workers to use for experience collection.
        use_critic: Whether to use critic (value estimates). Setting this to False will
            use 0 as baseline. If this is false, the agent becomes a vanilla
            actor-critic method.
        use_gae: Whether or not to use GAE.
        lam: The lambda parameter used in GAE.
        gamma: The discount factor.
        nsteps: The number of steps taken in each environment per update.
        ent_coef: The coefficient of the entropy term in the loss function.
        vf_coef: The coefficient of the value term in the loss function.
        lr: The initial learning rate.
        lr_schedule: The schedule for the learning rate, either 'constant' or 'linear'.
        max_grad_norm: The maximum value for the gradient clipping.
        epsilon: The epsilon value used by the Adam optimizer.
    """
    def __init__(
        self,
        env_fn: Callable[[], gym.Env],
        policy_network: str = "mlp",
        value_network: str = "copy",
        num_envs_per_worker: int = 1,
        num_workers: int = 8,
        use_critic: bool = True,
        use_gae: bool = False,
        lam: float = 1.0,
        gamma: float = 0.99,
        nsteps: int = 5,
        ent_coef: float = 0.01,
        vf_coef: float = 0.25,
        lr: float = 0.0001,
        lr_schedule: str = "constant",
        max_grad_norm: float = 0.5,
        epsilon: float = 1e-7,
    ):
        super().__init__(env_fn)

        assert lr_schedule in {
            "linear",
            "constant",
        }, 'lr_schedule must be "linear" or "constant"'

        env = self.make_env()

        def policy_fn():
            return ActorCriticPolicy(env.observation_space, env.action_space,
                                     policy_network, value_network)

        self.policy = policy_fn()

        self.runner = None
        self.runner_config = dict(
            env_fn=env_fn,
            policy_fn=policy_fn,
            num_envs_per_worker=num_envs_per_worker,
            num_workers=num_workers,
        )

        self.num_envs_per_worker = num_envs_per_worker
        self.num_workers = num_workers
        self.use_critic = use_critic
        self.use_gae = use_gae
        self.lam = lam
        self.gamma = gamma
        self.nsteps = nsteps
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.lr = lr
        self.lr_schedule = lr_schedule
        self.max_grad_norm = max_grad_norm
        self.epsilon = epsilon

        self.optimizer = None

    @property
    def timesteps_per_iteration(self):
        return self.nsteps * self.num_envs_per_worker * self.num_workers

    @tf.function
    def _update(self, obs, actions, advantages, returns):
        with tf.GradientTape() as tape:
            # Compute the policy and value predictions for the given observations
            pi, value_preds = self.policy(obs)
            # Retrieve policy entropy and the negative log probabilities of the actions
            neglogpacs = -pi.log_prob(actions)
            entropy = tf.reduce_mean(pi.entropy())
            # Define the individual loss functions
            policy_loss = tf.reduce_mean(advantages * neglogpacs)
            value_loss = tf.reduce_mean((returns - value_preds)**2)
            # The final loss to be minimized is a combination of the policy and value
            # losses, in addition to an entropy bonus which can be used to encourage
            # exploration
            loss = policy_loss - entropy * self.ent_coef + value_loss * self.vf_coef

        # Perform a gradient update to minimize the loss
        grads = tape.gradient(loss, self.policy.trainable_weights)
        # Perform gradient clipping
        grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        # Apply the gradient update
        self.optimizer.apply_gradients(
            zip(grads, self.policy.trainable_weights))

        # This is a measure of how well the value function explains the variance in
        # the rewards
        value_explained_variance = explained_variance(returns, value_preds)

        return {
            "policy_loss": policy_loss,
            "value_loss": value_loss,
            "policy_entropy": entropy,
            "value_explained_variance": value_explained_variance,
        }

    @tf.function
    def act(self, obs: TensorType, deterministic: bool = True) -> TensorType:
        pi, _ = self.policy(obs)

        if deterministic:
            actions = pi.mode()
        else:
            actions = pi.mean()

        return actions

    def pretrain_setup(self, total_timesteps):
        if self.lr_schedule == "linear":
            lr = LinearDecay(self.lr,
                             total_timesteps // self.timesteps_per_iteration)
        else:
            lr = self.lr

        self.optimizer = tf.keras.optimizers.Adam(lr, epsilon=self.epsilon)
        self.runner = Runner(**self.runner_config)

    def train(self, update: int) -> Tuple[Dict[str, float], List[Dict]]:
        # Update the weights of the actor policies to be consistent with the most
        # recent update.
        self.runner.update_policies(self.policy.get_weights())

        # Rollout the current policy in the environment to get back a batch of
        # experience.
        episodes, ep_infos = self.runner.run(self.nsteps)

        # Compute advantages for the collected experience.
        episodes.for_each(
            AdvantagePostprocessor(self.policy.value, self.gamma, self.lam,
                                   self.use_gae, self.use_critic))

        # Aggregate the collected experience so that a gradient update can be performed.
        batch = episodes.to_sample_batch().shuffle()

        # Update the policy and value function based on the new experience.
        metrics = self._update(
            batch[SampleBatch.OBS],
            batch[SampleBatch.ACTIONS],
            batch[SampleBatch.ADVANTAGES],
            batch[SampleBatch.RETURNS],
        )

        return metrics, ep_infos
예제 #8
0
def time_limit_cartpole_runner():
    env_fn = make_env_fn("CartPole-v1", episode_time_limit=8)
    env = env_fn()
    policy_fn = lambda: MockPolicy(91, env.observation_space, env.action_space)

    return Runner(env_fn, policy_fn, seed=91)
예제 #9
0
def cartpole_runner():
    env_fn = make_env_fn("CartPole-v1")
    env = env_fn()
    policy_fn = lambda: MockPolicy(91, env.observation_space, env.action_space)

    return Runner(env_fn, policy_fn, seed=91)