Пример #1
0
def trpo(
        env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        delta=0.01,
        vf_lr=1e-3,
        train_v_iters=80,
        damping_coeff=0.1,
        cg_iters=10,
        backtrack_iters=10,
        backtrack_coeff=0.8,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10,
        algo="trpo",
):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The agent's main model which for state ``x`` and
            action, ``a`` returns the following outputs:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a``
                                            | in states ``x``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | The mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as
                                            | specified by the inputs to
                                            | ``info``) over the batch of
                                            | states given in ``x``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x``. (Critical: make sure
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update.
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be
            smallish. Adjusts Hessian-vector product calculation:

            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient.
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform.
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down.

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the
            backtracking line search. Since the line search usually doesn't
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are
            almost the same.

    """

    setup_pytorch_for_mpi()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs["action_space"] = env.action_space

    # Main model
    actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    if isinstance(env.action_space, Box):
        info_shapes = {
            "old_mu": [env.action_space.shape[-1]],
            "old_log_std": [env.action_space.shape[-1]],
        }
    else:
        info_shapes = {"old_logits": [env.action_space.n]}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    var_counts = tuple(
        core.count_vars(module)
        for module in [actor_critic.policy, actor_critic.value_function])
    logger.log("\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts)

    # Optimizer for value function
    train_vf = torch.optim.Adam(actor_critic.value_function.parameters(),
                                lr=vf_lr)

    # Sync params across processes
    sync_all_params(actor_critic.parameters())

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = torch.zeros_like(b)
        r = b  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = b
        r_dot_old = torch.dot(r, r)
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (torch.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = torch.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        inputs = [torch.Tensor(x) for x in buf.get()]
        obs, act, adv, ret, logp_old = inputs[:-len(buf.sorted_info_keys)]
        policy_args = dict(
            zip(buf.sorted_info_keys, inputs[-len(buf.sorted_info_keys):]))

        # Main outputs from computation graph
        _, logp, _, _, d_kl, v = actor_critic(obs, act, **policy_args)

        # Prepare hessian func, gradient eval
        ratio = (logp - logp_old).exp()  # pi(a|s) / pi_old(a|s)
        pi_l_old = -(ratio * adv).mean()
        v_l_old = F.mse_loss(v, ret)

        g = core.flat_grad(pi_l_old,
                           actor_critic.policy.parameters(),
                           retain_graph=True)
        g = torch.from_numpy(mpi_avg(g.numpy()))
        pi_l_old = mpi_avg(pi_l_old.item())

        def Hx(x):
            hvp = core.hessian_vector_product(d_kl, actor_critic.policy, x)
            if damping_coeff > 0:
                hvp += damping_coeff * x
            return torch.from_numpy(mpi_avg(hvp.numpy()))

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = torch.sqrt(2 * delta / (torch.dot(x, Hx(x)) + EPS))
        old_params = parameters_to_vector(actor_critic.policy.parameters())

        def set_and_eval(step):
            vector_to_parameters(old_params - alpha * x * step,
                                 actor_critic.policy.parameters())
            _, logp, _, _, d_kl = actor_critic.policy(obs, act, **policy_args)
            ratio = (logp - logp_old).exp()
            pi_loss = -(ratio * adv).mean()
            return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item())

        if algo == "npg":
            kl, pi_l_new = set_and_eval(step=1.0)

        elif algo == "trpo":
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        "Accepting new params at step %d of line search." % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log("Line search failed! Keeping old params.")
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.0)

        # Value function updates
        for _ in range(train_v_iters):
            v = actor_critic.value_function(obs)
            v_loss = F.mse_loss(v, ret)

            # Value function gradient step
            train_vf.zero_grad()
            v_loss.backward()
            average_gradients(train_vf.param_groups)
            train_vf.step()

        v = actor_critic.value_function(obs)
        v_l_new = F.mse_loss(v, ret)

        # Log changes from update
        logger.store(
            LossPi=pi_l_old,
            LossV=v_l_old,
            KL=kl,
            DeltaLossPi=(pi_l_new - pi_l_old),
            DeltaLossV=(v_l_new - v_l_old),
        )

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        actor_critic.eval()
        for t in range(local_steps_per_epoch):
            a, _, logp_t, info_t, _, v_t = actor_critic(
                torch.Tensor(o.reshape(1, -1)))

            # save and log
            buf.store(
                o,
                a.detach().numpy(),
                r,
                v_t.item(),
                logp_t.detach().numpy(),
                core.values_as_sorted_list(info_t),
            )
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a.detach().numpy()[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print("Warning: trajectory cut off by epoch at %d steps." %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = (r if d else actor_critic.value_function(
                    torch.Tensor(o.reshape(1, -1))).item())
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({"env": env}, actor_critic, None)

        # Perform TRPO or NPG update!
        actor_critic.train()
        update()

        # Log info about epoch
        logger.log_tabular("Epoch", epoch)
        logger.log_tabular("EpRet", with_min_and_max=True)
        logger.log_tabular("EpLen", average_only=True)
        logger.log_tabular("VVals", with_min_and_max=True)
        logger.log_tabular("TotalEnvInteracts", (epoch + 1) * steps_per_epoch)
        logger.log_tabular("LossPi", average_only=True)
        logger.log_tabular("LossV", average_only=True)
        logger.log_tabular("DeltaLossPi", average_only=True)
        logger.log_tabular("DeltaLossV", average_only=True)
        logger.log_tabular("KL", average_only=True)
        if algo == "trpo":
            logger.log_tabular("BacktrackIters", average_only=True)
        logger.log_tabular("Time", time.time() - start_time)
        logger.dump_tabular()
Пример #2
0
def ddpg(
        env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.
            
        actor_critic: The agent's main model which takes some states ``x`` and 
            and actions ``a`` and returns a tuple of:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for
                                           | states ``x`` and actions in
                                           | ``a``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and
                                           | ``pi`` for states in ``x``:
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            class you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs["action_space"] = env.action_space

    # Main outputs from computation graph
    main = actor_critic(in_features=obs_dim, **ac_kwargs)

    # Target networks
    target = actor_critic(in_features=obs_dim, **ac_kwargs)
    target.eval()

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(module) for module in [main.policy, main.q, main])
    print("\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n" %
          var_counts)

    # Separate train ops for pi, q
    pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=pi_lr)
    q_optimizer = torch.optim.Adam(main.q.parameters(), lr=q_lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, noise_scale):
        pi = main.policy(torch.Tensor(o.reshape(1, -1)))
        a = pi.detach().numpy()[0] + noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.eval()
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            main.train()
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                (obs1, obs2, acts, rews, done) = (
                    torch.Tensor(batch["obs1"]),
                    torch.Tensor(batch["obs2"]),
                    torch.Tensor(batch["acts"]),
                    torch.Tensor(batch["rews"]),
                    torch.Tensor(batch["done"]),
                )
                _, q, q_pi = main(obs1, acts)
                _, _, q_pi_targ = target(obs2, acts)

                # Bellman backup for Q function
                backup = (rews + gamma * (1 - done) * q_pi_targ).detach()

                # DDPG losses
                pi_loss = -q_pi.mean()
                q_loss = F.mse_loss(q, backup)

                # Q-learning update
                q_optimizer.zero_grad()
                q_loss.backward()
                q_optimizer.step()
                logger.store(LossQ=q_loss, QVals=q.data.numpy())

                # Policy update
                pi_optimizer.zero_grad()
                pi_loss.backward()
                pi_optimizer.step()
                logger.store(LossPi=pi_loss)

                # Polyak averaging for target parameters
                for p_main, p_target in zip(main.parameters(),
                                            target.parameters()):
                    p_target.data.copy_(polyak * p_target.data +
                                        (1 - polyak) * p_main.data)

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({"env": env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular("Epoch", epoch)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t)
            logger.log_tabular("QVals", with_min_and_max=True)
            logger.log_tabular("LossPi", average_only=True)
            logger.log_tabular("LossQ", average_only=True)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()
Пример #3
0
def dqn(
    env_fn,
    dqnetwork=core.DQNetwork,
    ac_kwargs=dict(),
    seed=0,
    steps_per_epoch=5000,
    epochs=100,
    replay_size=int(1e6),
    gamma=0.99,
    min_replay_history=20000,
    epsilon_decay_period=250000,
    epsilon_train=0.01,
    epsilon_eval=0.001,
    lr=1e-3,
    max_ep_len=1000,
    update_period=4,
    target_update_period=8000,
    batch_size=100,
    logger_kwargs=dict(),
    save_freq=1,
):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = 1  # env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs["action_space"] = env.action_space

    # Main computation graph
    main = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Target network
    target = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [main.q, main])
    print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts)

    # Value train op
    value_params = main.q.parameters()
    value_optimizer = torch.optim.Adam(value_params, lr=lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, epsilon):
        """Select an action from the set of available actions.
        Chooses an action randomly with probability epsilon otherwise
        act greedily according to the current Q-value estimates.
        """
        if np.random.random() <= epsilon:
            return env.action_space.sample()
        else:
            q_values = main(torch.Tensor(o.reshape(1, -1)))
            # return the action with highest Q-value for this observation
            return torch.argmax(q_values, dim=1).item()

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # epsilon_eval used when evaluating the agent
                o, r, d, _ = test_env.step(get_action(o, epsilon_eval))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.eval()

        # the epsilon value used for exploration during training
        epsilon = core.linearly_decaying_epsilon(
            epsilon_decay_period, t, min_replay_history, epsilon_train
        )
        a = get_action(o, epsilon)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # train at the rate of update_period if enough training steps have been run
        if replay_buffer.size > min_replay_history and t % update_period == 0:
            main.train()
            batch = replay_buffer.sample_batch(batch_size)
            (obs1, obs2, acts, rews, done) = (
                torch.Tensor(batch["obs1"]),
                torch.Tensor(batch["obs2"]),
                torch.Tensor(batch["acts"]),
                torch.Tensor(batch["rews"]),
                torch.Tensor(batch["done"]),
            )
            q_pi = main(obs1).gather(1, acts.long()).squeeze()
            q_pi_targ, _ = target(obs2).max(1)

            # Bellman backup for Q function
            backup = (rews + gamma * (1 - done) * q_pi_targ).detach()

            # DQN loss
            value_loss = F.smooth_l1_loss(q_pi, backup)

            # Q-learning update
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()
            logger.store(LossQ=value_loss.item(), QVals=q_pi.data.numpy())

        # syncs weights from online to target network
        if t % target_update_period == 0:
            target.load_state_dict(main.state_dict())

        # End of epoch wrap-up
        if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({"env": env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular("Epoch", epoch)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t)
            logger.log_tabular("QVals", with_min_and_max=True)
            logger.log_tabular("LossQ", average_only=True)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()
Пример #4
0
def ppo(
        env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        use_gpu=False,
        gpu_parallel=False,
):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The agent's main model which is composed of
            the policy and value function model, where the policy takes
            some state, ``x`` and action ``a``, and value function takes
            the state ``x``. The model returns a tuple of:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a``
                                           | in states ``x``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x``. (Critical: make sure
                                           | to flatten this via .squeeze()!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            class you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Main model
    actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs)
    # actor_critic = torch.nn.DataParallel(actor_critic).to(device)
    # gpu是否使用
    # device = torch.device("cpu" if USE_DEVICE=="cpu" else "cuda")
    if torch.cuda.is_available():
        device = torch.device("cuda" if use_gpu else "cpu")
        if gpu_parallel:
            actor_critic = torch.nn.DataParallel(actor_critic)
    else:
        use_gpu = False
        use_parallel = False
        device = torch.device("cpu")
    actor_critic = actor_critic.to(device)
    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(
        core.count_vars(module)
        for module in [actor_critic.policy, actor_critic.value_function])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Optimizers
    train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr)
    train_v = torch.optim.Adam(actor_critic.value_function.parameters(),
                               lr=vf_lr)

    # Sync params across processes
    sync_all_params(actor_critic.parameters())

    def update():
        temp_get = buf.get()
        obs, act, adv, ret, logp_old = [
            torch.Tensor(x).to(device) for x in temp_get
        ]

        # Training policy
        _, logp, _ = actor_critic.policy(obs, act)
        ratio = (logp - logp_old).exp()
        min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv,
                              (1 - clip_ratio) * adv)

        pi_l_old = -(torch.min(ratio * adv, min_adv)).mean()
        ent = (-logp).mean()  # a sample estimate for entropy

        for i in range(train_pi_iters):
            # Output from policy function graph
            _, logp, _ = actor_critic.policy(obs, act)
            # PPO policy objective
            ratio = (logp - logp_old).exp()
            min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv,
                                  (1 - clip_ratio) * adv)

            pi_loss = -(torch.min(ratio * adv, min_adv)).mean()

            # Policy gradient step
            train_pi.zero_grad()
            pi_loss.backward()
            average_gradients(train_pi.param_groups)
            train_pi.step()

            _, logp, _ = actor_critic.policy(obs, act)
            kl = (logp_old - logp).mean()
            kl = mpi_avg(kl.item())
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)

        # Training value function
        v = actor_critic.value_function(obs)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            # Output from value function graph
            v = actor_critic.value_function(obs)
            # PPO value function objective
            v_loss = F.mse_loss(v, ret)

            # Value function gradient step
            train_v.zero_grad()
            v_loss.backward()
            average_gradients(train_v.param_groups)
            train_v.step()

        # Log changes from update
        _, logp, _, v = actor_critic(obs, act)
        ratio = (logp - logp_old).exp()
        min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv,
                              (1 - clip_ratio) * adv)
        pi_l_new = -(torch.min(ratio * adv, min_adv)).mean()
        v_l_new = F.mse_loss(v, ret)
        kl = (logp_old - logp).mean()  # a sample estimate for KL-divergence
        clipped = (ratio > (1 + clip_ratio)) | (ratio < (1 - clip_ratio))
        cf = (clipped.float()).mean()
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        actor_critic.eval()
        for t in range(local_steps_per_epoch):
            a, _, logp_t, v_t = actor_critic(
                torch.Tensor(o.reshape(1, -1)).to(device))

            # save and log
            buf.store(o,
                      a.cpu().detach().numpy(), r, v_t.item(),
                      logp_t.cpu().detach().numpy())
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a.cpu().detach().numpy()[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else actor_critic.value_function(
                    torch.Tensor(o.reshape(1, -1)).to(device)).item()
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, actor_critic, None)

        # Perform PPO update!
        actor_critic.train()
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #5
0
def sac(env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The agent's model which takes the state ``x`` and 
            action, ``a`` and returns a tuple of:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x`` and actions in
                                           | ``a``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x`` and actions in
                                           | ``a``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            class you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Main computation graph
    main = actor_critic(in_features=obs_dim, **ac_kwargs)

    # Target value network
    target = actor_critic(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(
        obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(module)
        for module in [main.policy, main.q1, main.q2, main.vf_mlp, main])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=lr)

    # Value train op
    value_params = list(main.vf_mlp.parameters()) + list(
        main.q1.parameters()) + list(main.q2.parameters())
    value_optimizer = torch.optim.Adam(value_params, lr=lr)

    # Initializing targets to match main variables
    target.vf_mlp.load_state_dict(main.vf_mlp.state_dict())

    def get_action(o, deterministic=False):
        pi, mu, _ = main.policy(torch.Tensor(o.reshape(1, -1)))
        return mu.detach().numpy()[0] if deterministic else pi.detach().numpy()[0]

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy.
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                (obs1, obs2, acts, rews, done) = (torch.Tensor(batch['obs1']),
                                                  torch.Tensor(batch['obs2']),
                                                  torch.Tensor(batch['acts']),
                                                  torch.Tensor(batch['rews']),
                                                  torch.Tensor(batch['done']))
                _, _, logp_pi, q1, q2, q1_pi, q2_pi, v = main(obs1, acts)
                v_targ = target.vf_mlp(obs2)

                # Min Double-Q:
                min_q_pi = torch.min(q1_pi, q2_pi)

                # Targets for Q and V regression
                q_backup = (rews + gamma * (1 - done) * v_targ).detach()
                v_backup = (min_q_pi - alpha * logp_pi).detach()

                # Soft actor-critic losses
                pi_loss = (alpha * logp_pi - min_q_pi).mean()
                q1_loss = 0.5 * F.mse_loss(q1, q_backup)
                q2_loss = 0.5 * F.mse_loss(q2, q_backup)
                v_loss = 0.5 * F.mse_loss(v, v_backup)
                value_loss = q1_loss + q2_loss + v_loss

                # Policy train op
                pi_optimizer.zero_grad()
                pi_loss.backward()
                pi_optimizer.step()

                # Value train op
                value_optimizer.zero_grad()
                value_loss.backward()
                value_optimizer.step()

                # Polyak averaging for target parameters
                for p_main, p_target in zip(main.vf_mlp.parameters(),
                                            target.vf_mlp.parameters()):
                    p_target.data.copy_(polyak * p_target.data +
                                        (1 - polyak) * p_main.data)

                logger.store(
                    LossPi=pi_loss.item(),
                    LossQ1=q1_loss.item(),
                    LossQ2=q2_loss.item(),
                    LossV=v_loss.item(),
                    Q1Vals=q1.detach().numpy(),
                    Q2Vals=q2.detach().numpy(),
                    VVals=v.detach().numpy(),
                    LogPi=logp_pi.detach().numpy())

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #6
0
def td3(env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The agent's main model which for state ``x`` and
            action, ``a`` returns the following outputs:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x`` and actions in
                                           | ``a``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x`` and actions in
                                           | ``a``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x``:
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target
            policy.

        noise_clip (float): Limit for absolute value of target policy
            smoothing noise.

        policy_delay (int): Policy will only be updated once every
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Main outputs from computation graph
    main = actor_critic(in_features=obs_dim, **ac_kwargs)

    # Target policy network
    target = actor_critic(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(module)
        for module in [main.policy, main.q1, main.q2, main])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Separate train ops for pi, q
    pi_optimizer = torch.optim.Adam(main.policy.parameters(), lr=pi_lr)

    q_params = list(main.q1.parameters()) + list(main.q2.parameters())
    q_optimizer = torch.optim.Adam(q_params, lr=q_lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, noise_scale):
        pi = main.policy(torch.Tensor(o.reshape(1, -1)))
        a = pi.detach().numpy()[0] + noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                (obs1, obs2, acts, rews, done) = (torch.tensor(batch['obs1']),
                                                  torch.tensor(batch['obs2']),
                                                  torch.tensor(batch['acts']),
                                                  torch.tensor(batch['rews']),
                                                  torch.tensor(batch['done']))
                q1 = main.q1(torch.cat((obs1, acts), dim=1))
                q2 = main.q2(torch.cat((obs1, acts), dim=1))
                pi_targ = target.policy(obs2)

                # Target policy smoothing, by adding clipped noise to target actions
                epsilon = torch.normal(torch.zeros_like(pi_targ),
                                       target_noise * torch.ones_like(pi_targ))

                epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
                a2 = torch.clamp(pi_targ + epsilon, -act_limit, act_limit)

                # Target Q-values, using action from target policy
                q1_targ = target.q1(torch.cat((obs2, a2), dim=1))
                q2_targ = target.q2(torch.cat((obs2, a2), dim=1))

                # Bellman backup for Q functions, using Clipped Double-Q targets
                min_q_targ = torch.min(q1_targ, q2_targ)
                backup = (rews + gamma * (1 - done) * min_q_targ).detach()

                # TD3 Q losses
                q1_loss = F.mse_loss(q1, backup)
                q2_loss = F.mse_loss(q2, backup)
                q_loss = q1_loss + q2_loss

                q_optimizer.zero_grad()
                q_loss.backward()
                q_optimizer.step()

                logger.store(LossQ=q_loss.item(),
                             Q1Vals=q1.detach().numpy(),
                             Q2Vals=q2.detach().numpy())

                if j % policy_delay == 0:
                    q1_pi = main.q1(torch.cat((obs1, main.policy(obs1)),
                                              dim=1))

                    # TD3 policy loss
                    pi_loss = -q1_pi.mean()

                    # Delayed policy update
                    pi_optimizer.zero_grad()
                    pi_loss.backward()
                    pi_optimizer.step()

                    # Polyak averaging for target variables
                    for p_main, p_target in zip(main.parameters(),
                                                target.parameters()):
                        p_target.data.copy_(polyak * p_target.data +
                                            (1 - polyak) * p_main.data)

                    logger.store(LossPi=pi_loss.item())

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #7
0
def iqn(
    env_fn,
    dqnetwork=core.DQNetwork,
    ac_kwargs=dict(),
    seed=0,
    steps_per_epoch=5000,
    epochs=100,
    replay_size=int(1e6),
    quantile_embedding_dim=64,  # n in equation 4 in IQN paper
    num_tau_samples=16,  # N in equation 3 in IQN paper
    num_tau_prime_samples=8,  # N' in equation 3 in IQN paper
    num_quantile_samples=32,  # K in equation 3 in IQN paper
    kappa=1.0,  # kappa for Huber Loss in IQN
    gamma=0.99,
    min_replay_history=20000,
    epsilon_decay_period=250000,
    epsilon_train=0.01,
    epsilon_eval=0.001,
    lr=1e-3,
    max_ep_len=1000,
    update_period=4,
    target_update_period=8000,
    batch_size=100,
    logger_kwargs=dict(),
    save_freq=1,
):
    """
    quantile_embedding_dim :  # n in equation 4 in IQN paper
    num_tau_samples : N in equation 3 in IQN paper
    num_tau_prime_samples : N' in equation 3 in IQN paper
    num_quantile_samples : K in equation 3 in IQN paper
    kappa : kappa for Huber Loss in IQN
    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = 1  # env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs["action_space"] = env.action_space
    ac_kwargs["quantile_embedding_dim"] = quantile_embedding_dim

    # Main computation graph
    main = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Target network
    target = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [main.z, main])
    print(("\nNumber of parameters: \t z: %d, \t total: %d\n") % var_counts)

    # Value train op
    params = main.parameters()
    optimizer = torch.optim.Adam(params, lr=lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, epsilon):
        """Select an action from the set of available actions.
        Chooses an action randomly with probability epsilon otherwise
        act greedily according to the current Q-value estimates.
        """
        if np.random.random() <= epsilon:
            return env.action_space.sample()
        else:
            return main.policy(torch.Tensor(o.reshape(1, -1)), num_tau_samples).item()

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # epsilon_eval used when evaluating the agent
                o, r, d, _ = test_env.step(get_action(o, epsilon_eval))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def update():
        """ref: https://github.com/google/dopamine/blob/master/dopamine/agents/implicit_quantile/implicit_quantile_agent.py
        """
        main.train()
        batch = replay_buffer.sample_batch(batch_size)
        (obs1, obs2, acts1, rews, done) = (
            torch.Tensor(batch["obs1"]),
            torch.Tensor(batch["obs2"]),
            torch.LongTensor(batch["acts"]),  # (bsz, 1)
            torch.Tensor(batch["rews"]),  # (bsz)
            torch.Tensor(batch["done"]),  # (bsz)
        )

        action_dim = env.action_space.n
        bsz = obs1.size(0)
        with torch.no_grad():
            z2, _ = target(obs2, num_tau_prime_samples)

            assert z2.size() == (bsz, action_dim, num_tau_prime_samples)

            # acts2 = main(obs2, num_quantile_samples)[0].mean(dim=-1).argmax(dim=-1)  # double dqn
            acts2 = z2.mean(dim=-1).argmax(dim=-1)  # (bsz)

            rews = rews.unsqueeze(1)
            done = done.unsqueeze(1)
            backups = rews + (1 - done) * gamma * z2[range(bsz), acts2]

            assert backups.size() == (bsz, num_tau_prime_samples)

        z1, replay_tau = main(obs1, num_tau_samples)
        acts1 = acts1.squeeze(1)  # (bsz)
        z1 = z1[range(bsz), acts1]  # (bsz, num_tau_samples)

        bellman_errors = backups.unsqueeze(-1) - z1.unsqueeze(1)

        assert bellman_errors.size() == (bsz, num_tau_prime_samples, num_tau_samples)

        huber_loss1 = (abs(bellman_errors) <= kappa).float() * 0.5 * bellman_errors ** 2
        huber_loss2 = (
            (abs(bellman_errors) > kappa).float()
            * kappa
            * (abs(bellman_errors) - kappa / 2)
        )
        huber_loss = huber_loss1 + huber_loss2

        replay_tau = replay_tau.view(bsz, num_tau_samples).unsqueeze(
            1
        )  # (bsz, 1, num_tau_samples)
        replay_tau = replay_tau.repeat(1, num_tau_prime_samples, 1)

        assert replay_tau.size() == (bsz, num_tau_prime_samples, num_tau_samples)

        tau_huber_loss = abs(replay_tau - ((bellman_errors < 0).float()).detach())
        tau_huber_loss = tau_huber_loss * huber_loss / kappa

        loss = tau_huber_loss.sum(dim=2).mean(dim=1)  # (bsz)

        loss = loss.mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss.item(), None

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.eval()

        # the epsilon value used for exploration during training
        epsilon = core.linearly_decaying_epsilon(
            epsilon_decay_period, t, min_replay_history, epsilon_train
        )
        a = get_action(o, epsilon)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # train at the rate of update_period if enough training steps have been run
        if replay_buffer.size > min_replay_history and t % update_period == 0:
            loss, QDist = update()
            logger.store(LossQ=loss)  # , QVals=QDist.mean(-1))

        # syncs weights from online to target network
        if t % target_update_period == 0:
            target.load_state_dict(main.state_dict())

        # End of epoch wrap-up
        if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({"env": env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular("Epoch", epoch)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t)
            logger.log_tabular("LossQ", average_only=True)
            # logger.log_tabular("QVals", with_min_and_max=True)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()
Пример #8
0
def rainbow(
        env_fn,
        dueling_dqn=False,
        double_dqn=False,
        noisy=False,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        Vmin=-100.0,  # hyperparameters for not-atari env
        Vmax=100.0,  # hyperparameters for not-atari env
        num_atoms=50,  # hyperparameters for not-atari env
        gamma=0.99,
        min_replay_history=20000,
        prioritized_replay_alpha=0.6,
        beta_start=0.4,
        beta_frames=10000,
        epsilon_decay_period=250000,
        epsilon_train=0.01,
        epsilon_eval=0.001,
        lr=1e-3,
        clip_grad_norm=5.0,
        max_ep_len=1000,
        update_period=4,
        target_update_period=8000,
        batch_size=100,
        logger_kwargs=dict(),
        save_freq=1,
):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = 1  # env.action_space.shape

    # Share information with policy architecture
    ac_kwargs['use_noisy_layer'] = noisy
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['num_atoms'] = num_atoms
    ac_kwargs['Vmin'] = Vmin
    ac_kwargs['Vmax'] = Vmax

    dqnetwork = core.CategoricalDuelingDQNetwork if dueling_dqn else core.CategoricalDQNetwork
    # Main computation graph
    main = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Target network
    target = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # C51 stuffs
    supports = torch.linspace(Vmin, Vmax, num_atoms)
    delta_z = (Vmax - Vmin) / (num_atoms - 1)

    # Experience buffer
    replay_buffer = PrioritizedReplayBuffer(replay_size,
                                            prioritized_replay_alpha)

    # Count variables
    if dueling_dqn:
        var_counts = tuple(
            core.count_vars(module)
            for module in [main.enc, main.v, main.a, main])
        print((
            '\nNumber of parameters: \t encoder: %d, \t value head: %d \t advantage head: %d \t total: %d\n'
        ) % var_counts)
    else:
        var_counts = tuple(
            core.count_vars(module) for module in [main.q, main])
        print(
            ('\nNumber of parameters: \t q: %d, \t total: %d\n') % var_counts)

    # Value train op
    value_params = main.parameters()
    value_optimizer = torch.optim.Adam(value_params, lr=lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, epsilon=None):
        """Select an action from the set of available actions.
        Chooses an action randomly with probability epsilon otherwise
        act greedily according to the current Q-value estimates.
        """
        if epsilon is not None and np.random.random() <= epsilon:
            return env.action_space.sample()
        else:
            return main.policy(torch.Tensor(o.reshape(1, -1))).item()

    def test_agent(n=10):
        epsilon_eval = None if noisy else epsilon_eval
        main.eval()
        for _ in range(n):
            o, r, done, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (done or (ep_len == max_ep_len)):
                # epsilon_eval used when evaluating the agent
                o, r, done, _ = test_env.step(get_action(o, epsilon_eval))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def update(t):
        main.train()
        beta = core.beta_by_frame(t, beta_start, beta_frames)
        batch = replay_buffer.sample_batch(batch_size, beta)
        (obs1, obs2, acts, rews, done, weights,
         idxes) = (torch.Tensor(batch['obs1']), torch.Tensor(batch['obs2']),
                   torch.LongTensor(batch['acts']),
                   torch.Tensor(batch['rews']), torch.Tensor(batch['done']),
                   torch.Tensor(batch['weights']), batch['idxes'])

        # compute target distribution
        bsz = obs1.size(0)
        with torch.no_grad():
            if noisy: target.reset_noise()
            pns = target(obs2)  # (bsz, act_dim, num_atoms)

            if double_dqn:
                next_act_idx = (main(obs2) *
                                supports.expand_as(pns)).sum(-1).argmax(-1)
            else:
                dist = supports.expand_as(pns) * pns
                next_act_idx = dist.sum(-1).argmax(-1)  # (bsz)

            pns_a = pns[range(bsz), next_act_idx]  # (bsz, num_atoms)

            rews = rews.unsqueeze(1)  # (bsz, 1)
            done = done.unsqueeze(1)  # (bsz, 1)

            # (bsz, num_atoms) for all in this block
            Tz = rews + (1 - done) * gamma * supports.unsqueeze(0)
            Tz = Tz.clamp(min=Vmin, max=Vmax)
            b = (Tz - Vmin) / delta_z
            l, u = b.floor().long(), b.ceil().long()

            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (num_atoms - 1)) * (l == u)] += 1

            offset = torch.linspace(0, (bsz - 1) * num_atoms, bsz)
            offset = offset.unsqueeze(1).expand(bsz, num_atoms).long()

            m = torch.zeros([bsz, num_atoms])
            m.view(-1).index_add_(0, (l + offset).view(-1),
                                  (pns_a * (u.float() - b)).view(-1))
            m.view(-1).index_add_(0, (u + offset).view(-1),
                                  (pns_a * (b - l.float())).view(-1))

        log_dist1 = main(obs1, log=True)  # (bsz, action_dim, num_atoms)
        acts = acts.squeeze(1)  # (bsz)
        log_dist1 = log_dist1[range(bsz), acts]  # (bsz, num_atoms)

        loss = -(m * log_dist1).sum(-1) * weights  # (bsz)
        priorities = loss.detach().numpy() + 1e-5
        loss = loss.mean()

        value_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(main.parameters(), clip_grad_norm)
        value_optimizer.step()

        # replay buffer update
        replay_buffer.update_priorities(idxes, priorities)

        return loss.item(), pns_a.numpy()

    start_time = time.time()
    o, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.train()  # enable NoisyNet exploration

        # the epsilon value used for exploration during training
        epsilon = core.linearly_decaying_epsilon(
            epsilon_decay_period, t, min_replay_history,
            epsilon_train) if noisy else None

        with torch.no_grad():
            a = get_action(o, epsilon)

        # Step the env
        o2, r, done, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        done = False if ep_len == max_ep_len else done

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, done)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if done or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if noisy is True and t % update_period == 0: main.reset_noise()

        # train at the rate of update_period if enough training steps have been run
        if len(replay_buffer) > min_replay_history and t % update_period == 0:
            value_loss, QDist = update(t)
            logger.store(LossQ=value_loss, QVals=QDist.mean(-1))

        # syncs weights from online to target network
        if t % target_update_period == 0:
            target.load_state_dict(main.state_dict())

        # End of epoch wrap-up
        if len(replay_buffer
               ) > min_replay_history and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #9
0
def c51(
        env_fn,
        dqnetwork=core.CategoricalDQNetwork,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        Vmin=-100.0,  # hyperparameters for not-atari env
        Vmax=100.0,  # hyperparameters for not-atari env
        num_atoms=50,  # hyperparameters for not-atari env
        gamma=0.99,
        min_replay_history=20000,
        epsilon_decay_period=250000,
        epsilon_train=0.01,
        epsilon_eval=0.001,
        lr=1e-3,
        grad_clip=5.0,
        max_ep_len=1000,
        update_period=4,
        target_update_period=8000,
        batch_size=100,
        logger_kwargs=dict(),
        save_freq=1,
):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = 1  # env.action_space.shape

    # Share information with policy architecture
    ac_kwargs["action_space"] = env.action_space
    ac_kwargs["num_atoms"] = num_atoms
    ac_kwargs["Vmin"] = Vmin
    ac_kwargs["Vmax"] = Vmax

    # Main computation graph
    main = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Target network
    target = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # C51 stuffs
    supports = torch.linspace(Vmin, Vmax, num_atoms)
    delta_z = (Vmax - Vmin) / (num_atoms - 1)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [main.q, main])
    print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts)

    # Value train op
    value_params = main.q.parameters()
    value_optimizer = torch.optim.Adam(value_params, lr=lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    def get_action(o, epsilon):
        """Select an action from the set of available actions.
        Chooses an action randomly with probability epsilon otherwise
        act greedily according to the current Q-value estimates.
        """
        if np.random.random() <= epsilon:
            return env.action_space.sample()
        else:
            return main.policy(torch.Tensor(o.reshape(1, -1))).item()

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # epsilon_eval used when evaluating the agent
                o, r, d, _ = test_env.step(get_action(o, epsilon_eval))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def update():
        main.train()
        batch = replay_buffer.sample_batch(batch_size)
        (obs1, obs2, acts, rews, done) = (
            torch.Tensor(batch["obs1"]),
            torch.Tensor(batch["obs2"]),
            torch.LongTensor(batch["acts"]),  # (bsz, 1)
            torch.Tensor(batch["rews"]),  # (bsz)
            torch.Tensor(batch["done"]),  # (bsz)
        )

        # compute target distribution
        bsz = obs1.size(0)
        with torch.no_grad():
            out2 = target(obs2)  # (bsz, act_dim, num_atoms)
            dist2 = supports.expand_as(out2) * out2
            act2_idx = dist2.sum(-1).argmax(-1)  # (bsz)

            dist2_a = out2[range(bsz), act2_idx]  # (bsz, num_atoms)

            rews = rews.unsqueeze(1)  # (bsz, 1)
            done = done.unsqueeze(1)  # (bsz, 1)

            # (bsz, num_atoms) for all in this block
            Tz = rews + (1 - done) * gamma * supports.unsqueeze(0)
            Tz = Tz.clamp(min=Vmin, max=Vmax)
            b = (Tz - Vmin) / delta_z
            l, u = b.floor().long(), b.ceil().long()

            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (num_atoms - 1)) * (l == u)] += 1

            offset = torch.linspace(0, (bsz - 1) * num_atoms, bsz)
            offset = offset.unsqueeze(1).expand(bsz, num_atoms).long()

            m = torch.zeros([bsz, num_atoms])
            m.view(-1).index_add_(0, (l + offset).view(-1),
                                  (dist2_a * (u.float() - b)).view(-1))
            m.view(-1).index_add_(0, (u + offset).view(-1),
                                  (dist2_a * (b - l.float())).view(-1))

        log_dist1 = main(obs1, log=True)  # (bsz, action_dim, num_atoms)
        acts = acts.squeeze(1)  # (bsz)
        log_dist1 = log_dist1[range(bsz), acts]  # (bsz, num_atoms)

        loss = -(m * log_dist1).sum(-1)  # (bsz)
        loss = loss.mean()

        value_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(main.parameters(), grad_clip)
        value_optimizer.step()

        return loss.item(), (dist2_a * supports.expand_as(dist2_a)).numpy()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.eval()

        # the epsilon value used for exploration during training
        epsilon = core.linearly_decaying_epsilon(epsilon_decay_period, t,
                                                 min_replay_history,
                                                 epsilon_train)
        a = get_action(o, epsilon)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # train at the rate of update_period if enough training steps have been run
        if replay_buffer.size > min_replay_history and t % update_period == 0:
            loss, QDist = update()
            logger.store(LossQ=loss, QVals=QDist.sum(-1))

        # syncs weights from online to target network
        if t % target_update_period == 0:
            target.load_state_dict(main.state_dict())

        # End of epoch wrap-up
        if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({"env": env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular("Epoch", epoch)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t)
            logger.log_tabular("LossQ", average_only=True)
            logger.log_tabular("QVals", with_min_and_max=True)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()
Пример #10
0
def vpg(env_fn,
        actor_critic=core.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The agent's main model which is composed of
            the policy and value function model, where the policy takes
            some state, ``x``, and action, ``a``, and value function takes
            the state ``x`` and returns a tuple of:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a``
                                           | in states ``x``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x``. (Critical: make sure
                                           | to flatten this via .item()!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            class you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Main model
    actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(
        core.count_vars(module)
        for module in [actor_critic.policy, actor_critic.value_function])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Optimizers
    train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr)
    train_v = torch.optim.Adam(actor_critic.value_function.parameters(),
                               lr=vf_lr)

    # Sync params across processes
    sync_all_params(actor_critic.parameters())

    def update():
        obs, act, adv, ret, logp_old = [torch.Tensor(x) for x in buf.get()]

        # Policy gradient step
        _, logp, _ = actor_critic.policy(obs, act)
        ent = (-logp).mean()  # a sample estimate for entropy

        # VPG policy objective
        pi_loss = -(logp * adv).mean()

        # Policy gradient step
        train_pi.zero_grad()
        pi_loss.backward()
        average_gradients(train_pi.param_groups)
        train_pi.step()

        # Value function learning
        v = actor_critic.value_function(obs)
        v_l_old = F.mse_loss(v, ret)
        for _ in range(train_v_iters):
            # Output from value function graph
            v = actor_critic.value_function(obs)
            # VPG value objective
            v_loss = F.mse_loss(v, ret)

            # Value function gradient step
            train_v.zero_grad()
            v_loss.backward()
            average_gradients(train_v.param_groups)
            train_v.step()

        # Log changes from update
        _, logp, _, v = actor_critic(obs, act)
        pi_l_new = -(logp * adv).mean()
        v_l_new = F.mse_loss(v, ret)
        kl = (logp_old - logp).mean()  # a sample estimate for KL-divergence
        logger.store(LossPi=pi_loss,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_loss),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        actor_critic.eval()
        for t in range(local_steps_per_epoch):
            a, _, logp_t, v_t = actor_critic(torch.Tensor(o.reshape(1, -1)))

            # save and log
            buf.store(o,
                      a.detach().numpy(), r, v_t.item(),
                      logp_t.detach().numpy())
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a.detach().numpy()[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else actor_critic.value_function(
                    torch.Tensor(o.reshape(1, -1))).item()
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, actor_critic, None)

        # Perform VPG update!
        actor_critic.train()
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #11
0
def qr_dqn(
        env_fn,
        dqnetwork=core.DQNetwork,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        Vmin=-10.0,  # hyperparameters for not-atari env
        Vmax=10.0,  # hyperparameters for not-atari env
        num_quantiles=50,  # hyperparameters for not-atari env
        gamma=0.99,
        min_replay_history=20000,
        epsilon_decay_period=250000,
        epsilon_train=0.01,
        epsilon_eval=0.001,
        lr=1e-3,
        max_ep_len=1000,
        update_period=4,
        target_update_period=8000,
        batch_size=100,
        logger_kwargs=dict(),
        save_freq=1,
):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = 1  # env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs["action_space"] = env.action_space

    # Main computation graph
    main = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Target network
    target = dqnetwork(in_features=obs_dim, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [main.q, main])
    print(("\nNumber of parameters: \t q: %d, \t total: %d\n") % var_counts)

    # Value train op
    value_params = main.q.parameters()
    value_optimizer = torch.optim.Adam(value_params, lr=lr)

    # Initializing targets to match main variables
    target.load_state_dict(main.state_dict())

    # Quantile regression stuffs
    k = 1.0

    def huber(x):
        return torch.where(x.abs() < k, x**2 / 2, k * (x.abs() - k / 2))

    tau = torch.Tensor(
        (2 * np.arange(num_quantiles) + 1) / (2.0 * num_quantiles)).view(
            1, -1)

    def get_action(o, epsilon):
        """Select an action from the set of available actions.
        Chooses an action randomly with probability epsilon otherwise
        act greedily according to the current Q-value estimates.
        """
        if np.random.random() <= epsilon:
            return env.action_space.sample()
        else:
            return main.policy(torch.Tensor(o.reshape(1, -1))).item()

    def test_agent(n=10):
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # epsilon_eval used when evaluating the agent
                o, r, d, _ = test_env.step(get_action(o, epsilon_eval))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def update():
        main.train()
        batch = replay_buffer.sample_batch(batch_size)
        (obs1, obs2, acts, rews, done) = (
            torch.Tensor(batch["obs1"]),
            torch.Tensor(batch["obs2"]),
            torch.LongTensor(batch["acts"]),  # (bsz, 1)
            torch.Tensor(batch["rews"]),  # (bsz)
            torch.Tensor(batch["done"]),  # (bsz)
        )
        bsz = obs1.size(0)

        q_dist1 = main(obs1)  # (bsz, action_dim, num_quantiles)
        acts = acts.squeeze(1)  # (bsz)
        q_dist1 = q_dist1[range(bsz), acts]  # (bsz, num_atoms)

        q_dist2 = target(obs2).detach()
        act_idx2 = q_dist2.mean(-1).argmax(-1)  # (bsz)
        # act_idx2 = main(obs2).mean(-1).argmax(-1)  # double dqn
        q_dist2 = q_dist2[range(bsz), act_idx2]  # (bsz, num_quantiles)

        rews = rews.unsqueeze(1)  # (bsz, 1)
        done = done.unsqueeze(1)  # (bsz, 1)
        T_theta = rews + (1 - done) * gamma * q_dist2

        diff = T_theta.t().unsqueeze(-1) - q_dist1
        loss = huber(diff) * (tau - (diff.detach() < 0).float()).abs()
        loss = loss.mean()

        value_optimizer.zero_grad()
        loss.backward()
        value_optimizer.step()

        return loss.item(), q_dist2.detach().numpy()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        main.eval()

        # the epsilon value used for exploration during training
        epsilon = core.linearly_decaying_epsilon(epsilon_decay_period, t,
                                                 min_replay_history,
                                                 epsilon_train)
        a = get_action(o, epsilon)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # train at the rate of update_period if enough training steps have been run
        if replay_buffer.size > min_replay_history and t % update_period == 0:
            loss, QDist = update()
            logger.store(LossQ=loss, QVals=QDist.mean(-1))

        # syncs weights from online to target network
        if t % target_update_period == 0:
            target.load_state_dict(main.state_dict())

        # End of epoch wrap-up
        if replay_buffer.size > min_replay_history and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({"env": env}, main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular("Epoch", epoch)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t)
            logger.log_tabular("LossQ", average_only=True)
            logger.log_tabular("QVals", with_min_and_max=True)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()