Exemplo n.º 1
0
            def f_loss_kl_impl(need_loss, need_kl):
                retval = dict()
                if need_loss:
                    new_dists = policy.compute_dists(all_obs)
                    old_dists = all_dists
                elif need_kl:
                    # if only kl is needed, compute distribution from sub-sampled data
                    new_dists = policy.compute_dists(subsamp_obs)
                    old_dists = subsamp_dists

                def compute_surr_loss(old_dists, new_dists, all_acts,
                                      all_advs):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                    :param all_advs: A chainer variable, which should be a vector of size N
                    :return: A chainer variable, which should be a scalar
                    """
                    surr_loss = Variable(np.array(0.))
                    "*** YOUR CODE HERE ***"

                    # SOLUTION

                    surr_loss = -F.mean(
                        new_dists.likelihood_ratio(old_dists, all_acts) *
                        all_advs)

                    # END OF SOLUTION

                    return surr_loss

                def compute_kl(old_dists, new_dists):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :return: A chainer variable, which should be a scalar
                    """
                    kl = Variable(np.array(0.))
                    "*** YOUR CODE HERE ***"

                    # SOLUTION

                    kl = F.mean(old_dists.kl_div(new_dists))

                    # END OF SOLUTION

                    return kl

                test_once(compute_surr_loss)
                test_once(compute_kl)

                if need_loss:
                    retval["surr_loss"] = compute_surr_loss(
                        old_dists, new_dists, all_acts, all_advs)
                if need_kl:
                    retval["kl"] = compute_kl(old_dists, new_dists)
                return retval
Exemplo n.º 2
0
            def f_loss_kl_impl(need_loss, need_kl):
                retval = dict()
                if need_loss:
                    new_dists = policy.compute_dists(all_obs)
                    old_dists = all_dists
                elif need_kl:
                    # if only kl is needed, compute distribution from sub-sampled data
                    new_dists = policy.compute_dists(subsamp_obs)
                    old_dists = subsamp_dists

                def compute_surr_loss(old_dists, new_dists, all_acts,
                                      all_advs):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                    :param all_advs: A chainer variable, which should be a vector of size N
                    :return: A chainer variable, which should be a scalar
                    """
                    "*** YOUR CODE HERE ***"
                    surr_loss = Variable(np.array(
                        0., dtype=np.float32))  # must be float32
                    # So we want to do eq 8 L_old
                    likelihood_ratio = new_dists.likelihood_ratio(
                        old_dists, all_acts)  # they gave us a hint to use this
                    surr_loss -= F.mean(likelihood_ratio * all_advs)

                    return surr_loss

                def compute_kl(old_dists, new_dists):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :return: A chainer variable, which should be a scalar
                    """
                    "*** YOUR CODE HERE ***"
                    kl = Variable(np.array(
                        0., dtype=np.float32))  # must be float32
                    # they gave us the clue to use dist.kl_div, and you can look it up in utils.py
                    # I got it the wrong way round the first time
                    kl += F.mean(old_dists.kl_div(new_dists))
                    return kl

                test_once(compute_surr_loss)
                test_once(compute_kl)

                if need_loss:
                    retval["surr_loss"] = compute_surr_loss(
                        old_dists, new_dists, all_acts, all_advs)
                if need_kl:
                    retval["kl"] = compute_kl(old_dists, new_dists)
                return retval
Exemplo n.º 3
0
            def f_loss_kl_impl(need_loss, need_kl):
                retval = dict()
                if need_loss:
                    new_dists = policy.compute_dists(all_obs)
                    old_dists = all_dists
                elif need_kl:
                    # if only kl is needed, compute distribution from sub-sampled data
                    new_dists = policy.compute_dists(subsamp_obs)
                    old_dists = subsamp_dists

                def compute_surr_loss(old_dists, new_dists, all_acts,
                                      all_advs):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                    :param all_advs: A chainer variable, which should be a vector of size N
                    :return: A chainer variable, which should be a scalar
                    """
                    # We use the same formula as we used before in part 4
                    # But this time I do it in a one-liner because you can
                    # Refer to previous implmenetation from pt.4 to get better sense
                    return -F.mean(
                        new_dists.likelihood_ratio(old_dists, all_acts) *
                        all_advs)

                def compute_kl(old_dists, new_dists):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :return: A chainer variable, which should be a scalar
                    """
                    # We are using the functions proposed by the authors
                    # of the lab in part 5.2(right before pt. 5.3)
                    return F.mean(old_dists.kl_div(new_dists))

                test_once(compute_surr_loss)
                test_once(compute_kl)

                if need_loss:
                    retval["surr_loss"] = compute_surr_loss(
                        old_dists, new_dists, all_acts, all_advs)
                if need_kl:
                    retval["kl"] = compute_kl(old_dists, new_dists)
                return retval
Exemplo n.º 4
0
            def f_loss_kl_impl(need_loss, need_kl):
                retval = dict()
                if need_loss:
                    new_dists = policy.compute_dists(all_obs)
                    old_dists = all_dists
                elif need_kl:
                    # if only kl is needed, compute distribution from sub-sampled data
                    new_dists = policy.compute_dists(subsamp_obs)
                    old_dists = subsamp_dists

                def compute_surr_loss(old_dists, new_dists, all_acts, all_advs):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                    :param all_advs: A chainer variable, which should be a vector of size N
                    :return: A chainer variable, which should be a scalar
                    """
                    "*** YOUR CODE HERE ***"
                    return -F.mean(new_dists.likelihood_ratio(old_dists, all_acts) * all_advs)

                def compute_kl(old_dists, new_dists):
                    """
                    :param old_dists: An instance of subclass of Distribution
                    :param new_dists: An instance of subclass of Distribution
                    :return: A chainer variable, which should be a scalar
                    """
                    "*** YOUR CODE HERE ***"
                    return F.mean(old_dists.kl_div(new_dists))

                test_once(compute_surr_loss)
                test_once(compute_kl)

                if need_loss:
                    retval["surr_loss"] = compute_surr_loss(
                        old_dists, new_dists, all_acts, all_advs)
                if need_kl:
                    retval["kl"] = compute_kl(old_dists, new_dists)
                return retval
Exemplo n.º 5
0
def pg(env,
       env_maker,
       policy,
       baseline,
       n_envs=mp.cpu_count(),
       last_iter=-1,
       n_iters=100,
       batch_size=1000,
       optimizer=chainer.optimizers.Adam(),
       discount=0.99,
       gae_lambda=0.97,
       snapshot_saver=None):
    """
    This method implements policy gradient algorithm.
    :param env: An environment instance, which should have the same class as what env_maker.make() returns.
    :param env_maker: An object such that calling env_maker.make() will generate a new environment.
    :param policy: A stochastic policy which we will be optimizing.
    :param baseline: A baseline used for variance reduction and estimating future returns for unfinished trajectories.
    :param n_envs: Number of environments running simultaneously.
    :param last_iter: The index of the last iteration. This is normally -1 when starting afresh, but may be different when
           loaded from a snapshot.
    :param n_iters: The total number of iterations to run.
    :param batch_size: The number of samples used per iteration.
    :param optimizer: A Chainer optimizer instance. By default we use the Adam algorithm with learning rate 1e-3.
    :param discount: Discount factor.
    :param gae_lambda: Lambda parameter used for generalized advantage estimation.
    :param snapshot_saver: An object for saving snapshots.
    """

    if getattr(optimizer, 'target', None) is not policy:
        optimizer.setup(policy)

    logger.info("Starting env pool")

    with EnvPool(env_maker, n_envs=n_envs) as env_pool:
        for iter in range(last_iter + 1, n_iters):
            logger.info("Starting iteration {}".format(iter))
            logger.logkv('Iteration', iter)

            logger.info("Start collecting samples")
            trajs = parallel_collect_samples(env_pool, policy, batch_size)

            logger.info("Computing input variables for policy optimization")
            all_obs, all_acts, all_advs, _ = compute_pg_vars(
                trajs, policy, baseline, discount, gae_lambda)

            # Begin policy update

            # Now, you need to implement the computation of the policy gradient
            # The policy gradient is given by -1/T \sum_t \nabla_\theta(log(p_\theta(a_t|s_t))) * A_t
            # Note the negative sign in the front, since optimizers are most often minimizing a loss rather
            # This is the same as \nabla_\theta(-1/T \sum_t log(p_\theta(a_t|s_t)) * A_t) = \nabla_\theta(L), where L is the surrogate loss term

            logger.info("Computing policy gradient")

            # Methods that may be useful:
            # - `dists.logli(actions)' returns the log probability of the actions under the distribution `dists'.
            #   This method returns a chainer variable.

            dists = policy.compute_dists(all_obs)

            def compute_surr_loss(dists, all_acts, all_advs):
                """
                :param dists: An instance of subclass of Distribution
                :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                :param all_advs: A chainer variable, which should be a vector of size N
                :return: A chainer variable, which should be a scalar
                """
                surr_loss = Variable(np.array(0.))
                "*** YOUR CODE HERE ***"

                # SOLUTION

                surr_loss = -F.mean(dists.logli(all_acts) * all_advs)

                # END OF SOLUTION

                return surr_loss

            test_once(compute_surr_loss)

            surr_loss = compute_surr_loss(dists, all_acts, all_advs)

            # reset gradients stored in the policy parameters
            policy.cleargrads()
            surr_loss.backward()

            # apply the computed gradient
            optimizer.update()

            # Update baseline
            logger.info("Updating baseline")
            baseline.update(trajs)

            # log statistics
            logger.info("Computing logging information")
            logger.logkv('SurrLoss', surr_loss.data)
            log_action_distribution_statistics(dists)
            log_reward_statistics(env)
            log_baseline_statistics(trajs)
            logger.dumpkvs()

            if snapshot_saver is not None:
                logger.info("Saving snapshot")
                snapshot_saver.save_state(
                    iter,
                    dict(alg=pg,
                         alg_state=dict(env_maker=env_maker,
                                        policy=policy,
                                        baseline=baseline,
                                        n_envs=n_envs,
                                        last_iter=iter,
                                        n_iters=n_iters,
                                        batch_size=batch_size,
                                        optimizer=optimizer,
                                        discount=discount,
                                        gae_lambda=gae_lambda)))
Exemplo n.º 6
0
def main(env_id, batch_size, discount, learning_rate, n_itrs, render,
         use_baseline, natural, natural_step_size):
    # Check gradient implementation

    rng = np.random.RandomState(42)

    if env_id == 'CartPole-v0':
        cartpole_test_grad_impl()
        env = gym.make('CartPole-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        get_action = cartpole_get_action
        get_grad_logp_action = cartpole_get_grad_logp_action
    elif env_id == 'Point-v0':
        point_test_grad_impl()
        from simplepg import point_env
        env = gym.make('Point-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        get_action = point_get_action
        get_grad_logp_action = point_get_grad_logp_action
    else:
        raise ValueError(
            "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'"
        )

    env.seed(42)
    timestep_limit = env.spec.timestep_limit

    # Initialize parameters
    theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1))

    # Store baselines for each time step.
    baselines = np.zeros(timestep_limit)

    # Policy training loop
    for itr in range(n_itrs):
        # Collect trajectory loop
        n_samples = 0
        grad = np.zeros_like(theta)
        episode_rewards = []

        # Store cumulative returns for each time step
        all_returns = [[] for _ in range(timestep_limit)]

        all_observations = []
        all_actions = []

        while n_samples < batch_size:
            observations = []
            actions = []
            rewards = []
            ob = env.reset()
            done = False
            # Only render the first trajectory
            render_episode = n_samples == 0
            # Collect a new trajectory
            while not done:
                action = get_action(theta, ob, rng=rng)
                next_ob, rew, done, _ = env.step(action)
                observations.append(ob)
                actions.append(action)
                rewards.append(rew)
                ob = next_ob
                n_samples += 1
                if render and render_episode:
                    env.render()
            # Go back in time to compute returns and accumulate gradient
            # Compute the gradient along this trajectory
            R = 0.
            for t in reversed(range(len(observations))):

                def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t,
                                   b_t, get_grad_logp_action):
                    """
                    :param discount: A scalar
                    :param R_tplus1: A scalar
                    :param theta: A matrix of size |A| * (|S|+1)
                    :param s_t: A vector of size |S|
                    :param a_t: Either a vector of size |A| or an integer, depending on the environment
                    :param r_t: A scalar
                    :param b_t: A scalar
                    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a 
                    matrix of size |A| * (|S|+1) )
                    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
                    """
                    R_t = 0.
                    pg_theta = np.zeros_like(theta)
                    "*** YOUR CODE HERE ***"
                    R_t = r_t + discount * R_tplus1
                    pg_theta = get_grad_logp_action(theta, s_t,
                                                    a_t) * (R_t - b_t)
                    return R_t, pg_theta

                # Test the implementation, but only once
                test_once(compute_update)

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=get_grad_logp_action)
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
            all_observations.extend(observations)
            all_actions.extend(actions)

        def compute_baselines(all_returns):
            """
            :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns 
            collected at time step t across different episodes
            :return: A vector of size T
            """
            baselines = np.zeros(len(all_returns))
            for t in range(len(all_returns)):
                "*** YOUR CODE HERE ***"
                baselines[t] = 0. if len(all_returns[t]) == 0 else np.mean(
                    all_returns[t])
            return baselines

        if use_baseline:
            test_once(compute_baselines)
            baselines = compute_baselines(all_returns)
        else:
            baselines = np.zeros(timestep_limit)

        # Roughly normalize the gradient
        grad = grad / (np.linalg.norm(grad) + 1e-8)

        if not natural:

            theta += learning_rate * grad
        else:

            def compute_fisher_matrix(theta, get_grad_logp_action,
                                      all_observations, all_actions):
                """
                :param theta: A matrix of size |A| * (|S|+1)
                :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix 
                of size |A| * (|S|+1) )
                :param all_observations: A list of vectors of size |S|
                :param all_actions: A list of vectors of size |A|
                :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of 
                entries in theta
                """
                d = len(theta.flatten())
                F = np.zeros((d, d))
                "*** YOUR CODE HERE ***"
                # this is an intuitive but very inefficient implementation:
                #                ws = []
                #                for action in all_actions:
                #                    for ob in all_observations:
                #                        g = get_grad_logp_action(theta, ob, action).reshape(d,1)
                #                        ws.append(g.dot(g.T))
                #                F = np.mean(np.array(ws), axis=0)

                # this is an efficient implementation
                for i in range(len(all_actions)):
                    grads = get_grad_logp_action(theta, all_observations[i],
                                                 all_actions[i]).flatten()
                    F += np.outer(grads, grads.T)
                F /= len(all_actions)
                return F

            def compute_natural_gradient(F, grad, reg=1e-4):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param grad: A matrix of size |A| * (|S|+1)
                :param reg: A scalar
                :return: A matrix of size |A| * (|S|+1)
                """
                natural_grad = np.zeros_like(grad)
                "*** YOUR CODE HERE ***"
                F_inv = np.linalg.inv(F + reg * np.eye(F.shape[0]))
                natural_grad = F_inv.dot(grad.flatten()).reshape(grad.shape)
                return natural_grad

            def compute_step_size(F, natural_grad, natural_step_size):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param natural_grad: A matrix of size |A| * (|S|+1)
                :param natural_step_size: A scalar
                :return: A scalar
                """
                step_size = 0.
                "*** YOUR CODE HERE ***"
                # this works with the inefficient implementation from compute_fisher_matrix
                #                w = natural_grad.dot(F).dot(natural_grad.T)

                natural_grad = natural_grad.flatten()
                w = natural_grad.T.dot(F).dot(natural_grad)
                step_size = np.sqrt(2 * natural_step_size / w)
                return step_size

            test_once(compute_fisher_matrix)
            test_once(compute_natural_gradient)
            test_once(compute_step_size)

            F = compute_fisher_matrix(
                theta=theta,
                get_grad_logp_action=get_grad_logp_action,
                all_observations=all_observations,
                all_actions=all_actions)
            natural_grad = compute_natural_gradient(F, grad)
            step_size = compute_step_size(F, natural_grad, natural_step_size)
            theta += step_size * natural_grad

        if env_id == 'CartPole-v0':
            logits = compute_logits(theta, np.array(all_observations))
            ent = np.mean(compute_entropy(logits))
            perp = np.exp(ent)

            print(
                "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f"
                % (itr, np.mean(episode_rewards), ent, perp,
                   np.linalg.norm(theta)))
        else:
            print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" %
                  (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
Exemplo n.º 7
0
def a2c(env, env_maker, policy, vf, joint_model=None, k=20, n_envs=16, discount=0.99,
        optimizer=chainer.optimizers.RMSprop(lr=1e-3), max_grad_norm=1.0, vf_loss_coeff=0.5,
        ent_coeff=0.01, last_epoch=-1, epoch_length=10000, n_epochs=8000, snapshot_saver=None):
    """
    This method implements (Synchronous) Advantage Actor-Critic algorithm. Rather than having asynchronous workers,
    which can be more efficient due to less coordination but also less stable and harder to extend / debug, we use a

    pool of environment workers performing simulation, while computing actions and performing gradient updates
    centrally. This also makes it easier to utilize GPUs for neural network computation.
    :param env: An environment instance, which should have the same class as what env_maker.make() returns.
    :param env_maker: An object such that calling env_maker.make() will generate a new environment.
    :param policy: A stochastic policy which we will be optimizing.
    :param vf: A value function which estimates future returns given a state. It can potentially share weights with the
           policy by calling policy.create_vf().
    :param joint_model: The joint model of policy and value function. This is usually automatically computed.
    :param k: Number of simulation steps per environment for each gradient update.
    :param n_envs: Number of environments running simultaneously.
    :param discount: Discount factor.
    :param optimizer: A chainer optimizer instance. By default we use the RMSProp algorithm.
    :param max_grad_norm: If provided, apply gradient clipping with the specified maximum L2 norm.
    :param vf_loss_coeff: Coefficient for the value function loss.
    :param ent_coeff: Coefficient for the entropy loss (the negative bonus).
    :param last_epoch: The index of the last epoch. This is normally -1 when starting afresh, but may be different when
           loaded from a snapshot.
    :param epoch: The starting epoch. This is normally 0, but may be different when loaded from a snapshot. Since A2C
           is an online algorithm, an epoch is just an artificial boundary so that we record logs after each epoch.
    :param epoch_length: Number of total environment steps per epoch.
    :param n_epochs: Total number of epochs to run the algorithm.
    :param snapshot_saver: An object for saving snapshots.
    """

    # ensures that shared parameters are only counted once
    if joint_model is None:
        joint_model = UniqueChainList(policy, vf)

    if getattr(optimizer, 'target', None) is not joint_model:
        optimizer.setup(joint_model)

    try:
        # remove existing hook if necessary (this should only be needed when restarting experiments)
        optimizer.remove_hook('gradient_clipping')
    except KeyError:
        pass
    if max_grad_norm is not None:
        # Clip L2 norm of gradient, to improve stability
        optimizer.add_hook(chainer.optimizer.GradientClipping(
            threshold=max_grad_norm), 'gradient_clipping')

    epoch = last_epoch + 1
    global_t = epoch * epoch_length

    loggings = defaultdict(list)

    logger.info("Starting env pool")
    with EnvPool(env_maker, n_envs=n_envs) as env_pool:

        gen = samples_generator(env_pool, policy, vf, k)

        logger.info("Starting epoch {}".format(epoch))

        if logger.get_level() <= logger.INFO:
            progbar = tqdm(total=epoch_length)
        else:
            progbar = None

        while global_t < epoch_length * n_epochs:

            # Run k steps in the environment
            # Note:
            # - all_actions, all_values, all_dists, and next_values are chainer variables
            # - all_rewards, all_dones are lists numpy arrays
            # The first dimension of each variable is time, and the second dimension is the index of the environment
            all_actions, all_rewards, all_dones, all_dists, all_values, next_values = next(
                gen)

            global_t += n_envs * k

            # Compute returns and advantages

            # Size: (k, n_envs)
            all_values = F.stack(all_values)
            all_rewards = np.asarray(all_rewards, dtype=np.float32)
            all_dones = np.asarray(all_dones, dtype=np.float32)

            all_values_data = all_values.data
            next_values_data = next_values.data

            test_once(compute_returns_advantages)

            all_returns, all_advs = compute_returns_advantages(
                all_rewards,
                all_dones,
                all_values_data,
                next_values_data,
                discount
            )

            all_returns = chainer.Variable(all_returns.astype(np.float32))
            all_advs = chainer.Variable(all_advs.astype(np.float32))

            # Concatenate data
            # Size: (k*n_envs,) + action_shape
            all_flat_actions = F.concat(all_actions, axis=0)
            # Size: key -> (k*n_envs,) + dist_shape
            all_flat_dists = {k: F.concat(
                [d[k] for d in all_dists], axis=0) for k in all_dists[0].keys()}
            all_flat_dists = policy.distribution.from_dict(all_flat_dists)

            # Prepare variables needed for gradient computation
            logli = all_flat_dists.logli(all_flat_actions)
            ent = all_flat_dists.entropy()
            # Flatten advantages
            all_advs = F.concat(all_advs, axis=0)

            # Form the loss - you should only need to use the variables provided as input arguments below
            def compute_total_loss(logli, all_advs, ent_coeff, ent, vf_loss_coeff, all_returns, all_values):
                """
                :param logli: A chainer variable, which should be a vector of size N
                :param all_advs: A chainer variable, which should be a vector of size N
                :param ent_coeff: A scalar
                :param ent: A chainer variable, which should be a vector of size N
                :param vf_loss_coeff: A scalar
                :param all_returns: A chainer variable, which should be a vector of size N
                :param all_values: A chainer variable, which should be a vector of size N
                :return: A tuple of (policy_loss, vf_loss, total_loss)
                policy_loss should be the weighted sum of the surrogate loss and the average entropy loss
                vf_loss should be the (unweighted) squared loss of value function prediction.
                total_loss should be the weighted sum of policy_loss and vf_loss
                """
                policy_loss = -1*F.mean(logli*all_advs)-ent_coeff*F.mean(ent)  #Variable(np.array(0.))
                vf_loss = F.mean_squared_error(all_returns,all_values) #Variable(np.array(0.))
                total_loss = policy_loss + vf_loss_coeff*vf_loss
                return policy_loss, vf_loss, total_loss

            test_once(compute_total_loss)

            policy_loss, vf_loss, total_loss = compute_total_loss(
                logli=logli, all_advs=all_advs, ent_coeff=ent_coeff,
                ent=ent, vf_loss_coeff=vf_loss_coeff,
                all_returns=all_returns, all_values=all_values
            )

            joint_model.cleargrads()
            total_loss.backward()
            optimizer.update()

            vf_loss_data = vf_loss.data
            all_returns_data = all_returns.data
            all_flat_dists_data = {
                k: v.data
                for k, v in all_flat_dists.as_dict().items()
            }

            loggings["vf_loss"].append(vf_loss_data)
            loggings["vf_preds"].append(all_values_data)
            loggings["vf_targets"].append(all_returns_data)
            loggings["dists"].append(all_flat_dists_data)

            if progbar is not None:
                progbar.update(k * n_envs)

            # An entire epoch has passed
            if global_t // epoch_length > epoch:
                logger.logkv('Epoch', epoch)
                log_reward_statistics(env)
                all_dists = {
                    k: Variable(np.concatenate([d[k] for d in loggings["dists"]], axis=0))
                    for k in loggings["dists"][0].keys()
                }
                log_action_distribution_statistics(
                    policy.distribution.from_dict(all_dists))
                logger.logkv('|VfPred|', np.mean(np.abs(loggings["vf_preds"])))
                logger.logkv('|VfTarget|', np.mean(
                    np.abs(loggings["vf_targets"])))
                logger.logkv('VfLoss', np.mean(loggings["vf_loss"]))
                logger.dumpkvs()

                if snapshot_saver is not None:
                    logger.info("Saving snapshot")

                    snapshot_saver.save_state(
                        epoch,
                        dict(
                            alg=a2c,
                            alg_state=dict(
                                env_maker=env_maker,
                                policy=policy,
                                vf=vf,
                                joint_model=joint_model,
                                k=k,
                                n_envs=n_envs,
                                discount=discount,
                                last_epoch=epoch,
                                n_epochs=n_epochs,
                                epoch_length=epoch_length,
                                optimizer=optimizer,
                                vf_loss_coeff=vf_loss_coeff,
                                ent_coeff=ent_coeff,
                                max_grad_norm=max_grad_norm,
                            )
                        )
                    )

                # Reset stored logging information
                loggings = defaultdict(list)

                if progbar is not None:
                    progbar.close()

                epoch = global_t // epoch_length

                logger.info("Starting epoch {}".format(epoch))

                if progbar is not None:
                    progbar = tqdm(total=epoch_length)

        if progbar is not None:
            progbar.close()
Exemplo n.º 8
0
def main(env_id, batch_size, discount, learning_rate, n_itrs, render,
         use_baseline, natural, natural_step_size):
    # Check gradient implementation

    rng = np.random.RandomState(42)

    if env_id == 'CartPole-v0':
        cartpole_test_grad_impl()
        env = gym.make('CartPole-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        get_action = cartpole_get_action
        get_grad_logp_action = cartpole_get_grad_logp_action
    elif env_id == 'Point-v0':
        point_test_grad_impl()
        from simplepg import point_env
        env = gym.make('Point-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        get_action = point_get_action
        get_grad_logp_action = point_get_grad_logp_action
    else:
        raise ValueError(
            "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'"
        )

    env.seed(42)
    timestep_limit = env.spec.timestep_limit

    # Initialize parameters
    theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1))

    # Store baselines for each time step.
    baselines = np.zeros(timestep_limit)

    # Policy training loop
    for itr in range(n_itrs):
        # Collect trajectory loop
        n_samples = 0
        grad = np.zeros_like(theta)
        episode_rewards = []

        # Store cumulative returns for each time step
        all_returns = [[] for _ in range(timestep_limit)]

        all_observations = []
        all_actions = []

        while n_samples < batch_size:
            observations = []
            actions = []
            rewards = []
            ob = env.reset()
            done = False
            # Only render the first trajectory
            render_episode = n_samples == 0
            # Collect a new trajectory
            while not done:
                action = get_action(theta, ob, rng=rng)
                next_ob, rew, done, _ = env.step(action)
                observations.append(ob)
                actions.append(action)
                rewards.append(rew)
                ob = next_ob
                n_samples += 1
                if render and render_episode:
                    env.render()
            # Go back in time to compute returns and accumulate gradient
            # Compute the gradient along this trajectory
            R = 0.
            for t in reversed(range(len(observations))):

                def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t,
                                   b_t, get_grad_logp_action):
                    """
                    :param discount: A scalar
                    :param R_tplus1: A scalar
                    :param theta: A matrix of size |A| * (|S|+1)
                    :param s_t: A vector of size |S|
                    :param a_t: Either a vector of size |A| or an integer, depending on the environment
                    :param r_t: A scalar
                    :param b_t: A scalar
                    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a 
                    matrix of size |A| * (|S|+1) )
                    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
                    """
                    # Use the formula from the lab instructions, part 3.4
                    R_t = discount * R_tplus1 + r_t
                    # Compute the single gradient contribution by formula from step 3.3
                    pg_theta = get_grad_logp_action(theta, s_t,
                                                    a_t) * (R_t - b_t)
                    return R_t, pg_theta

                # Test the implementation, but only once
                test_once(compute_update)

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=get_grad_logp_action)
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
            all_observations.extend(observations)
            all_actions.extend(actions)

        def compute_baselines(all_returns):
            """
            :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns 
            collected at time step t across different episodes
            :return: A vector of size T
            """
            baselines = np.zeros(len(all_returns))
            for t in range(len(all_returns)):
                # Use trajectories from previous episodes to compute the baseline
                if len(all_returns[t]) > 0:
                    # We need to check do we have any trajectories at all
                    # If not the default value of zero will be remaining
                    baselines[t] = np.mean(all_returns[t])
            return baselines

        if use_baseline:
            test_once(compute_baselines)
            baselines = compute_baselines(all_returns)
        else:
            baselines = np.zeros(timestep_limit)

        # Roughly normalize the gradient
        grad = grad / (np.linalg.norm(grad) + 1e-8)

        if not natural:

            theta += learning_rate * grad
        else:

            def compute_fisher_matrix(theta, get_grad_logp_action,
                                      all_observations, all_actions):
                """
                :param theta: A matrix of size |A| * (|S|+1)
                :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix 
                of size |A| * (|S|+1) )
                :param all_observations: A list of vectors of size |S|
                :param all_actions: A list of vectors of size |A|
                :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of 
                entries in theta
                """
                d = len(theta.flatten())
                F = np.zeros((d, d))

                # Compute for each action
                for i in range(len(all_actions)):
                    # First compute the inner value of the action gradient and make theta a flattened vector
                    grads = get_grad_logp_action(theta, all_observations[i],
                                                 all_actions[i]).flatten()
                    # Accumulate the inner product of the gradient
                    F += np.outer(grads, grads.T)

                # Compute the mean (expected value)
                F /= len(all_actions)
                return F

            def compute_natural_gradient(F, grad, reg=1e-4):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param grad: A matrix of size |A| * (|S|+1)
                :param reg: A scalar
                :return: A matrix of size |A| * (|S|+1)
                """
                # First ensure that Fisher inf. matrix is positive definite
                F_inv = np.linalg.inv(F + reg * np.eye(*F.shape))

                # Compute natural gradient with flattened version
                natural_grad = F_inv.dot(grad.flatten())

                # Reshape back to the g shape
                natural_grad = natural_grad.reshape(grad.shape)

                return natural_grad

            def compute_step_size(F, natural_grad, natural_step_size):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param natural_grad: A matrix of size |A| * (|S|+1)
                :param natural_step_size: A scalar
                :return: A scalar
                """

                # Flatten the natural gradient again
                natural_grad = natural_grad.flatten()

                # The computation is performed on accordance with
                # Formula from the solution footnote
                denominator = natural_grad.T.dot(F).dot(natural_grad)
                nominator = 2 * natural_step_size

                step_size = np.sqrt(nominator / denominator)

                return step_size

            test_once(compute_fisher_matrix)
            test_once(compute_natural_gradient)
            test_once(compute_step_size)

            F = compute_fisher_matrix(
                theta=theta,
                get_grad_logp_action=get_grad_logp_action,
                all_observations=all_observations,
                all_actions=all_actions)
            natural_grad = compute_natural_gradient(F, grad)
            step_size = compute_step_size(F, natural_grad, natural_step_size)
            theta += step_size * natural_grad

        if env_id == 'CartPole-v0':
            logits = compute_logits(theta, np.array(all_observations))
            ent = np.mean(compute_entropy(logits))
            perp = np.exp(ent)

            print(
                "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f"
                % (itr, np.mean(episode_rewards), ent, perp,
                   np.linalg.norm(theta)))
        else:
            print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" %
                  (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
Exemplo n.º 9
0
                    """
                    R_t = 0.
                    pg_theta = np.zeros_like(theta)
                    "*** YOUR CODE HERE ***"
                    
                    "Rt satis
es the recurrence relation: R_t = discount * R_tplus1 + r_t"
                    R_t = discount * R_tplus1 + r_t
                    
                    "single contribution to the overall policy gradient,... Formula shown in 3.4 Accummulating Policy Gradient"
                    pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t)
  
                    return R_t, pg_theta

                # Test the implementation, but only once
                test_once(compute_update)

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=get_grad_logp_action
                )
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
Exemplo n.º 10
0
def main(env_id, batch_size, discount, learning_rate, n_itrs, render,
         use_baseline, natural, natural_step_size):
    # Check gradient implementation

    rng = np.random.RandomState(42)

    if env_id == 'CartPole-v0':
        cartpole_test_grad_impl()
        env = gym.make('CartPole-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        get_action = cartpole_get_action
        get_grad_logp_action = cartpole_get_grad_logp_action
    elif env_id == 'Point-v0':
        point_test_grad_impl()
        from simplepg import point_env
        env = gym.make('Point-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        get_action = point_get_action
        get_grad_logp_action = point_get_grad_logp_action
    else:
        raise ValueError(
            "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'"
        )

    env.seed(42)
    timestep_limit = env.spec.timestep_limit

    # Initialize parameters
    theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1))

    # Store baselines for each time step.
    baselines = np.zeros(timestep_limit)

    # Policy training loop
    for itr in range(n_itrs):
        # Collect trajectory loop
        n_samples = 0
        grad = np.zeros_like(theta)
        episode_rewards = []

        # Store cumulative returns for each time step
        all_returns = [[] for _ in range(timestep_limit)]

        all_observations = []
        all_actions = []

        while n_samples < batch_size:
            observations = []
            actions = []
            rewards = []
            ob = env.reset()
            done = False
            # Only render the first trajectory
            render_episode = n_samples == 0
            # Collect a new trajectory
            while not done:
                action = get_action(theta, ob, rng=rng)
                next_ob, rew, done, _ = env.step(action)
                observations.append(ob)
                actions.append(action)
                rewards.append(rew)
                ob = next_ob
                n_samples += 1
                if render and render_episode:
                    env.render()
            # Go back in time to compute returns and accumulate gradient
            # Compute the gradient along this trajectory
            R = 0.
            for t in reversed(range(len(observations))):

                def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t,
                                   b_t, get_grad_logp_action):
                    """
                    :param discount: A scalar
                    :param R_tplus1: A scalar
                    :param theta: A matrix of size |A| * (|S|+1)
                    :param s_t: A vector of size |S|
                    :param a_t: Either a vector of size |A| or an integer, depending on the environment
                    :param r_t: A scalar
                    :param b_t: A scalar
                    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a
                    matrix of size |A| * (|S|+1) )
                    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
                    """
                    R_t = 0.
                    pg_theta = np.zeros_like(theta)
                    "*** YOUR CODE HERE ***"
                    R_t = discount * R_tplus1 + r_t
                    pg_theta = get_grad_logp_action(theta, s_t,
                                                    a_t) * (R_t - b_t)
                    return R_t, pg_theta

                # Test the implementation, but only once
                test_once(compute_update)

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=get_grad_logp_action)
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
            all_observations.extend(observations)
            all_actions.extend(actions)

        def compute_baselines(all_returns):
            """
            :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns
            collected at time step t across different episodes
            :return: A vector of size T
            """
            baselines = np.zeros(len(all_returns))
            for t in range(len(all_returns)):
                "*** YOUR CODE HERE ***"
                if len(all_returns[t]):
                    baselines[t] = np.mean(all_returns[t])
                else:
                    baselines[t] = 0
            return baselines

        if use_baseline:
            test_once(compute_baselines)
            baselines = compute_baselines(all_returns)
        else:
            baselines = np.zeros(timestep_limit)

        # Roughly normalize the gradient
        grad = grad / (np.linalg.norm(grad) + 1e-8)

        if not natural:

            theta += learning_rate * grad
        else:

            def compute_fisher_matrix(theta, get_grad_logp_action,
                                      all_observations, all_actions):
                """
                :param theta: A matrix of size |A| * (|S|+1)
                :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix
                of size |A| * (|S|+1) )
                :param all_observations: A list of vectors of size |S|
                :param all_actions: A list of vectors of size |A|
                :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of
                entries in theta
                """
                d = len(theta.flatten())
                F = np.zeros((d, d))
                "*** YOUR CODE HERE ***"
                # Where approximating the fisher matrix from sampled gradlogs
                # so we use the mean of all our estimates, which are outer(gradlogs, gradlogs)
                for i in range(len(all_observations)):
                    ob = all_observations[i]
                    action = all_actions[i]
                    grad_logp = get_grad_logp_action(theta, ob, action)
                    F += np.outer(grad_logp, grad_logp)
                F /= len(all_observations)
                # Watch the error carefully, when this works it will say
                # Test for __main__.compute_fisher_matrix passed!
                # Not equal to tolerance rtol=1e-05, atol=0 <= error for next part
                return F

            def compute_natural_gradient(F, grad, reg=1e-4):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param grad: A matrix of size |A| * (|S|+1)
                :param reg: A scalar
                :return: A matrix of size |A| * (|S|+1)
                """
                natural_grad = np.zeros_like(grad)
                "*** YOUR CODE HERE ***"
                I = np.eye(F.shape[0])
                # no dot product since reg is a scalar not a matrix
                F_1 = F + reg * I
                natural_grad = np.linalg.inv(F_1).dot(grad.flatten())
                F_inv = np.linalg.inv(F_1)
                # Assumes theta is flattened, therefore that grad is
                natural_grad = F_inv.dot(grad.flatten())
                # But we need to reshape the output
                return natural_grad.reshape(grad.shape)

            def compute_step_size(F, natural_grad, natural_step_size):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param natural_grad: A matrix of size |A| * (|S|+1)
                :param natural_step_size: A scalar
                :return: A scalar
                """
                step_size = 0.
                "*** YOUR CODE HERE ***"
                epsilon = natural_step_size
                g = natural_grad.reshape(-1, 1)
                alpha_squared = (2 * epsilon) / np.dot(np.dot(g.T, F), g)
                step_size = np.sqrt(alpha_squared)
                return step_size

            test_once(compute_fisher_matrix)
            test_once(compute_natural_gradient)
            test_once(compute_step_size)

            F = compute_fisher_matrix(
                theta=theta,
                get_grad_logp_action=get_grad_logp_action,
                all_observations=all_observations,
                all_actions=all_actions)
            natural_grad = compute_natural_gradient(F, grad)
            step_size = compute_step_size(F, natural_grad, natural_step_size)
            theta += step_size * natural_grad

        if env_id == 'CartPole-v0':
            logits = compute_logits(theta, np.array(all_observations))
            ent = np.mean(compute_entropy(logits))
            perp = np.exp(ent)

            print(
                "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f"
                % (itr, np.mean(episode_rewards), ent, perp,
                   np.linalg.norm(theta)))
        else:
            print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" %
                  (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
Exemplo n.º 11
0
def pg(env, env_maker, policy, baseline, n_envs=mp.cpu_count(), last_iter=-1, n_iters=100, batch_size=1000,
       optimizer=chainer.optimizers.Adam(), discount=0.99, gae_lambda=0.97, snapshot_saver=None):
    """
    This method implements policy gradient algorithm.
    :param env: An environment instance, which should have the same class as what env_maker.make() returns.
    :param env_maker: An object such that calling env_maker.make() will generate a new environment.
    :param policy: A stochastic policy which we will be optimizing.
    :param baseline: A baseline used for variance reduction and estimating future returns for unfinished trajectories.
    :param n_envs: Number of environments running simultaneously.
    :param last_iter: The index of the last iteration. This is normally -1 when starting afresh, but may be different when
           loaded from a snapshot.
    :param n_iters: The total number of iterations to run.
    :param batch_size: The number of samples used per iteration.
    :param optimizer: A Chainer optimizer instance. By default we use the Adam algorithm with learning rate 1e-3.
    :param discount: Discount factor.
    :param gae_lambda: Lambda parameter used for generalized advantage estimation.
    :param snapshot_saver: An object for saving snapshots.
    """

    if getattr(optimizer, 'target', None) is not policy:
        optimizer.setup(policy)

    logger.info("Starting env pool")
    with EnvPool(env_maker, n_envs=n_envs) as env_pool:
        for iter in range(last_iter + 1, n_iters):
            logger.info("Starting iteration {}".format(iter))
            logger.logkv('Iteration', iter)

            logger.info("Start collecting samples")
            trajs = parallel_collect_samples(env_pool, policy, batch_size)

            logger.info("Computing input variables for policy optimization")
            all_obs, all_acts, all_advs, _ = compute_pg_vars(
                trajs, policy, baseline, discount, gae_lambda
            )

            # Begin policy update

            # Now, you need to implement the computation of the policy gradient
            # The policy gradient is given by -1/T \sum_t \nabla_\theta(log(p_\theta(a_t|s_t))) * A_t
            # Note the negative sign in the front, since optimizers are most often minimizing a loss rather
            # This is the same as \nabla_\theta(-1/T \sum_t log(p_\theta(a_t|s_t)) * A_t) = \nabla_\theta(L), where L is the surrogate loss term

            logger.info("Computing policy gradient")

            # Methods that may be useful:
            # - `dists.logli(actions)' returns the log probability of the actions under the distribution `dists'.
            #   This method returns a chainer variable.

            dists = policy.compute_dists(all_obs)

            def compute_surr_loss(dists, all_acts, all_advs):
                """
                :param dists: An instance of subclass of Distribution
                :param all_acts: A chainer variable, which should be a matrix of size N * |A|
                :param all_advs: A chainer variable, which should be a vector of size N
                :return: A chainer variable, which should be a scalar
                """
                "*** YOUR CODE HERE ***"
                return -F.mean(dists.logli(all_acts) * all_advs)

            test_once(compute_surr_loss)

            surr_loss = compute_surr_loss(dists, all_acts, all_advs)

            # reset gradients stored in the policy parameters
            policy.cleargrads()
            surr_loss.backward()

            # apply the computed gradient
            optimizer.update()

            # Update baseline
            logger.info("Updating baseline")
            baseline.update(trajs)

            # log statistics
            logger.info("Computing logging information")
            logger.logkv('SurrLoss', surr_loss.data)
            log_action_distribution_statistics(dists)
            log_reward_statistics(env)
            log_baseline_statistics(trajs)
            logger.dumpkvs()

            if snapshot_saver is not None:
                logger.info("Saving snapshot")
                snapshot_saver.save_state(
                    iter,
                    dict(
                        alg=pg,
                        alg_state=dict(
                            env_maker=env_maker,
                            policy=policy,
                            baseline=baseline,
                            n_envs=n_envs,
                            last_iter=iter,
                            n_iters=n_iters,
                            batch_size=batch_size,
                            optimizer=optimizer,
                            discount=discount,
                            gae_lambda=gae_lambda
                        )
                    )
                )
Exemplo n.º 12
0
def main(env_id, batch_size, discount, learning_rate, n_itrs, render, use_baseline, natural, natural_step_size):
    # Check gradient implementation

    rng = np.random.RandomState(42)

    if env_id == 'CartPole-v0':
        cartpole_test_grad_impl()
        env = gym.make('CartPole-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        get_action = cartpole_get_action
        get_grad_logp_action = cartpole_get_grad_logp_action
    elif env_id == 'Point-v0':
        point_test_grad_impl()
        from simplepg import point_env
        env = gym.make('Point-v0')
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        get_action = point_get_action
        get_grad_logp_action = point_get_grad_logp_action
    else:
        raise ValueError(
            "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'")

    env.seed(42)
    timestep_limit = env.spec.timestep_limit

    # Initialize parameters
    theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1))

    # Store baselines for each time step.
    baselines = np.zeros(timestep_limit)

    # Policy training loop
    for itr in range(n_itrs):
        # Collect trajectory loop
        n_samples = 0
        grad = np.zeros_like(theta)
        episode_rewards = []

        # Store cumulative returns for each time step
        all_returns = [[] for _ in range(timestep_limit)]

        all_observations = []
        all_actions = []

        while n_samples < batch_size:
            observations = []
            actions = []
            rewards = []
            ob = env.reset()
            done = False
            # Only render the first trajectory
            render_episode = n_samples == 0
            # Collect a new trajectory
            while not done:
                action = get_action(theta, ob, rng=rng)
                next_ob, rew, done, _ = env.step(action)
                observations.append(ob)
                actions.append(action)
                rewards.append(rew)
                ob = next_ob
                n_samples += 1
                if render and render_episode:
                    env.render()
            # Go back in time to compute returns and accumulate gradient
            # Compute the gradient along this trajectory
            R = 0.
            for t in reversed(range(len(observations))):
                def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action):
                    """
                    :param discount: A scalar
                    :param R_tplus1: A scalar
                    :param theta: A matrix of size |A| * (|S|+1)
                    :param s_t: A vector of size |S|
                    :param a_t: Either a vector of size |A| or an integer, depending on the environment
                    :param r_t: A scalar
                    :param b_t: A scalar
                    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a 
                    matrix of size |A| * (|S|+1) )
                    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
                    """
                    "*** YOUR CODE HERE ***"
                    R_t = discount * R_tplus1 + r_t
                    pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t)
                    return R_t, pg_theta

                # Test the implementation, but only once
                test_once(compute_update)

                R, grad_t = compute_update(
                    discount=discount,
                    R_tplus1=R,
                    theta=theta,
                    s_t=observations[t],
                    a_t=actions[t],
                    r_t=rewards[t],
                    b_t=baselines[t],
                    get_grad_logp_action=get_grad_logp_action
                )
                all_returns[t].append(R)
                grad += grad_t

            episode_rewards.append(np.sum(rewards))
            all_observations.extend(observations)
            all_actions.extend(actions)

        def compute_baselines(all_returns):
            """
            :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns 
            collected at time step t across different episodes
            :return: A vector of size T
            """
            baselines = np.zeros(len(all_returns))
            for t in range(len(all_returns)):
                "*** YOUR CODE HERE ***"
                if len(all_returns[t]) > 0:
                    baselines[t] = np.mean(all_returns[t])
            return baselines

        if use_baseline:
            test_once(compute_baselines)
            baselines = compute_baselines(all_returns)
        else:
            baselines = np.zeros(timestep_limit)

        # Roughly normalize the gradient
        grad = grad / (np.linalg.norm(grad) + 1e-8)

        if not natural:

            theta += learning_rate * grad
        else:
            def compute_fisher_matrix(theta, get_grad_logp_action, all_observations, all_actions):
                """
                :param theta: A matrix of size |A| * (|S|+1)
                :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix 
                of size |A| * (|S|+1) )
                :param all_observations: A list of vectors of size |S|
                :param all_actions: A list of vectors of size |A|
                :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of 
                entries in theta
                """
                d = len(theta.flatten())
                F = np.zeros((d, d))
                "*** YOUR CODE HERE ***"
                for i in range(len(all_actions)):
                    grads = get_grad_logp_action(theta, all_observations[i], all_actions[i]).flatten()
                    F += np.outer(grads, grads.T)
                F /= len(all_actions)
                return F

            def compute_natural_gradient(F, grad, reg=1e-4):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param grad: A matrix of size |A| * (|S|+1)
                :param reg: A scalar
                :return: A matrix of size |A| * (|S|+1)
                """
                "*** YOUR CODE HERE ***"
                F_inv = np.linalg.inv(F + reg * np.eye(*F.shape))
                natural_grad = F_inv.dot(grad.flatten()).reshape(grad.shape)
                return natural_grad

            def compute_step_size(F, natural_grad, natural_step_size):
                """
                :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1))
                :param natural_grad: A matrix of size |A| * (|S|+1)
                :param natural_step_size: A scalar
                :return: A scalar
                """
                "*** YOUR CODE HERE ***"
                natural_grad = natural_grad.flatten()
                step_size = np.sqrt(2*natural_step_size/natural_grad.T.dot(F).dot(natural_grad))

                return step_size

            test_once(compute_fisher_matrix)
            test_once(compute_natural_gradient)
            test_once(compute_step_size)

            F = compute_fisher_matrix(theta=theta, get_grad_logp_action=get_grad_logp_action,
                                      all_observations=all_observations, all_actions=all_actions)
            natural_grad = compute_natural_gradient(F, grad)
            step_size = compute_step_size(F, natural_grad, natural_step_size)
            theta += step_size * natural_grad

        if env_id == 'CartPole-v0':
            logits = compute_logits(theta, np.array(all_observations))
            ent = np.mean(compute_entropy(logits))
            perp = np.exp(ent)

            print("Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f" % (
                itr, np.mean(episode_rewards), ent, perp, np.linalg.norm(theta)))
        else:
            print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (
                itr, np.mean(episode_rewards), np.linalg.norm(theta)))
Exemplo n.º 13
0
def a2c(env, env_maker, policy, vf, joint_model=None, k=20, n_envs=16, discount=0.99,
        optimizer=chainer.optimizers.RMSprop(lr=1e-3), max_grad_norm=1.0, vf_loss_coeff=0.5,
        ent_coeff=0.01, last_epoch=-1, epoch_length=10000, n_epochs=8000, snapshot_saver=None):
    """
    This method implements (Synchronous) Advantage Actor-Critic algorithm. Rather than having asynchronous workers,
    which can be more efficient due to less coordination but also less stable and harder to extend / debug, we use a

    pool of environment workers performing simulation, while computing actions and performing gradient updates
    centrally. This also makes it easier to utilize GPUs for neural network computation.
    :param env: An environment instance, which should have the same class as what env_maker.make() returns.
    :param env_maker: An object such that calling env_maker.make() will generate a new environment.
    :param policy: A stochastic policy which we will be optimizing.
    :param vf: A value function which estimates future returns given a state. It can potentially share weights with the
           policy by calling policy.create_vf().
    :param joint_model: The joint model of policy and value function. This is usually automatically computed.
    :param k: Number of simulation steps per environment for each gradient update.
    :param n_envs: Number of environments running simultaneously.
    :param discount: Discount factor.
    :param optimizer: A chainer optimizer instance. By default we use the RMSProp algorithm.
    :param max_grad_norm: If provided, apply gradient clipping with the specified maximum L2 norm.
    :param vf_loss_coeff: Coefficient for the value function loss.
    :param ent_coeff: Coefficient for the entropy loss (the negative bonus).
    :param last_epoch: The index of the last epoch. This is normally -1 when starting afresh, but may be different when
           loaded from a snapshot.
    :param epoch: The starting epoch. This is normally 0, but may be different when loaded from a snapshot. Since A2C
           is an online algorithm, an epoch is just an artificial boundary so that we record logs after each epoch.
    :param epoch_length: Number of total environment steps per epoch.
    :param n_epochs: Total number of epochs to run the algorithm.
    :param snapshot_saver: An object for saving snapshots.
    """

    # ensures that shared parameters are only counted once
    if joint_model is None:
        joint_model = UniqueChainList(policy, vf)

    if getattr(optimizer, 'target', None) is not joint_model:
        optimizer.setup(joint_model)

    try:
        # remove existing hook if necessary (this should only be needed when restarting experiments)
        optimizer.remove_hook('gradient_clipping')
    except KeyError:
        pass
    if max_grad_norm is not None:
        # Clip L2 norm of gradient, to improve stability
        optimizer.add_hook(chainer.optimizer.GradientClipping(
            threshold=max_grad_norm), 'gradient_clipping')

    epoch = last_epoch + 1
    global_t = epoch * epoch_length

    loggings = defaultdict(list)

    logger.info("Starting env pool")
    with EnvPool(env_maker, n_envs=n_envs) as env_pool:

        gen = samples_generator(env_pool, policy, vf, k)

        logger.info("Starting epoch {}".format(epoch))

        if logger.get_level() <= logger.INFO:
            progbar = tqdm(total=epoch_length)
        else:
            progbar = None

        while global_t < epoch_length * n_epochs:

            # Run k steps in the environment
            # Note:
            # - all_actions, all_values, all_dists, and next_values are chainer variables
            # - all_rewards, all_dones are lists numpy arrays
            # The first dimension of each variable is time, and the second dimension is the index of the environment
            all_actions, all_rewards, all_dones, all_dists, all_values, next_values = next(
                gen)

            global_t += n_envs * k

            # Compute returns and advantages

            # Size: (k, n_envs)
            all_values = F.stack(all_values)
            all_rewards = np.asarray(all_rewards, dtype=np.float32)
            all_dones = np.asarray(all_dones, dtype=np.float32)

            all_values_data = all_values.data
            next_values_data = next_values.data

            test_once(compute_returns_advantages)

            all_returns, all_advs = compute_returns_advantages(
                all_rewards,
                all_dones,
                all_values_data,
                next_values_data,
                discount
            )

            all_returns = chainer.Variable(all_returns.astype(np.float32))
            all_advs = chainer.Variable(all_advs.astype(np.float32))

            # Concatenate data
            # Size: (k*n_envs,) + action_shape
            all_flat_actions = F.concat(all_actions, axis=0)
            # Size: key -> (k*n_envs,) + dist_shape
            all_flat_dists = {k: F.concat(
                [d[k] for d in all_dists], axis=0) for k in all_dists[0].keys()}
            all_flat_dists = policy.distribution.from_dict(all_flat_dists)

            # Prepare variables needed for gradient computation
            logli = all_flat_dists.logli(all_flat_actions)
            ent = all_flat_dists.entropy()
            # Flatten advantages
            all_advs = F.concat(all_advs, axis=0)

            # Form the loss - you should only need to use the variables provided as input arguments below
            def compute_total_loss(logli, all_advs, ent_coeff, ent, vf_loss_coeff, all_returns, all_values):
                """
                :param logli: A chainer variable, which should be a vector of size N
                :param all_advs: A chainer variable, which should be a vector of size N
                :param ent_coeff: A scalar
                :param ent: A chainer variable, which should be a vector of size N
                :param vf_loss_coeff: A scalar
                :param all_returns: A chainer variable, which should be a vector of size N
                :param all_values: A chainer variable, which should be a vector of size N
                :return: A tuple of (policy_loss, vf_loss, total_loss)
                policy_loss should be the weighted sum of the surrogate loss and the average entropy loss
                vf_loss should be the (unweighted) squared loss of value function prediction.
                total_loss should be the weighted sum of policy_loss and vf_loss
                """
                "*** YOUR CODE HERE ***"
                policy_loss = -F.mean(logli * all_advs) - ent_coeff * F.mean(ent)
                vf_loss = F.mean_squared_error(all_returns, all_values)
                total_loss = policy_loss + vf_loss_coeff * vf_loss

                return policy_loss, vf_loss, total_loss

            test_once(compute_total_loss)

            policy_loss, vf_loss, total_loss = compute_total_loss(
                logli=logli, all_advs=all_advs, ent_coeff=ent_coeff,
                ent=ent, vf_loss_coeff=vf_loss_coeff,
                all_returns=all_returns, all_values=all_values
            )

            joint_model.cleargrads()
            total_loss.backward()
            optimizer.update()

            vf_loss_data = vf_loss.data
            all_returns_data = all_returns.data
            all_flat_dists_data = {
                k: v.data
                for k, v in all_flat_dists.as_dict().items()
            }

            loggings["vf_loss"].append(vf_loss_data)
            loggings["vf_preds"].append(all_values_data)
            loggings["vf_targets"].append(all_returns_data)
            loggings["dists"].append(all_flat_dists_data)

            if progbar is not None:
                progbar.update(k * n_envs)

            # An entire epoch has passed
            if global_t // epoch_length > epoch:
                logger.logkv('Epoch', epoch)
                log_reward_statistics(env)
                all_dists = {
                    k: Variable(np.concatenate([d[k] for d in loggings["dists"]], axis=0))
                    for k in loggings["dists"][0].keys()
                }
                log_action_distribution_statistics(
                    policy.distribution.from_dict(all_dists))
                logger.logkv('|VfPred|', np.mean(np.abs(loggings["vf_preds"])))
                logger.logkv('|VfTarget|', np.mean(
                    np.abs(loggings["vf_targets"])))
                logger.logkv('VfLoss', np.mean(loggings["vf_loss"]))
                logger.dumpkvs()

                if snapshot_saver is not None:
                    logger.info("Saving snapshot")

                    snapshot_saver.save_state(
                        epoch,
                        dict(
                            alg=a2c,
                            alg_state=dict(
                                env_maker=env_maker,
                                policy=policy,
                                vf=vf,
                                joint_model=joint_model,
                                k=k,
                                n_envs=n_envs,
                                discount=discount,
                                last_epoch=epoch,
                                n_epochs=n_epochs,
                                epoch_length=epoch_length,
                                optimizer=optimizer,
                                vf_loss_coeff=vf_loss_coeff,
                                ent_coeff=ent_coeff,
                                max_grad_norm=max_grad_norm,
                            )
                        )
                    )

                # Reset stored logging information
                loggings = defaultdict(list)

                if progbar is not None:
                    progbar.close()

                epoch = global_t // epoch_length

                logger.info("Starting epoch {}".format(epoch))

                if progbar is not None:
                    progbar = tqdm(total=epoch_length)

        if progbar is not None:
            progbar.close()