Exemplo n.º 1
0
    def train(self, seg, optim_batchsize, optim_epochs):
        cur_lrmult = 1.0
        add_vtarg_and_adv(seg, self.gamma, self.lam)
        ob, unnorm_ac, atarg, tdlamret = seg["ob"], seg["unnorm_ac"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not self.pi.recurrent)

        if hasattr(self.pi, "ob_rms"):
            self.pi.update_obs_rms(ob)  # update running mean/std for policy
        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = self.lossandgrad(batch["ac"], batch["atarg"],
                                      batch["vtarg"], cur_lrmult,
                                      *self.fix_ob2feed(batch["ob"]))
                new_losses, g = lg[:-1], lg[-1]
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(new_losses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ac"], batch["atarg"],
                                            batch["vtarg"], cur_lrmult,
                                            *self.fix_ob2feed(batch["ob"]))
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        return meanlosses
Exemplo n.º 2
0
def sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name,
                      sample_stochastic):

    assert load_model_path is not None
    U.load_state(load_model_path)
    sample_trajs = []
    for iters_so_far in range(max_sample_traj):
        logger.log2("********** Iteration %i ************" % iters_so_far)
        traj = traj_gen.next()
        ob, new, ep_ret, ac, rew, ep_len = traj['ob'], traj['new'], traj[
            'ep_ret'], traj['ac'], traj['rew'], traj['ep_len']
        logger.record_tabular("ep_ret", ep_ret)
        logger.record_tabular("ep_len", ep_len)
        logger.record_tabular("immediate reward", np.mean(rew))
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
        traj_data = {"ob": ob, "ac": ac, "rew": rew, "ep_ret": ep_ret}
        sample_trajs.append(traj_data)

    sample_ep_rets = [traj["ep_ret"] for traj in sample_trajs]
    logger.log2("Average total return: %f" %
                (sum(sample_ep_rets) / len(sample_ep_rets)))
Exemplo n.º 3
0
timesteps_so_far = 0
iters_so_far = 0
lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
distbuffer = deque(maxlen=100)
tstart = time.time()
writer = U.FileWriter(tensorboard_dir)
loss_stats = stats(["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"])
ep_stats = stats(["Reward", "Episode_Length", "Episode_This_Iter", "Distance"])

while timesteps_so_far < args.max_timesteps:
    # Save model
    if iters_so_far % args.save_per_iter == 0 and iters_so_far > 0 and ckpt_dir is not None:
        U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far)

    logger.log2("********** Iteration %i ************" % iters_so_far)

    seg = seg_gen.next()
    losses = policy.train(seg, args.optim_batchsize, args.optim_epochs)

    lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_dists"])  # local values
    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
    lens, rews, dists = map(flatten_lists, zip(*listoflrpairs))

    lenbuffer.extend(lens)
    rewbuffer.extend(rews)
    distbuffer.extend(dists)
    logger.record_tabular("eplenmean", np.mean(lenbuffer))
    logger.record_tabular("eprewmean", np.mean(rewbuffer))
    logger.record_tabular("epthisiter", len(lens))
    logger.record_tabular("epdistmean", np.mean(distbuffer))
Exemplo n.º 4
0
def learn(
        env,
        policy_fn,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        save_per_iter=50,
        max_sample_traj=10,
        ckpt_dir=None,
        log_dir=None,
        task_name="origin",
        sample_stochastic=True,
        load_model_path=None,
        task="train"):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed clipping parameter epsilon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    writer = U.FileWriter(log_dir)
    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)
    traj_gen = traj_episode_generator(pi,
                                      env,
                                      timesteps_per_actorbatch,
                                      stochastic=sample_stochastic)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    loss_stats = stats(loss_names)
    ep_stats = stats(["Reward", "Episode_Length", "Episode_This_Iter"])
    if task == "sample_trajectory":
        sample_trajectory(load_model_path, max_sample_traj, traj_gen,
                          task_name, sample_stochastic)
        sys.exit()

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log2("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.next()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
                                 batch["vtarg"], cur_lrmult)
                newlosses = lg[:-1]
                g = lg[-1]
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
            loss_stats.add_all_summary(writer, meanlosses, iters_so_far)
            ep_stats.add_all_summary(
                writer, [np.mean(rewbuffer),
                         np.mean(lenbuffer),
                         len(lens)], iters_so_far)

    return pi
Exemplo n.º 5
0
    def train(self, seg, optim_batchsize, optim_epochs):
        #normalize the reward
        rffs_int = np.array(
            [self.rff_int.update(rew) for rew in seg["rew_int"]])
        self.rff_rms_int.update(rffs_int.ravel())
        seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var)

        cur_lrmult = 1.0
        add_vtarg_and_adv(seg, self.gamma, self.lam)
        ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[
            "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[
                "adv_int"], seg["tdlamret_int"]
        vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[
            "vpred_int"]  # predicted value function before udpate
        atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std(
        )  # standardized advantage function estimate
        atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std()
        atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext

        d = Dataset(dict(ob=ob,
                         ac=unnorm_ac,
                         atarg=atarg,
                         vtarg_ext=tdlamret_ext,
                         vtarg_int=tdlamret_int),
                    shuffle=not self.pi.recurrent)

        if hasattr(self.pi, "ob_rms"):
            self.pi.update_obs_rms(ob)  # update running mean/std for policy
        if hasattr(self.int_rew, "ob_rms"):
            self.int_rew.update_obs_rms(
                ob)  #update running mean/std for int_rew
        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = self.lossandgrad(batch["ac"], batch["atarg"],
                                      batch["vtarg_ext"], batch["vtarg_int"],
                                      cur_lrmult, *zip(*batch["ob"].tolist()))
                new_losses, g = lg[:-1], lg[-1]
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(new_losses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ac"], batch["atarg"],
                                            batch["vtarg_ext"],
                                            batch["vtarg_int"], cur_lrmult,
                                            *zip(*batch["ob"].tolist()))
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular(
            "ev_tdlam_ext_before",
            explained_variance(vpredbefore_ext, tdlamret_ext))
        return meanlosses