Exemplo n.º 1
0
 def setup_critic_optimizer(self):
     normalized_critic_target_tf = tf.clip_by_value(
         normalize(self.critic_target, self.ret_rms), self.return_range[0],
         self.return_range[1])
     self.critic_loss = tf.reduce_mean(
         tf.square(self.normalized_critic_tf -
                   normalized_critic_target_tf))  # 真action输出的Q
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [
             var for var in self.critic.trainable_vars
             if 'kernel' in var.name and 'output' not in var.name
         ]
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),  # L2正则化,不是很懂!!!
             weights_list=critic_reg_vars)
         self.critic_loss += critic_reg
     critic_shapes = [
         var.get_shape().as_list() for var in self.critic.trainable_vars
     ]  # 每个可训练参数的规模
     critic_nb_params = sum([
         reduce(lambda x, y: x * y, shape) for shape in critic_shapes
     ])  # 总的参数个数
     self.critic_grads = U.flatgrad(self.critic_loss,
                                    self.critic.trainable_vars,
                                    clip_norm=self.clip_norm)  # 计算梯度
     self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                     beta1=0.9,
                                     beta2=0.999,
                                     epsilon=1e-08)
Exemplo n.º 2
0
class DDPGMPI(DDPG):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # overwrite train ops
        self.q_grads = flatgrad(self.q_loss, self.q.trainable_vars)
        self.policy_grads = flatgrad(self.policy_loss,
                                     self.policy.trainable_vars)
        world_size = MPI.COMM_WORLD.Get_size()
        self.q_optimizer = MpiAdam(var_list=self.q.trainable_vars,
                                   scale_grad_by_procs=False)
        self.policy_optimizer = MpiAdam(var_list=self.policy.trainable_vars,
                                        scale_grad_by_procs=False)
        self.update_global_step = tf.assign_add(
            tf.train.get_or_create_global_step(), world_size)
        self.train_ops = [
            self.policy_grads, self.policy_loss, self.q_grads, self.q_loss,
            self.update_global_step
        ]

    def initialize(self, sess):
        super().initialize(sess)
        self.policy_optimizer.sync()
        self.q_optimizer.sync()
        self.sess.run(self.target_init_ops)

    def get_actions(self, obs):
        actions = self.sess.run(self.actions,
                                {self.obs_ph: np.atleast_2d(obs)})
        if self.expl_noise != 0:
            actions = actions + np.random.normal(
                0, self.expl_noise, (self.n_sample_actions, *actions.shape))
            actions = np.clip(actions, self.min_action, self.max_action)
        return actions

    def get_action_value(self, obs, actions):
        return self.sess.run(self.q_value, {
            self.obs_ph: obs,
            self.actions_ph: actions
        })

    def train(self, batch):
        policy_grads, _, q_grads, _, _ = self.sess.run(self.train_ops,
                                                       feed_dict={
                                                           self.obs_ph:
                                                           batch.obs,
                                                           self.actions_ph:
                                                           batch.actions,
                                                           self.rewards_ph:
                                                           batch.rewards,
                                                           self.dones_ph:
                                                           batch.dones,
                                                           self.next_obs_ph:
                                                           batch.next_obs
                                                       })
        self.policy_optimizer.update(policy_grads, stepsize=self.policy_lr)
        self.q_optimizer.update(q_grads, stepsize=self.q_lr)
Exemplo n.º 3
0
 def setup_actor_optimizer(self):
     self.actor_loss = -tf.reduce_mean(
         self.critic_with_actor_tf)  # 使critic输出最大,评价最好,critic输入网络action
     actor_shapes = [
         var.get_shape().as_list() for var in self.actor.trainable_vars
     ]  # list元素为每个可训练参数shape的list
     actor_nb_params = sum([
         reduce(lambda x, y: x * y, shape) for shape in actor_shapes
     ])  # 计算总的参数个数
     self.actor_grads = U.flatgrad(self.actor_loss,
                                   self.actor.trainable_vars,
                                   clip_norm=self.clip_norm)  # 计算梯度
     self.actor_optimizer = MpiAdam(
         var_list=self.actor.trainable_vars,  # 实例化一个类
         beta1=0.9,
         beta2=0.999,
         epsilon=1e-08)
Exemplo n.º 4
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # overwrite train ops
        self.q_grads = flatgrad(self.q_loss, self.q.trainable_vars)
        self.policy_grads = flatgrad(self.policy_loss,
                                     self.policy.trainable_vars)
        world_size = MPI.COMM_WORLD.Get_size()
        self.q_optimizer = MpiAdam(var_list=self.q.trainable_vars,
                                   scale_grad_by_procs=False)
        self.policy_optimizer = MpiAdam(var_list=self.policy.trainable_vars,
                                        scale_grad_by_procs=False)
        self.update_global_step = tf.assign_add(
            tf.train.get_or_create_global_step(), world_size)
        self.train_ops = [
            self.policy_grads, self.policy_loss, self.q_grads, self.q_loss,
            self.update_global_step
        ]
Exemplo n.º 5
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    U.load_state("save/Humanoid-v1")

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
        U.save_state("save/Humanoid-v1")
Exemplo n.º 6
0
def learn(env,
          policy_func,
          reward_giver,
          reward_guidance,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          algo,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=1e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          loss_percent=0.0,
          callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = build_policy(env, 'mlp', value_network='copy')

    ob = observation_placeholder(ob_space)
    with tf.variable_scope('pi'):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope('oldpi'):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables('pi')
    # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    guidance_adam = MpiAdam(reward_guidance.get_trainable_variables())

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables('oldpi'), get_variables('pi'))
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    guidance_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     reward_guidance,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     algo=algo,
                                     loss_percent=loss_percent)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        # global flag_render
        # if iters_so_far > 0 and iters_so_far % 10 ==0:
        #     flag_render = True
        # else:
        #     flag_render = False

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            print('rewards', seg['rew'])
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(
            batch_size=len(ob))
        batch_size = 128
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Discriminator"):
            for (ob_batch, ac_batch) in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)
                # update running mean/std for reward_giver
                if hasattr(reward_giver, "obs_rms"):
                    reward_giver.obs_rms.update(
                        np.concatenate((ob_batch, ob_expert), 0))
                *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert)
                d_adam.update(allmean(g), d_stepsize)
                d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # ------------------ Update Guidance ------------
        logger.log("Optimizing Guidance...")

        logger.log(fmt_row(13, reward_guidance.loss_name))
        batch_size = 128
        guidance_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Guidance"):
            for ob_batch, ac_batch in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)

                idx_condition = process_expert(ob_expert, ac_expert)
                pick_idx = (idx_condition >= loss_percent)
                # pick_idx = idx_condition

                ob_expert_p = ob_expert[pick_idx]
                ac_expert_p = ac_expert[pick_idx]

                ac_batch_p = []
                for each_ob in ob_expert_p:
                    tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True)
                    ac_batch_p.append(tmp_ac)

                # update running mean/std for reward_giver
                if hasattr(reward_guidance, "obs_rms"):
                    reward_guidance.obs_rms.update(ob_expert_p)
                # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p)
                *newlosses, g = reward_guidance.lossandgrad(
                    ob_expert_p, ac_batch_p, ac_expert_p)
                guidance_adam.update(allmean(g), d_stepsize)
                guidance_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(guidance_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens) * g_step
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
Exemplo n.º 7
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=False,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')  # 当前状态
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')  # 下一时刻转移后的状态
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(
                    shape=observation_shape)  # 归一化没看没看明白,实例化
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(
            normalize(self.obs0,
                      self.obs_rms),  # self.obs_rms没有.mean.std,它只是对象,不是列表啥的
            self.observation_range[0],
            self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(
            normalize(self.obs1, self.obs_rms),  # 保留范围内的数据
            self.observation_range[0],
            self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)  # actor网络输出的action
        self.normalized_critic_tf = critic(normalized_obs0,
                                           self.actions)  # critic真action输出的Q
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(
            normalized_obs0, self.actor_tf, reuse=True)  # 网络产生的action
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)  # 目标网络,网络产生action
        self.target_Q = self.rewards + (
            1. - self.terminals1
        ) * gamma * Q_obs1  # targetQ,terminals1是gym的done,是否失败,失败true,没失败false

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:  # 不懂,先不管
            self.setup_popart()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)  # 参数扰动,并非结果扰动
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(
            normalized_obs0)  # 扰动结果输出action
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor,
            self.param_noise_stddev)  # 返回一种操作,扰动网络的参数
        # 加了扰动
        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(
                tf.square(self.actor_tf -
                          adaptive_actor_tf)))  # 二者输出action的‘标准差’,距离的度量

    def setup_actor_optimizer(self):
        self.actor_loss = -tf.reduce_mean(
            self.critic_with_actor_tf)  # 使critic输出最大,评价最好,critic输入网络action
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]  # list元素为每个可训练参数shape的list
        actor_nb_params = sum([
            reduce(lambda x, y: x * y, shape) for shape in actor_shapes
        ])  # 计算总的参数个数
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)  # 计算梯度
        self.actor_optimizer = MpiAdam(
            var_list=self.actor.trainable_vars,  # 实例化一个类
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)

    def setup_critic_optimizer(self):
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf -
                      normalized_critic_target_tf))  # 真action输出的Q
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),  # L2正则化,不是很懂!!!
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]  # 每个可训练参数的规模
        critic_nb_params = sum([
            reduce(lambda x, y: x * y, shape) for shape in critic_shapes
        ])  # 总的参数个数
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)  # 计算梯度
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):  # 先不管
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev: 5,
                          })
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init_updates)  # 目标网络初始化更新

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev: 5,
                          })