예제 #1
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
예제 #2
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    entval = calcent(Mval).mean() #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)
    def _init(self, ob_space, ac_space, hid_size, feat_size, gaussian_fixed_var=True):

        num_hid_layers = len(hid_size)
        mean_emb = ob_space.dim_mean_embs
        nr_rec_obs = mean_emb[0]  # each agents receives n_agents - 1 observations...
        dim_rec_obs = mean_emb[1]  # ... each of size dim_rec_obs ...
        dim_flat_obs = ob_space.dim_flat_o  # ... plus a local observation

        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        # a row in ob contains an agent's flattened observation, the first dimension needs to be None because we use it
        # for training and inference, i.e.[None, (n_agents - 1) * dim_rec_obs + dim_flat_obs]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None,) + ob_space.shape)

        flat_obs_input_layer = tf.slice(ob, [0, 0], [-1, nr_rec_obs * dim_rec_obs])  # grab only the part that goes into mean embedding
        flat_feature_input_layer = tf.slice(ob, [0, nr_rec_obs * dim_rec_obs], [-1, dim_flat_obs])  # grab only the local observation

        with tf.variable_scope('vf'):
            with tf.variable_scope('me'):
                me_v = me.MeanEmbedding(flat_obs_input_layer, feat_size, nr_rec_obs, dim_rec_obs)
            last_out = tf.concat([me_v.me_out, flat_feature_input_layer], axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1),
                                           kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            with tf.variable_scope('me'):
                me_pi = me.MeanEmbedding(flat_obs_input_layer, feat_size, nr_rec_obs, dim_rec_obs)
            last_out = tf.concat([me_pi.me_out, flat_feature_input_layer], axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1),
                                           kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
        self._me_v = U.function([ob], [me_v.me_out])
        self._me_pi = U.function([ob], [me_pi.me_out])
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(0.0),
            name="runningsum", trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq", trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count", trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
            updates=[tf.assign_add(self._sum, newsum),
                     tf.assign_add(self._sumsq, newsumsq),
                     tf.assign_add(self._count, newcount)])
예제 #5
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              feat_size,
              gaussian_fixed_var=True):

        num_hid_layers = len(hid_size)
        neighbor_info = ob_space.dim_rec_o
        nr_rec_obs = neighbor_info[0]
        dim_rec_obs = neighbor_info[1]
        rest = ob_space.dim_flat_o - ob_space.dim_local_o
        dim_flat_obs = ob_space.dim_flat_o

        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=(None, ) + ob_space.shape)

        flat_obs_input_layer_0 = tf.slice(ob, [0, 0],
                                          [-1, nr_rec_obs * dim_rec_obs])
        flat_obs_input_layer_1 = tf.slice(ob, [0, nr_rec_obs * dim_rec_obs],
                                          [-1, rest])
        flat_feature_input_layer = tf.slice(
            ob, [0, nr_rec_obs * dim_rec_obs + rest],
            [-1, ob_space.dim_local_o])

        with tf.variable_scope('vf'):
            with tf.variable_scope('input_0'):
                input_0_v = tf.layers.dense(
                    flat_obs_input_layer_0,
                    feat_size[0][0],
                    name="fc0",
                    kernel_initializer=U.normc_initializer(1.0))
            with tf.variable_scope('input_1'):
                input_1_v = tf.layers.dense(
                    flat_obs_input_layer_1,
                    feat_size[1][0],
                    name="fc0",
                    kernel_initializer=U.normc_initializer(1.0))
            last_out = tf.concat(
                [input_0_v, input_1_v, flat_feature_input_layer], axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name="fc%i" % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            with tf.variable_scope('input_0'):
                input_0_pi = tf.layers.dense(
                    flat_obs_input_layer_0,
                    feat_size[0][0],
                    name="fc0",
                    kernel_initializer=U.normc_initializer(1.0))
            with tf.variable_scope('input_1'):
                input_1_pi = tf.layers.dense(
                    flat_obs_input_layer_1,
                    feat_size[1][0],
                    name="fc0",
                    kernel_initializer=U.normc_initializer(1.0))
            last_out = tf.concat(
                [input_0_pi, input_1_pi, flat_feature_input_layer], axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name="fc%i" % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
예제 #6
0
def learn(env, policy_fn, *,
        timesteps_per_batch, # what to train on
        max_kl, cg_iters,
        gamma, lam, # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters =3,
        max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
        callback=None
        ):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    var_list.extend([v for v in all_var_list if v.name.split("/")[1].startswith("me")])
    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    act_params = {
        'name': "pi",
        'ob_space': ob_space,
        'ac_space': ac_space,
    }

    pi = ActWrapper(pi, act_params)

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************"%iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        ob = np.concatenate([s['ob'] for s in seg], axis=0)
        ac = np.concatenate([s['ac'] for s in seg], axis=0)
        atarg = np.concatenate([s['adv'] for s in seg], axis=0)
        tdlamret = np.concatenate([s['tdlamret'] for s in seg], axis=0)
        vpredbefore = np.concatenate([s["vpred"] for s in seg], axis=0) # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        # if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        # if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        args = ob, ac, atarg
        fvpargs = [arr[::5] for arr in args]
        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new() # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
            assert np.isfinite(stepdir).all()
            shs = .5*stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((ob, tdlamret),
                                                         include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        # lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        lrlocal = (seg[0]["ep_lens"], seg[0]["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              feat_size,
              gaussian_fixed_var=True):

        num_hid_layers = len(hid_size)
        n_mean_embs = len(ob_space.dim_mean_embs)
        mean_emb_0 = ob_space.dim_mean_embs[0]
        mean_emb_1 = ob_space.dim_mean_embs[1]
        nr_obs_0 = mean_emb_0[0]
        dim_obs_0 = mean_emb_0[1]

        nr_obs_1 = mean_emb_1[0]
        dim_obs_1 = mean_emb_1[1]

        dim_flat_obs = ob_space.dim_flat_o

        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=(None, ) + ob_space.shape)

        mean_emb_0_input_layer = tf.slice(ob, [0, 0],
                                          [-1, nr_obs_0 * dim_obs_0])
        mean_emb_1_input_layer = tf.slice(ob, [0, nr_obs_0 * dim_obs_0],
                                          [-1, nr_obs_1 * dim_obs_1])
        flat_feature_input_layer = tf.slice(
            ob, [0, nr_obs_0 * dim_obs_0 + nr_obs_1 * dim_obs_1],
            [-1, dim_flat_obs])

        with tf.variable_scope('vf'):
            with tf.variable_scope('me_rec'):
                me_v_rec = me.MeanEmbedding(mean_emb_0_input_layer,
                                            feat_size[0], nr_obs_0, dim_obs_0)
            with tf.variable_scope('me_local'):
                me_v_local = me.MeanEmbedding(mean_emb_1_input_layer,
                                              feat_size[1], nr_obs_1,
                                              dim_obs_1)
            last_out = tf.concat(
                [me_v_rec.me_out, me_v_local.me_out, flat_feature_input_layer],
                axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name="fc%i" % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            with tf.variable_scope('me_rec'):
                me_pi_rec = me.MeanEmbedding(mean_emb_0_input_layer,
                                             feat_size[0], nr_obs_0, dim_obs_0)
            with tf.variable_scope('me_local'):
                me_pi_local = me.MeanEmbedding(mean_emb_1_input_layer,
                                               feat_size[1], nr_obs_1,
                                               dim_obs_1)
            last_out = tf.concat([
                me_pi_rec.me_out, me_pi_local.me_out, flat_feature_input_layer
            ],
                                 axis=1)
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name="fc%i" % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tfc.layers.layer_norm(last_out)
                last_out = tf.nn.relu(last_out)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
        self._me = U.function([ob], [me])
예제 #8
0
    def __init__(
            self,
            env,
            policy_fn,
            *,
            timesteps_per_batch,  # what to train on
            max_kl,
            cg_iters,
            gamma,
            lam,  # advantage estimation
            entcoeff=0.0,
            cg_damping=1e-2,
            vf_stepsize=3e-4,
            vf_iters=3,
            max_timesteps=0,
            max_episodes=0,
            max_iters=0,  # time constraint
            callback=None,
            max_path_length=None):

        self.gamma = gamma
        self.gae_lambda = lam
        self.max_kl = max_kl
        self.cg_iters = cg_iters
        self.cg_damping = cg_damping
        self.vf_stepsize = vf_stepsize
        self.vf_iters = vf_iters
        self.time_steps_per_batch = timesteps_per_batch
        if max_path_length is None:
            self.max_path_length = timesteps_per_batch
        else:
            self.max_path_length = max_path_length

        self.nworkers = MPI.COMM_WORLD.Get_size()
        self.rank = MPI.COMM_WORLD.Get_rank()
        np.set_printoptions(precision=3)
        # Setup losses and stuff
        # ----------------------------------------
        ob_space = env.observation_space
        ac_space = env.action_space
        pi = policy_fn("pi", ob_space, ac_space)
        oldpi = policy_fn("oldpi", ob_space, ac_space)
        atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32,
                             shape=[None])  # Empirical return
        # n_size = tf.placeholder(dtype=tf.float32, shape=[None])  # neighborhood size

        ob = U.get_placeholder_cached(name="ob")
        ac = pi.pdtype.sample_placeholder([None])

        kloldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        meankl = tf.reduce_mean(kloldnew)
        meanent = tf.reduce_mean(ent)
        entbonus = entcoeff * meanent

        vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
        # pred_n_error = tf.reduce_mean(tf.square(pi.predict_n - n_size))

        ratio = tf.exp(pi.pd.logp(ac) -
                       oldpi.pd.logp(ac))  # advantage * pnew / pold
        surrgain = tf.reduce_mean(ratio * atarg)

        optimgain = surrgain + entbonus  # - pred_n_error
        losses = [optimgain, meankl, entbonus, surrgain, meanent,
                  vferr]  # , pred_n_error]
        self.loss_names = [
            "optimgain", "meankl", "entloss", "surrgain", "entropy", "vf_loss"
        ]  # , "pred_n_error"]

        dist = meankl

        all_var_list = pi.get_trainable_variables()
        var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("pol")
        ]
        var_list.extend(
            [v for v in all_var_list if v.name.split("/")[1].startswith("me")])
        vf_var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("vf")
        ]
        # vf_var_list.extend([v for v in all_var_list if v.name.split("/")[1].startswith("me")])
        self.vfadam = MpiAdam(vf_var_list)

        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)
        klgrads = tf.gradients(dist, var_list)
        flat_tangent = tf.placeholder(dtype=tf.float32,
                                      shape=[None],
                                      name="flat_tan")
        shapes = [var.get_shape().as_list() for var in var_list]
        start = 0
        tangents = []
        for shape in shapes:
            sz = U.intprod(shape)
            tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
            start += sz
        gvp = tf.add_n([
            tf.reduce_sum(g * tangent)
            for (g, tangent) in zipsame(klgrads, tangents)
        ])  # pylint: disable=E1111
        fvp = U.flatgrad(gvp, var_list)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (
                    oldv,
                    newv) in zipsame(oldpi.get_variables(), pi.get_variables())
            ])
        self.compute_losses = U.function([ob, ac, atarg, ret], losses)
        self.compute_lossandgrad = U.function(
            [ob, ac, atarg, ret], losses + [U.flatgrad(optimgain, var_list)])
        self.compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
        self.compute_vflossandgrad = U.function([ob, ret],
                                                U.flatgrad(vferr, vf_var_list))

        act_params = {
            'name': "pi",
            'ob_space': ob_space,
            'ac_space': ac_space,
        }

        self.pi = ActWrapper(pi, act_params)

        U.initialize()
        th_init = self.get_flat()
        MPI.COMM_WORLD.Bcast(th_init, root=0)
        self.set_from_flat(th_init)
        self.vfadam.sync()
        print("Init param sum", th_init.sum(), flush=True)

        # self.seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
        if self.time_steps_per_batch > self.max_path_length:
            self.nr_traj_seg_gens = int(self.time_steps_per_batch /
                                        self.max_path_length)
            self.seg_gen = [
                copy_func(traj_segment_generator,
                          "traj_seg_gen_{}".format(i))(pi,
                                                       env,
                                                       timesteps_per_batch,
                                                       stochastic=True)
                for i in range(self.nr_traj_seg_gens)
            ]
        else:
            self.nr_traj_seg_gens = 1
            self.seg_gen = [
                traj_segment_generator(pi,
                                       env,
                                       self.time_steps_per_batch,
                                       stochastic=True)
            ]
예제 #9
0
    def _init(self, ob_space, ac_space, hid_size, gaussian_fixed_var=True):

        num_hid_layers = len(hid_size)

        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # with tf.variable_scope("retfilter"):
        #     self.ret_rms = RunningMeanStd(shape=1)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            # last_out = obz
            last_out = ob
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name="fc%i" % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tc.layers.layer_norm(last_out,
                                                    center=True,
                                                    scale=True)
                last_out = tf.nn.relu(last_out)
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            # last_out = obz
            last_out = ob
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(
                    last_out,
                    hid_size[i],
                    name='fc%i' % (i + 1),
                    kernel_initializer=U.normc_initializer(1.0))
                if self.layer_norm:
                    last_out = tc.layers.layer_norm(last_out,
                                                    center=True,
                                                    scale=True)
                last_out = tf.nn.relu(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])