예제 #1
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    entval = calcent(Mval).mean() #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)
예제 #2
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
예제 #3
0
def test_function():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        y = tf.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(2, 2) == 10
예제 #4
0
def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2 = tf.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10
    def __init__(self, epsilon=1e-2, shape=(), use_mpi=True):
        self.use_mpi = use_mpi

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ])
예제 #6
0
    def _init(self,
              ob_space,
              ac_space,
              num_units,
              gaussian_fixed_var=True,
              async_update=False):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(
            ac_space)  # pd: probability distribution
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape,
                                         use_mpi=(not async_update))

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell_vf',
                initializer=U.normc_initializer(1.0))

            init_c, init_h = lstm.zero_state(1, dtype=tf.float32)

            self.input_c_vf = U.get_placeholder(dtype=tf.float32,
                                                name="c_vf",
                                                shape=[None] +
                                                list(init_c.get_shape()[1:]))
            self.input_h_vf = U.get_placeholder(dtype=tf.float32,
                                                name="h_vf",
                                                shape=[None] +
                                                list(init_h.get_shape()[1:]))

            inpt_vf = tf.expand_dims(obz, 0)
            out_vf, (new_c, new_h) = tf.nn.dynamic_rnn(
                lstm,
                inpt_vf,
                initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                    self.input_c_vf, self.input_h_vf),
                dtype=tf.float32)
            out_vf = tf.squeeze(out_vf, axis=[0])

            self.vpred = tf.layers.dense(
                out_vf,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]
            self.out_hs_vf = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)

        with tf.variable_scope('pol'):

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell_pol',
                initializer=U.normc_initializer(1.0))

            init_c, init_h = lstm.zero_state(1, dtype=tf.float32)

            self.input_c_pol = U.get_placeholder(dtype=tf.float32,
                                                 name="c_pol",
                                                 shape=[None] +
                                                 list(init_c.get_shape()[1:]))
            self.input_h_pol = U.get_placeholder(dtype=tf.float32,
                                                 name="h_pol",
                                                 shape=[None] +
                                                 list(init_h.get_shape()[1:]))

            inpt_pol = tf.expand_dims(obz, 0)
            out_pol, (new_c, new_h) = tf.nn.dynamic_rnn(
                lstm,
                inpt_pol,
                initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                    self.input_c_vf, self.input_h_vf),
                dtype=tf.float32)
            out_pol = tf.squeeze(out_pol, axis=[0])
            self.out_hs_pol = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    out_pol,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    out_pol,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([
            stochastic, ob, self.input_c_vf, self.input_h_vf, self.input_c_pol,
            self.input_h_pol
        ], [ac, self.vpred, self.out_hs_vf, self.out_hs_pol])
예제 #7
0
    def _init(self,
              ob_space,
              ac_space,
              num_units,
              gaussian_fixed_var=True,
              async_update=False):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(
            ac_space)  # pd: probability distribution
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        full_path_is_done = tf.get_variable("full_path_is_done",
                                            dtype=tf.bool,
                                            initializer=True,
                                            trainable=False)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape,
                                         use_mpi=(not async_update))

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell',
                initializer=U.normc_initializer(1.0))

            init_lstm_state = lstm.zero_state(1, dtype=tf.float32)
            v_lstm_state = tf.get_variable("v_lstm_state",
                                           dtype=tf.float32,
                                           initializer=init_lstm_state,
                                           trainable=False)
            ba_state = tf.get_variable("ba_state",
                                       dtype=tf.float32,
                                       initializer=init_lstm_state,
                                       trainable=False)
            assign_ba_state = tf.cond(
                full_path_is_done,
                lambda: tf.assign(ba_state, v_lstm_state),  # TRUE
                lambda: tf.assign(ba_state, ba_state))  # FALSE
            lstm_state = tf.cond(tf.equal(tf.shape(ob)[0], 1),
                                 lambda: v_lstm_state, lambda: ba_state)
            assign_fpid = tf.assign(full_path_is_done,
                                    tf.math.greater(tf.shape(ob)[0], 1))

            with tf.control_dependencies([assign_ba_state]):
                last_out = tf.expand_dims(last_out, 0)
                last_out, lstm_new_state = tf.nn.dynamic_rnn(
                    lstm,
                    last_out,
                    initial_state=init_lstm_state,
                    dtype=tf.float32)
                assign_new_state = tf.assign(v_lstm_state, lstm_new_state)
                last_out = tf.squeeze(last_out, axis=[0])

            with tf.control_dependencies([assign_new_state, assign_fpid]):
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell',
                initializer=U.normc_initializer(1.0),
                state_is_tuple=False)

            init_lstm_state = lstm.zero_state(1, dtype=tf.float32)
            v_lstm_state = tf.get_variable("v_lstm_state",
                                           dtype=tf.float32,
                                           initializer=init_lstm_state,
                                           trainable=False)
            ba_state = tf.get_variable("ba_state",
                                       dtype=tf.float32,
                                       initializer=init_lstm_state,
                                       trainable=False)
            assign_ba_state = tf.cond(
                full_path_is_done,
                lambda: tf.assign(ba_state, v_lstm_state),  # TRUE
                lambda: tf.assign(ba_state, ba_state))  # FALSE
            lstm_state = tf.cond(tf.equal(tf.shape(ob)[0], 1),
                                 lambda: v_lstm_state, lambda: ba_state)
            assign_fpid = tf.assign(full_path_is_done,
                                    tf.math.greater(tf.shape(ob)[0], 1))

            with tf.control_dependencies([assign_ba_state]):
                last_out = tf.expand_dims(last_out, 0)
                last_out, lstm_new_state = tf.nn.dynamic_rnn(
                    lstm, last_out, initial_state=lstm_state, dtype=tf.float32)
                assign_new_state = tf.assign(v_lstm_state, lstm_new_state)
                last_out = tf.squeeze(last_out, axis=[0])

            with tf.control_dependencies([assign_new_state, assign_fpid]):
                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0] // 2,
                        name='final',
                        kernel_initializer=U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                else:
                    pdparam = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0],
                        name='final',
                        kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
예제 #8
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    reward_rule=reward_for_final_timestep):

    rank = MPI.COMM_WORLD.Get_rank()

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdamAsync(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    t1 = time.time()
    ##
    adam.sync()
    ##
    t2 = time.time()
    t = t2 - t1
    dh_logger.info(
        jm(type='adam.sync', rank=rank, duration=t, start_time=t1,
           end_time=t2))

    if rank == 0:  # ZERO is the parameter server
        while True:
            t1 = time.time()
            ## BEGIN - TIMING ##
            rank_worker_source = adam.master_update()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='adam.master_update',
                   rank=rank,
                   duration=t,
                   rank_worker_source=rank_worker_source,
                   start_time=t1,
                   end_time=t2))
    else:
        # Prepare for rollouts
        # ----------------------------------------

        seg_gen = traj_segment_generator(pi,
                                         env,
                                         timesteps_per_actorbatch,
                                         stochastic=True,
                                         reward_affect_func=reward_rule)

        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

        cond = sum([
            max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0
        ])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_iters={max_iters}, max_timesteps={max_timesteps}, max_episodes={max_episodes}, max_seconds={max_seconds}"

        while True:
            if callback: callback(locals(), globals())
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break
            elif max_episodes and episodes_so_far >= max_episodes:
                break
            elif max_iters and iters_so_far >= max_iters:
                break
            elif max_seconds and time.time() - tstart >= max_seconds:
                break

            if schedule == 'constant':
                cur_lrmult = 1.0
            elif schedule == 'linear':
                cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps,
                                 0)
            else:
                raise NotImplementedError

            #logger.log("********** Iteration %i ************"%iters_so_far)

            t1 = time.time()
            ## BEGIN - TIMING ##
            seg = seg_gen.__next__()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='batch_computation',
                   rank=rank,
                   duration=t,
                   start_time=t1,
                   end_time=t2))
            dh_logger.info(jm(type='seg', rank=rank, **seg))

            add_vtarg_and_adv(seg, gamma, lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            # optim_batchsize = optim_batchsize or ob.shape[0]
            optim_batchsize = ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            dh_logger.info(f"Rank={rank}: Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)

                    t1 = time.time()
                    ## BEGIN - TIMING ##
                    adam.worker_update(g, optim_stepsize * cur_lrmult)
                    ## END - TIMING ##
                    t2 = time.time()
                    t = t2 - t1
                    dh_logger.info(
                        jm(type='adam.worker_update',
                           rank=rank,
                           duration=t,
                           start_time=t1,
                           end_time=t2))

                    losses.append(newlosses)

            dh_logger.info(f"Rank={rank}: Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0, use_mpi=False)

            lens = seg["ep_lens"]
            rews = seg["ep_rets"]

            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1

        return pi
예제 #9
0
def learn(env, policy_fn, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        reward_rule=reward_for_final_timestep
        ):

    rank = MPI.COMM_WORLD.Get_rank()

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    input_c_vf = U.get_placeholder_cached(name="c_vf")
    input_h_vf = U.get_placeholder_cached(name="h_vf")
    input_c_pol = U.get_placeholder_cached(name="c_pol")
    input_h_pol = U.get_placeholder_cached(name="h_pol")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function(
        [ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol],
        losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator_ph(pi, env, timesteps_per_actorbatch,
        stochastic=True, reward_affect_func=reward_rule)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert max_timesteps > 0, f"The number of timesteps should be > 0 but is {max_timesteps}"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        dh_logger.info(jm(type='seg', rank=rank, **seg))
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        c_vf = np.squeeze(np.array([c for c, _ in seg["hs_vf"]]))
        h_vf = np.squeeze(np.array([h for _, h in seg["hs_vf"]]))
        c_pol = np.squeeze(np.array([c for c, _ in seg["hs_pol"]]))
        h_pol = np.squeeze(np.array([h for _, h in seg["hs_pol"]]))
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, c_vf=c_vf, h_vf=h_vf, c_pol=c_pol, h_pol=h_pol), shuffle=not pi.recurrent)
        # optim_batchsize = optim_batchsize or ob.shape[0]
        optim_batchsize = ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            gradients = []
            for batch in d.iterate_once(optim_batchsize):
                for i in range(len(batch["ob"])):
                    *newlosses, g = lossandgrad(
                        batch["ob"][i:i+1],
                        batch["ac"][i:i+1],
                        batch["atarg"][i:i+1],
                        batch["vtarg"][i:i+1],
                        cur_lrmult,
                        batch["c_vf"][i:i+1],
                        batch["h_vf"][i:i+1],
                        batch["c_pol"][i:i+1],
                        batch["h_pol"][i:i+1])
                    losses.append(newlosses)
                    gradients.append(g)
            g = np.array(gradients).sum(0)
            adam.update(g, optim_stepsize * cur_lrmult)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            for i in range(len(batch["ob"])):
                newlosses = compute_losses(
                    batch["ob"][i:i+1],
                    batch["ac"][i:i+1],
                    batch["atarg"][i:i+1],
                    batch["vtarg"][i:i+1],
                    cur_lrmult,
                    batch["c_vf"][i:i+1],
                    batch["h_vf"][i:i+1],
                    batch["c_pol"][i:i+1],
                    batch["h_pol"][i:i+1])
                losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

    return pi
예제 #10
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])