def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
def _init(self, ob_space, ac_space, hid_size, feat_size, gaussian_fixed_var=True): num_hid_layers = len(hid_size) mean_emb = ob_space.dim_mean_embs nr_rec_obs = mean_emb[0] # each agents receives n_agents - 1 observations... dim_rec_obs = mean_emb[1] # ... each of size dim_rec_obs ... dim_flat_obs = ob_space.dim_flat_o # ... plus a local observation assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) # a row in ob contains an agent's flattened observation, the first dimension needs to be None because we use it # for training and inference, i.e.[None, (n_agents - 1) * dim_rec_obs + dim_flat_obs] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None,) + ob_space.shape) flat_obs_input_layer = tf.slice(ob, [0, 0], [-1, nr_rec_obs * dim_rec_obs]) # grab only the part that goes into mean embedding flat_feature_input_layer = tf.slice(ob, [0, nr_rec_obs * dim_rec_obs], [-1, dim_flat_obs]) # grab only the local observation with tf.variable_scope('vf'): with tf.variable_scope('me'): me_v = me.MeanEmbedding(flat_obs_input_layer, feat_size, nr_rec_obs, dim_rec_obs) last_out = tf.concat([me_v.me_out, flat_feature_input_layer], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): with tf.variable_scope('me'): me_pi = me.MeanEmbedding(flat_obs_input_layer, feat_size, nr_rec_obs, dim_rec_obs) last_out = tf.concat([me_pi.me_out, flat_feature_input_layer], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) self._me_v = U.function([ob], [me_v.me_out]) self._me_pi = U.function([ob], [me_pi.me_out])
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def _init(self, ob_space, ac_space, hid_size, feat_size, gaussian_fixed_var=True): num_hid_layers = len(hid_size) neighbor_info = ob_space.dim_rec_o nr_rec_obs = neighbor_info[0] dim_rec_obs = neighbor_info[1] rest = ob_space.dim_flat_o - ob_space.dim_local_o dim_flat_obs = ob_space.dim_flat_o assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, ) + ob_space.shape) flat_obs_input_layer_0 = tf.slice(ob, [0, 0], [-1, nr_rec_obs * dim_rec_obs]) flat_obs_input_layer_1 = tf.slice(ob, [0, nr_rec_obs * dim_rec_obs], [-1, rest]) flat_feature_input_layer = tf.slice( ob, [0, nr_rec_obs * dim_rec_obs + rest], [-1, ob_space.dim_local_o]) with tf.variable_scope('vf'): with tf.variable_scope('input_0'): input_0_v = tf.layers.dense( flat_obs_input_layer_0, feat_size[0][0], name="fc0", kernel_initializer=U.normc_initializer(1.0)) with tf.variable_scope('input_1'): input_1_v = tf.layers.dense( flat_obs_input_layer_1, feat_size[1][0], name="fc0", kernel_initializer=U.normc_initializer(1.0)) last_out = tf.concat( [input_0_v, input_1_v, flat_feature_input_layer], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): with tf.variable_scope('input_0'): input_0_pi = tf.layers.dense( flat_obs_input_layer_0, feat_size[0][0], name="fc0", kernel_initializer=U.normc_initializer(1.0)) with tf.variable_scope('input_1'): input_1_pi = tf.layers.dense( flat_obs_input_layer_1, feat_size[1][0], name="fc0", kernel_initializer=U.normc_initializer(1.0)) last_out = tf.concat( [input_0_pi, input_1_pi, flat_feature_input_layer], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn(env, policy_fn, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] var_list.extend([v for v in all_var_list if v.name.split("/")[1].startswith("me")]) vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out act_params = { 'name': "pi", 'ob_space': ob_space, 'ac_space': ac_space, } pi = ActWrapper(pi, act_params) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob = np.concatenate([s['ob'] for s in seg], axis=0) ac = np.concatenate([s['ac'] for s in seg], axis=0) atarg = np.concatenate([s['adv'] for s in seg], axis=0) tdlamret = np.concatenate([s['tdlamret'] for s in seg], axis=0) vpredbefore = np.concatenate([s["vpred"] for s in seg], axis=0) # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) # if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = ob, ac, atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((ob, tdlamret), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values lrlocal = (seg[0]["ep_lens"], seg[0]["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def _init(self, ob_space, ac_space, hid_size, feat_size, gaussian_fixed_var=True): num_hid_layers = len(hid_size) n_mean_embs = len(ob_space.dim_mean_embs) mean_emb_0 = ob_space.dim_mean_embs[0] mean_emb_1 = ob_space.dim_mean_embs[1] nr_obs_0 = mean_emb_0[0] dim_obs_0 = mean_emb_0[1] nr_obs_1 = mean_emb_1[0] dim_obs_1 = mean_emb_1[1] dim_flat_obs = ob_space.dim_flat_o assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, ) + ob_space.shape) mean_emb_0_input_layer = tf.slice(ob, [0, 0], [-1, nr_obs_0 * dim_obs_0]) mean_emb_1_input_layer = tf.slice(ob, [0, nr_obs_0 * dim_obs_0], [-1, nr_obs_1 * dim_obs_1]) flat_feature_input_layer = tf.slice( ob, [0, nr_obs_0 * dim_obs_0 + nr_obs_1 * dim_obs_1], [-1, dim_flat_obs]) with tf.variable_scope('vf'): with tf.variable_scope('me_rec'): me_v_rec = me.MeanEmbedding(mean_emb_0_input_layer, feat_size[0], nr_obs_0, dim_obs_0) with tf.variable_scope('me_local'): me_v_local = me.MeanEmbedding(mean_emb_1_input_layer, feat_size[1], nr_obs_1, dim_obs_1) last_out = tf.concat( [me_v_rec.me_out, me_v_local.me_out, flat_feature_input_layer], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): with tf.variable_scope('me_rec'): me_pi_rec = me.MeanEmbedding(mean_emb_0_input_layer, feat_size[0], nr_obs_0, dim_obs_0) with tf.variable_scope('me_local'): me_pi_local = me.MeanEmbedding(mean_emb_1_input_layer, feat_size[1], nr_obs_1, dim_obs_1) last_out = tf.concat([ me_pi_rec.me_out, me_pi_local.me_out, flat_feature_input_layer ], axis=1) for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tfc.layers.layer_norm(last_out) last_out = tf.nn.relu(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) self._me = U.function([ob], [me])
def __init__( self, env, policy_fn, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, max_path_length=None): self.gamma = gamma self.gae_lambda = lam self.max_kl = max_kl self.cg_iters = cg_iters self.cg_damping = cg_damping self.vf_stepsize = vf_stepsize self.vf_iters = vf_iters self.time_steps_per_batch = timesteps_per_batch if max_path_length is None: self.max_path_length = timesteps_per_batch else: self.max_path_length = max_path_length self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) oldpi = policy_fn("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # n_size = tf.placeholder(dtype=tf.float32, shape=[None]) # neighborhood size ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) # pred_n_error = tf.reduce_mean(tf.square(pi.predict_n - n_size)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus # - pred_n_error losses = [optimgain, meankl, entbonus, surrgain, meanent, vferr] # , pred_n_error] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy", "vf_loss" ] # , "pred_n_error"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] var_list.extend( [v for v in all_var_list if v.name.split("/")[1].startswith("me")]) vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] # vf_var_list.extend([v for v in all_var_list if v.name.split("/")[1].startswith("me")]) self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(var_list) self.set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for ( oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) self.compute_losses = U.function([ob, ac, atarg, ret], losses) self.compute_lossandgrad = U.function( [ob, ac, atarg, ret], losses + [U.flatgrad(optimgain, var_list)]) self.compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) self.compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) act_params = { 'name': "pi", 'ob_space': ob_space, 'ac_space': ac_space, } self.pi = ActWrapper(pi, act_params) U.initialize() th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) self.vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # self.seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) if self.time_steps_per_batch > self.max_path_length: self.nr_traj_seg_gens = int(self.time_steps_per_batch / self.max_path_length) self.seg_gen = [ copy_func(traj_segment_generator, "traj_seg_gen_{}".format(i))(pi, env, timesteps_per_batch, stochastic=True) for i in range(self.nr_traj_seg_gens) ] else: self.nr_traj_seg_gens = 1 self.seg_gen = [ traj_segment_generator(pi, env, self.time_steps_per_batch, stochastic=True) ]
def _init(self, ob_space, ac_space, hid_size, gaussian_fixed_var=True): num_hid_layers = len(hid_size) assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # with tf.variable_scope("retfilter"): # self.ret_rms = RunningMeanStd(shape=1) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # last_out = obz last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tc.layers.layer_norm(last_out, center=True, scale=True) last_out = tf.nn.relu(last_out) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): # last_out = obz last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense( last_out, hid_size[i], name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0)) if self.layer_norm: last_out = tc.layers.layer_norm(last_out, center=True, scale=True) last_out = tf.nn.relu(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])