def __init__(self, env, hidden_size, hidden_layers, entcoeff=0.001, lr_rate=1e-4, embedding_shape=None, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.conditional_shape = embedding_shape #self.actions_shape = env.action_space.shape # self.input_shape = tuple([o+a for o,a in zip(self.observation_shape, self.actions_shape)]) #????? # self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.hidden_layers = hidden_layers self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.embedding_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.embedding_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy ###explore # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log( 1 - tf.nn.sigmoid(generator_logits) + 1e-8) ###-tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.generator_obs_ph, self.expert_obs_ph, self.embedding_ph], self.losses + [U.flatgrad(self.total_loss, var_list)])
def test_mpi_adam(): """ tests the MpiAdam object's functionality """ np.random.seed(0) tf.compat.v1.set_random_seed(0) a_var = tf.Variable(np.random.randn(3).astype('float32')) b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(input_tensor=tf.square(a_var)) + tf.reduce_sum( input_tensor=tf.sin(b_var)) learning_rate = 1e-2 update_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss) do_update = tf_utils.function([], loss, updates=[update_op]) tf.compat.v1.get_default_session().run( tf.compat.v1.global_variables_initializer()) for step in range(10): print(step, do_update()) tf.compat.v1.set_random_seed(0) tf.compat.v1.get_default_session().run( tf.compat.v1.global_variables_initializer()) var_list = [a_var, b_var] lossandgrad = tf_utils.function( [], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for step in range(10): loss, grad = lossandgrad() adam.update(grad, learning_rate) print(step, loss)
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = Adam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = Adam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype("float32")) b = tf.Variable(np.random.randn(2, 5).astype("float32")) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) losslist_ref = [] for i in range(10): l = do_update() print(i, l) losslist_ref.append(l) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) adam = MpiAdam(var_list) losslist_test = [] for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l) losslist_test.append(l) np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
def learn( env, policy_func, discriminator, expert_dataset, embedding_z, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, embedding=embedding_z, timesteps_per_batch=timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def learn(env, model_path, data_path, policy_fn, *, rolloutSize, num_options=4, horizon=80, clip_param=0.025, ent_coeff=0.01, # clipping parameter epsilon, entropy coeff optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=20, # time constraint adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False, ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space, num_options=num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) op_adv = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) betas = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) # Setup losses and stuff kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) option_hot = tf.one_hot(option, depth=num_options) pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims( tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options]) pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims( tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1) pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6) op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv) log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1) op_loss -= 0.01 * tf.reduce_sum(op_entropy) var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) termgrad = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list)]) # Since we will use a different step size. opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options], [U.flatgrad(op_loss, var_list)]) # Since we will use a different step size. intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options], [U.flatgrad(int_loss, var_list)]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards datas = [0 for _ in range(num_options)] if retrain: print("Retraining to New Task !! ") time.sleep(2) U.load_state(model_path+'/') p = [] max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam, num_options) opt_d = [] for i in range(num_options): dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0. opt_d.append(dur) ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # Optimizing the policy for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) opt_d[opt] = indices.size if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) adam.update(grads, mainlr * cur_lrmult) losses.append(newlosses) # Optimize termination functions termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0] adam.update(termg, termlr) # Optimize interest functions intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0] adam.update(intgrads, intlr) # Optimize policy over options opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0] adam.update(opgrads, piolr) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn( env, policy_func, discriminator, expert_dataset, timesteps_per_batch, *, g_step, d_step, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, d_stepsize=3e-4, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) d_adam = MpiAdam(discriminator.get_trainable_variables()) adam = MpiAdam(var_list, epsilon=adam_epsilon) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() adam.sync() def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) for _ in range(g_step): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new( ) # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # ----------------- logger -------------------- logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) true_rewbuffer.extend(true_rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') # loss_normed = -tf.reduce_mean(self.normalized_critic_with_actor_tf) self.actor_Q = tf.reduce_mean(self.critic_with_actor_tf) self.actor_loss = -self.actor_Q tf.summary.scalar('actor/Q', self.actor_Q) # setting up actor vars/grads/optimizer self.actor_vars = self.actor.active_vars self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] self.actor_params = actor_params = [0] * ( len(self.actor.trainable_vars) + 1) for i, shape in enumerate(actor_shapes): actor_params[i + 1] = actor_params[i] + np.prod(shape) n_inact = len(actor_shapes) - len(self.actor_vars) active_params = actor_params[n_inact:] - actor_params[n_inact] logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_params)) logger.info(' actor total: {}'.format(actor_params[-1])) logger.info(' actor active: {}'.format(active_params)) grad = self.actor_grads[active_params[0]:active_params[1]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (n_inact // 2, active_params[1] - active_params[0]), tf.reduce_mean(grad)) grad = self.actor_grads[active_params[-3]:active_params[-2]] tf.summary.scalar( 'grads/actor_layer%d_%d' % (-1, active_params[-2] - active_params[-3]), tf.reduce_mean(grad)) # for train_demo() self.demo_loss = tf.reduce_mean( tf.square(self.obs_delta_kine - self.demo_aprx)) self.demo_max_loss = tf.reduce_max( tf.square(self.obs_delta_kine - self.demo_aprx)) if self.demo_l2_reg > 0.: demo_reg_vars = self.actor.demo_reg_vars for var in demo_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info( ' applying l2 regularization for demo_aprx with {}'.format( self.demo_l2_reg)) self.demo_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.demo_l2_reg), weights_list=demo_reg_vars) self.demo_loss += self.demo_reg else: self.demo_reg = None self.demo_grads = tf_util.flatgrad(self.demo_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.demo_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # mimic rwd self.mimic_rwd = -self.demo_loss tf.summary.scalar('actor/mimic_rwd', self.mimic_rwd)
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') self.normalized_critic_target_tf = tf.clip_by_value( ret_normalize(self.critic_target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - self.normalized_critic_target_tf)) tf.summary.scalar('critic_loss/Q_diff', self.critic_loss) if self.normalize_returns: tf.summary.scalar('critic_loss/Q_normed_critic', tf.reduce_mean(self.normalized_critic_tf)) tf.summary.scalar('critic_loss/Q_normed_target', tf.reduce_mean(self.normalized_critic_target_tf)) self.critic_loss_step = 0 diff_rwd = tf.reduce_mean(tf.square(self.pred_rwd - self.rewards)) self.critic_loss_step += diff_rwd tf.summary.scalar('critic_loss/step_rwd', self.critic_loss_step) critic_kine_factor = 100 diff_obs = tf.reduce_mean(tf.square(self.pred_obs_delta - self.obs_delta_kstates), axis=0) diff_obs_kine = tf.reduce_mean( diff_obs[:self.nb_demo_kine]) * critic_kine_factor diff_obs_rest = tf.reduce_mean(diff_obs[self.nb_demo_kine:]) self.critic_loss_step += (diff_obs_kine + diff_obs_rest) tf.summary.scalar( 'critic_loss/step_kstates_kine_x%d' % critic_kine_factor, diff_obs_kine) tf.summary.scalar('critic_loss/step_kstates_rest', diff_obs_rest) tf.summary.scalar('critic_loss/step_total', self.critic_loss_step) self.critic_loss += self.critic_loss_step if self.critic_l2_reg > 0.: critic_reg_vars = self.critic.reg_vars for var in critic_reg_vars: logger.debug(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg tf.summary.scalar('critic_loss/reg', critic_reg) critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_params = [0] * (len(self.critic.trainable_vars) + 1) for i, shape in enumerate(critic_shapes): critic_params[i + 1] = critic_params[i] + np.prod(shape) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_params)) logger.info(' critic total: {}'.format(critic_params[-1])) self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) # todo: make the following general grad = self.critic_grads[critic_params[0]:critic_params[1]] tf.summary.scalar( 'grads/critic_layer%d_%d' % (0, critic_params[1] - critic_params[0]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-3]:critic_params[-2]] tf.summary.scalar( 'grads/critic_layer%d_rwd_%d' % (-1, critic_params[-2] - critic_params[-3]), tf.reduce_mean(grad)) grad = self.critic_grads[critic_params[-7]:critic_params[-6]] tf.summary.scalar( 'grads/critic_layer%d_q_%d' % (-1, critic_params[-6] - critic_params[-7]), tf.reduce_mean(grad))
def learn(env, model_path, data_path, policy_fn, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, *, modes, rolloutSize, num_options=2, horizon, # timesteps per actor per update clip_param, ent_coeff=0.02, # clipping parameter epsilon, entropy coeff optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=160, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=0, # time constraint adam_epsilon=1.2e-4, schedule='linear', # annealing for stepsize parameters (epsilon and adam) retrain=False ): """ Core learning function """ ob_space = env.observation_space ac_space = env.action_space if retrain: model = pickle.load(open(model_path + '/hybrid_model.pkl', 'rb')) print("Model graph:", model.transitionGraph.nodes) print("Model options:", model.transitionGraph.edges) else: model = partialHybridModel(env, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, horizon, modes, num_options, rolloutSize) pi = policy_fn("pi", ob_space, ac_space, model, num_options) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space, model, num_options) # Network for old policy atarg = tf1.placeholder(dtype=tf1.float32, shape=[None]) # Target advantage function (if applicable) ret = tf1.placeholder(dtype=tf1.float32, shape=[None]) # Empirical return lrmult = tf1.placeholder(name='lrmult', dtype=tf1.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # Define placeholders for computing the advantage ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") ac = pi.pdtype.sample_placeholder([None]) # Defining losses for optimization kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf1.reduce_mean(kloldnew) meanent = tf1.reduce_mean(ent) pol_entpen = (-ent_coeff) * meanent ratio = tf1.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf1.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf1.reduce_mean(tf1.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP), negative to convert from a maximization to minimization problem vf_loss = tf1.reduce_mean(tf1.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf1.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() # Prepare for rollouts episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=10) # rolling buffer for episode lengths rewbuffer = deque(maxlen=10) # rolling buffer for episode rewards p = [] # for saving the rollouts if retrain: print("Retraining to New Task !!") time.sleep(2) U.load_state(model_path+'/') print(pi.eps) max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("************* Iteration %i *************" % iters_so_far) print("Collecting samples for policy optimization !! ") render = False rollouts = sample_trajectory(pi, model, env, horizon=horizon, rolloutSize=rolloutSize, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + '/rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) # Model update print("Updating model !!\n") model.updateModel(rollouts, pi) print("Model graph:", model.transitionGraph.nodes) print("Model options:", model.transitionGraph.edges) edges = list(model.transitionGraph.edges) for i in range(0, len(edges)): print(edges[i][0], " -> ", edges[i][1], " : ", model.transitionGraph[edges[i][0]][edges[i][1]]['weight']) datas = [0 for _ in range(num_options)] add_vtarg_and_adv(rollouts, pi, gamma, lam, num_options) ob, ac, opts, atarg, tdlamret = rollouts["seg_obs"], rollouts["seg_acs"], rollouts["des_opts"], rollouts["adv"], rollouts["tdlamret"] old_opts = rollouts["seg_opts"] similarity = 0 for i in range(0, len(old_opts)): if old_opts[i] == opts[i]: similarity += 1 print("Percentage similarity of options: ", similarity/len(old_opts) * 100) vpredbefore = rollouts["vpreds"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() pi.eps = pi.eps * gamma #reduce exploration # Optimizing the policy print("\nOptimizing policy !! \n") for opt in range(num_options): indices = np.where(opts == opt)[0] print("Option- ", opt, " Batch Size: ", indices.size) if not indices.size: continue datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) if indices.size < optim_batchsize: print("Too few samples for opt - ", opt) continue optim_batchsize_corrected = optim_batchsize optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs) print("Optim Epochs:", optim_epochs_corrected) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs_corrected): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize_corrected): *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt]) if np.isnan(newlosses).any(): continue adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) if len(losses) > 0: meanlosses, _, _ = mpi_moments(losses, axis=0) print("Mean loss ", meanlosses) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ''' if model_path and not retrain: U.save_state(model_path + '/') model_file_name = model_path + '/hybrid_model.pkl' pickle.dump(model, open(model_file_name, "wb"), pickle.HIGHEST_PROTOCOL) print("Policy and Model saved in - ", model_path) ''' return pi, model
def learn(encoder, action_decorder, state_decorder, embedding_shape, *, dataset, logdir, batch_size, time_steps, epsilon=0.001, lr_rate=1e-3): lstm_encoder = encoder("lstm_encoder") ac_decoder = action_decorder("ac_decoder") state_decoder = state_decorder("state_decoder") #换成了mlp obs = U.get_placeholder_cached(name="obs") ##for encoder ob = U.get_placeholder_cached(name="ob") embedding = U.get_placeholder_cached(name="embedding") # obss = U.get_placeholder_cached(name="obss") ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs # ## for action decoder, 这个state decoder应该也是可以用的 # embeddingss = U.get_placeholder_cached(name="embeddingss") ac = ac_decoder.pdtype.sample_placeholder([None]) obs_out = state_decoder.pdtype.sample_placeholder([None]) # p(z) 标准正太分布, state先验分布???是不是应该换成demonstration的标准正态分布???? 可以考虑一下这个问题 from common.distributions import make_pdtype p_z_pdtype = make_pdtype(embedding_shape) p_z_params = U.concatenate([ tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd") ], axis=-1) p_z = p_z_pdtype.pdfromflat(p_z_params) recon_loss = -tf.reduce_mean( tf.reduce_sum(ac_decoder.pd.logp(ac) + state_decoder.pd.logp(obs_out), axis=0)) ##这个地方还要再改 kl_loss = lstm_encoder.pd.kl(p_z) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! vae_loss = recon_loss + kl_loss ###vae_loss 应该是一个batch的 ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"]) losses = [recon_loss, kl_loss, vae_loss] ## var_list var_list = [] en_var_list = lstm_encoder.get_trainable_variables() var_list.extend(en_var_list) # ac_de_var_list = ac_decoder.get_trainable_variables() # var_list.extend(ac_de_var_list) state_de_var_list = state_decoder.get_trainable_variables() var_list.extend(state_de_var_list) # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss) compute_losses = U.function([obs, ob, embedding, ac, obs_out], losses) compute_grad = U.function([obs, ob, embedding, ac, obs_out], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!! adam = MpiAdam(var_list, epsilon=epsilon) U.initialize() adam.sync() writer = U.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) # =========================== TRAINING ===================== # iters_so_far = 0 saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=100) saver_encoder = tf.train.Saver(var_list=en_var_list, max_to_keep=100) # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎 while True: logger.log("********** Iteration %i ************" % iters_so_far) recon_loss_buffer = deque(maxlen=100) kl_loss_buffer = deque(maxlen=100) vae_loss_buffer = deque(maxlen=100) for observations in dataset.get_next_batch(batch_size=time_steps): observations = observations.transpose((1, 0)) embedding_now = lstm_encoder.get_laten_vector(observations) embeddings = np.array([embedding_now for _ in range(time_steps)]) embeddings_reshape = embeddings.reshape((time_steps, -1)) actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape) state_outputs = state_decoder.get_outputs( observations.reshape(time_steps, -1, 1), embeddings) ##还没有加混合高斯......乱加了一通,已经加完了 recon_loss, kl_loss, vae_loss = compute_losses( observations, observations.reshape(batch_size, time_steps, -1), embeddings_reshape, observations.reshape(time_steps, -1, 1), embeddings, actions, state_outputs) g = compute_grad(observations, observations.reshape(batch_size, time_steps, -1), embeddings_reshape, observations.reshape(time_steps, -1, 1), embeddings, actions, state_outputs) adam.update(g, lr_rate) recon_loss_buffer.append(recon_loss) kl_loss_buffer.append(kl_loss) vae_loss_buffer.append(vae_loss) ep_stats.add_all_summary(writer, [ np.mean(recon_loss_buffer), np.mean(kl_loss_buffer), np.mean(vae_loss_buffer) ], iters_so_far) logger.record_tabular("recon_loss", recon_loss) logger.record_tabular("kl_loss", kl_loss) logger.record_tabular("vae_loss", vae_loss) logger.dump_tabular() if (iters_so_far % 10 == 0 and iters_so_far != 0): save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far) save(saver=saver_encoder, sess=tf.get_default_session(), logdir="./vae_saver", step=iters_so_far) # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far) iters_so_far += 1
def _create_network(self): self.sess = U.get_session() self.inp_src = tf.placeholder(shape=[None, 1, self.inp_dim], dtype=tf.float32, name='input_src') self.inp_dest = tf.placeholder(shape=[None, 1, self.out_dim], dtype=tf.float32, name='input_dest') self.labels = tf.placeholder(shape=[None, self.seq_len, self.out_dim], dtype=tf.float32, name='label') self.src_seq_len = tf.placeholder(tf.int32, (None, ), name='source_sequence_length') self.tar_seq_len = tf.placeholder(tf.int32, (None, ), name='target_sequence_length') # running averages # with tf.variable_scope('goal_stats_src'): # self.goal_stats_src = Normalizer(self.inp_dim, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('goal_stats_dest'): self.goal_stats_dest = Normalizer(self.out_dim, self.norm_eps, self.norm_clip, sess=self.sess, PLN=True) # normalize inp_src, and goals labels inp_src = self.goal_stats_dest.normalize(self.inp_src) inp_dest = self.goal_stats_dest.normalize(self.inp_dest) goal_labels = self.goal_stats_dest.normalize(self.labels) with tf.variable_scope('goal_gen'): encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, inp_src, sequence_length=self.src_seq_len, dtype=tf.float32) decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) project_layer = tf.layers.Dense(self.out_dim) with tf.variable_scope("decode"): train_inp = tf.concat([inp_dest, goal_labels[:, :-1, :]], axis=-2) train_helper = tf.contrib.seq2seq.TrainingHelper( train_inp, sequence_length=self.tar_seq_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, train_helper, encoder_state, output_layer=project_layer) train_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( train_decoder, maximum_iterations=self.seq_len) self.train_outputs = train_outputs.rnn_output with tf.variable_scope("decode", reuse=True): infer_helper = ContinousInferHelper(inp_dest[:, 0, :], self.tar_seq_len) infer_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, infer_helper, encoder_state, output_layer=project_layer) infer_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( infer_decoder, maximum_iterations=self.seq_len) self.infer_outputs = self.goal_stats_dest.denormalize( infer_outputs.rnn_output) log_sigma = tf.get_variable(name="logstd", shape=[1, self.out_dim], initializer=U.normc_initializer(0.1)) goals = train_outputs.rnn_output loss = 0.5 * tf.reduce_sum(tf.square((goal_labels - goals)/tf.exp(log_sigma)), axis=-1) \ + 0.5 * np.log(2*np.pi) * tf.to_float(tf.shape(self.labels)[-1]) \ + tf.reduce_sum(log_sigma, axis=-1) self.loss = tf.reduce_mean(loss) self.tr_outputs = self.goal_stats_dest.denormalize( self.train_outputs ) # just for inspect the correctness of training var_list = self._vars('') self.grads = U.flatgrad(self.loss, var_list) self.adam = MpiAdam(var_list, epsilon=self.adamepsilon) tf.variables_initializer(self._global_vars('')).run() self.adam.sync()
def learn(env, encoder, action_decorder, state_decorder, embedding_shape,*, dataset, optimizer, logdir, batch_size, time_steps, adam_epsilon = 0.001, lr_rate = 1e-4, vae_beta = 8): lstm_encoder = encoder("lstm_encoder") ac_decoder = action_decorder("ac_decoder") state_decoder = state_decorder("state_decoder") #这个地方有问题 ac_de_ob = U.get_placeholder_cached(name="ac_de_ob") en_ob = U.get_placeholder_cached(name="en_ob") ##for encoder state_de_ob = U.get_placeholder_cached(name="state_de_ob") ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs ac_de_embedding = U.get_placeholder_cached(name="ac_de_embedding") ## for action decoder, 这个state decoder应该也是可以用的 state_de_embedding = U.get_placeholder_cached(name="state_de_embedding") # ac = ac_decoder.pdtype.sample_placeholder([None]) ob_next = tf.placeholder(name="ob_next", shape=[None, ob_shape], dtype=tf.float32) # ob_next_ac = tf.placeholder(name="ob_next_ac", shape=[ob_shape], dtype=tf.float32) # obs_out = state_decoder.pdtype.sample_placeholder([None]) # p(z) 标准正太分布 from common.distributions import make_pdtype p_z_pdtype = make_pdtype(embedding_shape) p_z_params = U.concatenate([tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd")], axis=-1) p_z = p_z_pdtype.pdfromflat(p_z_params) # recon_loss 里再加一个,对于action的 recon_loss = -tf.reduce_sum(state_decoder.pd.logp(ob_next)) # kl_loss = lstm_encoder.pd.kl(p_z)[0] ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! # kl_loss = tf.maximum(lstm_encoder.pd.kl(p_z)[0], tf.constant(5.00)) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!! kl_loss = lstm_encoder.pd.kl(p_z)[0] vae_loss = tf.reduce_mean(recon_loss + vae_beta * kl_loss) ###vae_loss 应该是一个batch的 ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"]) losses = [recon_loss, kl_loss, vae_loss] # 均方误差去训练 action,把得到的action step 一下,得到x(t+1),然后用均方误差loss,或者可以试试交叉熵 ## var_list var_list = [] en_var_list = lstm_encoder.get_trainable_variables() var_list.extend(en_var_list) # ac_de_var_list = ac_decoder.get_trainable_variables() # var_list.extend(ac_de_var_list) state_de_var_list = state_decoder.get_trainable_variables() var_list.extend(state_de_var_list) # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss) compute_losses = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], losses) compute_grad = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!! adam = MpiAdam(var_list, epsilon=adam_epsilon) U.initialize() adam.sync() writer = U.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) # =========================== TRAINING ===================== # iters_so_far = 0 saver = tf.train.Saver(var_list=var_list, max_to_keep=100) saver_encoder = tf.train.Saver(var_list = en_var_list, max_to_keep=100) # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎 while iters_so_far < 50: ## 加多轮 logger.log("********** Iteration %i ************" % iters_so_far) ## 要不要每一轮调整一下batch_size recon_loss_buffer = deque(maxlen=100) # recon_loss2_buffer = deque(maxlen=100) kl_loss_buffer = deque(maxlen=100) vae_loss_buffer = deque(maxlen=100) # i = 0 for obs_and_next in dataset.get_next_batch(batch_size=time_steps): # print(i) # i += 1 observations = obs_and_next[0].transpose((1, 0))[:-1] ob_next = obs_and_next[0].transpose(1, 0)[state_decoder.receptive_field:, :] embedding_now = lstm_encoder.get_laten_vector(obs_and_next[0].transpose((1, 0))) embeddings = np.array([embedding_now for _ in range(time_steps - 1)]) embeddings_reshape = embeddings.reshape((time_steps-1, -1)) actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape) ob_next_ac = get_ob_next_ac(env, observations[-1], actions[0]) ##这个还需要再修改 #########################################3 # state_outputs = state_decoder.get_outputs(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) ##还没有加混合高斯......乱加了一通,已经加完了 # recon_loss = state_decoder.recon_loss(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) recon_loss, kl_loss, vae_loss = compute_losses(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1), observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next) g = compute_grad(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1), observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next) # logger.record_tabular("recon_loss", recon_loss) # logger.record_tabular("recon_loss2", recon_loss2) # logger.record_tabular("kl_loss", kl_loss) # logger.record_tabular("vae_loss", vae_loss) # logger.dump_tabular() adam.update(g, lr_rate) recon_loss_buffer.append(recon_loss) # recon_loss2_buffer.append(recon_loss2) kl_loss_buffer.append(kl_loss) vae_loss_buffer.append(vae_loss) ep_stats.add_all_summary(writer, [np.mean(recon_loss_buffer), np.mean(kl_loss_buffer), np.mean(vae_loss_buffer)], iters_so_far) logger.record_tabular("recon_loss", recon_loss) # logger.record_tabular("recon_loss2", recon_loss2) logger.record_tabular("kl_loss", kl_loss) logger.record_tabular("vae_loss", vae_loss) logger.dump_tabular() if(iters_so_far % 10 == 0 and iters_so_far != 0): save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far) save(saver=saver_encoder, sess=tf.get_default_session(),logdir="./vae_saver", step=iters_so_far) # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far) iters_so_far += 1 if iters_so_far < 6: lr_rate /= 2
def learn( env, model_path, data_path, policy_fn, *, horizon=150, # timesteps per actor per update rolloutSize=50, clip_param=0.2, entcoeff=0.02, # clipping parameter epsilon, entropy coeff optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32, # optimization hypers gamma=0.99, lam=0.95, # advantage estimation max_iters=0, # time constraint adam_epsilon=1e-4, schedule='constant', # annealing for stepsize parameters (epsilon and adam) retrain=False): # Setup losses and policy ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=5) # rolling buffer for episode lengths rewbuffer = deque(maxlen=5) # rolling buffer for episode rewards p = [] # for saving the rollouts if retrain == True: print("Retraining the policy from saved path") time.sleep(2) U.load_state(model_path) max_timesteps = int(horizon * rolloutSize * max_iters) while True: if max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) print("Collecting samples for policy optimization !! ") if iters_so_far > 70: render = True else: render = False rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, stochastic=True, render=render) # Save rollouts data = {'rollouts': rollouts} p.append(data) del data data_file_name = data_path + 'rollout_data.pkl' pickle.dump(p, open(data_file_name, "wb")) add_vtarg_and_adv(rollouts, gamma, lam) ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[ "adv"], rollouts["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("Success", rollouts["success"]) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn( env, policy_func, *, timesteps=4, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", timesteps, ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", timesteps, ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pi_vpred = tf.placeholder(dtype=tf.float32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") # ob_now = tf.placeholder(dtype=tf.float32, shape=[optim_batchsize, list(ob_space.shape)[0]]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) # total_loss = pol_surr + pol_entpen + vf_loss total_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if not v.name.split("/")[1].startswith("vf") ] # lossandgrad = U.function([ob, ac, atarg ,ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, pol_var_list)]) vf_grad = U.function([ob, ac, atarg, ret, lrmult], U.flatgrad(vf_loss, vf_var_list)) # adam = MpiAdam(var_list, epsilon=adam_epsilon) pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() #adam.sync() pol_adam.sync() vf_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, timesteps, env, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards EpRewMean_MAX = 2.5e3 assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) # if(iters_so_far == 1): # a = 1 seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, vpred, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "vpred"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset( dict(ob=ob, ac=ac, atarg=atarg, vpred=vpred, vtarg=tdlamret), shuffle=False ) #d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vpred = vpred, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob] ob_now = np.append(pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) *newlosses, g = lossandgrad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) ###这里的g好像都是0 #adam.update(g, optim_stepsize * cur_lrmult) pol_adam.update(g, optim_stepsize * cur_lrmult) vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob] ob_now = np.append(pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) *newlosses, g = lossandgrad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) ###这里的g好像都是0 #adam.update(g, optim_stepsize * cur_lrmult) pol_adam.update(g, optim_stepsize * cur_lrmult) vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) logger.log("Evaluating losses...") losses = [] loss_pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ### feed ob ob_now = np.append(loss_pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) loss_pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) newlosses = compute_losses(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) if (np.mean(rewbuffer) > EpRewMean_MAX): EpRewMean_MAX = np.mean(rewbuffer) print(iters_so_far) print(np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()