def train(self, seg, optim_batchsize, optim_epochs): cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg, tdlamret = seg["ob"], seg["unnorm_ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, *self.fix_ob2feed(batch["ob"])) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) return meanlosses
def sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic): assert load_model_path is not None U.load_state(load_model_path) sample_trajs = [] for iters_so_far in range(max_sample_traj): logger.log2("********** Iteration %i ************" % iters_so_far) traj = traj_gen.next() ob, new, ep_ret, ac, rew, ep_len = traj['ob'], traj['new'], traj[ 'ep_ret'], traj['ac'], traj['rew'], traj['ep_len'] logger.record_tabular("ep_ret", ep_ret) logger.record_tabular("ep_len", ep_len) logger.record_tabular("immediate reward", np.mean(rew)) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() traj_data = {"ob": ob, "ac": ac, "rew": rew, "ep_ret": ep_ret} sample_trajs.append(traj_data) sample_ep_rets = [traj["ep_ret"] for traj in sample_trajs] logger.log2("Average total return: %f" % (sum(sample_ep_rets) / len(sample_ep_rets)))
timesteps_so_far = 0 iters_so_far = 0 lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards distbuffer = deque(maxlen=100) tstart = time.time() writer = U.FileWriter(tensorboard_dir) loss_stats = stats(["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]) ep_stats = stats(["Reward", "Episode_Length", "Episode_This_Iter", "Distance"]) while timesteps_so_far < args.max_timesteps: # Save model if iters_so_far % args.save_per_iter == 0 and iters_so_far > 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log2("********** Iteration %i ************" % iters_so_far) seg = seg_gen.next() losses = policy.train(seg, args.optim_batchsize, args.optim_epochs) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_dists"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, dists = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) distbuffer.extend(dists) logger.record_tabular("eplenmean", np.mean(lenbuffer)) logger.record_tabular("eprewmean", np.mean(rewbuffer)) logger.record_tabular("epthisiter", len(lens)) logger.record_tabular("epdistmean", np.mean(distbuffer))
def learn( env, policy_fn, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=50, max_sample_traj=10, ckpt_dir=None, log_dir=None, task_name="origin", sample_stochastic=True, load_model_path=None, task="train"): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed clipping parameter epsilon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) writer = U.FileWriter(log_dir) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_actorbatch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" loss_stats = stats(loss_names) ep_stats = stats(["Reward", "Episode_Length", "Episode_This_Iter"]) if task == "sample_trajectory": sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log2("********** Iteration %i ************" % iters_so_far) seg = seg_gen.next() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses = lg[:-1] g = lg[-1] adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() loss_stats.add_all_summary(writer, meanlosses, iters_so_far) ep_stats.add_all_summary( writer, [np.mean(rewbuffer), np.mean(lenbuffer), len(lens)], iters_so_far) return pi
def train(self, seg, optim_batchsize, optim_epochs): #normalize the reward rffs_int = np.array( [self.rff_int.update(rew) for rew in seg["rew_int"]]) self.rff_rms_int.update(rffs_int.ravel()) seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var) cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[ "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[ "adv_int"], seg["tdlamret_int"] vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[ "vpred_int"] # predicted value function before udpate atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std( ) # standardized advantage function estimate atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std() atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg_ext=tdlamret_ext, vtarg_int=tdlamret_int), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy if hasattr(self.int_rew, "ob_rms"): self.int_rew.update_obs_rms( ob) #update running mean/std for int_rew self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_ext_before", explained_variance(vpredbefore_ext, tdlamret_ext)) return meanlosses