def __init__(self, env, master_policy,old_master_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64): self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.num_subpolicies = len(sub_policies) self.sub_policies = sub_policies self.master_policy = master_policy ob_space = env.observation_space ac_space = env.action_space self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None]) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") ob_master = U.get_placeholder_cached(name="adv_ob") ac_master = master_policy.pdtype.sample_placeholder([None]) loss_master = self.policy_loss_master(master_policy, old_master_policy, ob_master, ac_master, atarg, ret, clip_param) self.master_policy_var_list = master_policy.get_trainable_variables() self.master_loss = U.function([ob_master, ac_master, atarg, ret], U.flatgrad(loss_master, self.master_policy_var_list)) self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_master_policy.get_variables(), master_policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] for i in range(self.num_subpolicies): varlist = sub_policies[i].get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policies[i], sub_policies[(i-1)%2], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param) self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist))) self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() self.master_adam.sync() for i in range(self.num_subpolicies): self.adams[i].sync()
def __init__(self, env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64): self.policy = policy self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.num_subpolicies = len(sub_policies) self.sub_policies = sub_policies ob_space = env.observation_space ac_space = env.action_space if WRITE_SCALAR: self.scalar_writer = tf.summary.FileWriter(osp.join("savedir/",'checkpoints', 'scalar%d' % time.time())) # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") ac = policy.pdtype.sample_placeholder([None]) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param) self.master_policy_var_list = policy.get_trainable_variables() self.master_loss = U.function([ob, ac, atarg, ret], U.flatgrad(total_loss, self.master_policy_var_list)) self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) summ = tf.summary.scalar("total_loss", total_loss) self.calc_summary = U.function([ob, ac, atarg, ret],[summ]) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None]) for i in range(self.num_subpolicies): varlist = sub_policies[i].get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policies[i], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param) self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist))) self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() self.master_adam.sync() for i in range(self.num_subpolicies): self.adams[i].sync()
class Learner: def __init__(self, env, master_policy, old_master_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64): self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.num_subpolicies = len(sub_policies) self.sub_policies = sub_policies self.master_policy = master_policy ob_space = env.observation_space ac_space = env.action_space self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None]) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") ob_master = U.get_placeholder_cached(name="adv_ob") ac_master = master_policy.pdtype.sample_placeholder([None]) loss_master = self.policy_loss_master(master_policy, old_master_policy, ob_master, ac_master, atarg, ret, clip_param) self.master_policy_var_list = master_policy.get_trainable_variables() self.master_loss = U.function([ob_master, ac_master, atarg, ret], U.flatgrad(loss_master, self.master_policy_var_list)) self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_master_policy.get_variables(), master_policy.get_variables()) ]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] for i in range(self.num_subpolicies): varlist = sub_policies[i].get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policies[i], sub_policies[(i - 1) % 2], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param) self.losses.append( U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist))) self.assign_subs.append( U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv ) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables()) ])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() self.master_adam.sync() for i in range(self.num_subpolicies): self.adams[i].sync() def nograd(self, var_list): return tf.concat(axis=0, values=[ tf.reshape(tf.zeros_like(v), [U.numel(v)]) for v in var_list ]) def policy_loss_master(self, pi, oldpi, ob, ac, atarg, ret, clip_param): ratio = tf.exp( pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -U.mean(tf.minimum(surr1, surr2)) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) total_loss = pol_surr + vf_loss return total_loss def policy_loss(self, pi, other_pi, oldpi, ob, ac, atarg, ret, clip_param): policy_seperation_loss = 0.1 * U.mean( tf.reciprocal(pi.pd.kl(other_pi.pd))) ratio = tf.exp( pi.pd.logp(ac) - tf.clip_by_value(oldpi.pd.logp(ac), -20, 20)) # advantage * pnew / pold surr1 = ratio * atarg surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = -U.mean(tf.minimum(surr1, surr2)) vfloss1 = tf.square(pi.vpred - ret) # We dont want to clip vf losses because value is critical to learning of master. vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) total_loss = pol_surr + vf_loss # + policy_seperation_loss return total_loss def syncMasterPolicies(self): self.master_adam.sync() def syncSubpolicies(self): for i in range(self.num_subpolicies): self.adams[i].sync() def updateMasterPolicy(self, seg): ob, ac, atarg, tdlamret = seg["macro_ob"], seg["macro_ac"], seg[ "macro_adv"], seg["macro_tdlamret"] # ob = np.ones_like(ob) mean = atarg.mean() std = atarg.std() meanlist = MPI.COMM_WORLD.allgather(mean) global_mean = np.mean(meanlist) real_var = std**2 + (mean - global_mean)**2 variance_list = MPI.COMM_WORLD.allgather(real_var) global_std = np.sqrt(np.mean(variance_list)) atarg = (atarg - global_mean) / max(global_std, 0.000001) d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True) optim_batchsize = min(self.optim_batchsize, ob.shape[0]) self.master_policy.ob_rms.update( ob) # update running mean/std for policy self.assign_old_eq_new() for _ in range(self.optim_epochs): for batch in d.iterate_once(optim_batchsize): g = self.master_loss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"]) self.master_adam.update(g, 0.01, 1) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) logger.record_tabular("EpRewMean", np.mean(rews)) return np.mean(rews), np.mean(seg["ep_rets"]) def updateSubPolicies(self, test_segs, num_batches, optimize=[True, True]): for i in range(self.num_subpolicies): is_optimizing = True test_seg = test_segs[i] ob, ac, atarg, tdlamret = test_seg["ob"], test_seg["ac"], test_seg[ "adv"], test_seg["tdlamret"] if np.shape(ob)[0] < 1: is_optimizing = False else: atarg = (atarg - atarg.mean()) / max(atarg.std(), 0.000001) test_d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True) test_size = int(ob.shape[0]) self.assign_subs[i]( ) # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data if is_optimizing and optimize[i]: self.sub_policies[i].ob_rms.update(ob) for k in range(self.optim_epochs): m = 0 for test_batch in test_d.iterate_once( self.optim_batchsize): test_g = self.losses[i](test_batch["ob"], test_batch["ac"], test_batch["atarg"], test_batch["vtarg"]) self.adams[i].update(test_g, self.optim_stepsize, 1) m += 1 else: self.sub_policies[i].ob_rms.noupdate() blank = self.zerograd() for _ in range(self.optim_epochs): for _ in range(num_batches): self.adams[i].update(blank, self.optim_stepsize, 0)
def __init__(self, env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, args=None): # self.policy = policy self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize # self.num_subpolicies = len(sub_policies) self.sub_policy = sub_policy self.args = args ob_space = env.observation_space ac_space = env.action_space # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") # ac = policy.pdtype.sample_placeholder([None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return entcoeff = tf.placeholder(dtype=tf.float32, name="entcoef") # total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param, entcoeff) # self.master_policy_var_list = policy.get_trainable_variables() # self.master_loss = U.function([ob, ac, atarg, ret, entcoeff], U.flatgrad(total_loss, self.master_policy_var_list)) # self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) # self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) # for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] self.sp_ac = sub_policy.pdtype.sample_placeholder([None]) # for i in range(self.num_subpolicies): varlist = sub_policy.get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policy, old_sub_policy, ob, self.sp_ac, atarg, ret, clip_param, entcoeff) self.losses.append( U.function([ob, self.sp_ac, atarg, ret, entcoeff], U.flatgrad(loss, varlist))) self.assign_subs.append( U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policy.get_variables(), sub_policy.get_variables()) ])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() # self.master_adam.sync() # for i in range(self.num_subpolicies): self.adams[0].sync()