def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) #ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 1]) # features = Features(name="features", ob=ob) policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64) rollout = rollouts.traj_segment_generator(policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args) x = 0 while x<10000: # for x in range(10000): x = callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. policy.reset() learner.syncMasterPolicies() env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10) # mini_ep = 0 totalmeans = [] while mini_ep < warmup_time+train_time: mini_ep += 1 # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # learner.updateSubPolicies(test_seg, # log print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean))) if args.s: totalmeans.append(gmean) with open('./Fourrooms_google_20_mlsh/outfile'+str(x)+'.pickle', 'wb') as fp: pickle.dump(totalmeans, fp) x = x+1
def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space print("ob_space: %s" % ob_space) print("ac_space: %s" % ac_space) num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time num_batches = 15 # observation in. if (len(ob_space.shape) == 1): ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) elif (len(ob_space.shape) == 2): ob = U.get_placeholder( name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0] * ob_space.shape[1]]) elif (len(ob_space.shape) == 3): ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[ None, ob_space.shape[0] * ob_space.shape[1] * ob_space.shape[2] ]) else: raise Exception("unsupported observer space shape (%d)" % len(ob_space.shape)) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) gs_policy = GuessStepsPolicy(name="guess_steps", ob=ob, hid_size=32, num_hid_layers=5) policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, gs_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64) rollout = rollouts.traj_segment_generator(policy, sub_policies, gs_policy, env, macro_duration, num_rollouts, stochastic=True, args=args) hasRandomizeCorrect = hasattr(env, "env") and hasattr( env.env, "randomizeCorrect") for x in range(100000): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. policy.reset() learner.syncGuessStepsPolicies() learner.syncMasterPolicies() if hasRandomizeCorrect: env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time + train_time / 10) # mini_ep = 0 totalmeans = [] while mini_ep < warmup_time + train_time: mini_ep += 1 # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) gmean, lmean = learner.updateMasterPolicy(rolls) if gmean > 0: learner.updateGuessStepsPolicyLoss(rolls) #print("steps:") #print(rolls["steps"]) #print("gs_vpreds:") #print(rolls["gs_vpreds"]) print("gs mean:") print( U.eval( tf.reduce_mean( tf.square(rolls["gs_vpreds"] - rolls["steps"])))) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # log print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean))) if args.s: totalmeans.append(gmean) with open('outfile' + str(x) + '.pickle', 'wb') as fp: pickle.dump(totalmeans, fp)
def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time sub_hidden_sizes = args.sub_hidden_sizes sub_policy_costs = args.sub_policy_costs save_folder = os.path.join("savedir/", args.savename) if not os.path.exists(save_folder): os.makedirs(save_folder) num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) # policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) # old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) # sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)] # old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)] sub_policy = SubPolicy(name="sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2) old_sub_policy = SubPolicy(name="old_sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2) learner = Learner(env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, args=args) rollout = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts, stochastic=True, args=args, sub_policy_costs=sub_policy_costs) rollout_eval = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts, stochastic=False, args=args, sub_policy_costs=sub_policy_costs) for x in range(1): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. # policy.reset() # earner.syncMasterPolicies() try: env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal except: pass # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) # mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10) mini_ep = 0 totalmeans = [] while mini_ep < warmup_time+train_time: mini_ep += 1 # rollout rolls = rollout.__next__() # save images, rewards, macro actions if 'rgb_arrays' in rolls: current_save_folder = os.path.join(save_folder, 'episode' + str(mini_ep)) os.makedirs(current_save_folder, exist_ok=True) statistic_file = os.path.join(current_save_folder, 'statistic_file.txt') rgb_arrays_file = os.path.join(current_save_folder, 'rgb_arrays.pickle') with open(statistic_file, 'w') as f: ep_ret = sum(rolls['rews_without_cost']) f.write('%d: %f' % (mini_ep, ep_ret) + '\n') needed_keys = ['macro_ac', 'rews_without_cost'] for key in needed_keys: f.write(key + '\n') for v in rolls[key]: f.write(str(v) + ' ') f.write('\n\n') rgb_arrays = np.array(rolls['rgb_arrays']) rgb_arrays.dump(rgb_arrays_file) allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # print(("Episode %d return: %s" % (mini_ep, rolls['ep_rets_without_cost'][0]))) if args.s: totalmeans.append(gmean) with open('outfile'+str(x)+'.pickle', 'wb') as fp: pickle.dump(totalmeans, fp) if mini_ep % 50 == 0: if args.num_subs != 1: print("macro acts:", rolls['macro_ac']) # eval score if mini_ep % 50 == 0: returns = [] for i in range(50): rolls = rollout_eval.__next__() returns.append(rolls['ep_rets_without_cost'][0]) print("Episode %d return: %s" % (mini_ep, statistics.mean(returns))) # save session if mini_ep % 500 == 0: fname = os.path.join("savedir/", args.savename, 'checkpoints', '%.5i'%mini_ep) U.save_state(fname)
def start(args, workerseed, rank, comm): env = gym.make(args.task) env_eval = gym.make(args.task) env.seed(workerseed) env.set_experiment_id(args.id_number) ob_space = env.observation_space master_ob = gym.spaces.Box(np.array([-100,-100],dtype=np.float32),np.array([100,100],dtype=np.float32)) ac_space = env.action_space num_subs = args.num_subs num_rollouts = args.num_rollouts train_time = args.train_time num_batches = int(num_rollouts/64) print(num_batches) # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) adv_ob = U.get_placeholder(name="adv_ob",dtype=tf.float32, shape=[None,master_ob.shape[0]]) master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2) old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2) sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] learner = Learner(env,master_policy,old_master_policy,sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64) adv_generator = adv_gen(1.0, ob_space, perturb_func= grid_reflect_x, delay=num_rollouts*args.warmup_time,augmented=args.augment) adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func= grid_reflect_x) override=None rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args) rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1, stochastic=False, args=args) ret_buffer = deque(maxlen=20) ret_buffer_eval = deque(maxlen=20) fname = './data/'+args.filename +'.csv' file = open(fname,'w') writer = csv.writer(file) if args.load is not None: fname = osp.join("./savedir/",args.load, args.load) U.load_state(fname) #saver = tf.train.Saver() #callback(0) learner.syncSubpolicies() print("synced subpols") master_train = True sub_train = [True, True] goal_t = 0 mini_ep=0 totalmeans = [] while mini_ep < args.warmup_time + train_time: mini_ep += 1 if(mini_ep==args.warmup_time or args.warmup_time==0): print("===================") print("START TRAINING WITH") print("===================") args.pretrain = -1 sub_train = [False,True] #if(mini_ep == 200): # adv_generator.perturb_func = stoch_bias rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, 0.99, 0.98) if args.pretrain < 0 and master_train: gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, sub_train) rolls_eval = rollout_eval.__next__() # learner.updateSubPolicies(test_seg, # log ret_buffer.extend(rolls['ep_rets']) ret_buffer_eval.extend(rolls_eval['ep_rets']) ret_mean = np.mean(ret_buffer) ret_eval_mean = np.mean(ret_buffer_eval) if len(ret_buffer_eval) == 0: ret_eval_mean = -100 fields = [mini_ep, ret_mean,ret_eval_mean,rolls['latent_counts'][0],rolls['latent_counts'][1],rolls['real_counts'][0],rolls['real_counts'][1]] writer.writerow(fields) print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(mini_ep,ret_mean, ret_eval_mean)) print("--------------------------------------------------") if args.save is not None: fname = osp.join("savedir/", args.save, args.save) U.save_state(fname)
def start(args, workerseed, rank, comm): env = gym.make(args.task) env_eval = gym.make(args.task) env.seed(workerseed) #np.random.seed(workerseed) ob_space = env.observation_space master_ob = gym.spaces.Box(np.array([-100, -100], dtype=np.float32), np.array([100, 100], dtype=np.float32)) ac_space = env.action_space num_subs = args.num_subs num_rollouts = args.num_rollouts train_time = args.train_time num_batches = int(num_rollouts / 64) print(num_batches) # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) adv_ob = U.get_placeholder(name="adv_ob", dtype=tf.float32, shape=[None, master_ob.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=8, num_hid_layers=2, num_subpolicies=2) old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=8, num_hid_layers=2, num_subpolicies=2) # features = Features(name="features", ob=ob) sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] #attack_grad = U.function([ob],tf.nn.l2_normalize(tf.gradients(sub_policies[0].vpred, ob)[0])) learner = Learner(env, master_policy, old_master_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64) #adv_generator = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts) #adv_generator_eval = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts,dummy=True) adv_generator = adv_gen(1.0, ob_space, perturb_func=stoch_bias_grid, delay=num_rollouts * args.warmup_time, augmented=args.augment) adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func=stoch_perturb) override = None rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args) rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1024, stochastic=False, args=args) ret_buffer = deque(maxlen=20) ret_buffer_eval = deque(maxlen=20) fname = './data/' + args.filename + '.csv' if not os.path.exists(os.path.dirname(fname)): try: os.makedirs(os.path.dirname(fname)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(fname, 'w') writer = csv.writer(file) if args.load is not None: fname = os.path.join("./savedir/", args.load, args.load) U.load_state(fname) #saver = tf.train.Saver() #callback(0) learner.syncSubpolicies() print("synced subpols") master_train = True sub_train = [True, True] goal_t = 0 mini_ep = 0 totalmeans = [] while mini_ep < args.warmup_time + train_time: mini_ep += 1 if (mini_ep == args.warmup_time or args.warmup_time == 0): print("start training with") args.pretrain = -1 sub_train = [False, True] #if(mini_ep == 200): # adv_generator.perturb_func = stoch_bias rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, 0.99, 0.98) if args.pretrain < 0 and master_train: gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, sub_train) rolls_eval = rollout_eval.__next__() # learner.updateSubPolicies(test_seg, # log ret_buffer.extend(rolls['ep_rets']) ret_buffer_eval.extend(rolls_eval['ep_rets']) ret_mean = np.mean(ret_buffer) ret_eval_mean = np.mean(ret_buffer_eval) if len(ret_buffer_eval) == 0: ret_eval_mean = -100 fields = [ mini_ep, ret_mean, ret_eval_mean, rolls['latent_counts'][0], rolls['latent_counts'][1], rolls['real_counts'][0], rolls['real_counts'][1] ] writer.writerow(fields) print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format( mini_ep, ret_mean, ret_eval_mean)) if args.save is not None: fname = os.path.join("savedir/", args.save, args.save) U.save_state(fname)
def start(callback, args): num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time num_master_groups = args.num_master_grps # number of batches for the sub-policy optimization num_sub_batches = args.num_sub_batches # number of sub groups in each group num_sub_in_grp = args.num_sub_in_grp num_env = num_master_groups * num_sub_batches recurrent = args.subpol_network == 'lstm' if recurrent: nlstm = args.nlstm def make_env_vec(seed): # common random numbers in sub groups def make_env(): env = gym.make(args.task) env.seed(seed) MONITORDIR = osp.join('savedir', args.savename, 'monitor') if not osp.exists(MONITORDIR): os.makedirs(MONITORDIR) monitor_path = osp.join(MONITORDIR, '%s-%d' % (args.task, seed)) env = bench.Monitor(env, monitor_path, allow_early_resets=True) #env = gym.wrappers.Monitor(env, MONITORDIR, force=True, # video_callable=lambda episode_id: True) if 'Atari' in str(env.__dict__['env']): env = wrap_deepmind(env, frame_stack=True) return env # TODO: replace DummyVecEnv with multiprocessing based class return DummyVecEnv([make_env for _ in range(num_sub_in_grp)]) envs = [ make_env_vec(np.random.randint(0, 2**31 - 1)) for _ in range(num_master_groups) ] ob_space = envs[0].observation_space ac_space = envs[0].action_space # observation in. master_obs = [ U.get_placeholder(name="master_ob_%i" % x, dtype=tf.float32, shape=[None] + list(ob_space.shape)) for x in range(num_master_groups) ] policies = [ Policy(name="policy_%i" % x, ob=master_obs[x], ac_space=ac_space, num_subpolicies=num_subs, network=args.master_network) for x in range(num_master_groups) ] old_policies = [ Policy(name="old_policy_%i" % x, ob=master_obs[x], ac_space=ac_space, num_subpolicies=num_subs, network=args.master_network) for x in range(num_master_groups) ] if not recurrent: sub_obs = [ U.get_placeholder(name="sub_ob_%i" % x, dtype=tf.float32, shape=[None] + list(ob_space.shape)) for x in range(num_subs) ] sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=sub_obs[x], ac_space=ac_space, network=args.subpol_network) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=sub_obs[x], ac_space=ac_space, network=args.subpol_network) for x in range(num_subs) ] elif recurrent: envsperbatch = max(1, num_env // num_sub_batches) num_batches = num_env // envsperbatch nbatch = envsperbatch * num_rollouts sub_obs = [ U.get_placeholder(name="sub_ob_%i" % x, dtype=tf.float32, shape=[nbatch] + list(ob_space.shape)) for x in range(num_subs) ] sub_states = [ U.get_placeholder(name="states_%i" % x, dtype=tf.float32, shape=[envsperbatch, 2 * nlstm]) for x in range(num_subs) ] sub_masks = [ U.get_placeholder(name="masks_%i" % x, dtype=tf.float32, shape=[nbatch]) for x in range(num_subs) ] actor_sub_obs = [ U.get_placeholder(name="actor_sub_ob_%i" % x, dtype=tf.float32, shape=[1] + list(ob_space.shape)) for x in range(num_subs) ] actor_sub_states = [ U.get_placeholder(name="actor_states_%i" % x, dtype=tf.float32, shape=[1, 2 * nlstm]) for x in range(num_subs) ] actor_sub_masks = [ U.get_placeholder(name="actor_masks_%i" % x, dtype=tf.float32, shape=[1]) for x in range(num_subs) ] sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=sub_obs[x], ac_space=ac_space, network=args.subpol_network, nsteps=num_rollouts, nbatch=nbatch, nlstm=nlstm, states=sub_states[x], masks=sub_masks[x]) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=sub_obs[x], ac_space=ac_space, network=args.subpol_network, nsteps=num_rollouts, nbatch=nbatch, nlstm=nlstm, states=sub_states[x], masks=sub_masks[x]) for x in range(num_subs) ] actor_sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=actor_sub_obs[x], ac_space=ac_space, network=args.subpol_network, nsteps=1, nbatch=1, nlstm=nlstm, states=actor_sub_states[x], masks=actor_sub_masks[x], reuse=True) for x in range(num_subs) ] learner = Learner(envs, policies, sub_policies, old_policies, old_sub_policies, clip_param=0.2, vfcoeff=args.vfcoeff, entcoeff=args.entcoeff, divcoeff=args.divcoeff, optim_epochs=10, master_lr=args.master_lr, sub_lr=args.sub_lr, optim_batchsize=32, envsperbatch=envsperbatch if recurrent else 0, num_rollouts=num_rollouts, nlstm=nlstm if recurrent else 0, recurrent=recurrent) rollout = rollouts.traj_segment_generator( policies, actor_sub_policies if recurrent else sub_policies, envs, macro_duration, num_rollouts, num_sub_in_grp, stochastic=True, args=args) start_iter = 0 if args.continue_iter is not None: start_iter = int(args.continue_iter) + 1 for x in range(start_iter, 10000): callback(x) if x == 0: [sub_policy.reset() for sub_policy in sub_policies] print("synced subpols") # Run the inner meta-episode. [policy.reset() for policy in policies] learner.reset_master_optimizer() for i in range(num_master_groups): seed = np.random.randint(0, 2**31 - 1) for j in range(num_sub_in_grp): # NOTE: implement env sampling in the seed function; # it is seeded only the first time seed is called. # every subsequent call would call a sampler to randomize the env. # this 'overloading' is to ensure compatibility with wrappers. envs[i].envs[j].seed(seed) # TODO: is warm-up staggering necessary? mini_ep = 0 totalmeans = [] while mini_ep < warmup_time + train_time: print('*' * 10 + ' Iteration %d, Mini-ep %d ' % (x, mini_ep) + '*' * 10) if mini_ep == 0: print('WARM-UP') elif mini_ep == warmup_time: print('JOINT TRAINING') # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs, recurrent=recurrent) learner.updateSubPolicies(test_seg, num_sub_batches, num_rollouts, num_env, optimize=(mini_ep >= warmup_time), recurrent=recurrent) mini_ep += 1
def start(callback, args, workerseed, rank, comm, logdir): if args.task in ['OverCooked']: import overcooked env = overcooked.OverCooked(args=args, ) else: env = gym.make(args.task) if rank == 0: summary_writer = tf.summary.FileWriter(logdir) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64) rollout = rollouts.traj_segment_generator(policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args) start_time = time.time() num_interation = 2000 episode_rewards = {} for x in range(num_interation): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. policy.reset() learner.syncMasterPolicies() env.randomizeCorrect( ) # change goal in this function, do not change goal in reset, make sure the logic of done-reset is correct # shared_goal = comm.bcast(env.single_goal, root=0) # env.single_goal = shared_goal if args.reward_level == 1: print("It is iteration %d so i'm changing the goal to %s" % (x, env.single_goal)) elif args.reward_level == 2: print("It is iteration %d so i'm changing the goal to %s" % (x, env.realgoal)) mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time + train_time / 10) # mini_ep = 0 totalmeans = [] while mini_ep < warmup_time + train_time: mini_ep += 1 # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) gmean, lmean = learner.updateMasterPolicy(rolls) try: episode_rewards[env.single_goal] += [gmean] except Exception as e: episode_rewards[env.single_goal] = [gmean] # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) if rank in [0]: print_string = "" summary = tf.Summary() try: print_string += "[{}] goal {}, remaining {:.2f} hours".format( x, env.single_goal, (time.time() - start_time) / (x) * (num_interation - x) / 60.0 / 60.0, ) except Exception as e: pass print_string += ", ep_rew for {}-th goal: {:.2f}".format( env.single_goal, episode_rewards[env.single_goal][-1], ) summary.value.add( tag='ep rew for goal {}'.format(env.single_goal, ), simple_value=episode_rewards[env.single_goal][-1], ) summary.value.add( tag='ep rew (all) for goal {}'.format(env.single_goal, ), simple_value=np.mean(episode_rewards[env.single_goal]), ) print(print_string) summary_writer.add_summary(summary, x) summary_writer.flush()
def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time sub_hidden_sizes = args.sub_hidden_sizes sub_policy_costs = args.sub_policy_costs num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs) ] learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, args=args) rollout = rollouts.traj_segment_generator( policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args, sub_policy_costs=sub_policy_costs) fixed_policy_rollouts = [] for i in range(num_subs): fixed_policy_rollouts.append( rollouts.traj_segment_generator(policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args, sub_policy_costs=sub_policy_costs, fixed_policy=i)) for x in range(1): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. policy.reset() learner.syncMasterPolicies() try: env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal except: pass # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time + train_time / 10) # mini_ep = 0 totalmeans = [] while mini_ep < warmup_time + train_time: mini_ep += 1 # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # learner.updateSubPolicies(test_seg, # log # print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean))) print(("Episode %d return: %s" % (mini_ep, lmean))) if args.s: totalmeans.append(gmean) with open('outfile' + str(x) + '.pickle', 'wb') as fp: pickle.dump(totalmeans, fp) # evaluate sub-policies seperately if mini_ep % 50 == 0: if args.num_subs != 1: print("macro acts:", rolls['macro_ac']) for i, fix_policy_rollout in enumerate(fixed_policy_rollouts): collected_rolls = [] for _ in range(10): collected_rolls.extend(fix_policy_rollout.__next__() ['ep_rets_without_cost']) print("sub %d: %.3f" % (i, statistics.mean(collected_rolls)), end=', ') print()