Exemplo n.º 1
0
def start(callback, args, workerseed, rank, comm):
    env = gym.make(args.task)
    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time
    sub_hidden_sizes = args.sub_hidden_sizes
    sub_policy_costs = args.sub_policy_costs


    save_folder = os.path.join("savedir/", args.savename)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    num_batches = 15

    # observation in.
    ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]])
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])

    # features = Features(name="features", ob=ob)
    # policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)
    # old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)

    # sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)]
    # old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)]
    sub_policy = SubPolicy(name="sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2)
    old_sub_policy = SubPolicy(name="old_sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2)

    learner = Learner(env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, args=args)
    rollout = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts,
                                              stochastic=True, args=args, sub_policy_costs=sub_policy_costs)
    rollout_eval = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts,
                                              stochastic=False, args=args, sub_policy_costs=sub_policy_costs)



    for x in range(1):
        callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.

        # policy.reset()
        # earner.syncMasterPolicies()

        try:
            env.env.randomizeCorrect()
            shared_goal = comm.bcast(env.env.realgoal, root=0)
            env.env.realgoal = shared_goal
        except:
            pass

        # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal))
        # mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10)
        mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time+train_time:
            mini_ep += 1
            # rollout
            rolls = rollout.__next__()
            # save images, rewards, macro actions
            if 'rgb_arrays' in rolls:
                current_save_folder = os.path.join(save_folder, 'episode' + str(mini_ep))
                os.makedirs(current_save_folder, exist_ok=True)
                statistic_file = os.path.join(current_save_folder, 'statistic_file.txt')
                rgb_arrays_file = os.path.join(current_save_folder, 'rgb_arrays.pickle')
                with open(statistic_file, 'w') as f:
                    ep_ret = sum(rolls['rews_without_cost'])
                    f.write('%d: %f' % (mini_ep, ep_ret) + '\n')
                    needed_keys = ['macro_ac', 'rews_without_cost']
                    for key in needed_keys:
                        f.write(key + '\n')
                        for v in rolls[key]:
                            f.write(str(v) + ' ')
                        f.write('\n\n')
                rgb_arrays = np.array(rolls['rgb_arrays'])
                rgb_arrays.dump(rgb_arrays_file)

            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time))
            # print(("Episode %d return: %s" % (mini_ep, rolls['ep_rets_without_cost'][0])))
            if args.s:
                totalmeans.append(gmean)
                with open('outfile'+str(x)+'.pickle', 'wb') as fp:
                    pickle.dump(totalmeans, fp)


            if mini_ep % 50 == 0:
                if args.num_subs != 1:
                    print("macro acts:", rolls['macro_ac'])
            # eval score
            if mini_ep % 50 == 0:
                returns = []
                for i in range(50):
                    rolls = rollout_eval.__next__()
                    returns.append(rolls['ep_rets_without_cost'][0])
                print("Episode %d return: %s" % (mini_ep, statistics.mean(returns)))
            # save session
            if mini_ep % 500 == 0:
                fname = os.path.join("savedir/", args.savename, 'checkpoints', '%.5i'%mini_ep)
                U.save_state(fname)
Exemplo n.º 2
0
def start(callback, args, workerseed, rank, comm):
    env = gym.make(args.task)
    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time

    num_batches = 15

    # observation in.
    ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]])
    #ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 1])

    # features = Features(name="features", ob=ob)
    policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)
    old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)

    sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]
    old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]

    learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64)
    rollout = rollouts.traj_segment_generator(policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args)

    x = 0
    while x<10000:
   # for x in range(10000):
        x = callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.
        policy.reset()
        learner.syncMasterPolicies()

        env.env.randomizeCorrect()
        shared_goal = comm.bcast(env.env.realgoal, root=0)
        env.env.realgoal = shared_goal

        print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal))
        mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10)
        # mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time+train_time:
            mini_ep += 1
            # rollout
            rolls = rollout.__next__()
            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            gmean, lmean = learner.updateMasterPolicy(rolls)
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time))
            # learner.updateSubPolicies(test_seg,
            # log
            print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean)))
            if args.s:
                totalmeans.append(gmean)
                with open('./Fourrooms_google_20_mlsh/outfile'+str(x)+'.pickle', 'wb') as fp:
                    pickle.dump(totalmeans, fp)
        x = x+1
Exemplo n.º 3
0
def start(callback, args, workerseed, rank, comm):
    env = gym.make(args.task)
    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space
    print("ob_space: %s" % ob_space)
    print("ac_space: %s" % ac_space)

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time

    num_batches = 15

    # observation in.
    if (len(ob_space.shape) == 1):
        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[None, ob_space.shape[0]])
    elif (len(ob_space.shape) == 2):
        ob = U.get_placeholder(
            name="ob",
            dtype=tf.float32,
            shape=[None, ob_space.shape[0] * ob_space.shape[1]])
    elif (len(ob_space.shape) == 3):
        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[
                                   None, ob_space.shape[0] *
                                   ob_space.shape[1] * ob_space.shape[2]
                               ])
    else:
        raise Exception("unsupported observer space shape (%d)" %
                        len(ob_space.shape))
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])

    # features = Features(name="features", ob=ob)

    gs_policy = GuessStepsPolicy(name="guess_steps",
                                 ob=ob,
                                 hid_size=32,
                                 num_hid_layers=5)
    policy = Policy(name="policy",
                    ob=ob,
                    ac_space=ac_space,
                    hid_size=32,
                    num_hid_layers=2,
                    num_subpolicies=num_subs)
    old_policy = Policy(name="old_policy",
                        ob=ob,
                        ac_space=ac_space,
                        hid_size=32,
                        num_hid_layers=2,
                        num_subpolicies=num_subs)

    sub_policies = [
        SubPolicy(name="sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    old_sub_policies = [
        SubPolicy(name="old_sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]

    learner = Learner(env,
                      policy,
                      old_policy,
                      sub_policies,
                      old_sub_policies,
                      gs_policy,
                      comm,
                      clip_param=0.2,
                      entcoeff=0,
                      optim_epochs=10,
                      optim_stepsize=3e-5,
                      optim_batchsize=64)
    rollout = rollouts.traj_segment_generator(policy,
                                              sub_policies,
                                              gs_policy,
                                              env,
                                              macro_duration,
                                              num_rollouts,
                                              stochastic=True,
                                              args=args)

    hasRandomizeCorrect = hasattr(env, "env") and hasattr(
        env.env, "randomizeCorrect")
    for x in range(100000):
        callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.

        policy.reset()
        learner.syncGuessStepsPolicies()
        learner.syncMasterPolicies()

        if hasRandomizeCorrect:
            env.env.randomizeCorrect()
            shared_goal = comm.bcast(env.env.realgoal, root=0)
            env.env.realgoal = shared_goal
            print("It is iteration %d so i'm changing the goal to %s" %
                  (x, env.env.realgoal))
        mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time +
                                                         train_time / 10)
        # mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time + train_time:
            mini_ep += 1
            # rollout
            rolls = rollout.__next__()

            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)

            gmean, lmean = learner.updateMasterPolicy(rolls)

            if gmean > 0:
                learner.updateGuessStepsPolicyLoss(rolls)
                #print("steps:")
                #print(rolls["steps"])
                #print("gs_vpreds:")
                #print(rolls["gs_vpreds"])
                print("gs mean:")
                print(
                    U.eval(
                        tf.reduce_mean(
                            tf.square(rolls["gs_vpreds"] - rolls["steps"]))))

            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls,
                                                 macro_duration,
                                                 0.99,
                                                 0.98,
                                                 num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches,
                                      (mini_ep >= warmup_time))

            # log
            print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean)))
            if args.s:
                totalmeans.append(gmean)
                with open('outfile' + str(x) + '.pickle', 'wb') as fp:
                    pickle.dump(totalmeans, fp)
Exemplo n.º 4
0
    def __init__(self, envs, policies, sub_policies, old_policies, old_sub_policies, 
            clip_param=0.2, vfcoeff=1., entcoeff=0, divcoeff=0., optim_epochs=10, 
            master_lr=1e-3, sub_lr=3e-4, optim_batchsize=64, envsperbatch=None, 
            num_rollouts=None, nlstm=256, recurrent=False):
        self.policies = policies
        self.sub_policies = sub_policies
        self.old_policies = old_policies
        self.old_sub_policies = old_sub_policies
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_batchsize = optim_batchsize
        self.num_master_groups = num_master_groups = len(policies)
        self.num_subpolicies = num_subpolicies = len(sub_policies)
        self.ob_space = envs[0].observation_space
        self.ac_space = envs[0].action_space
        self.nbatch = nbatch = num_rollouts * envsperbatch
        self.envsperbatch = envsperbatch

        self.master_obs = [U.get_placeholder(name="master_ob_%i"%x, dtype=tf.float32,
            shape=[None] + list(self.ob_space.shape)) for x in range(num_master_groups)]
        self.master_acs = [policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_master_groups)]
        self.master_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        self.master_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        retvals = zip(*[self.policy_loss(policies[i], 
            old_policies[i], self.master_obs[i], self.master_acs[i], self.master_atargs[i], 
            self.master_ret[i], clip_param, mask=tf.constant(1.), vfcoeff=vfcoeff, 
            entcoeff=entcoeff) for i in range(num_master_groups)])
        self.master_losses, self.master_kl, self.master_pol_surr, self.master_vf_loss, \
                self.master_entropy, self.master_values, _ = retvals 

        master_trainers = [tf.train.AdamOptimizer(learning_rate=master_lr, 
            name='master_adam_%i'%_) for _ in range(num_master_groups)]
        master_params = [policies[i].get_trainable_variables() 
                for i in range(num_master_groups)] 
        master_grads = [tf.gradients(self.master_losses[i], master_params[i])
                for i in range(num_master_groups)]
        master_grads = [list(zip(g, p)) for g, p in zip(master_grads, master_params)]
        # TODO: gradient clipping
        self.assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_policies[i].get_variables(), 
                policies[i].get_variables())]) for i in range(num_master_groups)]
        self.master_train_steps = [master_trainers[i].apply_gradients(master_grads[i])
                for i in range(num_master_groups)]
       

        if not recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[None] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
        self.sub_acs = [sub_policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_subpolicies)]
        self.sub_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.sub_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.logpacs = [tf.placeholder(dtype=tf.float32, shape=[num_subpolicies, None])
                for _ in range(num_subpolicies)]
        self.loss_masks = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        if recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[nbatch] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
            self.sub_masks = [U.get_placeholder(name="masks_%i"%_, dtype=tf.float32, 
                shape=[nbatch]) for _ in range(num_subpolicies)]
            self.sub_states = [U.get_placeholder(name="states_%i"%_, dtype=tf.float32, 
                shape=[envsperbatch, 2*nlstm]) for _ in range(num_subpolicies)]
        sub_retvals = zip(*[self.policy_loss(sub_policies[i], 
            old_sub_policies[i], self.sub_obs[i], self.sub_acs[i], self.sub_atargs[i], 
            self.sub_ret[i], clip_param, mask=self.loss_masks[i], vfcoeff=vfcoeff, 
            entcoeff=entcoeff, divcoeff=divcoeff, logpacs=None)#self.logpacs[i]) 
            for i in range(num_subpolicies)])
        self.sub_losses, self.sub_kl, self.sub_pol_surr, self.sub_vf_loss, \
                self.sub_entropy, self.sub_values, self.div_loss = sub_retvals 

        sub_trainers = [tf.train.AdamOptimizer(learning_rate=sub_lr)
                for _ in range(num_subpolicies)]
        sub_params = [sub_policies[i].get_trainable_variables() 
                for i in range(num_subpolicies)] 
        sub_grads = [tf.gradients(self.sub_losses[i], sub_params[i])
                for i in range(num_subpolicies)]
        sub_grads = [list(zip(g, p)) for g, p in zip(sub_grads, sub_params)]
        # TODO: gradient clipping
        self.subs_assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), 
                sub_policies[i].get_variables())]) for i in range(num_subpolicies)]
        self.sub_train_steps = [sub_trainers[i].apply_gradients(sub_grads[i])
                for i in range(num_subpolicies)]

        U.initialize()
Exemplo n.º 5
0
def start(args, workerseed, rank, comm):
    env = gym.make(args.task)
    env_eval = gym.make(args.task)

    env.seed(workerseed)
    env.set_experiment_id(args.id_number)

    ob_space = env.observation_space
    master_ob = gym.spaces.Box(np.array([-100,-100],dtype=np.float32),np.array([100,100],dtype=np.float32))
    ac_space = env.action_space

    num_subs = args.num_subs
    num_rollouts = args.num_rollouts
    train_time = args.train_time

    num_batches = int(num_rollouts/64)
    print(num_batches)

    # observation in.
    ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]])
    adv_ob = U.get_placeholder(name="adv_ob",dtype=tf.float32, shape=[None,master_ob.shape[0]])

    master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2)
    old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2)

    sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]
    old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]

    learner = Learner(env,master_policy,old_master_policy,sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64)

    adv_generator = adv_gen(1.0, ob_space, perturb_func= grid_reflect_x, delay=num_rollouts*args.warmup_time,augmented=args.augment)
    adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func= grid_reflect_x)

    override=None

    rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args)
    rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1, stochastic=False, args=args)

    ret_buffer = deque(maxlen=20)
    ret_buffer_eval = deque(maxlen=20)

    fname = './data/'+args.filename +'.csv'
    file  = open(fname,'w')
    writer = csv.writer(file)
    if args.load is not None:
        fname = osp.join("./savedir/",args.load, args.load)
        U.load_state(fname)
    #saver = tf.train.Saver()

    #callback(0)


    learner.syncSubpolicies()
    print("synced subpols")

    master_train = True
    sub_train = [True, True]
    goal_t = 0
    mini_ep=0
    totalmeans = []
    while mini_ep < args.warmup_time + train_time:

        mini_ep += 1
        if(mini_ep==args.warmup_time or args.warmup_time==0):
            print("===================")
            print("START TRAINING WITH")
            print("===================")
            args.pretrain = -1
            sub_train = [False,True]
        #if(mini_ep == 200):
         #   adv_generator.perturb_func = stoch_bias

        rolls = rollout.__next__()
        allrolls = []
        allrolls.append(rolls)
        # train theta
        rollouts.add_advantage_macro(rolls, 0.99, 0.98)
        if args.pretrain < 0 and master_train:
            gmean, lmean = learner.updateMasterPolicy(rolls)
        # train phi
        test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs)
        learner.updateSubPolicies(test_seg, num_batches, sub_train)
        rolls_eval = rollout_eval.__next__()
        # learner.updateSubPolicies(test_seg,
        # log
        ret_buffer.extend(rolls['ep_rets'])
        ret_buffer_eval.extend(rolls_eval['ep_rets'])
        ret_mean = np.mean(ret_buffer)
        ret_eval_mean = np.mean(ret_buffer_eval)
        if len(ret_buffer_eval) == 0: ret_eval_mean =  -100
        fields = [mini_ep, ret_mean,ret_eval_mean,rolls['latent_counts'][0],rolls['latent_counts'][1],rolls['real_counts'][0],rolls['real_counts'][1]]
        writer.writerow(fields)

        print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(mini_ep,ret_mean, ret_eval_mean))
        print("--------------------------------------------------")
    if args.save is not None:
        fname = osp.join("savedir/", args.save, args.save)
        U.save_state(fname)
Exemplo n.º 6
0
def start(args, workerseed, rank, comm):
    env = gym.make(args.task)
    env_eval = gym.make(args.task)

    env.seed(workerseed)
    #np.random.seed(workerseed)
    ob_space = env.observation_space
    master_ob = gym.spaces.Box(np.array([-100, -100], dtype=np.float32),
                               np.array([100, 100], dtype=np.float32))
    ac_space = env.action_space

    num_subs = args.num_subs
    num_rollouts = args.num_rollouts
    train_time = args.train_time

    num_batches = int(num_rollouts / 64)
    print(num_batches)

    # observation in.
    ob = U.get_placeholder(name="ob",
                           dtype=tf.float32,
                           shape=[None, ob_space.shape[0]])
    adv_ob = U.get_placeholder(name="adv_ob",
                               dtype=tf.float32,
                               shape=[None, master_ob.shape[0]])
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])
    master_policy = Policy(name="master",
                           ob=adv_ob,
                           ac_space=0,
                           hid_size=8,
                           num_hid_layers=2,
                           num_subpolicies=2)
    old_master_policy = Policy(name="old_master",
                               ob=adv_ob,
                               ac_space=0,
                               hid_size=8,
                               num_hid_layers=2,
                               num_subpolicies=2)
    # features = Features(name="features", ob=ob)
    sub_policies = [
        SubPolicy(name="sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    old_sub_policies = [
        SubPolicy(name="old_sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    #attack_grad = U.function([ob],tf.nn.l2_normalize(tf.gradients(sub_policies[0].vpred, ob)[0]))
    learner = Learner(env,
                      master_policy,
                      old_master_policy,
                      sub_policies,
                      old_sub_policies,
                      comm,
                      clip_param=0.2,
                      entcoeff=0,
                      optim_epochs=10,
                      optim_stepsize=3e-4,
                      optim_batchsize=64)
    #adv_generator = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts)
    #adv_generator_eval = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts,dummy=True)
    adv_generator = adv_gen(1.0,
                            ob_space,
                            perturb_func=stoch_bias_grid,
                            delay=num_rollouts * args.warmup_time,
                            augmented=args.augment)
    adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func=stoch_perturb)
    override = None
    rollout = rollouts.traj_segment_generator(adv_generator,
                                              master_policy,
                                              sub_policies,
                                              env,
                                              num_rollouts,
                                              stochastic=True,
                                              args=args)
    rollout_eval = rollouts.traj_segment_generator(adv_generator_eval,
                                                   master_policy,
                                                   sub_policies,
                                                   env_eval,
                                                   1024,
                                                   stochastic=False,
                                                   args=args)

    ret_buffer = deque(maxlen=20)
    ret_buffer_eval = deque(maxlen=20)

    fname = './data/' + args.filename + '.csv'
    if not os.path.exists(os.path.dirname(fname)):
        try:
            os.makedirs(os.path.dirname(fname))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    file = open(fname, 'w')
    writer = csv.writer(file)
    if args.load is not None:
        fname = os.path.join("./savedir/", args.load, args.load)
        U.load_state(fname)
    #saver = tf.train.Saver()

    #callback(0)

    learner.syncSubpolicies()
    print("synced subpols")

    master_train = True
    sub_train = [True, True]
    goal_t = 0
    mini_ep = 0
    totalmeans = []
    while mini_ep < args.warmup_time + train_time:

        mini_ep += 1
        if (mini_ep == args.warmup_time or args.warmup_time == 0):
            print("start training with")
            args.pretrain = -1
            sub_train = [False, True]
        #if(mini_ep == 200):
        #   adv_generator.perturb_func = stoch_bias

        rolls = rollout.__next__()
        allrolls = []
        allrolls.append(rolls)
        # train theta
        rollouts.add_advantage_macro(rolls, 0.99, 0.98)
        if args.pretrain < 0 and master_train:
            gmean, lmean = learner.updateMasterPolicy(rolls)
        # train phi
        test_seg = rollouts.prepare_allrolls(allrolls,
                                             0.99,
                                             0.98,
                                             num_subpolicies=num_subs)
        learner.updateSubPolicies(test_seg, num_batches, sub_train)
        rolls_eval = rollout_eval.__next__()
        # learner.updateSubPolicies(test_seg,
        # log
        ret_buffer.extend(rolls['ep_rets'])
        ret_buffer_eval.extend(rolls_eval['ep_rets'])
        ret_mean = np.mean(ret_buffer)
        ret_eval_mean = np.mean(ret_buffer_eval)
        if len(ret_buffer_eval) == 0: ret_eval_mean = -100
        fields = [
            mini_ep, ret_mean, ret_eval_mean, rolls['latent_counts'][0],
            rolls['latent_counts'][1], rolls['real_counts'][0],
            rolls['real_counts'][1]
        ]
        writer.writerow(fields)

        print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(
            mini_ep, ret_mean, ret_eval_mean))
    if args.save is not None:
        fname = os.path.join("savedir/", args.save, args.save)
        U.save_state(fname)
Exemplo n.º 7
0
def start(callback, args):
    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time

    num_master_groups = args.num_master_grps
    # number of batches for the sub-policy optimization
    num_sub_batches = args.num_sub_batches
    # number of sub groups in each group
    num_sub_in_grp = args.num_sub_in_grp
    num_env = num_master_groups * num_sub_batches

    recurrent = args.subpol_network == 'lstm'
    if recurrent:
        nlstm = args.nlstm

    def make_env_vec(seed):
        # common random numbers in sub groups
        def make_env():
            env = gym.make(args.task)
            env.seed(seed)
            MONITORDIR = osp.join('savedir', args.savename, 'monitor')
            if not osp.exists(MONITORDIR):
                os.makedirs(MONITORDIR)
            monitor_path = osp.join(MONITORDIR, '%s-%d' % (args.task, seed))
            env = bench.Monitor(env, monitor_path, allow_early_resets=True)
            #env = gym.wrappers.Monitor(env, MONITORDIR, force=True,
            #        video_callable=lambda episode_id: True)
            if 'Atari' in str(env.__dict__['env']):
                env = wrap_deepmind(env, frame_stack=True)
            return env

        # TODO: replace DummyVecEnv with multiprocessing based class
        return DummyVecEnv([make_env for _ in range(num_sub_in_grp)])

    envs = [
        make_env_vec(np.random.randint(0, 2**31 - 1))
        for _ in range(num_master_groups)
    ]
    ob_space = envs[0].observation_space
    ac_space = envs[0].action_space

    # observation in.
    master_obs = [
        U.get_placeholder(name="master_ob_%i" % x,
                          dtype=tf.float32,
                          shape=[None] + list(ob_space.shape))
        for x in range(num_master_groups)
    ]

    policies = [
        Policy(name="policy_%i" % x,
               ob=master_obs[x],
               ac_space=ac_space,
               num_subpolicies=num_subs,
               network=args.master_network) for x in range(num_master_groups)
    ]
    old_policies = [
        Policy(name="old_policy_%i" % x,
               ob=master_obs[x],
               ac_space=ac_space,
               num_subpolicies=num_subs,
               network=args.master_network) for x in range(num_master_groups)
    ]

    if not recurrent:
        sub_obs = [
            U.get_placeholder(name="sub_ob_%i" % x,
                              dtype=tf.float32,
                              shape=[None] + list(ob_space.shape))
            for x in range(num_subs)
        ]
        sub_policies = [
            SubPolicy(name="sub_policy_%i" % x,
                      ob=sub_obs[x],
                      ac_space=ac_space,
                      network=args.subpol_network) for x in range(num_subs)
        ]
        old_sub_policies = [
            SubPolicy(name="old_sub_policy_%i" % x,
                      ob=sub_obs[x],
                      ac_space=ac_space,
                      network=args.subpol_network) for x in range(num_subs)
        ]
    elif recurrent:
        envsperbatch = max(1, num_env // num_sub_batches)
        num_batches = num_env // envsperbatch
        nbatch = envsperbatch * num_rollouts

        sub_obs = [
            U.get_placeholder(name="sub_ob_%i" % x,
                              dtype=tf.float32,
                              shape=[nbatch] + list(ob_space.shape))
            for x in range(num_subs)
        ]
        sub_states = [
            U.get_placeholder(name="states_%i" % x,
                              dtype=tf.float32,
                              shape=[envsperbatch, 2 * nlstm])
            for x in range(num_subs)
        ]
        sub_masks = [
            U.get_placeholder(name="masks_%i" % x,
                              dtype=tf.float32,
                              shape=[nbatch]) for x in range(num_subs)
        ]

        actor_sub_obs = [
            U.get_placeholder(name="actor_sub_ob_%i" % x,
                              dtype=tf.float32,
                              shape=[1] + list(ob_space.shape))
            for x in range(num_subs)
        ]
        actor_sub_states = [
            U.get_placeholder(name="actor_states_%i" % x,
                              dtype=tf.float32,
                              shape=[1, 2 * nlstm]) for x in range(num_subs)
        ]
        actor_sub_masks = [
            U.get_placeholder(name="actor_masks_%i" % x,
                              dtype=tf.float32,
                              shape=[1]) for x in range(num_subs)
        ]

        sub_policies = [
            SubPolicy(name="sub_policy_%i" % x,
                      ob=sub_obs[x],
                      ac_space=ac_space,
                      network=args.subpol_network,
                      nsteps=num_rollouts,
                      nbatch=nbatch,
                      nlstm=nlstm,
                      states=sub_states[x],
                      masks=sub_masks[x]) for x in range(num_subs)
        ]
        old_sub_policies = [
            SubPolicy(name="old_sub_policy_%i" % x,
                      ob=sub_obs[x],
                      ac_space=ac_space,
                      network=args.subpol_network,
                      nsteps=num_rollouts,
                      nbatch=nbatch,
                      nlstm=nlstm,
                      states=sub_states[x],
                      masks=sub_masks[x]) for x in range(num_subs)
        ]
        actor_sub_policies = [
            SubPolicy(name="sub_policy_%i" % x,
                      ob=actor_sub_obs[x],
                      ac_space=ac_space,
                      network=args.subpol_network,
                      nsteps=1,
                      nbatch=1,
                      nlstm=nlstm,
                      states=actor_sub_states[x],
                      masks=actor_sub_masks[x],
                      reuse=True) for x in range(num_subs)
        ]

    learner = Learner(envs,
                      policies,
                      sub_policies,
                      old_policies,
                      old_sub_policies,
                      clip_param=0.2,
                      vfcoeff=args.vfcoeff,
                      entcoeff=args.entcoeff,
                      divcoeff=args.divcoeff,
                      optim_epochs=10,
                      master_lr=args.master_lr,
                      sub_lr=args.sub_lr,
                      optim_batchsize=32,
                      envsperbatch=envsperbatch if recurrent else 0,
                      num_rollouts=num_rollouts,
                      nlstm=nlstm if recurrent else 0,
                      recurrent=recurrent)
    rollout = rollouts.traj_segment_generator(
        policies,
        actor_sub_policies if recurrent else sub_policies,
        envs,
        macro_duration,
        num_rollouts,
        num_sub_in_grp,
        stochastic=True,
        args=args)

    start_iter = 0
    if args.continue_iter is not None:
        start_iter = int(args.continue_iter) + 1
    for x in range(start_iter, 10000):
        callback(x)
        if x == 0:
            [sub_policy.reset() for sub_policy in sub_policies]
            print("synced subpols")

        # Run the inner meta-episode.
        [policy.reset() for policy in policies]
        learner.reset_master_optimizer()

        for i in range(num_master_groups):
            seed = np.random.randint(0, 2**31 - 1)
            for j in range(num_sub_in_grp):
                # NOTE: implement env sampling in the seed function;
                # it is seeded only the first time seed is called.
                # every subsequent call would call a sampler to randomize the env.
                # this 'overloading' is to ensure compatibility with wrappers.
                envs[i].envs[j].seed(seed)

        # TODO: is warm-up staggering necessary?
        mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time + train_time:
            print('*' * 10 + ' Iteration %d, Mini-ep %d ' % (x, mini_ep) +
                  '*' * 10)
            if mini_ep == 0:
                print('WARM-UP')
            elif mini_ep == warmup_time:
                print('JOINT TRAINING')
            # rollout
            rolls = rollout.__next__()
            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            learner.updateMasterPolicy(rolls)
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls,
                                                 macro_duration,
                                                 0.99,
                                                 0.98,
                                                 num_subpolicies=num_subs,
                                                 recurrent=recurrent)
            learner.updateSubPolicies(test_seg,
                                      num_sub_batches,
                                      num_rollouts,
                                      num_env,
                                      optimize=(mini_ep >= warmup_time),
                                      recurrent=recurrent)
            mini_ep += 1
Exemplo n.º 8
0
def start(callback, args, workerseed, rank, comm, logdir):
    if args.task in ['OverCooked']:
        import overcooked
        env = overcooked.OverCooked(args=args, )
    else:
        env = gym.make(args.task)

    if rank == 0:
        summary_writer = tf.summary.FileWriter(logdir)

    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time

    num_batches = 15
    # observation in.
    ob = U.get_placeholder(name="ob",
                           dtype=tf.float32,
                           shape=[None] + list(ob_space.shape))
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])

    # features = Features(name="features", ob=ob)
    policy = Policy(name="policy",
                    ob=ob,
                    ac_space=ac_space,
                    hid_size=32,
                    num_hid_layers=2,
                    num_subpolicies=num_subs)
    old_policy = Policy(name="old_policy",
                        ob=ob,
                        ac_space=ac_space,
                        hid_size=32,
                        num_hid_layers=2,
                        num_subpolicies=num_subs)

    sub_policies = [
        SubPolicy(name="sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    old_sub_policies = [
        SubPolicy(name="old_sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]

    learner = Learner(env,
                      policy,
                      old_policy,
                      sub_policies,
                      old_sub_policies,
                      comm,
                      clip_param=0.2,
                      entcoeff=0,
                      optim_epochs=10,
                      optim_stepsize=3e-5,
                      optim_batchsize=64)
    rollout = rollouts.traj_segment_generator(policy,
                                              sub_policies,
                                              env,
                                              macro_duration,
                                              num_rollouts,
                                              stochastic=True,
                                              args=args)

    start_time = time.time()
    num_interation = 2000
    episode_rewards = {}

    for x in range(num_interation):
        callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.

        policy.reset()
        learner.syncMasterPolicies()

        env.randomizeCorrect(
        )  # change goal in this function, do not change goal in reset, make sure the logic of done-reset is correct
        # shared_goal = comm.bcast(env.single_goal, root=0)
        # env.single_goal = shared_goal
        if args.reward_level == 1:
            print("It is iteration %d so i'm changing the goal to %s" %
                  (x, env.single_goal))
        elif args.reward_level == 2:
            print("It is iteration %d so i'm changing the goal to %s" %
                  (x, env.realgoal))
        mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time +
                                                         train_time / 10)
        # mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time + train_time:

            mini_ep += 1
            # rollout
            rolls = rollout.__next__()
            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            gmean, lmean = learner.updateMasterPolicy(rolls)

            try:
                episode_rewards[env.single_goal] += [gmean]
            except Exception as e:
                episode_rewards[env.single_goal] = [gmean]
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls,
                                                 macro_duration,
                                                 0.99,
                                                 0.98,
                                                 num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches,
                                      (mini_ep >= warmup_time))

        if rank in [0]:
            print_string = ""
            summary = tf.Summary()

            try:
                print_string += "[{}] goal {}, remaining {:.2f} hours".format(
                    x,
                    env.single_goal,
                    (time.time() - start_time) / (x) * (num_interation - x) /
                    60.0 / 60.0,
                )
            except Exception as e:
                pass

            print_string += ", ep_rew for {}-th goal: {:.2f}".format(
                env.single_goal,
                episode_rewards[env.single_goal][-1],
            )

            summary.value.add(
                tag='ep rew for goal {}'.format(env.single_goal, ),
                simple_value=episode_rewards[env.single_goal][-1],
            )

            summary.value.add(
                tag='ep rew (all) for goal {}'.format(env.single_goal, ),
                simple_value=np.mean(episode_rewards[env.single_goal]),
            )

            print(print_string)
            summary_writer.add_summary(summary, x)
            summary_writer.flush()
Exemplo n.º 9
0
def start(callback, args, workerseed, rank, comm):
    env = gym.make(args.task)
    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time
    sub_hidden_sizes = args.sub_hidden_sizes
    sub_policy_costs = args.sub_policy_costs

    num_batches = 15

    # observation in.
    ob = U.get_placeholder(name="ob",
                           dtype=tf.float32,
                           shape=[None, ob_space.shape[0]])
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])

    # features = Features(name="features", ob=ob)
    policy = Policy(name="policy",
                    ob=ob,
                    ac_space=ac_space,
                    hid_size=32,
                    num_hid_layers=2,
                    num_subpolicies=num_subs)
    old_policy = Policy(name="old_policy",
                        ob=ob,
                        ac_space=ac_space,
                        hid_size=32,
                        num_hid_layers=2,
                        num_subpolicies=num_subs)

    sub_policies = [
        SubPolicy(name="sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=sub_hidden_sizes[x],
                  num_hid_layers=2) for x in range(num_subs)
    ]
    old_sub_policies = [
        SubPolicy(name="old_sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=sub_hidden_sizes[x],
                  num_hid_layers=2) for x in range(num_subs)
    ]

    learner = Learner(env,
                      policy,
                      old_policy,
                      sub_policies,
                      old_sub_policies,
                      comm,
                      clip_param=0.2,
                      entcoeff=0,
                      optim_epochs=10,
                      optim_stepsize=3e-5,
                      optim_batchsize=64,
                      args=args)
    rollout = rollouts.traj_segment_generator(
        policy,
        sub_policies,
        env,
        macro_duration,
        num_rollouts,
        stochastic=True,
        args=args,
        sub_policy_costs=sub_policy_costs)
    fixed_policy_rollouts = []
    for i in range(num_subs):
        fixed_policy_rollouts.append(
            rollouts.traj_segment_generator(policy,
                                            sub_policies,
                                            env,
                                            macro_duration,
                                            num_rollouts,
                                            stochastic=True,
                                            args=args,
                                            sub_policy_costs=sub_policy_costs,
                                            fixed_policy=i))

    for x in range(1):
        callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.

        policy.reset()
        learner.syncMasterPolicies()

        try:
            env.env.randomizeCorrect()
            shared_goal = comm.bcast(env.env.realgoal, root=0)
            env.env.realgoal = shared_goal
        except:
            pass

        # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal))
        mini_ep = 0 if x > 0 else -1 * (rank % 10) * int(warmup_time +
                                                         train_time / 10)
        # mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time + train_time:
            mini_ep += 1
            # rollout
            rolls = rollout.__next__()
            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            gmean, lmean = learner.updateMasterPolicy(rolls)
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls,
                                                 macro_duration,
                                                 0.99,
                                                 0.98,
                                                 num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches,
                                      (mini_ep >= warmup_time))
            # learner.updateSubPolicies(test_seg,
            # log
            # print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean)))
            print(("Episode %d return: %s" % (mini_ep, lmean)))
            if args.s:
                totalmeans.append(gmean)
                with open('outfile' + str(x) + '.pickle', 'wb') as fp:
                    pickle.dump(totalmeans, fp)

            # evaluate sub-policies seperately
            if mini_ep % 50 == 0:
                if args.num_subs != 1:
                    print("macro acts:", rolls['macro_ac'])
                for i, fix_policy_rollout in enumerate(fixed_policy_rollouts):
                    collected_rolls = []
                    for _ in range(10):
                        collected_rolls.extend(fix_policy_rollout.__next__()
                                               ['ep_rets_without_cost'])
                    print("sub %d: %.3f" %
                          (i, statistics.mean(collected_rolls)),
                          end=', ')
                print()