def test_const_adv(env, protag_policy, path_length=100, n_traj=5, render=False): const_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) paths = [] sum_rewards = 0.0 for _ in range(n_traj): path = rollout(env, protag_policy, path_length, adv_agent=const_adv_policy, animated=render, test=True) sum_rewards += path['rewards'].sum() paths.append(path) avg_rewards = sum_rewards / n_traj return avg_rewards
## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True ) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy( env_spec=env.spec, is_protagonist=False, constant_val = 0.0 ) ## Adversary policy definition ## adv_policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=False ) adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ##
def train(num_experiments, thread_id, queue): ############ DEFAULT PARAMETERS ############ env_name = None #Name of adversarial environment path_length = 1000 #Maximum episode length layer_size = tuple([100, 100, 100]) #Layer definition ifRender = False #Should we render? afterRender = 100 #After how many to animate n_exps = 1 #Number of training instances to run n_itr = 25 #Number of iterations of the alternating optimization n_pro_itr = 1 #Number of iterations for the protaginist n_adv_itr = 1 #Number of interations for the adversary batch_size = 4000 #Number of training samples for each iteration ifSave = True #Should we save? save_every = 100 #Save checkpoint every save_every iterations n_process = 1 #Number of parallel threads for sampling environment adv_fraction = 0.25 #Fraction of maximum adversarial force to be applied step_size = 0.01 #kl step size for TRPO gae_lambda = 0.97 #gae_lambda for learner save_dir = './results' #folder to save result in ############ ENV SPECIFIC PARAMETERS ############ env_name = 'HopperAdv-v1' layer_size = tuple([64, 64]) step_size = 0.01 gae_lambda = 1.0 batch_size = 25000 n_exps = num_experiments n_itr = 500 ifSave = False n_process = 4 adv_fraction = 3.0 save_dir = './../results/StaticHopper' args = [ env_name, path_length, layer_size, ifRender, afterRender, n_exps, n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process, adv_fraction, step_size, gae_lambda, save_dir ] ############ ADVERSARIAL POLICY LOAD ############ filepath = './../initial_results/Hopper/env-HopperAdv-v1_Exp1_Itr500_BS25000_Adv0.25_stp0.01_lam1.0_369983.p' res_D = pickle.load(open(filepath, 'rb')) pretrained_adv_policy = res_D['adv_policy'] ############ MAIN LOOP ############ ## Initializing summaries for the tests ## const_test_rew_summary = [] rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = pretrained_adv_policy adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ## pro_algo = TRPO(env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Beginning alternating optimization ## for ni in range(n_itr): logger.log('\n\nThread: {} Experiment: {} Iteration: {}\n'.format( thread_id, ne, ni, )) ## Train Protagonist pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format( np.array(pro_algo.rews).mean())) ## Test the learnt policies const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) if ni % afterRender == 0 and ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) if ni != 0 and ni % save_every == 0 and ifSave == True: ## SAVING CHECKPOINT INFO ## pickle.dump( { 'args': args, 'pro_policy': pro_policy, 'adv_policy': adv_policy, 'zero_test': [const_testing_rews], 'rand_test': [rand_testing_rews], 'step_test': [step_testing_rews], 'rand_step_test': [rand_step_testing_rews], 'iter_save': ni, 'exp_save': ne, 'adv_test': [adv_testing_rews] }, open(save_name + '_' + str(ni) + '.p', 'wb')) ## Shutting down the optimizer ## pro_algo.shutdown_worker() ## Updating the test summaries over all training instances const_test_rew_summary.append(const_testing_rews) rand_test_rew_summary.append(rand_testing_rews) step_test_rew_summary.append(step_testing_rews) rand_step_test_rew_summary.append(rand_step_testing_rews) adv_test_rew_summary.append(adv_testing_rews) queue.put([ const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary, rand_step_test_rew_summary, adv_test_rew_summary ]) ############ SAVING MODEL ############ '''