def rollout_row(train_config_num, env_ind, env, q): mean_rollouts = np.zeros(len(phi_configs)) std_rollouts = np.zeros(len(phi_configs)) # iterate over test configurations for test_config_num, test_config in enumerate(phi_configs): print("train config num : {}".format(train_config_num)) print("test config num : {}".format(test_config_num)) rollouts = [] # iterate over agents for agent_num in range(num_agents): real_config_num = train_config_num - 1 if train_config_num == 0: real_config_num = "nominal" file_str = '../policies_curriculum/{}/policy_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], dynamic_environments[env_ind], real_config_num, agent_num) # read in the agent's policy policy = loadModel(file_str) if train_config_num == 0: # set configuration for nominal policy policy.set_config(test_config) curriculum = None else: # note that policy config is set through the curriculum # by having only one element, we ensure this is the config during rollouts assert (isinstance(policy, CurriculumPolicy)) curriculum = [test_config] cum_rewards = [] for i in range(num_rollouts): rollout_dict = rollout(env=env, agent=policy, max_path_length=env.horizon, curriculum=curriculum) cum_rewards.append(np.sum(rollout_dict["rewards"])) rollouts.append(cum_rewards) mean_rollouts[test_config_num] = np.mean(rollouts) std_rollouts[test_config_num] = np.std(rollouts) q.put((train_config_num, test_config_num, mean_rollouts[test_config_num], std_rollouts[test_config_num])) # write to file in case something weird with multiproc happens... saveModel([mean_rollouts, std_rollouts], 'rollouts_{}_config_{}'.format(dynamic_environments[env_ind], train_config_num)) print("GOT HERE {}".format(train_config_num)) return
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=NUM_ITERS, discount=0.995, step_size=0.01, gae_lambda=0.97, sampler_args={'n_workers': 2}, plot_learning_curve=GENERATE_PLOTS, trial=trial, ) avg_rewards, std_rewards = algo.train() print('trial {}'.format(trial)) saveModel( algo.policy, 'policy_{}_{}_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, TRAIN_ADVERSARIAL, NUM_ITERS, PROBABILITY, EPS, MAX_NORM, USE_DYNAMICS, trial)) # save rewards per model over the iterations if GENERATE_PLOTS: saveModel([range(NUM_ITERS), avg_rewards, std_rewards], 'rewards_{}_{}_{}_{}_{}_{}_{}_{}'.format( ENV_NAME, TRAIN_ADVERSARIAL, NUM_ITERS, PROBABILITY, EPS, MAX_NORM, USE_DYNAMICS, trial))
def train(env_ind, config_num, num_agents): # get the original state space size first org_env = GymEnv(original_environments[env_ind]) org_env_size = org_env.observation_space.shape[0] org_env.terminate() # the environment env = GymEnv(dynamic_environments[env_ind]) # the configuration settings curriculum_config = curriculum_configs[config_num] if args.env_ind == 0: # batch size for Inverted Pendulum curriculum_config.set_batch_size(5000) else: # batch size for all other environments curriculum_config.set_batch_size(25000) # the nominal config config = curriculum_config.curriculum_list[0] for agent_num in range(num_agents): # define policy by reading from config class policy = CurriculumPolicy( env_spec=env.spec, hidden_sizes=config.hidden_sizes, adaptive_std=config.adaptive_std, adversarial=config.adversarial, eps=config.eps, probability=config.probability, use_dynamics=config.use_dynamics, random=config.random, observable_noise=config.observable_noise, zero_gradient_cutoff=org_env_size, use_max_norm=config.use_max_norm, curriculum_list=list(curriculum_config.curriculum_list), update_freq=curriculum_config.update_freq, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=config.batch_size, max_path_length=env.horizon, n_itr=config.num_iter, discount=config.discount, step_size=config.step_size, gae_lambda=config.gae_lambda, num_workers=config.num_workers, plot_learning_curve=config.plot_learning_curve, trial=agent_num, ) avg_rewards, std_rewards = algo.train() print("training completed!") saveModel( algo.policy, 'policy_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], config_num, agent_num)) # save rewards per model over the iterations if config.plot_learning_curve: saveModel([range(config.num_iter), avg_rewards, std_rewards], 'rewards_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], config_num, agent_num))
baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=config.batch_size, max_path_length=env.horizon, n_itr=config.num_iter, discount=config.discount, step_size=config.step_size, gae_lambda=config.gae_lambda, num_workers=config.num_workers, plot_learning_curve=config.plot_learning_curve, trial=agent_num, ) avg_rewards, std_rewards = algo.train() print("training completed!") saveModel( algo.policy, 'policy_{}_config_{}_agent_{}'.format( dynamic_environments[args.env_ind], args.config_num, agent_num)) # save rewards per model over the iterations if config.plot_learning_curve: saveModel([range(config.num_iter), avg_rewards, std_rewards], 'rewards_{}_config_{}_agent_{}'.format( dynamic_environments[args.env_ind], args.config_num, agent_num))
# read in the agent's policy policy = loadModel(file_str) # set policy parameters to ensure we test correctly (these are used by the rollout function internally) policy.adversarial = test_config.adversarial policy.eps = test_config.eps policy.probability = test_config.probability policy.use_dynamics = test_config.use_dynamics policy.random = test_config.random policy.observable_noise = test_config.observable_noise policy.use_max_norm = test_config.use_max_norm cum_rewards = [] for i in range(num_rollouts): rollout_dict = rollout(env=env, agent=policy, max_path_length=env.horizon) cum_rewards.append(np.sum(rollout_dict["rewards"])) rollouts.append(cum_rewards) mean_rewards[test_config_num] = np.mean(rollouts) std_rewards[test_config_num] = np.std(rollouts) print("mean_rewards") print(mean_rewards) print("std_rewards") print(std_rewards) saveModel([mean_rewards, std_rewards], 'rollouts_{}_config_{}'.format( dynamic_environments[args.env_ind], args.config_num))
# read in the agent's policy policy = loadModel(fname) o = env.reset() original_dynamics = o[org_env_size:] assert (len(original_dynamics) == 2) for i in range(num_param_evals): for j in range(num_param_evals): new_dynamics = original_dynamics.copy() new_dynamics[0] = percentages[i] * original_dynamics[0] new_dynamics[1] = percentages[j] * original_dynamics[1] policy.set_dynamics = new_dynamics policy.adversarial = False # curriculum is just nominal config, no adversarial curriculum = [phi_configs[0]] # average over several rollouts cum_rewards = np.zeros(num_rollouts) for k in range(num_rollouts): rollout_dict = rollout(env=env, agent=policy, max_path_length=env.horizon, curriculum=curriculum) cum_rewards[k] = np.sum(rollout_dict["rewards"]) results[i, j] = np.mean(cum_rewards) saveModel(results, "epopt_{}".format(f_suffix))
baseline=baseline, batch_size=config.batch_size, max_path_length=env.horizon, n_itr=n_itr, discount=config.discount, step_size=config.step_size, gae_lambda=config.gae_lambda, num_workers=config.num_workers, plot_learning_curve=config.plot_learning_curve, trial=agent_num, ) avg_rewards, std_rewards = algo.train() print("training completed!") saveModel( algo.policy, 'policy_{}_config_{}_agent_{}'.format( dynamic_environments[args.env_ind], args.config_num, agent_num)) # save rewards per model over the iterations # also plot the rewards if config.plot_learning_curve: saveModel([range(n_itr), avg_rewards, std_rewards], 'rewards_{}_config_{}_agent_{}'.format( dynamic_environments[args.env_ind], args.config_num, agent_num)) plt.figure() plt.plot(range(n_itr), avg_rewards) plt.title('Learning Curve') plt.savefig('mr_{}_config_{}_agent_{}.png'.format( dynamic_environments[args.env_ind], args.config_num, agent_num))