def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #Esto es para alimentar con el optimo trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) print(actions.shape) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # No estoy seguro de si esto es necesario ya # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 for _ in range(200): trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 # to create new file at beginning of trial #f= open("coor_state.txt","w") #f.close while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b_%d_%H_%M_%S") # create unique directories #logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('videos', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, scenario, num_agents, action_dim, timesteps): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() # env, obs_dim, act_dim = init_gym(env_name) env = make_env(scenario) obs_dims = env.observation_space act_dims = [env.action_space[0].n for i in range(env.n)] obs_dims = [obs_dim.shape[0] + 1 for obs_dim in obs_dims] # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=scenario, now=now) aigym_path = os.path.join('/tmp', scenario, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dims) val_func = NNValueFunction(obs_dims[0]+act_dims[0], hid1_mult) policys = [] for i in range(num_agents): policys.append(Policy(i, obs_dims[i], act_dims[0], kl_targ, hid1_mult, policy_logvar, num_agents-1, timesteps)) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policys, scaler, logger, act_dims[0], timesteps, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policys, scaler, logger, act_dims[0],timesteps, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, intents, act_trajs, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: # log_batch_stats(observes, actions,intents, act_trajs, advantages, disc_sum_rew, logger, episode) for i, policy in enumerate(policys): policy.update(observes[i], actions[i], intents[i], act_trajs[i], advantages[i], logger) # update policy val_func.fit(observes[i]+intents[i], disc_sum_rew[i], logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() for policy in policys: policy.close_sess() val_func.close_sess()
def main(arglist): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() # env, obs_dim, act_dim = init_gym(aenv_name) env = make_env(arglist.scenario, arglist) obs_dim = env.observation_space[0].shape[0] act_dim = env.action_space[0].n obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('/tmp', arglist.scenario, now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, arglist.hid1_mult) trainers, loggers = get_trainers(env, arglist.num_adversaries, obs_dim, act_dim, arglist) # run a few episodes of untrained policy to initialize scaler: run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=5) episode = 0 while episode < arglist.num_episodes: trajectories = run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=arglist.b_size) episode += len(trajectories[0]) print("episode: {}".format(episode)) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, arglist.gamma) add_gae(trajectories, arglist.gamma, arglist.lam) observations, actions, advantages, disc_sum_rews = build_train_set(trajectories) log_batch_stats(observations, actions, advantages, disc_sum_rews, loggers, episode) for i in range(len(trainers)): trainers[i].update(observations[i], actions[i], advantages[i], loggers[i]) val_func.fit(observations[i], disc_sum_rews[i], loggers[i]) loggers[i].write(display=True) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if episode % arglist.save_rate == 0: print("Episode {} complete".format(episode)) # score = play(env, policy1, policy2) for i in range(len(loggers)): loggers[i].close() trainers[i].close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, env_name="Hopper-v2"): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = (datetime.datetime.utcnow() - datetime.timedelta(hours=4)).strftime("%b-%d_%H:%M:%S") # create dictionaries based on ets time logger = Logger(logname=env_name, now=now) plotter = Plot(plotname=env_name+"-Fig", now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) # recording, dir?? scaler = Scaler(obs_dim) # obs_dim=377 val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # kl target=0.003 by default # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, plotter, episodes=5, plot=False) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, plotter, episodes=batch_size) episode += len(trajectories) # length of trajectories equals batch size which by default is 20 plotter.updateEpisodes(episode) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, plotter) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() plotter.plot() # plt.show() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/home/vatsal', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): ''' Main training loop Args: env_name: Robot model name num_episodes: maximum umber of episodes to run (int) gamma: reward discount factor (float) lam: lambda for Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)] bath_size: number of episodes per policy training batch ''' env, obs_dim, act_dim = init_env(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) acumulator = BestAcumulator() #TODO agregar la parte de sampling una vez que todo ande # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, 5, acumulator) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, batch_size, acumulator) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to train log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout acumulator.save(pathFolder) logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) episode = 0 #Inizialize reward list (to keep track of improvements) avg_rew_list = [] while episode < num_episodes: print(episode) trajectories = run_policy(env, policy, scaler, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: policy.update(observes, actions, advantages) # update policy val_func.fit(observes, disc_sum_rew) # update value function avg_rew_list.append(avg_rewards(trajectories)) #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards if not episode % 20000: print("Saving models") policy.save(episode) val_func.save(episode) f = open("models/scaler-" + str(episode) + ".pkl", 'wb') pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL) f.close() f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb') pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL) f2.close() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #Show animation at the end of training while True: obs = env.reset() step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 done = False while not done: obs = obs.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) obs = (obs - offset) * scale action = policy.sample(obs).reshape((1, -1)).astype(np.float32) obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) env.render1() env.render2() step += 1e-3 policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, weights_path, init_episode, experiment_name, resume): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() logger = Logger(logname=env_name, sub_dir=experiment_name) aigym_path = os.path.join('results', env_name, experiment_name) if resume: weights_path = aigym_path ckpt = tf.train.get_checkpoint_state(weights_path) init_episode = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) env, obs_dim, act_dim = init_gym(env_name) # obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = init_episode while episode <= num_episodes: if episode % 1000 is 0: # record one episode record(env_name, aigym_path, policy, scaler) policy.save(aigym_path, episode) trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #record one last episode record(env_name, aigym_path, policy, scaler) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, net_size_factor, noise_bias, weight, use_ppoclip): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single" logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) if weight == "None": val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor) policy = None if use_ppoclip == "False": policy = Policy(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) elif use_ppoclip == "True": policy = PolicyClip(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) #assert False, "Not tested" else: assert False, "Unreachable" else: token = weight.split(".") token[-3] = token[-3][:-5] + "value" weight_2 = ".".join(token) val_func = NNValueFunctionContinue(weight_2, obs_dim, net_size_factor=net_size_factor) policy = PolicyContinue(weight, obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, scaler) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() # with open("test_dump", 'w') as f: # pickle.dump(policy, f) policy.close_sess() val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) # scaler = Scaler(obs_dim) logger.log("loading scaler") with open('models/scaler/scaler.pkl', 'rb') as input: scaler = pickle.load(input) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) load_v = False #whether load value function baseline or train from scratch; no big impact on stein if load_v == True: val_func.load_val_model(load_dir) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) #Split data into validation and training data random.shuffle(trajectories) t_trajectories = trajectories[:int(len(trajectories) / 2)] v_trajectories = trajectories[int(len(trajectories) / 2):] refit_v = True # if fit value function baseline once again before evaluating; no big impact on stein if refit_v == True: tt_trajectories = copy.deepcopy(t_trajectories) add_value(tt_trajectories, val_func) add_disc_sum_rew(tt_trajectories, gamma) add_gae(tt_trajectories, gamma, lam) tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set( tt_trajectories) logger.log("refit value function baseline") val_func.fit(tt_observes, tt_disc_sum_rew) # update value function logger.log("done") # build training data after refit v add_value(t_trajectories, val_func) add_disc_sum_rew(t_trajectories, gamma) add_gae(t_trajectories, gamma, lam) t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set( t_trajectories) # build validation data after refit v add_value(v_trajectories, val_func) add_disc_sum_rew(v_trajectories, gamma) add_gae(v_trajectories, gamma, lam) v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set( v_trajectories) sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ max_timesteps, env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(v_observes, v_actions, v_advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) d = Dataset(dict(ob=t_observes, ac=t_actions, atarg=t_advantages, vtarg=t_disc_sum_rew), shuffle=True) for _ in range(phi_epochs): # optim_epochs for batch in d.iterate_once(128): # optim_batchsize policy.update(load_model, batch['ob'], batch['ac'], batch['atarg'], use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(v_observes, \ v_actions, v_advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) if mpi_util.rank == 0: now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) policy = Policy(obs_dim, act_dim, kl_targ) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(policy, scaler, val_func) worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
class Policy(): def __init__(self, name, obs_dim, act_dim, n_ways, batch_size, log_path, gamma=0.995, lam=0.98, kl_targ=0.003, hid1_mult=10, policy_logvar=1.0): self.name = name self.obs_dim, self.act_dim = obs_dim, act_dim self.n_ways = n_ways self.batch_size = batch_size self.gamma = gamma self.lam = lam self.kl_targ = kl_targ self.hid1_mult = hid1_mult self.policy_logvar = policy_logvar self.logger = Logger(logname=os.path.join(log_path, name), now=datetime.utcnow().strftime("%b_%d_%H_%M_%S")) self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10) self.trpo_net = TrpoNet(name, self.obs_dim, self.act_dim, n_ways=n_ways, kl_targ=kl_targ, hid1_mult=hid1_mult, policy_logvar=policy_logvar) self.trajectories = [] self.episode = 0 def update_scaler(self, unscaled): self.scaler.update( unscaled) # update running statistics for scaling observations def update(self, unscaled_obs, actions, rewards, env_idx=-1, trainWeight=False): scale, offset = self.scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 observes = (unscaled_obs - offset) * scale trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards, 'unscaled_obs': unscaled_obs } self.trajectories.append(trajectory) if len(self.trajectories) > self.batch_size: unscaled = np.concatenate( [t['unscaled_obs'] for t in self.trajectories]) self.scaler.update( unscaled) # update running statistics for scaling observations self.logger.log({ '_{}_MeanReward'.format(self.name): np.mean([t['rewards'].sum() for t in self.trajectories]), '_{}_steps'.format(self.name): unscaled.shape[0] / self.batch_size }) trajs = copy.deepcopy(self.trajectories) self.trajectories = [] self.episode += len(trajs) self._add_value(trajs, self.val_func) # add estimated values to episodes self._add_disc_sum_rew( trajs, self.gamma) # calculated discounted sum of Rs self._add_gae(trajs, self.gamma, self.lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = self._build_train_set( trajs) self._log_batch_stats(observes, actions, advantages, disc_sum_rew, self.logger, self.episode) self.trpo_net.update(observes, actions, advantages, env_idx, self.logger, trainWeight=trainWeight) # update policy self.val_func.fit(observes, disc_sum_rew, self.logger) # update value function self.logger.write(display=False) def act(self, unscaled_obs): scale, offset = self.scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature #print(self.name,unscaled_obs.shape,len(offset)) obs = (unscaled_obs - offset) * scale action = self.trpo_net.sample(obs).reshape((1, -1)).astype(np.float32) return action def addway(self): self.n_ways += 1 var_dict = self.trpo_net.get_vars() new_pi = TrpoNet(self.name, self.obs_dim, self.act_dim, self.n_ways, self.kl_targ, self.hid1_mult, self.policy_logvar) new_pi.set_vars(var_dict) self.trpo_net.close_sess() self.trpo_net = new_pi gc.collect() def close_session(self): self.val_func.close_sess() self.trpo_net.close_sess() def _discount(self, x, gamma): """ Calculate discounted forward sum of a sequence at each point """ return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] def _add_value(self, trajectories, val_func): """ Adds estimated value to all time steps of all trajectories Args: trajectories: as returned by run_policy() val_func: object with predict() method, takes observations and returns predicted state value Returns: None (mutates trajectories dictionary to add 'values') """ for trajectory in trajectories: observes = trajectory['observes'] values = val_func.predict(observes) trajectory['values'] = values def _add_disc_sum_rew(self, trajectories, gamma): """ Adds discounted sum of rewards to all time steps of all trajectories Args: trajectories: as returned by run_policy() gamma: discount Returns: None (mutates trajectories dictionary to add 'disc_sum_rew') """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] disc_sum_rew = self._discount(rewards, gamma) trajectory['disc_sum_rew'] = disc_sum_rew def _add_gae(self, trajectories, gamma, lam): """ Add generalized advantage estimator. https://arxiv.org/pdf/1506.02438.pdf Args: trajectories: as returned by run_policy(), must include 'values' key from add_value(). gamma: reward discount lam: lambda (see paper). lam=0 : use TD residuals lam=1 : A = Sum Discounted Rewards - V_hat(s) Returns: None (mutates trajectories dictionary to add 'advantages') """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] values = trajectory['values'] # temporal differences tds = rewards - values + np.append(values[1:] * gamma, 0) advantages = self._discount(tds, gamma * lam) trajectory['advantages'] = advantages def _build_train_set(self, trajectories): """ Args: trajectories: trajectories after processing by add_disc_sum_rew(), add_value(), and add_gae() Returns: 4-tuple of NumPy arrays observes: shape = (N, obs_dim) actions: shape = (N, act_dim) advantages: shape = (N,) disc_sum_rew: shape = (N,) """ observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) # normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def _log_batch_stats(self, observes, actions, advantages, disc_sum_rew, logger, episode): """ Log various batch statistics """ logger.log({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode })
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, animate, evaluate, load_ckpt): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) # Observations here are the previously applied torques + current angles of joints obs_dim = 8 + 8 + 5 # Actions are 1 torque value per oscillator act_dim = 8 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) if evaluate: print("Evaluating: ") eval_agent(env, policy, logger, obs_dim, act_dim, 15) exit() if load_ckpt: print("Loading last ckpt: ") policy.restore_weights() # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, 5, animate) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, batch_size, animate) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.save_weights() policy.close_sess() val_func.save_weights() print("Saved policy and VF weights.") val_func.close_sess()
obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # sess = tf.Session() policy = Policy(obs_dim, act_dim) val_func = NNValueFunction(obs_dim) # sess.run(tf.compat.v1.initializers.global_variables()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories, val_func, gamma, lam) policy.update(observes, actions, advantages, logger) val_func.fit(observes, disc_sum_rew, logger) logger.log({ '_Episode': episode, }) logger.write(display=True)
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model, type): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ,epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj, type=type) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler episode = 0 for i in range(2000): print("sampling and training at %s iteration\n"%(i)) trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() refine_scaler = False if refine_scaler == True: run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler with open('models/scaler/scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, act_dim, obs_dim, final_pol_test, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env = init_env(env_name, **kwargs) # add 1 to obs dimension for time step feature (see run_episode()) obs_dim += 1 tz = timezone('America/Montreal') # Montreal Timezone dt = datetime.now(tz) # Create unique directories now = dt.strftime('%Y-%m-%d %H_%M_%S') logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) dir = './log-files/' + env_name + '/' + now + '/' while episode < num_episodes: trajectories, tot_stuck = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) # add estimated values to episodes add_value(trajectories, val_func) # calculated discounted sum of Rs add_disc_sum_rew(trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage disc0 = [t['disc_sum_rew'][0] for t in trajectories] # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if raw_input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('Standard PPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("log-learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('Standard PPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: print('running simulations') tr, tot_stuck = run_policy(env, policy, scaler, logger, episodes=final_pol_test) print('done') sum_rewww = [t['rewards'].sum() for t in tr] sum_rewww += [tot_stuck] print('total stucks', sum_rewww[-1]) hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('Standard PPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("standard_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() if mpi_util.nworkers > 1: batch_size = batch_size // mpi_util.nworkers if batch_size % mpi_util.nworkers == 0 else batch_size // mpi_util.nworkers + 1 # spread the desired batch_size across processes env, obs_dim, act_dim = init_gym(env_name) mpi_util.set_global_seeds(111 + mpi_util.rank) env.seed(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) if mpi_util.rank == 0: env = wrappers.Monitor(env, aigym_path, force=True, write_upon_reset=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, valfunc_hid_list) policy = Policy(obs_dim, act_dim, kl_targ, policy_hid_list) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: mpi_util.timeit( '--------------------------' ) # let's time everything so we can see where the work is being done trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) mpi_util.timeit('run_policy') # episode += len(trajectories) episode += mpi_util.all_sum(len(trajectories)) mpi_util.timeit('mpi_util.all_sum') add_value(trajectories, val_func) # add estimated values to episodes mpi_util.timeit('add_value') add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs mpi_util.timeit('add_disc_sum_rew') add_gae(trajectories, gamma, lam) # calculate advantage mpi_util.timeit('add_gae') # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) mpi_util.timeit('build_train_set') # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) mpi_util.timeit('log_batch_stats') if mpi_util.rank == 0: policy.update(observes, actions, advantages, logger) # update policy mpi_util.timeit('policy.update') val_func.fit(observes, disc_sum_rew, logger) # update value function mpi_util.timeit('val_func.fit') mpi_util.rank0_bcast_wts( val_func.sess, val_func.g, 'val' ) # doubt if value network is used during rollouts but it only takes a few milliseconds anyhow mpi_util.timeit('mpi_util.rank0_bcast_wts(val_func') mpi_util.rank0_bcast_wts(policy.sess, policy.g, 'policy') mpi_util.timeit('mpi_util.rank0_bcast_wts(policy') if mpi_util.rank == 0: logger.write( display=True) # write logger results to file and stdout # if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
class Central_agent: def __init__(self): with tf.name_scope("central_agent"): self.val_func = NNValueFunction(obs_dim, hid1_mult) self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) self.num_tuple = 0 def update_parameter_server(self, episode, trajectories, name): self.num_tuple += len(trajectories) if len(trajectories) < batch_size: return # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = self.build_train_set( trajectories) # add various stats to training log: self.log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) self.policy.update(observes, actions, advantages, logger) # update policy self.val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout print([ 'thread_name: ' + name + ', episode: ' + str(episode) + ', tuples: ' + str(self.num_tuple) ]) if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) self.policy.save(episode, filename1) self.val_func.save(episode, filename2) def build_train_set(self, trajectories): observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def log_batch_stats(self, observes, actions, advantages, disc_sum_rew, logger, episode): logger.log({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode })
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_iterations: maximum number of iterations to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance coef: coefficient of Stein control variate use_lr_adjust: whether adjust lr based on kl ada_kl_penalty: whether adjust kl penalty max_timesteps: maximum time steps per trajectory reg_scale: regularization coefficient policy_size: policy network size phi_obj: FitQ or MinVar """ env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, c_ph=coef, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, batch_size=1000, max_timesteps=max_timesteps) for _ in range(num_iterations): logger.log("\n#Training Iter %d" % (_)) logger.log("Draw Samples..") trajectories = run_policy(env, policy, scaler, batch_size=batch_size, max_timesteps=max_timesteps) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew) logger.log("Starting Training...") policy.update(observes, actions, advantages, \ use_lr_adjust, ada_kl_penalty) # update policy val_func.fit(observes, disc_sum_rew) # update value function logger.log('--------------------------------\n') policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, risk_targ): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now_utc = datetime.utcnow() # create unique directories now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str( now_utc.year) + '_' + str( ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str( now_utc.second) # adjust for Montreal Time Zone logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, risk_targ, 'CVaR', batch_size, 1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) #big_li_rew_nodisc0 = np.array([]) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes #predicted_values_0 = [t['values'][0] for t in trajectories] add_disc_sum_rew( trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage nodisc0 = -0.0001 * np.array( [t['rewards'].sum() for t in trajectories]) # scaled for gradients print(nodisc0) disc0 = [t['disc_sum_rew'][0] for t in trajectories] print('scaled sum rewards', nodisc0) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) lamb = policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout kl_terms = np.append(kl_terms, policy.check_kl) x1 = list(range(1, (len(kl_terms) + 1))) rewards = plt.plot(x1, kl_terms) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("KL Divergence") plt.savefig("KL_curve.png") plt.close("KL_curve.png") beta_terms = np.append(beta_terms, policy.beta) x2 = list(range(1, (len(beta_terms) + 1))) mean_rewards = plt.plot(x2, beta_terms) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Beta Lagrange Multiplier") plt.savefig("lagrange_beta_curve.png") plt.close("lagrange_beta_curve.png") if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: tr = run_policy(env, policy, scaler, logger, episodes=1000) sum_rewww = [t['rewards'].sum() for t in tr] hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('RAPPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("RA_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def add_value(trajectories, val_func): ''' Adds estimated value to all time steps of all trajectories Args: trajectories: as returned by run_policy() val_func: object with predict() method, takes observations and returns predicted state value Returns: None (mutates trajectories dictionary to add 'values') ''' for trajectory in trajectories: observes = trajectory['observes'] values = val_func.predict(observes) trajectory['values'] = values def add_gae(trajectories, gamma, lam): ''' Add generalized advantage estimator. https://arxiv.org/pdf/1506.02438.pdf Args: trajectories: as returned by run_policy must include 'values' key from add_values(). gamma: reward discount lam: lambda (see paper). lam=0 : use TD residuals lam=1 : A = Sum Discounted Rewards - V_hat(s) Returns: None (mutates trajectories dictionary to add 'advantages') ''' for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] values = trajectory['values'] # temporal differences tds = rewards - values + np.append(values[1:] * gamma, 0) advantages = discount(tds, gamma * lam) trajectory['advantages'] = advantages def build_train_set(trajectories): ''' Args: trajectories after processing by add_disc_sum_rew(), add_value() and add_gae() Returns: 4-tuple of NumPy arrays observes: shape = (N, obs_dim) actions: shape = (N, act_dim) advantages: shape = (N,) disc_sum_rew: shape = (N,) ''' observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) # normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode): """ Log various batch statistics """ logger.log({'_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode }) def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): ''' Main training loop Args: env_name: Robot model name num_episodes: maximum umber of episodes to run (int) gamma: reward discount factor (float) lam: lambda for Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)] bath_size: number of episodes per policy training batch ''' env, obs_dim, act_dim = init_env(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #TODO agregar la parte de sampling una vez que todo ande # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, numEpisodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to train log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env_name = 'HumanoidasimoMRD4_2-v1' #env_name='Humanoid-v1' num_episodes = 5000000 gamma = 0.995 lam = 0.98 kl_targ = 0.003 batch_size = 32 hid1_mult = 10 policy_logvar = -1.0 killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join( '/home/initial/eclipse-workspace4/test/trpo-master/src/result', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, filename2) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, filename=filename1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) policy.save(episode, filename1) val_func.save(episode, filename2) #loger.flush() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, max_time_steps, time_steps_batch, time_steps_mini_batch, gamma, lamda, kl_targ, clipping_range, pol_loss_type, init_pol_logvar, animate,\ save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\ time_step_to_load, now_to_load): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' max_time_steps: maximum number of time steps to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) clipping_range: max value to clip the policy gradient ratio pol_loss_type: string determining which type of loss to use for the Policy Network time_steps_batch: number of time steps per policy training batch init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved save_rate: Int determining how often to save videos for num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** task_params_str = ''.join(str(e) + ', ' for e in task_params) num_tasks = len(task_params) envs = [None] * num_tasks scalers = [None] * num_tasks loggers = [None] * num_tasks print("\n\n------ PATHS: ------") start_time = datetime.now() if time_step_to_load == None: now = start_time.strftime( "%b-%d_%H:%M:%S" ) # If NOT loading from Checkpoint -> used to create unique directories else: assert now_to_load != None,\ "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load) now = now_to_load logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now) for task in range(num_tasks): # Create task specific environment envs[task], obs_dim, act_dim = init_gym(env_name, task_param=task_params[task]) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Create task specific Paths and logger object loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \ logname_file= "_{}_{}".format(task_name, task_params[task])) if time_step_to_load == None: # If NOT loading from Checkpoint scalers[task] = Scaler(obs_dim) # Auxiliary saver (becase logger sometimes fails or takes to much time) with open( logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: f.write("_TimeStep" + " " + "_MeanReward") aigym_path = os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders agent_path = os.path.join('agents', env_name, task_name, task_params_str, now) # agent / policy folders if time_step_to_load == None: # If NOT loading from Checkpoint os.makedirs(agent_path) with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command print("\nPath for Saved Videos : {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # **************** Initialize Policy, Value Networks and Scaler *************** print("\n\n------ NEURAL NETWORKS: ------") dims_core_hid.insert( 0, obs_dim ) # Modify dims list to have the size of the layer 'n-1' at position '0' dims_head_hid.insert(0, dims_head_hid[-1]) val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks, time_steps_mini_batch) policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, time_steps_mini_batch, pol_loss_type=pol_loss_type) # Load from Checkpoint: # Validate intented time step to load OR get last time step number if no target time step was provided if time_step_to_load != None: load_agent_path = agent_path # agent / policy folders saved_ep_list = [ file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file ] if time_step_to_load == -1: # Get last saved time step time_step_to_load = sorted( [int(ep_string) for ep_string in saved_ep_list])[-1] else: # Validate if time_step_to_load was indeed saved assert str(time_step_to_load) in saved_ep_list,\ "\n\nWARNING: Time Step you want to load ({}) was not stored during trainning".format(time_step_to_load) # Load Policy Network's Ops and Variables & Load Scaler Object policy.tf_saver.restore( policy.sess, "{}/policy_ep_{}".format(load_agent_path, time_step_to_load)) val_func.tf_saver.restore( val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, time_step_to_load)) scalers = pickle.load( open( "{}/scalers_ep_{}.p".format(load_agent_path, time_step_to_load), 'rb')) print("\n\n ---- CHECKPOINT LOAD: Time Step Loaded **{}**".format( time_step_to_load)) # Delete extra epochs that where logged to the auxiliary logs for task in range(num_tasks): aux_log_path = logs_path + '/aux_{}_{}.txt'.format( task_name, task_params[task]) aux_log = pd.read_table(aux_log_path, delim_whitespace=True) idx_to_cut = aux_log.index[aux_log["_TimeStep"] == time_step_to_load].tolist()[0] aux_log[0:idx_to_cut + 1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log # If NOT loading from Checkpoint: run some time steps to initialize scalers and create Tensor board dirs elif time_step_to_load == None: for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], time_steps_batch=int(time_steps_batch / 3), task=task) # Tensor Board writer os.makedirs(agent_path + '/tensor_board/policy') os.makedirs(agent_path + '/tensor_board/valFunc') tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g) tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g) # **************** Start Training *************** print("\n\n------ TRAINNING: ------") animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_offset = save_rate killer = GracefulKiller() if time_step_to_load == None: time_step = 0 else: time_step = time_step_to_load # Time steps are counted across all tasks i.e. N time steps indicates each tasks has been runned for N times while time_step < max_time_steps and not killer.kill_now: # **************** Obtain data (train set) *************** observes_all = [None] * num_tasks actions_all = [None] * num_tasks advantages_all = [None] * num_tasks disc_sum_rew_all = [None] * num_tasks time_step += time_steps_batch for task in range(num_tasks): # Obtain 'time_steps_batch' trajectories and add additional intermediate calculations trajectories = run_policy(envs[task], policy, scalers[task], loggers[task], time_steps_batch=time_steps_batch, task=task, animate=animate) add_value(trajectories, val_func, task) # add estimated values to trajectories add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all time steps into single NumPy arrays observes_all[task], actions_all[task], advantages_all[ task], disc_sum_rew_all[task] = build_train_set(trajectories) # print("Observes Shape: {}".format(observes_all[task].shape)) # print("Actions Shape: {}\n\n".format(actions_all[task].shape)) # print("Advantage Shape: {}\n\n".format(advantages_all[task].shape)) # Logging Stats log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \ loggers[task], time_step) # **************** Update Policy and Value Networks *************** # print ("*************************************") for task in range(num_tasks): pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task]) # update policy val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task]) # update value function # Auxiliary saver (because logger sometimes fails or takes to much time) with open( logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: f.write("\n" + str(loggers[task].log_entry['_TimeStep']) + " " + str(loggers[task].log_entry['_MeanReward'])) loggers[task].write( display=False) # write logger results to file and stdout tb_pol_writer.add_summary(pol_summary, global_step=time_step) tb_val_writer.add_summary(val_summary, global_step=time_step) # **************** Storing NN and Videos *************** # Store Policy, Value Network and Scaler: every 'save_rate' or in first/last time steps if time_step >= saver_offset or time_step >= max_time_steps or time_step <= time_steps_batch * 1.5 or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += save_rate policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format( agent_path, time_step)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format( agent_path, time_step)) # Save Value Network pickle.dump( scalers, open("{}/scalers_ep_{}.p".format(agent_path, time_step), 'wb')) print("---- Saved Agent at Time Step {} ----".format(time_step)) # Save video of current agent/policy if save_video: print( "---- Saving Video at Time Step {} ----".format(time_step)) for task in range(num_tasks): _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ts_{}/{}_{}".format( time_step, task_name, task_params[task])) envs[task].close() # closes window open by monitor wrapper envs[task], _, _ = init_gym( env_name, task_param=task_params[task] ) # Recreate env as it was killed print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # **************** Terminate Variables ************** for task in range(num_tasks): envs[task].close() loggers[task].close() policy.close_sess() val_func.close_sess() # Save elapsed time end_time = datetime.now() elapsed_time = end_time - start_time timedelta(0, 8, 562000) delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60) delta_str = "Elapsed Time: {} min {} seconds".format( delta_time[0], delta_time[1]) # save elapsed time, 'a' to append not overwrite with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, save): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories env_id = env_name + id_generator() logger = Logger(logname=env_id, now=now) aigym_path = os.path.join('/tmp', env_id) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=lambda episode_id: False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 if env_name == 'Swimmer-v1': score_window = 100 solution_score = 360 elif env_name == 'HalfCheetah-v1': score_window = 100 solution_score = 4800 else: assert False # assert score_window % batch_size == 0 rewards = collections.deque(maxlen=int(np.rint(score_window / batch_size))) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function mean_reward = logger.log_entry['_MeanReward'] logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False rewards.append(mean_reward) ''' if np.mean(rewards) >= solution_score: episode = episode - score_window break ''' logger.close() policy.close_sess() val_func.close_sess() # return episode return -np.mean(rewards)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ memory = deque([]) memory_size = kwargs['memory_size'] killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # kl_targ = 0? explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0) run_policy(env, explore_policy, scaler, logger, episodes=5, fix_drct_dist=0) episode = 0 fix_drct_dist_range = (0.3, 0) while episode < num_episodes: # save model if episode % 200 == 0: save_path = target_policy.saver.save( target_policy.sess, "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt" % (episode)) # run a few episodes fix_drct_dist = ( (episode * fix_drct_dist_range[1]) + (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes target_trajectories = run_policy(env, target_policy, scaler, logger, episodes=batch_size, fix_drct_dist=0) explore_trajectories = run_policy(env, explore_policy, scaler, logger, episodes=batch_size, fix_drct_dist=fix_drct_dist) # Add to memory n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1) trajectories = target_trajectories + explore_trajectories[:n_explore] episode += batch_size memory += trajectories while len(memory) > memory_size: memory.popleft() # train explore network add_value(explore_trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(explore_trajectories, gamma) # calculated discounted sum of Rs add_gae(explore_trajectories, gamma, lam) # calculate advantage observes, actions, advantages, disc_sum_rew = build_train_set( explore_trajectories) explore_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # train target network # re-sample trajectories trajectories = sample(memory, batch_size) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) target_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f: for reward in rewards_record: f.write('%f\n' % reward) plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record) plt.savefig('learning_curve_%s.png' % kwargs['log_postfix']) logger.close() explore_policy.close_sess() target_policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ print('Start time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.now().strftime("%b-%d_%H:%M:%S") # create unique directories 格林尼治时间!!! utcnow改为now testname = now+'-'+TestNote logger = Logger(logname=env_name, now=testname) monitor_path = os.path.join('log-files', env_name, testname, 'monitor') env = wrappers.Monitor(env, monitor_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 print('Start time:\n') time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # save models if not episode % (num_episodes / 10): policy_save_path = os.path.join('log-files', env_name, testname, 'checkpoint') policy.save_model(env_name + "-" + str(episode), policy_save_path) logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess() print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate, submit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_osim(animate) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories if mpi_util.rank == 0: #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) episode = 0 checkpoint = Checkpoint("saves", now) # restore from checkpoint? if restore_path: (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ) = checkpoint.restore(restore_path) else: policy = Policy(obs_dim, act_dim, kl_targ) val_func = NNValueFunction(obs_dim) scaler = Scaler(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: checkpoint.save(policy, val_func, scaler, episode) if animate: observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler, animate=animate) exit(0) if submit: # Settings #remote_base = 'http://grader.crowdai.org:1729' remote_base = 'http://grader.crowdai.org:1730' token = 'a83412a94593cae3a491f3ee28ff44e1' client = Client(remote_base) # Create environment observation = client.env_create(token) step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: obs = np.array(observation).astype(np.float32).reshape((1, -1)) print("OBSERVATION TYPE:", type(obs), obs.shape) print(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations observes.append(obs) action = policy.sample(obs).astype(np.float32).reshape((-1, 1)) print("ACTION TYPE:", type(action), action.shape) print(action) actions.append(action) [observation, reward, done, info] = client.env_step(action.tolist()) print("step:", step, "reward:", reward) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) step += 1e-3 # increment time step feature if done: print( "================================== RESTARTING =================================" ) observation = client.env_reset() step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature if not observation: break client.submit() exit(0) ###### worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) batch = 0 while episode < num_episodes: if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0: checkpoint.save(policy, val_func, scaler, episode) batch = batch + 1 trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()