def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, clipping_range): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) env.reset() env.render() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, clipping_range) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: # trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) # episode += len(trajectories) # add_value(trajectories, val_func) # add estimated values to episodes # add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs # add_gae(trajectories, gamma, lam) # calculate advantage # # concatenate all episodes into single NumPy arrays # observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # # add various stats to training log: # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) # policy.update(observes, actions, advantages, logger) # update policy # val_func.fit(observes, disc_sum_rew, logger) # update value function # logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if episode % 100 == 0: policy.save_sess() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ print('Testing Period:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) env.set_goals(0) now = datetime.now().strftime("%b-%d_%H:%M:%S") # create unique directories 格林尼治时间!!! utcnow改为now testname = now+'-'+TestNote logger = Logger(logname=env_name, now=testname) aigym_path = os.path.join('log-Test-files', env_name, testname) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt') episode = 0 observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True) tmp=np.vstack((rewards,states_x,states_y)) tmp1=np.transpose(tmp) data = np.concatenate((observes, actions, tmp1),axis=1) trajectory = {} for j in range(data.shape[0]): for i in range(data.shape[1]): trajectory[i] = data[j][i] logger.log(trajectory) logger.write(display=False) logger.close() policy.close_sess() val_func.close_sess() print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #Esto es para alimentar con el optimo trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) print(actions.shape) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # No estoy seguro de si esto es necesario ya # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def restore(self, restore_path): name = restore_path.split("/")[ -1] # remove preceding/path/components/to/name # unpickle and restore scaler - NOTE: this file includes some other variables too mypath = "saves/scaler/" + name print("restoring scaler checkpoint from:", mypath) with open(mypath + ".scaler", 'rb') as f: (scaler, episode, obs_dim, act_dim, kl_targ, self.init_time) = pickle.load(f) # policy mypath = "saves/policy/" + name print("restoring policy checkpoint from:", mypath) policy = Policy(obs_dim, act_dim, kl_targ, restore_path=mypath) print("restored policy:") Checkpoint.dump_vars(policy.g) # val_func mypath = "saves/val_func/" + name print("restoring val_func checkpoint from:", mypath) val_func = NNValueFunction(obs_dim, restore_path=mypath) print("restored val_func:") Checkpoint.dump_vars(val_func.g) print("finished restore.") return (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env_name = 'hsr' killer = GracefulKiller() #env, obs_dim, act_dim = init_gym(env_name) env = pr2_agent("r_arm") obs_dim = 10 act_dim = 7 obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #while episode < num_episodes: '''
def restore_old(self, policy, val_func, scaler, restore_path): #mypath = self.checkpoints_dir+"/"+restore_path mypath = restore_path print("restoring checkpoint from:", mypath) from policy import Policy from value_function import NNValueFunction policy = Policy(policy.obs_dim, policy.act_dim, policy.kl_targ, restore_flag=True) with policy.g.as_default(): print("0000000A") Checkpoint.dump_vars(policy.g) tf.saved_model.loader.load(policy.sess, [tf.saved_model.tag_constants.TRAINING], mypath + ".policy") print("1111111A") Checkpoint.dump_vars(policy.g) policy._placeholders() print("YYYY:", policy.obs_ph) val_func = NNValueFunction(val_func.obs_dim, restore_flag=True) with val_func.g.as_default(): print("2222222A") Checkpoint.dump_vars(val_func.g) tf.saved_model.loader.load(val_func.sess, [tf.saved_model.tag_constants.TRAINING], mypath + ".val_func") print("3333333A") Checkpoint.dump_vars(val_func.g) val_func._placeholders() print("YYYY:", val_func.obs_ph) # unpickle and restore scaler with open(mypath + ".scaler", 'rb') as f: (scaler, episode) = pickle.load(f) print("FINISHED RESTORE") return (policy, val_func, scaler, episode)
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 for _ in range(200): trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 # to create new file at beginning of trial #f= open("coor_state.txt","w") #f.close while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, no_of_updates): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env, obs_dim, act_dim = init_gym(env_name) #env._max_episode_steps = 150 obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b_%d_%H_%M_%S") # create unique directories #logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('videos', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, no_of_updates) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, no_of_updates) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout logger.close() policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, scenario, num_agents, action_dim, timesteps): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() # env, obs_dim, act_dim = init_gym(env_name) env = make_env(scenario) obs_dims = env.observation_space act_dims = [env.action_space[0].n for i in range(env.n)] obs_dims = [obs_dim.shape[0] + 1 for obs_dim in obs_dims] # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=scenario, now=now) aigym_path = os.path.join('/tmp', scenario, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dims) val_func = NNValueFunction(obs_dims[0]+act_dims[0], hid1_mult) policys = [] for i in range(num_agents): policys.append(Policy(i, obs_dims[i], act_dims[0], kl_targ, hid1_mult, policy_logvar, num_agents-1, timesteps)) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policys, scaler, logger, act_dims[0], timesteps, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policys, scaler, logger, act_dims[0],timesteps, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, intents, act_trajs, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: # log_batch_stats(observes, actions,intents, act_trajs, advantages, disc_sum_rew, logger, episode) for i, policy in enumerate(policys): policy.update(observes[i], actions[i], intents[i], act_trajs[i], advantages[i], logger) # update policy val_func.fit(observes[i]+intents[i], disc_sum_rew[i], logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() for policy in policys: policy.close_sess() val_func.close_sess()
def __init__(self, name, obs_dim, act_dim, n_ways, batch_size, log_path, gamma=0.995, lam=0.98, kl_targ=0.003, hid1_mult=10, policy_logvar=1.0): self.name = name self.obs_dim, self.act_dim = obs_dim, act_dim self.n_ways = n_ways self.batch_size = batch_size self.gamma = gamma self.lam = lam self.kl_targ = kl_targ self.hid1_mult = hid1_mult self.policy_logvar = policy_logvar self.logger = Logger(logname=os.path.join(log_path, name), now=datetime.utcnow().strftime("%b_%d_%H_%M_%S")) self.scaler = Scaler(self.obs_dim) self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10) self.trpo_net = TrpoNet(name, self.obs_dim, self.act_dim, n_ways=n_ways, kl_targ=kl_targ, hid1_mult=hid1_mult, policy_logvar=policy_logvar) self.trajectories = [] self.episode = 0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_osim() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if portable_input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(arglist): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() # env, obs_dim, act_dim = init_gym(aenv_name) env = make_env(arglist.scenario, arglist) obs_dim = env.observation_space[0].shape[0] act_dim = env.action_space[0].n obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('/tmp', arglist.scenario, now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, arglist.hid1_mult) trainers, loggers = get_trainers(env, arglist.num_adversaries, obs_dim, act_dim, arglist) # run a few episodes of untrained policy to initialize scaler: run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=5) episode = 0 while episode < arglist.num_episodes: trajectories = run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=arglist.b_size) episode += len(trajectories[0]) print("episode: {}".format(episode)) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, arglist.gamma) add_gae(trajectories, arglist.gamma, arglist.lam) observations, actions, advantages, disc_sum_rews = build_train_set(trajectories) log_batch_stats(observations, actions, advantages, disc_sum_rews, loggers, episode) for i in range(len(trainers)): trainers[i].update(observations[i], actions[i], advantages[i], loggers[i]) val_func.fit(observations[i], disc_sum_rews[i], loggers[i]) loggers[i].write(display=True) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if episode % arglist.save_rate == 0: print("Episode {} complete".format(episode)) # score = play(env, policy1, policy2) for i in range(len(loggers)): loggers[i].close() trainers[i].close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, env_name="Hopper-v2"): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = (datetime.datetime.utcnow() - datetime.timedelta(hours=4)).strftime("%b-%d_%H:%M:%S") # create dictionaries based on ets time logger = Logger(logname=env_name, now=now) plotter = Plot(plotname=env_name+"-Fig", now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) # recording, dir?? scaler = Scaler(obs_dim) # obs_dim=377 val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # kl target=0.003 by default # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, plotter, episodes=5, plot=False) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, plotter, episodes=batch_size) episode += len(trajectories) # length of trajectories equals batch size which by default is 20 plotter.updateEpisodes(episode) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, plotter) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() plotter.plot() # plt.show() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/home/vatsal', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): ''' Main training loop Args: env_name: Robot model name num_episodes: maximum umber of episodes to run (int) gamma: reward discount factor (float) lam: lambda for Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)] bath_size: number of episodes per policy training batch ''' env, obs_dim, act_dim = init_env(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) acumulator = BestAcumulator() #TODO agregar la parte de sampling una vez que todo ande # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, 5, acumulator) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, batch_size, acumulator) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to train log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout acumulator.save(pathFolder) logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env_name = 'HumanoidasimoMRD4_2-v1' #env_name='Humanoid-v1' num_episodes = 5000000 gamma = 0.995 lam = 0.98 kl_targ = 0.003 batch_size = 32 hid1_mult = 10 policy_logvar = -1.0 killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join( '/home/initial/eclipse-workspace4/test/trpo-master/src/result', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, filename2) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, filename=filename1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) policy.save(episode, filename1) val_func.save(episode, filename2) #loger.flush() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) # scaler = Scaler(obs_dim) logger.log("loading scaler") with open('models/scaler/scaler.pkl', 'rb') as input: scaler = pickle.load(input) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) load_v = False #whether load value function baseline or train from scratch; no big impact on stein if load_v == True: val_func.load_val_model(load_dir) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) #Split data into validation and training data random.shuffle(trajectories) t_trajectories = trajectories[:int(len(trajectories) / 2)] v_trajectories = trajectories[int(len(trajectories) / 2):] refit_v = True # if fit value function baseline once again before evaluating; no big impact on stein if refit_v == True: tt_trajectories = copy.deepcopy(t_trajectories) add_value(tt_trajectories, val_func) add_disc_sum_rew(tt_trajectories, gamma) add_gae(tt_trajectories, gamma, lam) tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set( tt_trajectories) logger.log("refit value function baseline") val_func.fit(tt_observes, tt_disc_sum_rew) # update value function logger.log("done") # build training data after refit v add_value(t_trajectories, val_func) add_disc_sum_rew(t_trajectories, gamma) add_gae(t_trajectories, gamma, lam) t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set( t_trajectories) # build validation data after refit v add_value(v_trajectories, val_func) add_disc_sum_rew(v_trajectories, gamma) add_gae(v_trajectories, gamma, lam) v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set( v_trajectories) sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ max_timesteps, env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(v_observes, v_actions, v_advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) d = Dataset(dict(ob=t_observes, ac=t_actions, atarg=t_advantages, vtarg=t_disc_sum_rew), shuffle=True) for _ in range(phi_epochs): # optim_epochs for batch in d.iterate_once(128): # optim_batchsize policy.update(load_model, batch['ob'], batch['ac'], batch['atarg'], use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(v_observes, \ v_actions, v_advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, risk_targ): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now_utc = datetime.utcnow() # create unique directories now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str( now_utc.year) + '_' + str( ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str( now_utc.second) # adjust for Montreal Time Zone logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, risk_targ, 'CVaR', batch_size, 1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) #big_li_rew_nodisc0 = np.array([]) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes #predicted_values_0 = [t['values'][0] for t in trajectories] add_disc_sum_rew( trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage nodisc0 = -0.0001 * np.array( [t['rewards'].sum() for t in trajectories]) # scaled for gradients print(nodisc0) disc0 = [t['disc_sum_rew'][0] for t in trajectories] print('scaled sum rewards', nodisc0) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) lamb = policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout kl_terms = np.append(kl_terms, policy.check_kl) x1 = list(range(1, (len(kl_terms) + 1))) rewards = plt.plot(x1, kl_terms) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("KL Divergence") plt.savefig("KL_curve.png") plt.close("KL_curve.png") beta_terms = np.append(beta_terms, policy.beta) x2 = list(range(1, (len(beta_terms) + 1))) mean_rewards = plt.plot(x2, beta_terms) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Beta Lagrange Multiplier") plt.savefig("lagrange_beta_curve.png") plt.close("lagrange_beta_curve.png") if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: tr = run_policy(env, policy, scaler, logger, episodes=1000) sum_rewww = [t['rewards'].sum() for t in tr] hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('RAPPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("RA_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) if mpi_util.rank == 0: now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) policy = Policy(obs_dim, act_dim, kl_targ) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(policy, scaler, val_func) worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
def __init__(self): with tf.name_scope("central_agent"): self.val_func = NNValueFunction(obs_dim, hid1_mult) self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) self.num_tuple = 0
class Central_agent: def __init__(self): with tf.name_scope("central_agent"): self.val_func = NNValueFunction(obs_dim, hid1_mult) self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) self.num_tuple = 0 def update_parameter_server(self, episode, trajectories, name): self.num_tuple += len(trajectories) if len(trajectories) < batch_size: return # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = self.build_train_set( trajectories) # add various stats to training log: self.log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) self.policy.update(observes, actions, advantages, logger) # update policy self.val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout print([ 'thread_name: ' + name + ', episode: ' + str(episode) + ', tuples: ' + str(self.num_tuple) ]) if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) self.policy.save(episode, filename1) self.val_func.save(episode, filename2) def build_train_set(self, trajectories): observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def log_batch_stats(self, observes, actions, advantages, disc_sum_rew, logger, episode): logger.log({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode })
def main(env_name, max_time_steps, time_steps_batch, time_steps_mini_batch, gamma, lamda, kl_targ, clipping_range, pol_loss_type, init_pol_logvar, animate,\ save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\ time_step_to_load, now_to_load): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' max_time_steps: maximum number of time steps to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) clipping_range: max value to clip the policy gradient ratio pol_loss_type: string determining which type of loss to use for the Policy Network time_steps_batch: number of time steps per policy training batch init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved save_rate: Int determining how often to save videos for num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** task_params_str = ''.join(str(e) + ', ' for e in task_params) num_tasks = len(task_params) envs = [None] * num_tasks scalers = [None] * num_tasks loggers = [None] * num_tasks print("\n\n------ PATHS: ------") start_time = datetime.now() if time_step_to_load == None: now = start_time.strftime( "%b-%d_%H:%M:%S" ) # If NOT loading from Checkpoint -> used to create unique directories else: assert now_to_load != None,\ "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load) now = now_to_load logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now) for task in range(num_tasks): # Create task specific environment envs[task], obs_dim, act_dim = init_gym(env_name, task_param=task_params[task]) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Create task specific Paths and logger object loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \ logname_file= "_{}_{}".format(task_name, task_params[task])) if time_step_to_load == None: # If NOT loading from Checkpoint scalers[task] = Scaler(obs_dim) # Auxiliary saver (becase logger sometimes fails or takes to much time) with open( logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: f.write("_TimeStep" + " " + "_MeanReward") aigym_path = os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders agent_path = os.path.join('agents', env_name, task_name, task_params_str, now) # agent / policy folders if time_step_to_load == None: # If NOT loading from Checkpoint os.makedirs(agent_path) with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command print("\nPath for Saved Videos : {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # **************** Initialize Policy, Value Networks and Scaler *************** print("\n\n------ NEURAL NETWORKS: ------") dims_core_hid.insert( 0, obs_dim ) # Modify dims list to have the size of the layer 'n-1' at position '0' dims_head_hid.insert(0, dims_head_hid[-1]) val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks, time_steps_mini_batch) policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, time_steps_mini_batch, pol_loss_type=pol_loss_type) # Load from Checkpoint: # Validate intented time step to load OR get last time step number if no target time step was provided if time_step_to_load != None: load_agent_path = agent_path # agent / policy folders saved_ep_list = [ file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file ] if time_step_to_load == -1: # Get last saved time step time_step_to_load = sorted( [int(ep_string) for ep_string in saved_ep_list])[-1] else: # Validate if time_step_to_load was indeed saved assert str(time_step_to_load) in saved_ep_list,\ "\n\nWARNING: Time Step you want to load ({}) was not stored during trainning".format(time_step_to_load) # Load Policy Network's Ops and Variables & Load Scaler Object policy.tf_saver.restore( policy.sess, "{}/policy_ep_{}".format(load_agent_path, time_step_to_load)) val_func.tf_saver.restore( val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, time_step_to_load)) scalers = pickle.load( open( "{}/scalers_ep_{}.p".format(load_agent_path, time_step_to_load), 'rb')) print("\n\n ---- CHECKPOINT LOAD: Time Step Loaded **{}**".format( time_step_to_load)) # Delete extra epochs that where logged to the auxiliary logs for task in range(num_tasks): aux_log_path = logs_path + '/aux_{}_{}.txt'.format( task_name, task_params[task]) aux_log = pd.read_table(aux_log_path, delim_whitespace=True) idx_to_cut = aux_log.index[aux_log["_TimeStep"] == time_step_to_load].tolist()[0] aux_log[0:idx_to_cut + 1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log # If NOT loading from Checkpoint: run some time steps to initialize scalers and create Tensor board dirs elif time_step_to_load == None: for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], time_steps_batch=int(time_steps_batch / 3), task=task) # Tensor Board writer os.makedirs(agent_path + '/tensor_board/policy') os.makedirs(agent_path + '/tensor_board/valFunc') tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g) tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g) # **************** Start Training *************** print("\n\n------ TRAINNING: ------") animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_offset = save_rate killer = GracefulKiller() if time_step_to_load == None: time_step = 0 else: time_step = time_step_to_load # Time steps are counted across all tasks i.e. N time steps indicates each tasks has been runned for N times while time_step < max_time_steps and not killer.kill_now: # **************** Obtain data (train set) *************** observes_all = [None] * num_tasks actions_all = [None] * num_tasks advantages_all = [None] * num_tasks disc_sum_rew_all = [None] * num_tasks time_step += time_steps_batch for task in range(num_tasks): # Obtain 'time_steps_batch' trajectories and add additional intermediate calculations trajectories = run_policy(envs[task], policy, scalers[task], loggers[task], time_steps_batch=time_steps_batch, task=task, animate=animate) add_value(trajectories, val_func, task) # add estimated values to trajectories add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all time steps into single NumPy arrays observes_all[task], actions_all[task], advantages_all[ task], disc_sum_rew_all[task] = build_train_set(trajectories) # print("Observes Shape: {}".format(observes_all[task].shape)) # print("Actions Shape: {}\n\n".format(actions_all[task].shape)) # print("Advantage Shape: {}\n\n".format(advantages_all[task].shape)) # Logging Stats log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \ loggers[task], time_step) # **************** Update Policy and Value Networks *************** # print ("*************************************") for task in range(num_tasks): pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task]) # update policy val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task]) # update value function # Auxiliary saver (because logger sometimes fails or takes to much time) with open( logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: f.write("\n" + str(loggers[task].log_entry['_TimeStep']) + " " + str(loggers[task].log_entry['_MeanReward'])) loggers[task].write( display=False) # write logger results to file and stdout tb_pol_writer.add_summary(pol_summary, global_step=time_step) tb_val_writer.add_summary(val_summary, global_step=time_step) # **************** Storing NN and Videos *************** # Store Policy, Value Network and Scaler: every 'save_rate' or in first/last time steps if time_step >= saver_offset or time_step >= max_time_steps or time_step <= time_steps_batch * 1.5 or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += save_rate policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format( agent_path, time_step)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format( agent_path, time_step)) # Save Value Network pickle.dump( scalers, open("{}/scalers_ep_{}.p".format(agent_path, time_step), 'wb')) print("---- Saved Agent at Time Step {} ----".format(time_step)) # Save video of current agent/policy if save_video: print( "---- Saving Video at Time Step {} ----".format(time_step)) for task in range(num_tasks): _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ts_{}/{}_{}".format( time_step, task_name, task_params[task])) envs[task].close() # closes window open by monitor wrapper envs[task], _, _ = init_gym( env_name, task_param=task_params[task] ) # Recreate env as it was killed print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # **************** Terminate Variables ************** for task in range(num_tasks): envs[task].close() loggers[task].close() policy.close_sess() val_func.close_sess() # Save elapsed time end_time = datetime.now() elapsed_time = end_time - start_time timedelta(0, 8, 562000) delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60) delta_str = "Elapsed Time: {} min {} seconds".format( delta_time[0], delta_time[1]) # save elapsed time, 'a' to append not overwrite with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, act_dim, obs_dim, final_pol_test, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env = init_env(env_name, **kwargs) # add 1 to obs dimension for time step feature (see run_episode()) obs_dim += 1 tz = timezone('America/Montreal') # Montreal Timezone dt = datetime.now(tz) # Create unique directories now = dt.strftime('%Y-%m-%d %H_%M_%S') logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) dir = './log-files/' + env_name + '/' + now + '/' while episode < num_episodes: trajectories, tot_stuck = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) # add estimated values to episodes add_value(trajectories, val_func) # calculated discounted sum of Rs add_disc_sum_rew(trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage disc0 = [t['disc_sum_rew'][0] for t in trajectories] # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if raw_input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('Standard PPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("log-learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('Standard PPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: print('running simulations') tr, tot_stuck = run_policy(env, policy, scaler, logger, episodes=final_pol_test) print('done') sum_rewww = [t['rewards'].sum() for t in tr] sum_rewww += [tot_stuck] print('total stucks', sum_rewww[-1]) hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('Standard PPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("standard_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() if mpi_util.nworkers > 1: batch_size = batch_size // mpi_util.nworkers if batch_size % mpi_util.nworkers == 0 else batch_size // mpi_util.nworkers + 1 # spread the desired batch_size across processes env, obs_dim, act_dim = init_gym(env_name) mpi_util.set_global_seeds(111 + mpi_util.rank) env.seed(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) if mpi_util.rank == 0: env = wrappers.Monitor(env, aigym_path, force=True, write_upon_reset=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, valfunc_hid_list) policy = Policy(obs_dim, act_dim, kl_targ, policy_hid_list) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: mpi_util.timeit( '--------------------------' ) # let's time everything so we can see where the work is being done trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) mpi_util.timeit('run_policy') # episode += len(trajectories) episode += mpi_util.all_sum(len(trajectories)) mpi_util.timeit('mpi_util.all_sum') add_value(trajectories, val_func) # add estimated values to episodes mpi_util.timeit('add_value') add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs mpi_util.timeit('add_disc_sum_rew') add_gae(trajectories, gamma, lam) # calculate advantage mpi_util.timeit('add_gae') # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) mpi_util.timeit('build_train_set') # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) mpi_util.timeit('log_batch_stats') if mpi_util.rank == 0: policy.update(observes, actions, advantages, logger) # update policy mpi_util.timeit('policy.update') val_func.fit(observes, disc_sum_rew, logger) # update value function mpi_util.timeit('val_func.fit') mpi_util.rank0_bcast_wts( val_func.sess, val_func.g, 'val' ) # doubt if value network is used during rollouts but it only takes a few milliseconds anyhow mpi_util.timeit('mpi_util.rank0_bcast_wts(val_func') mpi_util.rank0_bcast_wts(policy.sess, policy.g, 'policy') mpi_util.timeit('mpi_util.rank0_bcast_wts(policy') if mpi_util.rank == 0: logger.write( display=True) # write logger results to file and stdout # if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, save): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories env_id = env_name + id_generator() logger = Logger(logname=env_id, now=now) aigym_path = os.path.join('/tmp', env_id) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=lambda episode_id: False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 if env_name == 'Swimmer-v1': score_window = 100 solution_score = 360 elif env_name == 'HalfCheetah-v1': score_window = 100 solution_score = 4800 else: assert False # assert score_window % batch_size == 0 rewards = collections.deque(maxlen=int(np.rint(score_window / batch_size))) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function mean_reward = logger.log_entry['_MeanReward'] logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False rewards.append(mean_reward) ''' if np.mean(rewards) >= solution_score: episode = episode - score_window break ''' logger.close() policy.close_sess() val_func.close_sess() # return episode return -np.mean(rewards)
def add_value(trajectories, val_func): ''' Adds estimated value to all time steps of all trajectories Args: trajectories: as returned by run_policy() val_func: object with predict() method, takes observations and returns predicted state value Returns: None (mutates trajectories dictionary to add 'values') ''' for trajectory in trajectories: observes = trajectory['observes'] values = val_func.predict(observes) trajectory['values'] = values def add_gae(trajectories, gamma, lam): ''' Add generalized advantage estimator. https://arxiv.org/pdf/1506.02438.pdf Args: trajectories: as returned by run_policy must include 'values' key from add_values(). gamma: reward discount lam: lambda (see paper). lam=0 : use TD residuals lam=1 : A = Sum Discounted Rewards - V_hat(s) Returns: None (mutates trajectories dictionary to add 'advantages') ''' for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] values = trajectory['values'] # temporal differences tds = rewards - values + np.append(values[1:] * gamma, 0) advantages = discount(tds, gamma * lam) trajectory['advantages'] = advantages def build_train_set(trajectories): ''' Args: trajectories after processing by add_disc_sum_rew(), add_value() and add_gae() Returns: 4-tuple of NumPy arrays observes: shape = (N, obs_dim) actions: shape = (N, act_dim) advantages: shape = (N,) disc_sum_rew: shape = (N,) ''' observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) # normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode): """ Log various batch statistics """ logger.log({'_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode }) def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): ''' Main training loop Args: env_name: Robot model name num_episodes: maximum umber of episodes to run (int) gamma: reward discount factor (float) lam: lambda for Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)] bath_size: number of episodes per policy training batch ''' env, obs_dim, act_dim = init_env(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #TODO agregar la parte de sampling una vez que todo ande # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, numEpisodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to train log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ print('Start time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.now().strftime("%b-%d_%H:%M:%S") # create unique directories 格林尼治时间!!! utcnow改为now testname = now+'-'+TestNote logger = Logger(logname=env_name, now=testname) monitor_path = os.path.join('log-files', env_name, testname, 'monitor') env = wrappers.Monitor(env, monitor_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 print('Start time:\n') time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # save models if not episode % (num_episodes / 10): policy_save_path = os.path.join('log-files', env_name, testname, 'checkpoint') policy.save_model(env_name + "-" + str(episode), policy_save_path) logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess() print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, weights_path, init_episode, experiment_name, resume): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() logger = Logger(logname=env_name, sub_dir=experiment_name) aigym_path = os.path.join('results', env_name, experiment_name) if resume: weights_path = aigym_path ckpt = tf.train.get_checkpoint_state(weights_path) init_episode = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) env, obs_dim, act_dim = init_gym(env_name) # obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = init_episode while episode <= num_episodes: if episode % 1000 is 0: # record one episode record(env_name, aigym_path, policy, scaler) policy.save(aigym_path, episode) trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #record one last episode record(env_name, aigym_path, policy, scaler) logger.close() policy.close_sess() val_func.close_sess()