def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, risk_targ): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now_utc = datetime.utcnow() # create unique directories now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str( now_utc.year) + '_' + str( ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str( now_utc.second) # adjust for Montreal Time Zone logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, risk_targ, 'CVaR', batch_size, 1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) #big_li_rew_nodisc0 = np.array([]) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes #predicted_values_0 = [t['values'][0] for t in trajectories] add_disc_sum_rew( trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage nodisc0 = -0.0001 * np.array( [t['rewards'].sum() for t in trajectories]) # scaled for gradients print(nodisc0) disc0 = [t['disc_sum_rew'][0] for t in trajectories] print('scaled sum rewards', nodisc0) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) lamb = policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout kl_terms = np.append(kl_terms, policy.check_kl) x1 = list(range(1, (len(kl_terms) + 1))) rewards = plt.plot(x1, kl_terms) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("KL Divergence") plt.savefig("KL_curve.png") plt.close("KL_curve.png") beta_terms = np.append(beta_terms, policy.beta) x2 = list(range(1, (len(beta_terms) + 1))) mean_rewards = plt.plot(x2, beta_terms) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Beta Lagrange Multiplier") plt.savefig("lagrange_beta_curve.png") plt.close("lagrange_beta_curve.png") if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: tr = run_policy(env, policy, scaler, logger, episodes=1000) sum_rewww = [t['rewards'].sum() for t in tr] hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('RAPPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("RA_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, act_dim, obs_dim, final_pol_test, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env = init_env(env_name, **kwargs) # add 1 to obs dimension for time step feature (see run_episode()) obs_dim += 1 tz = timezone('America/Montreal') # Montreal Timezone dt = datetime.now(tz) # Create unique directories now = dt.strftime('%Y-%m-%d %H_%M_%S') logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) dir = './log-files/' + env_name + '/' + now + '/' while episode < num_episodes: trajectories, tot_stuck = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) # add estimated values to episodes add_value(trajectories, val_func) # calculated discounted sum of Rs add_disc_sum_rew(trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage disc0 = [t['disc_sum_rew'][0] for t in trajectories] # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if raw_input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('Standard PPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("log-learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('Standard PPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: print('running simulations') tr, tot_stuck = run_policy(env, policy, scaler, logger, episodes=final_pol_test) print('done') sum_rewww = [t['rewards'].sum() for t in tr] sum_rewww += [tot_stuck] print('total stucks', sum_rewww[-1]) hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('Standard PPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("standard_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()