Пример #1
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, risk_targ):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now_utc = datetime.utcnow()  # create unique directories
    now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str(
        now_utc.year) + '_' + str(
            ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str(
                now_utc.second)  # adjust for Montreal Time Zone
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    risk_targ, 'CVaR', batch_size, 1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
    #big_li_rew_nodisc0 = np.array([])
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        #predicted_values_0 = [t['values'][0] for t in trajectories]
        add_disc_sum_rew(
            trajectories, gamma, scaler.mean_rew,
            np.sqrt(scaler.var_rew))  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        nodisc0 = -0.0001 * np.array(
            [t['rewards'].sum() for t in trajectories])  # scaled for gradients
        print(nodisc0)
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        print('scaled sum rewards', nodisc0)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        lamb = policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        kl_terms = np.append(kl_terms, policy.check_kl)
        x1 = list(range(1, (len(kl_terms) + 1)))
        rewards = plt.plot(x1, kl_terms)
        plt.title('RAPPO')
        plt.xlabel("Episode")
        plt.ylabel("KL Divergence")
        plt.savefig("KL_curve.png")
        plt.close("KL_curve.png")
        beta_terms = np.append(beta_terms, policy.beta)
        x2 = list(range(1, (len(beta_terms) + 1)))
        mean_rewards = plt.plot(x2, beta_terms)
        plt.title('RAPPO')
        plt.xlabel("Batch")
        plt.ylabel("Beta Lagrange Multiplier")
        plt.savefig("lagrange_beta_curve.png")
        plt.close("lagrange_beta_curve.png")
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        tr = run_policy(env, policy, scaler, logger, episodes=1000)
        sum_rewww = [t['rewards'].sum() for t in tr]
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('RAPPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("RA_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #2
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, act_dim, obs_dim, final_pol_test,
         **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f
        (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env = init_env(env_name, **kwargs)
    # add 1 to obs dimension for time step feature (see run_episode())
    obs_dim += 1
    tz = timezone('America/Montreal')  # Montreal Timezone
    dt = datetime.now(tz)  # Create unique directories
    now = dt.strftime('%Y-%m-%d %H_%M_%S')
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
        dir = './log-files/' + env_name + '/' + now + '/'
    while episode < num_episodes:
        trajectories, tot_stuck = run_policy(env,
                                             policy,
                                             scaler,
                                             logger,
                                             episodes=batch_size)
        episode += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, scaler.mean_rew,
                         np.sqrt(scaler.var_rew))
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if raw_input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("log-learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        print('running simulations')
        tr, tot_stuck = run_policy(env,
                                   policy,
                                   scaler,
                                   logger,
                                   episodes=final_pol_test)
        print('done')
        sum_rewww = [t['rewards'].sum() for t in tr]
        sum_rewww += [tot_stuck]
        print('total stucks', sum_rewww[-1])
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('Standard PPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("standard_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()