示例#1
0
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ,
         batch_size):
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim, env_name)
    scaler.resume()
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    episode = 0
    capture = False
    while episode < num_episodes:
        if VideoSave and not capture:
            env.ScreenCapture(5)
            capture = True
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#2
0
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size):
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True) 
    scaler = Scaler(obs_dim, env_name)
    scaler.resume()
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    episode = 0
    capture = False
    while episode < num_episodes:
        if VideoSave and not capture:
            env.ScreenCapture(5)
            capture = True
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#3
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, clipping_range):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    env.reset()
    env.render()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    clipping_range)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    while episode < num_episodes:
        # trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        # episode += len(trajectories)
        # add_value(trajectories, val_func)  # add estimated values to episodes
        # add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        # add_gae(trajectories, gamma, lam)  # calculate advantage
        # # concatenate all episodes into single NumPy arrays
        # observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # # add various stats to training log:
        # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        # policy.update(observes, actions, advantages, logger)  # update policy
        # val_func.fit(observes, disc_sum_rew, logger)  # update value function
        # logger.write(display=True)  # write logger results to file and stdout

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        if episode % 100 == 0:
            policy.save_sess()

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#4
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    print('Testing Period:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))



    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    env.set_goals(0)

    now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories  格林尼治时间!!!  utcnow改为now
    testname = now+'-'+TestNote
    logger = Logger(logname=env_name, now=testname)
    aigym_path = os.path.join('log-Test-files', env_name, testname)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt')
    episode = 0

    observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True)
    tmp=np.vstack((rewards,states_x,states_y))
    tmp1=np.transpose(tmp)
    data = np.concatenate((observes, actions, tmp1),axis=1)
    trajectory = {}
    for j in range(data.shape[0]):
        for i in range(data.shape[1]):
            trajectory[i] = data[j][i]
        logger.log(trajectory)
        logger.write(display=False)


    logger.close()
    policy.close_sess()
    val_func.close_sess()

    print('End time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
示例#5
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #Esto es para alimentar con el optimo
    trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger)
    add_value(trajectories, val_func)  # add estimated values to episodes
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    print(actions.shape)
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function

    # No estoy seguro de si esto es necesario ya
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
示例#6
0
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    #capture = False
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        """if episode > 600 and not capture:
               env.ScreenCapture(5)
               capture = True"""
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#7
0
    def restore(self, restore_path):
        name = restore_path.split("/")[
            -1]  # remove preceding/path/components/to/name

        # unpickle and restore scaler - NOTE: this file includes some other variables too
        mypath = "saves/scaler/" + name
        print("restoring scaler checkpoint from:", mypath)
        with open(mypath + ".scaler", 'rb') as f:
            (scaler, episode, obs_dim, act_dim, kl_targ,
             self.init_time) = pickle.load(f)

        # policy
        mypath = "saves/policy/" + name
        print("restoring policy checkpoint from:", mypath)
        policy = Policy(obs_dim, act_dim, kl_targ, restore_path=mypath)
        print("restored policy:")
        Checkpoint.dump_vars(policy.g)

        # val_func
        mypath = "saves/val_func/" + name
        print("restoring val_func checkpoint from:", mypath)
        val_func = NNValueFunction(obs_dim, restore_path=mypath)
        print("restored val_func:")
        Checkpoint.dump_vars(val_func.g)

        print("finished restore.")
        return (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ)
示例#8
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    env_name = 'hsr'
    killer = GracefulKiller()
    #env, obs_dim, act_dim = init_gym(env_name)
    env = pr2_agent("r_arm")
    obs_dim = 10
    act_dim = 7
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    #while episode < num_episodes:
    '''
示例#9
0
    def restore_old(self, policy, val_func, scaler, restore_path):
        #mypath = self.checkpoints_dir+"/"+restore_path
        mypath = restore_path

        print("restoring checkpoint from:", mypath)

        from policy import Policy
        from value_function import NNValueFunction

        policy = Policy(policy.obs_dim,
                        policy.act_dim,
                        policy.kl_targ,
                        restore_flag=True)
        with policy.g.as_default():
            print("0000000A")
            Checkpoint.dump_vars(policy.g)
            tf.saved_model.loader.load(policy.sess,
                                       [tf.saved_model.tag_constants.TRAINING],
                                       mypath + ".policy")
            print("1111111A")
            Checkpoint.dump_vars(policy.g)
        policy._placeholders()
        print("YYYY:", policy.obs_ph)

        val_func = NNValueFunction(val_func.obs_dim, restore_flag=True)
        with val_func.g.as_default():
            print("2222222A")
            Checkpoint.dump_vars(val_func.g)
            tf.saved_model.loader.load(val_func.sess,
                                       [tf.saved_model.tag_constants.TRAINING],
                                       mypath + ".val_func")
            print("3333333A")
            Checkpoint.dump_vars(val_func.g)
        val_func._placeholders()
        print("YYYY:", val_func.obs_ph)

        # unpickle and restore scaler
        with open(mypath + ".scaler", 'rb') as f:
            (scaler, episode) = pickle.load(f)

        print("FINISHED RESTORE")
        return (policy, val_func, scaler, episode)
示例#10
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0
    for _ in range(200):
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0

    # to create new file at beginning of trial
    #f= open("coor_state.txt","w")
    #f.close

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  arg,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#12
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, no_of_updates):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    env, obs_dim, act_dim = init_gym(env_name)
    #env._max_episode_steps = 150
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b_%d_%H_%M_%S")  # create unique directories
    #logger = Logger(logname=env_name, now=now)
    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)

    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('videos', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult, no_of_updates)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    no_of_updates)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#13
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar,
        scenario,  num_agents, action_dim, timesteps):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    # env, obs_dim, act_dim = init_gym(env_name)
    env = make_env(scenario)
    obs_dims = env.observation_space
    act_dims = [env.action_space[0].n for i in range(env.n)]
   
    obs_dims = [obs_dim.shape[0] + 1 for obs_dim in obs_dims]  # add 1 to obs dimension for time step feature (see run_episode())
  
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=scenario, now=now)
    aigym_path = os.path.join('/tmp', scenario, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dims)
    val_func = NNValueFunction(obs_dims[0]+act_dims[0], hid1_mult)
    policys = []

    for i in range(num_agents):
        policys.append(Policy(i, obs_dims[i], act_dims[0], kl_targ, hid1_mult, policy_logvar, num_agents-1, timesteps))
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policys, scaler, logger,  act_dims[0], timesteps, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policys, scaler, logger, act_dims[0],timesteps, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, intents, act_trajs,  advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        # log_batch_stats(observes, actions,intents, act_trajs,  advantages, disc_sum_rew, logger, episode)
        for i, policy in enumerate(policys):

            policy.update(observes[i], actions[i], intents[i], act_trajs[i], advantages[i], logger)  # update policy
            val_func.fit(observes[i]+intents[i], disc_sum_rew[i], logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    for policy in policys:
        policy.close_sess()
    val_func.close_sess()
示例#14
0
    def __init__(self,
                 name,
                 obs_dim,
                 act_dim,
                 n_ways,
                 batch_size,
                 log_path,
                 gamma=0.995,
                 lam=0.98,
                 kl_targ=0.003,
                 hid1_mult=10,
                 policy_logvar=1.0):
        self.name = name
        self.obs_dim, self.act_dim = obs_dim, act_dim
        self.n_ways = n_ways
        self.batch_size = batch_size
        self.gamma = gamma
        self.lam = lam
        self.kl_targ = kl_targ
        self.hid1_mult = hid1_mult
        self.policy_logvar = policy_logvar
        self.logger = Logger(logname=os.path.join(log_path, name),
                             now=datetime.utcnow().strftime("%b_%d_%H_%M_%S"))

        self.scaler = Scaler(self.obs_dim)
        self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10)
        self.trpo_net = TrpoNet(name,
                                self.obs_dim,
                                self.act_dim,
                                n_ways=n_ways,
                                kl_targ=kl_targ,
                                hid1_mult=hid1_mult,
                                policy_logvar=policy_logvar)

        self.trajectories = []
        self.episode = 0
示例#15
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_osim()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        if killer.kill_now:
            if portable_input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#16
0
def main(arglist):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    # env, obs_dim, act_dim = init_gym(aenv_name)
    env = make_env(arglist.scenario, arglist)
    obs_dim = env.observation_space[0].shape[0]
    act_dim = env.action_space[0].n
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('/tmp', arglist.scenario, now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, arglist.hid1_mult)
    trainers, loggers = get_trainers(env, arglist.num_adversaries, obs_dim, act_dim, arglist)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=5)
    episode = 0
    while episode < arglist.num_episodes:
        trajectories = run_policy(env, trainers, scaler, loggers, arglist.max_episode_len ,  episodes=arglist.b_size)
        episode += len(trajectories[0])
        print("episode: {}".format(episode))
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, arglist.gamma)
        add_gae(trajectories, arglist.gamma, arglist.lam)
        observations, actions, advantages, disc_sum_rews = build_train_set(trajectories)
        log_batch_stats(observations, actions, advantages, disc_sum_rews, loggers, episode)
        for i in range(len(trainers)):
            trainers[i].update(observations[i], actions[i], advantages[i], loggers[i])
            val_func.fit(observations[i], disc_sum_rews[i], loggers[i])  
            loggers[i].write(display=True)  

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if episode % arglist.save_rate == 0:
            print("Episode {} complete".format(episode))
            
        # score = play(env, policy1, policy2)   
    for i in range(len(loggers)):
        loggers[i].close()
        trainers[i].close_sess()
        val_func.close_sess()     
def main(num_episodes, gamma, lam, kl_targ, batch_size, env_name="Hopper-v2"):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = (datetime.datetime.utcnow() - datetime.timedelta(hours=4)).strftime("%b-%d_%H:%M:%S")  # create dictionaries based on ets time
    logger = Logger(logname=env_name, now=now)
    plotter = Plot(plotname=env_name+"-Fig", now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)  # recording, dir??
    scaler = Scaler(obs_dim)        # obs_dim=377
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)  # kl target=0.003 by default
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, plotter, episodes=5, plot=False)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, plotter, episodes=batch_size)
        episode += len(trajectories)    # length of trajectories equals batch size which by default is 20
        plotter.updateEpisodes(episode)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger, plotter)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    plotter.plot()
    # plt.show()

    policy.close_sess()
    val_func.close_sess()
示例#18
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/home/vatsal', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#19
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
	'''
	Main training loop 

	Args:
		env_name: Robot model name
		num_episodes: maximum umber of episodes to run (int)
		gamma: reward discount factor (float)
		lam: lambda for Generalized Advantage Estimate
		kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)]
		bath_size: number of episodes per policy training batch
	'''
	env, obs_dim, act_dim = init_env(env_name)
	obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
	now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
	logger = Logger(logname=env_name, now=now)
	pathFolder = logger.pathFolder
	scaler = Scaler(obs_dim)
	val_func = NNValueFunction(obs_dim)
	policy = Policy(obs_dim, act_dim, kl_targ)
	acumulator = BestAcumulator()
	#TODO agregar la parte de sampling una vez que todo ande

	# run a few episodes of untrained policy to initialize scaler:
	run_policy(env, policy, scaler, logger, 5, acumulator)
	episode = 0
	while episode < num_episodes:
		trajectories = run_policy(env, policy, scaler, logger, batch_size, acumulator)
		episode += len(trajectories)
		add_value(trajectories, val_func) # add estimated values to episodes
		add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs
		add_gae(trajectories, gamma, lam) # calculate advantage
		# concatenate all episodes into single NumPy arrays
		observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
		# add various stats to train log:
		log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
		policy.update(observes, actions, advantages, logger)  # update policy
		val_func.fit(observes, disc_sum_rew, logger)  # update value function
		logger.write(display=True)  # write logger results to file and stdout
	acumulator.save(pathFolder)
	logger.close()
	policy.close_sess(pathFolder)
	val_func.close_sess(pathFolder)
示例#20
0
文件: messy.py 项目: jasonsiver/DRL
def main():
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    env_name = 'HumanoidasimoMRD4_2-v1'
    #env_name='Humanoid-v1'
    num_episodes = 5000000
    gamma = 0.995
    lam = 0.98
    kl_targ = 0.003
    batch_size = 32
    hid1_mult = 10
    policy_logvar = -1.0

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join(
        '/home/initial/eclipse-workspace4/test/trpo-master/src/result',
        env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult, filename2)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    filename=filename1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            policy.save(episode, filename1)
            val_func.save(episode, filename2)
            #loger.flush()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    # scaler = Scaler(obs_dim)
    logger.log("loading scaler")
    with open('models/scaler/scaler.pkl', 'rb') as input:
        scaler = pickle.load(input)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    load_v = False  #whether load value function baseline or train from scratch; no big impact on stein
    if load_v == True:
        val_func.load_val_model(load_dir)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps,
                                             mode=load_model)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)

    #Split data into validation and training data
    random.shuffle(trajectories)
    t_trajectories = trajectories[:int(len(trajectories) / 2)]
    v_trajectories = trajectories[int(len(trajectories) / 2):]

    refit_v = True  # if fit value function baseline once again before evaluating; no big impact on stein
    if refit_v == True:
        tt_trajectories = copy.deepcopy(t_trajectories)
        add_value(tt_trajectories, val_func)
        add_disc_sum_rew(tt_trajectories, gamma)
        add_gae(tt_trajectories, gamma, lam)
        tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set(
            tt_trajectories)
        logger.log("refit value function baseline")
        val_func.fit(tt_observes, tt_disc_sum_rew)  # update value function
        logger.log("done")

    # build training data after refit v
    add_value(t_trajectories, val_func)
    add_disc_sum_rew(t_trajectories, gamma)
    add_gae(t_trajectories, gamma, lam)
    t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set(
        t_trajectories)

    # build validation data after refit v
    add_value(v_trajectories, val_func)
    add_disc_sum_rew(v_trajectories, gamma)
    add_gae(v_trajectories, gamma, lam)
    v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set(
        v_trajectories)

    sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        max_timesteps, env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(v_observes,
                                             v_actions,
                                             v_advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    d = Dataset(dict(ob=t_observes,
                     ac=t_actions,
                     atarg=t_advantages,
                     vtarg=t_disc_sum_rew),
                shuffle=True)
    for _ in range(phi_epochs):  # optim_epochs
        for batch in d.iterate_once(128):  # optim_batchsize
            policy.update(load_model,
                          batch['ob'],
                          batch['ac'],
                          batch['atarg'],
                          use_lr_adjust,
                          ada_kl_penalty,
                          c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(v_observes, \
                    v_actions, v_advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
示例#22
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, risk_targ):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now_utc = datetime.utcnow()  # create unique directories
    now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str(
        now_utc.year) + '_' + str(
            ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str(
                now_utc.second)  # adjust for Montreal Time Zone
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    risk_targ, 'CVaR', batch_size, 1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
    #big_li_rew_nodisc0 = np.array([])
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        #predicted_values_0 = [t['values'][0] for t in trajectories]
        add_disc_sum_rew(
            trajectories, gamma, scaler.mean_rew,
            np.sqrt(scaler.var_rew))  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        nodisc0 = -0.0001 * np.array(
            [t['rewards'].sum() for t in trajectories])  # scaled for gradients
        print(nodisc0)
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        print('scaled sum rewards', nodisc0)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        lamb = policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        kl_terms = np.append(kl_terms, policy.check_kl)
        x1 = list(range(1, (len(kl_terms) + 1)))
        rewards = plt.plot(x1, kl_terms)
        plt.title('RAPPO')
        plt.xlabel("Episode")
        plt.ylabel("KL Divergence")
        plt.savefig("KL_curve.png")
        plt.close("KL_curve.png")
        beta_terms = np.append(beta_terms, policy.beta)
        x2 = list(range(1, (len(beta_terms) + 1)))
        mean_rewards = plt.plot(x2, beta_terms)
        plt.title('RAPPO')
        plt.xlabel("Batch")
        plt.ylabel("Beta Lagrange Multiplier")
        plt.savefig("lagrange_beta_curve.png")
        plt.close("lagrange_beta_curve.png")
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        tr = run_policy(env, policy, scaler, logger, episodes=1000)
        sum_rewww = [t['rewards'].sum() for t in tr]
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('RAPPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("RA_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#23
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()

    env, obs_dim, act_dim = init_gym(env_name)
    env.seed(111 + mpi_util.rank)
    mpi_util.set_global_seeds(111 + mpi_util.rank)

    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    if mpi_util.rank == 0:
        now = datetime.utcnow().strftime(
            "%b-%d_%H:%M:%S")  # create unique directories
        aigym_path = os.path.join('/tmp', env_name, now)
        env = wrappers.Monitor(env, aigym_path, force=True)
        logger = Logger(logname=env_name, now=now)

    policy = Policy(obs_dim, act_dim, kl_targ)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)

    if mpi_util.rank == 0:
        # run a few episodes (on node 0) of untrained policy to initialize scaler:
        trajectories = run_policy(env, policy, scaler, episodes=5)

        unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories])
        scaler.update(
            unscaled)  # update running statistics for scaling observations

    # broadcast policy weights, scaler, val_func
    (policy, scaler,
     val_func) = mpi_util.broadcast_policy_scaler_val(policy, scaler, val_func)

    worker_batch_size = int(batch_size / mpi_util.nworkers)  # HACK
    if (worker_batch_size * mpi_util.nworkers != batch_size):
        print("batch_size:", batch_size, " is not divisible by nworkers:",
              mpi_util.nworkers)
        exit(1)

    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  episodes=worker_batch_size)
        trajectories = mpi_util.gather_trajectories(trajectories)

        if mpi_util.rank == 0:
            # concatentate trajectories into one list
            trajectories = list(itertools.chain.from_iterable(trajectories))
            print("did a batch of ", len(trajectories), " trajectories")
            print([t['rewards'].sum() for t in trajectories])

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage

            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)

            # add various stats to training log:
            logger.log({
                '_MeanReward':
                np.mean([t['rewards'].sum() for t in trajectories]),
                'Steps':
                np.sum([t['observes'].shape[0] for t in trajectories])
            })
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode)

            policy.update(observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

            logger.write(
                display=True)  # write logger results to file and stdout

        # if mpi_util.rank == 0 and killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

    if mpi_util.rank == 0: logger.close()
    policy.close_sess()
    if mpi_util.rank == 0: val_func.close_sess()
示例#24
0
 def __init__(self):
     with tf.name_scope("central_agent"):
         self.val_func = NNValueFunction(obs_dim, hid1_mult)
         self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                              policy_logvar)
         self.num_tuple = 0
示例#25
0
class Central_agent:
    def __init__(self):
        with tf.name_scope("central_agent"):
            self.val_func = NNValueFunction(obs_dim, hid1_mult)
            self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                                 policy_logvar)
            self.num_tuple = 0

    def update_parameter_server(self, episode, trajectories, name):
        self.num_tuple += len(trajectories)
        if len(trajectories) < batch_size:
            return

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = self.build_train_set(
            trajectories)
        # add various stats to training log:
        self.log_batch_stats(observes, actions, advantages, disc_sum_rew,
                             logger, episode)
        self.policy.update(observes, actions, advantages,
                           logger)  # update policy
        self.val_func.fit(observes, disc_sum_rew,
                          logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        print([
            'thread_name: ' + name + ', episode: ' + str(episode) +
            ', tuples: ' + str(self.num_tuple)
        ])
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            self.policy.save(episode, filename1)
            self.val_func.save(episode, filename2)

    def build_train_set(self, trajectories):
        observes = np.concatenate([t['observes'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        disc_sum_rew = np.concatenate(
            [t['disc_sum_rew'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-6)
        return observes, actions, advantages, disc_sum_rew

    def log_batch_stats(self, observes, actions, advantages, disc_sum_rew,
                        logger, episode):
        logger.log({
            '_mean_obs': np.mean(observes),
            '_min_obs': np.min(observes),
            '_max_obs': np.max(observes),
            '_std_obs': np.mean(np.var(observes, axis=0)),
            '_mean_act': np.mean(actions),
            '_min_act': np.min(actions),
            '_max_act': np.max(actions),
            '_std_act': np.mean(np.var(actions, axis=0)),
            '_mean_adv': np.mean(advantages),
            '_min_adv': np.min(advantages),
            '_max_adv': np.max(advantages),
            '_std_adv': np.var(advantages),
            '_mean_discrew': np.mean(disc_sum_rew),
            '_min_discrew': np.min(disc_sum_rew),
            '_max_discrew': np.max(disc_sum_rew),
            '_std_discrew': np.var(disc_sum_rew),
            '_Episode': episode
        })
示例#26
0
def main(env_name, max_time_steps, time_steps_batch, time_steps_mini_batch, gamma, lamda, kl_targ, clipping_range, pol_loss_type, init_pol_logvar, animate,\
        save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\
        time_step_to_load, now_to_load):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        max_time_steps: maximum number of time steps to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        clipping_range: max value to clip the policy gradient ratio
        pol_loss_type: string determining which type of loss to use for the Policy Network
        time_steps_batch: number of time steps per policy training batch
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        save_rate: Int determining how often to save videos for
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """

    # ****************  Environment Initialization and Paths  ***************
    task_params_str = ''.join(str(e) + ', ' for e in task_params)
    num_tasks = len(task_params)
    envs = [None] * num_tasks
    scalers = [None] * num_tasks
    loggers = [None] * num_tasks

    print("\n\n------ PATHS: ------")
    start_time = datetime.now()
    if time_step_to_load == None:
        now = start_time.strftime(
            "%b-%d_%H:%M:%S"
        )  # If NOT loading from Checkpoint -> used to  create unique directories
    else:
        assert now_to_load != None,\
            "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load)
        now = now_to_load
    logs_path = os.path.join('log-files', env_name, task_name, task_params_str,
                             now)

    for task in range(num_tasks):
        # Create task specific environment
        envs[task], obs_dim, act_dim = init_gym(env_name,
                                                task_param=task_params[task])
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

        # Create task specific Paths and logger object
        loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \
                               logname_file= "_{}_{}".format(task_name, task_params[task]))

        if time_step_to_load == None:  # If NOT loading from Checkpoint
            scalers[task] = Scaler(obs_dim)

            # Auxiliary saver (becase logger sometimes fails or takes to much time)
            with open(
                    logs_path +
                    '/aux_{}_{}.txt'.format(task_name, task_params[task]),
                    'w') as f:
                f.write("_TimeStep" + "  " + "_MeanReward")

    aigym_path = os.path.join('./videos', env_name, task_name, task_params_str,
                              now)  # videos folders
    agent_path = os.path.join('agents', env_name, task_name, task_params_str,
                              now)  # agent / policy folders
    if time_step_to_load == None:  # If NOT loading from Checkpoint
        os.makedirs(agent_path)
        with open(agent_path + '/commandline_args.txt', 'w') as f:
            f.write(' '.join(sys.argv[1:]))  # save commandline command
        with open(logs_path + '/commandline_args.txt', 'w') as f:
            f.write(' '.join(sys.argv[1:]))  # save commandline command

    print("\nPath for Saved Videos : {}".format(aigym_path))
    print("Path for Saved Agents: {}\n".format(agent_path))

    # ****************  Initialize Policy, Value Networks and Scaler  ***************
    print("\n\n------ NEURAL NETWORKS: ------")
    dims_core_hid.insert(
        0, obs_dim
    )  # Modify dims list to have the size of the layer 'n-1' at position '0'
    dims_head_hid.insert(0, dims_head_hid[-1])

    val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid,
                               num_tasks, time_steps_mini_batch)
    policy = Policy(obs_dim,
                    act_dim,
                    dims_core_hid,
                    dims_head_hid,
                    num_tasks,
                    time_steps_mini_batch,
                    pol_loss_type=pol_loss_type)

    # Load from Checkpoint:
    # Validate intented time step to load OR get last time step number if no target time step was provided
    if time_step_to_load != None:
        load_agent_path = agent_path  # agent / policy folders
        saved_ep_list = [
            file.split(".")[0].split("_")[-1]
            for file in os.listdir(load_agent_path) if "policy" in file
        ]

        if time_step_to_load == -1:  # Get last saved time step
            time_step_to_load = sorted(
                [int(ep_string) for ep_string in saved_ep_list])[-1]

        else:  # Validate if time_step_to_load was indeed saved
            assert str(time_step_to_load) in saved_ep_list,\
            "\n\nWARNING: Time Step you want to load ({}) was not stored during trainning".format(time_step_to_load)

        # Load Policy Network's Ops and Variables & Load Scaler Object
        policy.tf_saver.restore(
            policy.sess, "{}/policy_ep_{}".format(load_agent_path,
                                                  time_step_to_load))
        val_func.tf_saver.restore(
            val_func.sess, "{}/val_func_ep_{}".format(load_agent_path,
                                                      time_step_to_load))
        scalers = pickle.load(
            open(
                "{}/scalers_ep_{}.p".format(load_agent_path,
                                            time_step_to_load), 'rb'))
        print("\n\n ---- CHECKPOINT LOAD:  Time Step Loaded **{}**".format(
            time_step_to_load))

        # Delete extra epochs that where logged to the auxiliary logs
        for task in range(num_tasks):
            aux_log_path = logs_path + '/aux_{}_{}.txt'.format(
                task_name, task_params[task])
            aux_log = pd.read_table(aux_log_path, delim_whitespace=True)
            idx_to_cut = aux_log.index[aux_log["_TimeStep"] ==
                                       time_step_to_load].tolist()[0]
            aux_log[0:idx_to_cut +
                    1].to_csv(aux_log_path,
                              header=True,
                              index=False,
                              sep=' ',
                              mode='w')  # overwrite trimmed aux_log

    # If NOT loading from Checkpoint: run some time steps to initialize scalers and create Tensor board dirs
    elif time_step_to_load == None:
        for task in range(num_tasks):
            run_policy(envs[task],
                       policy,
                       scalers[task],
                       loggers[task],
                       time_steps_batch=int(time_steps_batch / 3),
                       task=task)

        # Tensor Board writer
        os.makedirs(agent_path + '/tensor_board/policy')
        os.makedirs(agent_path + '/tensor_board/valFunc')

    tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy',
                                          graph=policy.g)
    tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc',
                                          graph=val_func.g)

    # ****************  Start Training  ***************
    print("\n\n------ TRAINNING: ------")
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_offset = save_rate
    killer = GracefulKiller()

    if time_step_to_load == None: time_step = 0
    else: time_step = time_step_to_load

    # Time steps are counted across all tasks i.e. N time steps indicates each tasks has been runned for N times
    while time_step < max_time_steps and not killer.kill_now:

        # ****************  Obtain data (train set)  ***************
        observes_all = [None] * num_tasks
        actions_all = [None] * num_tasks
        advantages_all = [None] * num_tasks
        disc_sum_rew_all = [None] * num_tasks

        time_step += time_steps_batch
        for task in range(num_tasks):

            # Obtain 'time_steps_batch' trajectories and add additional intermediate calculations
            trajectories = run_policy(envs[task],
                                      policy,
                                      scalers[task],
                                      loggers[task],
                                      time_steps_batch=time_steps_batch,
                                      task=task,
                                      animate=animate)

            add_value(trajectories, val_func,
                      task)  # add estimated values to trajectories
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lamda)  # calculate advantage

            # Concatenate all time steps into single NumPy arrays
            observes_all[task], actions_all[task], advantages_all[
                task], disc_sum_rew_all[task] = build_train_set(trajectories)

            # print("Observes Shape: {}".format(observes_all[task].shape))
            # print("Actions Shape: {}\n\n".format(actions_all[task].shape))
            # print("Advantage Shape: {}\n\n".format(advantages_all[task].shape))

            # Logging Stats
            log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \
                            loggers[task], time_step)

        # ****************  Update Policy and Value Networks  ***************
        # print ("*************************************")
        for task in range(num_tasks):
            pol_summary = policy.update(task, observes_all[task],
                                        actions_all[task],
                                        advantages_all[task],
                                        loggers[task])  # update policy
            val_summary = val_func.fit(task, observes_all[task],
                                       disc_sum_rew_all[task],
                                       loggers[task])  # update value function
            # Auxiliary saver (because logger sometimes fails or takes to much time)
            with open(
                    logs_path +
                    '/aux_{}_{}.txt'.format(task_name, task_params[task]),
                    'a') as f:
                f.write("\n" + str(loggers[task].log_entry['_TimeStep']) +
                        "  " + str(loggers[task].log_entry['_MeanReward']))
            loggers[task].write(
                display=False)  # write logger results to file and stdout

            tb_pol_writer.add_summary(pol_summary, global_step=time_step)
            tb_val_writer.add_summary(val_summary, global_step=time_step)

        # ****************  Storing NN and Videos  ***************
        # Store Policy, Value Network and Scaler: every 'save_rate'  or in first/last time steps
        if time_step >= saver_offset or time_step >= max_time_steps or time_step <= time_steps_batch * 1.5 or killer.kill_now:
            # TODO: Make saving agent/video a method so that it can be called in killer.kill_now
            saver_offset += save_rate
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(
                agent_path, time_step))  # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(
                agent_path, time_step))  # Save Value Network
            pickle.dump(
                scalers,
                open("{}/scalers_ep_{}.p".format(agent_path, time_step), 'wb'))
            print("---- Saved Agent at Time Step {} ----".format(time_step))

            # Save video of current agent/policy
            if save_video:
                print(
                    "---- Saving Video at Time Step {} ----".format(time_step))
                for task in range(num_tasks):
                    _ = sim_agent(envs[task],
                                  policy,
                                  task,
                                  scalers[task],
                                  num_episodes_sim,
                                  save_video=True,
                                  out_dir=aigym_path +
                                  "/vid_ts_{}/{}_{}".format(
                                      time_step, task_name, task_params[task]))
                    envs[task].close()  # closes window open by monitor wrapper
                    envs[task], _, _ = init_gym(
                        env_name, task_param=task_params[task]
                    )  # Recreate env as it was killed
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # ****************  Terminate Variables  **************
    for task in range(num_tasks):
        envs[task].close()
        loggers[task].close()
    policy.close_sess()
    val_func.close_sess()

    # Save elapsed time
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    timedelta(0, 8, 562000)
    delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60)
    delta_str = "Elapsed Time: {} min {} seconds".format(
        delta_time[0], delta_time[1])
    # save elapsed time, 'a' to append not overwrite
    with open(agent_path + '/commandline_args.txt', 'a') as f:
        f.write('\n\n' + delta_str)
    with open(logs_path + '/commandline_args.txt', 'a') as f:
        f.write('\n\n' + delta_str)
示例#27
0
文件: train.py 项目: nwaftp23/ppo
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, act_dim, obs_dim, final_pol_test,
         **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f
        (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env = init_env(env_name, **kwargs)
    # add 1 to obs dimension for time step feature (see run_episode())
    obs_dim += 1
    tz = timezone('America/Montreal')  # Montreal Timezone
    dt = datetime.now(tz)  # Create unique directories
    now = dt.strftime('%Y-%m-%d %H_%M_%S')
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
        dir = './log-files/' + env_name + '/' + now + '/'
    while episode < num_episodes:
        trajectories, tot_stuck = run_policy(env,
                                             policy,
                                             scaler,
                                             logger,
                                             episodes=batch_size)
        episode += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, scaler.mean_rew,
                         np.sqrt(scaler.var_rew))
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if raw_input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("log-learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        print('running simulations')
        tr, tot_stuck = run_policy(env,
                                   policy,
                                   scaler,
                                   logger,
                                   episodes=final_pol_test)
        print('done')
        sum_rewww = [t['rewards'].sum() for t in tr]
        sum_rewww += [tot_stuck]
        print('total stucks', sum_rewww[-1])
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('Standard PPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("standard_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#28
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()
    if mpi_util.nworkers > 1:
        batch_size = batch_size // mpi_util.nworkers if batch_size % mpi_util.nworkers == 0 else batch_size // mpi_util.nworkers + 1  # spread the desired batch_size across processes
    env, obs_dim, act_dim = init_gym(env_name)
    mpi_util.set_global_seeds(111 + mpi_util.rank)
    env.seed(111 + mpi_util.rank)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    if mpi_util.rank == 0:
        env = wrappers.Monitor(env,
                               aigym_path,
                               force=True,
                               write_upon_reset=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, valfunc_hid_list)
    policy = Policy(obs_dim, act_dim, kl_targ, policy_hid_list)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        mpi_util.timeit(
            '--------------------------'
        )  # let's time everything so we can see where the work is being done
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        mpi_util.timeit('run_policy')
        # episode += len(trajectories)
        episode += mpi_util.all_sum(len(trajectories))
        mpi_util.timeit('mpi_util.all_sum')
        add_value(trajectories, val_func)  # add estimated values to episodes
        mpi_util.timeit('add_value')
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        mpi_util.timeit('add_disc_sum_rew')
        add_gae(trajectories, gamma, lam)  # calculate advantage
        mpi_util.timeit('add_gae')
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        mpi_util.timeit('build_train_set')
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        mpi_util.timeit('log_batch_stats')
        if mpi_util.rank == 0:
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            mpi_util.timeit('policy.update')
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            mpi_util.timeit('val_func.fit')
        mpi_util.rank0_bcast_wts(
            val_func.sess, val_func.g, 'val'
        )  # doubt if value network is used during rollouts but it only takes a few milliseconds anyhow
        mpi_util.timeit('mpi_util.rank0_bcast_wts(val_func')
        mpi_util.rank0_bcast_wts(policy.sess, policy.g, 'policy')
        mpi_util.timeit('mpi_util.rank0_bcast_wts(policy')
        if mpi_util.rank == 0:
            logger.write(
                display=True)  # write logger results to file and stdout
        # if killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#29
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, save):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    env_id = env_name + id_generator()
    logger = Logger(logname=env_id, now=now)
    aigym_path = os.path.join('/tmp', env_id)
    env = wrappers.Monitor(env,
                           aigym_path,
                           force=True,
                           video_callable=lambda episode_id: False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    if env_name == 'Swimmer-v1':
        score_window = 100
        solution_score = 360
    elif env_name == 'HalfCheetah-v1':
        score_window = 100
        solution_score = 4800
    else:
        assert False

    # assert score_window % batch_size == 0
    rewards = collections.deque(maxlen=int(np.rint(score_window / batch_size)))
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        mean_reward = logger.log_entry['_MeanReward']
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        rewards.append(mean_reward)
        '''
        if np.mean(rewards) >= solution_score:
            episode = episode - score_window
            break
        '''

    logger.close()
    policy.close_sess()
    val_func.close_sess()

    # return episode
    return -np.mean(rewards)
示例#30
0
def add_value(trajectories, val_func):
	'''
	Adds estimated value to all time steps of all trajectories

	Args: 
		trajectories: as returned by run_policy()
		val_func: object with predict() method, takes observations and returns predicted state value

	Returns: 
		None (mutates trajectories dictionary to add 'values')
	'''
	for trajectory in trajectories:
        observes = trajectory['observes']
        values = val_func.predict(observes)
        trajectory['values'] = values

def add_gae(trajectories, gamma, lam):
	''' 
	Add generalized advantage estimator. 
	https://arxiv.org/pdf/1506.02438.pdf

	Args:
		trajectories: as returned by run_policy must include 'values' key from add_values().
		gamma: reward discount
		lam: lambda (see paper).
			lam=0 : use TD residuals
			lam=1 : A = Sum Discounted Rewards - V_hat(s)

	Returns:
		None (mutates trajectories dictionary to add 'advantages')
	'''
	for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        values = trajectory['values']
        # temporal differences
        tds = rewards - values + np.append(values[1:] * gamma, 0)
        advantages = discount(tds, gamma * lam)
        trajectory['advantages'] = advantages

def build_train_set(trajectories):
	'''

	Args:
		trajectories after processing by add_disc_sum_rew(), add_value() and add_gae()

	Returns: 4-tuple of NumPy arrays
		observes: shape = (N, obs_dim)
		actions: shape = (N, act_dim)
		advantages: shape = (N,)
		disc_sum_rew: shape = (N,)
	'''
	observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

    return observes, actions, advantages, disc_sum_rew

def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode):
    """ Log various batch statistics """
    logger.log({'_mean_obs': np.mean(observes),
                '_min_obs': np.min(observes),
                '_max_obs': np.max(observes),
                '_std_obs': np.mean(np.var(observes, axis=0)),
                '_mean_act': np.mean(actions),
                '_min_act': np.min(actions),
                '_max_act': np.max(actions),
                '_std_act': np.mean(np.var(actions, axis=0)),
                '_mean_adv': np.mean(advantages),
                '_min_adv': np.min(advantages),
                '_max_adv': np.max(advantages),
                '_std_adv': np.var(advantages),
                '_mean_discrew': np.mean(disc_sum_rew),
                '_min_discrew': np.min(disc_sum_rew),
                '_max_discrew': np.max(disc_sum_rew),
                '_std_discrew': np.var(disc_sum_rew),
                '_Episode': episode
                })

def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
	'''
	Main training loop 

	Args:
		env_name: Robot model name
		num_episodes: maximum umber of episodes to run (int)
		gamma: reward discount factor (float)
		lam: lambda for Generalized Advantage Estimate
		kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)]
		bath_size: number of episodes per policy training batch
	'''
	env, obs_dim, act_dim = init_env(env_name)
    obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #TODO agregar la parte de sampling una vez que todo ande

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
    	trajectories = run_policy(env, policy, scaler, logger, numEpisodes=batch_size)
    	episode += len(trajectories)
    	add_value(trajectories, val_func) # add estimated values to episodes
    	add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs
    	add_gae(trajectories, gamma, lam) # calculate advantage
    	# concatenate all episodes into single NumPy arrays
    	observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    	# add various stats to train log:
    	log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
示例#31
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    print('Start time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))


    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories  格林尼治时间!!!  utcnow改为now
    testname = now+'-'+TestNote
    logger = Logger(logname=env_name, now=testname)
    monitor_path = os.path.join('log-files', env_name, testname, 'monitor')
    env = wrappers.Monitor(env, monitor_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    print('Start time:\n')
    time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function

        # save models
        if not episode % (num_episodes / 10):
            policy_save_path = os.path.join('log-files', env_name, testname, 'checkpoint')
            policy.save_model(env_name + "-" + str(episode), policy_save_path)


        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False


    logger.close()
    policy.close_sess()
    val_func.close_sess()

    print('End time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
示例#32
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, weights_path, init_episode, experiment_name, resume):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    logger = Logger(logname=env_name, sub_dir=experiment_name)
    aigym_path = os.path.join('results', env_name, experiment_name)

    if resume:
        weights_path = aigym_path
        ckpt = tf.train.get_checkpoint_state(weights_path)
        init_episode = int(
            os.path.basename(ckpt.model_checkpoint_path).split('-')[1])

    env, obs_dim, act_dim = init_gym(env_name)
    # obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    weights_path)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = init_episode
    while episode <= num_episodes:
        if episode % 1000 is 0:
            # record one episode
            record(env_name, aigym_path, policy, scaler)
            policy.save(aigym_path, episode)
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #record one last episode
    record(env_name, aigym_path, policy, scaler)
    logger.close()
    policy.close_sess()
    val_func.close_sess()