Exemplo n.º 1
0
def main():
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    env_name = 'HumanoidasimoMRD4_2-v1'
    #env_name='Humanoid-v1'
    num_episodes = 5000000
    gamma = 0.995
    lam = 0.98
    kl_targ = 0.003
    batch_size = 32
    hid1_mult = 10
    policy_logvar = -1.0

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join(
        '/home/initial/eclipse-workspace4/test/trpo-master/src/result',
        env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult, filename2)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    filename=filename1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            policy.save(episode, filename1)
            val_func.save(episode, filename2)
            #loger.flush()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Exemplo n.º 2
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop
    Args:
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)
    episode = 0
    #Inizialize reward list (to keep track of improvements)
    avg_rew_list = []
    while episode < num_episodes:
        print(episode)
        trajectories = run_policy(env, policy, scaler, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        policy.update(observes, actions, advantages)  # update policy
        val_func.fit(observes, disc_sum_rew)  # update value function
        avg_rew_list.append(avg_rewards(trajectories))
        #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards
        if not episode % 20000:
            print("Saving models")
            policy.save(episode)
            val_func.save(episode)
            f = open("models/scaler-" + str(episode) + ".pkl", 'wb')
            pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb')
            pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL)
            f2.close()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #Show animation at the end of training
    while True:
        obs = env.reset()
        step = 0.0
        scale, offset = scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        done = False
        while not done:
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]], axis=1)
            obs = (obs - offset) * scale
            action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            env.render1()
            env.render2()
            step += 1e-3
    policy.close_sess()
    val_func.close_sess()
Exemplo n.º 3
0
class Central_agent:
    def __init__(self):
        with tf.name_scope("central_agent"):
            self.val_func = NNValueFunction(obs_dim, hid1_mult)
            self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                                 policy_logvar)
            self.num_tuple = 0

    def update_parameter_server(self, episode, trajectories, name):
        self.num_tuple += len(trajectories)
        if len(trajectories) < batch_size:
            return

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = self.build_train_set(
            trajectories)
        # add various stats to training log:
        self.log_batch_stats(observes, actions, advantages, disc_sum_rew,
                             logger, episode)
        self.policy.update(observes, actions, advantages,
                           logger)  # update policy
        self.val_func.fit(observes, disc_sum_rew,
                          logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        print([
            'thread_name: ' + name + ', episode: ' + str(episode) +
            ', tuples: ' + str(self.num_tuple)
        ])
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            self.policy.save(episode, filename1)
            self.val_func.save(episode, filename2)

    def build_train_set(self, trajectories):
        observes = np.concatenate([t['observes'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        disc_sum_rew = np.concatenate(
            [t['disc_sum_rew'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-6)
        return observes, actions, advantages, disc_sum_rew

    def log_batch_stats(self, observes, actions, advantages, disc_sum_rew,
                        logger, episode):
        logger.log({
            '_mean_obs': np.mean(observes),
            '_min_obs': np.min(observes),
            '_max_obs': np.max(observes),
            '_std_obs': np.mean(np.var(observes, axis=0)),
            '_mean_act': np.mean(actions),
            '_min_act': np.min(actions),
            '_max_act': np.max(actions),
            '_std_act': np.mean(np.var(actions, axis=0)),
            '_mean_adv': np.mean(advantages),
            '_min_adv': np.min(advantages),
            '_max_adv': np.max(advantages),
            '_std_adv': np.var(advantages),
            '_mean_discrew': np.mean(disc_sum_rew),
            '_min_discrew': np.min(disc_sum_rew),
            '_max_discrew': np.max(disc_sum_rew),
            '_std_discrew': np.var(disc_sum_rew),
            '_Episode': episode
        })
Exemplo n.º 4
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path,
         out_path, thread_count, animation_mode, gait_name, gait_length,
         gaits_config_path, reward_mask, log_rewards, gait_reward_weight,
         g_colab, progress_reward_weight, phase_time_limit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    # restore_path = os.path.abspath(restore_path)
    env, obs_dim, act_dim = init_gym(env_name)
    log_rewards = log_rewards or (num_episodes == 0)
    env_list = []
    if thread_count > 1:
        env_list, obs_dim, act_dim = init_gyms(env_name, batch_size)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    start_time = datetime.now()  # create unique directories
    start_time_str = start_time.strftime("%b-%d/%H.%M.%S")
    logger = Logger(logname=env_name, now=start_time_str, out_path=out_path)
    env.env.set_params(gaits_config_path=gaits_config_path,
                       gait_name=gait_name,
                       gait_cycle_len=gait_length,
                       out_path=logger.path,
                       log_rewards=log_rewards,
                       render_mode=animation_mode,
                       reward_mask=reward_mask,
                       contact_reward=gait_reward_weight,
                       g_colab=g_colab,
                       progress_weight=progress_reward_weight,
                       phase_time_limit=phase_time_limit)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, logger, restore_path)
    policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path)

    log_train_info(logger, num_episodes, start_time_str, gait_name,
                   gait_length, batch_size, restore_path, reward_mask,
                   gait_reward_weight, progress_reward_weight,
                   phase_time_limit)

    # run a few episodes of untrained policy to initialize scaler:
    episode = 0
    try:
        if restore_path is None:
            print("\nInitializing scaler (may take some time)... ")
            run_policy(env, policy, scaler, logger, episodes=5)
            print("Done\n")
        else:
            scaler.load(restore_path, obs_dim)

        while episode < num_episodes:
            sim_time = datetime.now()
            if thread_count > 1:
                trajectories = run_policy_parallel(env_list,
                                                   policy,
                                                   scaler,
                                                   logger,
                                                   episodes=batch_size,
                                                   thread_num=thread_count)
            else:
                trajectories = run_policy(env,
                                          policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size)
            sim_time = datetime.now() - sim_time

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            train_time = datetime.now() - start_time
            policy_time = datetime.now()
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            policy_time = datetime.now() - policy_time
            val_time = datetime.now()
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            val_time = datetime.now() - val_time

            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode, train_time, sim_time, policy_time,
                            val_time)
            logger.write(
                display=True)  # write logger results to file and stdout
            print("Estimated time left: {}\n".format(
                estimate_time_left(episode, num_episodes, train_time)))

            if episode % 1000 == 0:
                policy.save()
                val_func.save()
                scaler.save(logger.path)
                print("Data saved at {}\n".format(logger.path))
                update_train_info(logger, episode)
                if animation_mode > 0:
                    run_policy(env,
                               policy,
                               scaler,
                               logger,
                               episodes=1,
                               animate=True,
                               anim_name='epizode_{}'.format(episode))
            if episode % 5000 == 0:
                os.rename(
                    os.path.join(logger.path, 'value_dump'),
                    os.path.join(logger.path, 'value_dump_' + str(episode)))
                os.rename(
                    os.path.join(logger.path, 'policy_dump'),
                    os.path.join(logger.path, 'policy_dump_' + str(episode)))
                # if episode == 20000:
                #     reward_mask = 63
                #     env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length,
                #                        out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode,
                #                        reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab)
                print("Progress Enabled")
            if killer.kill_now:
                # if input('Terminate training (y/[n])? ') == 'y':
                #     break
                # killer.kill_now = False
                break
    finally:
        if animation_mode > 0 or num_episodes == 0:
            print("Rendering result video")
            try:
                trajectories = run_policy(
                    env,
                    policy,
                    scaler,
                    logger,
                    episodes=1,
                    animate=True,
                    anim_name='final_epizode_{}'.format(episode))
                # for walk analysis
                for t in trajectories:
                    logger.log_trajectory(t)
            except Exception as e:
                print("Failed to animate results, error: {}".format(e))
                raise e

        scaler.save(logger.path)
        policy.close_sess()
        val_func.close_sess()
        update_train_info(logger, episode)
        logger.close()