def test_composition(low_name, high_name, env_name, g, k=None, num_ep=100):
    params = compose_params(low_name, high_name, env_name, k=k)
    model, env = load(high_name, params, best=True)
    print("COMPOSED PARAMS", params)
    print("ENV", env)
    ep_rewards = list()
    rewards = list()
    gif_frames = list()
    obs = env.reset()
    for _ in range(g):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if 'frames' in info:
            gif_frames.extend(info['frames'])
        # env.render()
        # frame = env.render(mode='rgb_array')
        # gif_frames.append(frame)
        rewards.append(reward)
        if done:
            ep_rewards.append(sum(rewards))
            print("REWARD", sum(rewards), len(rewards), "Ep to go:", num_ep, "cur avg", np.mean(ep_rewards))
            num_ep -= 1
            rewards = []
            if num_ep == 0:
                break
            obs = env.reset()

    # print('AVG EP REWARD', np.mean(ep_rewards))

    import imageio
    render_path = os.path.join(RENDERS, 'composition_' + low_name + '2.gif')
    os.makedirs(os.path.dirname(render_path), exist_ok=True)
    print("saving to ", render_path)
    imageio.mimsave(render_path, gif_frames[::4], subrectangles=True, duration=0.05)
    print("completed saving")
    def __init__(self, env, policy, params=None, discrim_early_low_term=False, best=True, discrim_time_limit=None, discrim_online=True):
        utils.EzPickle.__init__(**locals())
        if not isinstance(env, Hierarchical):
            raise ValueError("Must be created with a Hierarchical Environment")
        # Get the discriminator params
        if not params:
            from bot_transfer.utils.loader import ModelParams
            discrim_params = ModelParams.load(policy)
        else:
            discrim_params = params
        discrim_params['env'] = discrim_params['env'].split('_')[0]
        # Avoid possible recursion
        if 'discrim_policy' in discrim_params['env_wrapper_args']:
            del discrim_params['env_wrapper_args']['discrim_policy']
        if not discrim_time_limit:
            discrim_time_limit = discrim_params['time_limit']
        discrim_params['time_limit'] = None
        # get the discriminator Environment
        from bot_transfer.utils.loader import get_env
        self.expert_env = get_env(discrim_params)
        if not isinstance(self.expert_env, Hierarchical):
            raise ValueError("Expert Environment must also be Hierarchical")
        # Get the expert policy
        if isinstance(policy, str):
            from bot_transfer.utils.loader import load
            expert_model, _ = load(policy, discrim_params, load_env=False, best=best)
        else:
            expert_model = policy

        self.env = env
        self.observation_space = self.env.agent_state_space()
        self.action_space = self.env.action_space()
        # option for expert env to auto terminate when we reach the goal.
        self.expert_env.early_low_termination = discrim_early_low_term
        self.expert_pred_fn = expert_model.predict
        self.expert_state = self.expert_env.agent_state_func(self.expert_env.state())
        self.expert_time_limit = discrim_time_limit
        self.expert_time_step = 0
        self.prev_agent_obs = None
        self.prev_expert_obs = None
        self.discrim_online = discrim_online
args = parser.parse_args()

params = compose_params(args.low,
                        args.high,
                        env_name=args.env,
                        k=args.high_level_skip)
assert params['alg'] == 'SAC', "Only SAC supported for finetuning"

params['timesteps'] = args.timesteps
params['alg_args']['learning_starts'] = 0
if not args.learning_rate is None:
    params['alg_args']['learning_rate'] = args.learning_rate
if not args.sample_goals is None:
    params['env_args']['sample_goals'] = args.sample_goals

# KL args
params['alg_args']['kl_policy'] = args.high
if not args.kl_type is None:
    params['alg_args']['kl_type'] = args.kl_type
if not args.kl_coef is None:
    params['alg_args']['kl_coef'] = args.kl_coef
if not args.kl_stop is None:
    params['alg_args']['kl_stop'] = args.kl_stop
if not args.kl_decay is None:
    params['alg_args']['kl_decay'] = args.kl_decay

params['alg'] = 'KL' + params['alg']
model, _ = load(args.high, params, load_env=False, best=False)
if isinstance(model, stable_baselines.SAC):
    model.learning_starts = 0
train(params, model=model)
Пример #4
0
import bot_transfer
import argparse
import stable_baselines

parser = argparse.ArgumentParser()

parser.add_argument("--low", "-l", type=str, required=True)
parser.add_argument("--high", "-m", type=str, required=True)
parser.add_argument("--time-limit", "-k", type=int)
parser.add_argument("--finetune-time-limit", "-f", type=int)
parser.add_argument("--timesteps", "-t", type=int, default=2000)

args = parser.parse_args()

params = ModelParams.load(args.low)
assert params['env'].endswith("_Low")

params['env_wrapper_args']['policy'] = args.high

if args.finetune_time_limit:
    params['env_wrapper_args'][
        'finetune_time_limit'] = args.finetune_time_limit
if args.time_limit:
    params['time_limit'] = args.time_limit

params['env'] = '_'.join([params['env'].split('_')[0], 'LowFinetune'])

model, _ = load(args.low, params, load_env=False, best=True)
if isinstance(model, stable_baselines.SAC):
    model.learning_starts = 0
train(params, model=model)
def composition_sweep(low_names, high_names, env_name=None, k=None, num_ep=100, success=False):
    import time
    f = open(os.getcwd() + '/data/comparison.results' + str(int(round(time.time() * 10000000))), 'w+')
    for high_name in high_names:
        data = list()
        for low_name in low_names:

            # Determine if we are dealing with a single seed directory or multiple seeds.
            try:
                test_load = ModelParams.load(low_name)
                test_load = ModelParams.load(high_name)
                multi_seed = False
            except ValueError:
                multi_seed = True

            if multi_seed:
                file_location_low = os.path.join(BASE, low_name) if not low_name.startswith('/') else low_name
                file_location_high = os.path.join(BASE, high_name) if not high_name.startswith('/') else high_name
                low_runs = sorted([os.path.join(low_name, run) for run in os.listdir(file_location_low) if not run.endswith('.log')])
                high_runs = sorted([os.path.join(high_name, run) for run in os.listdir(file_location_high) if not run.endswith('.log')])
                print(low_runs)
                print(high_runs)
                assert len(low_runs) == len(high_runs)
                run_list = zip(low_runs, high_runs)
            else:
                run_list = [(low_name, high_name)]

            seed_rewards = list()
            f.write("----------------------------------------------\n")
            for run_low, run_high in run_list:
                print("Composing", run_low, "with", run_high)
                params = compose_params(run_low, run_high, env_name=env_name, k=k)
                print("COMPOSED PARAMS", params)
                
                ep_rewards = list()
                model, env = load(run_high, params, best=True)
                obs = env.reset()
                for _ in range(num_ep):
                    rewards = list()
                    while True:
                        action, _states = model.predict(obs)
                        obs, reward, done, info = env.step(action)
                        rewards.append(reward)
                        if done:
                            if success:
                                val_to_add = 1.0 if sum(rewards) > 0 else 0.0
                            else:
                                val_to_add = sum(rewards)
                            ep_rewards.append(val_to_add)
                            obs = env.reset()
                            break
                env.close()
                del model
                del env
                seed_rew_mean = np.mean(ep_rewards)
                seed_rewards.append(seed_rew_mean)

                print("==============================")
                print("Run:", run_low, run_high, ":", seed_rew_mean)
                write_str = run_low + "\t" + run_high + "\t" + str(seed_rew_mean) + "\n"
                f.write(write_str)

            data.append((low_name, np.mean(seed_rewards), np.std(seed_rewards)))

        # print the resulting output
        print("=================================================")
        print("Results for high policy" + high_name)
        for name, score, std in data:
            print('{:<60} {:.2f} {:.2f}'.format(name[-55:], score, std))
        # Write it to a file
        f.write("==================== FINAL RESULTS ==========================\n")
        f.write("== Results for High Level: " + high_name + "\n")
        for name, score, std in data:
            f.write('{:<60} {:.2f} {:.2f}\n'.format(name[-55:], score, std))

    f.close()