Exemplo n.º 1
0
def main():
    env = gym.make(ENV_NAME)
    env = Monitor(env, f"./videos/{ENV_PREFIX}", force=True)
    policy, baseline = load_meta_learner_params(META_POLICY_PATH,
                                                BASELINE_PATH, env)
    sampler = BatchSampler(env_name=ENV_NAME, batch_size=20, num_workers=2)
    learner = MetaLearner(sampler, policy, baseline, optimizer=None)

    for task in TEST_TASKS:
        returns = []

        for i in range(1, EVAL_STEPS + 1):
            for grad_steps in GRAD_STEPS:
                if i % 10 == 0:
                    print(f"Evaluation-step: {i}")

                env.reset_task(task)
                learner.policy, learner.baseline = load_meta_learner_params(
                    META_POLICY_PATH, BASELINE_PATH, env)

                # Sample a batch of transitions
                sampler.reset_task(task)
                episodes = sampler.sample(learner.policy)
                for _ in range(grad_steps):
                    new_params = learner.adapt(episodes)
                    learner.policy.set_params_with_name(new_params)
                returns.append(evaluate(env, task, learner.policy))

        print("========EVAL RESULTS=======")
        print(f"Task: {task}")
        print(f"Returns: {returns}")
        print(f"Average Return: {np.mean(returns)}")
        print("===========================")
Exemplo n.º 2
0
def eval(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    # writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    log_folder = './logs/{0}'.format(args.output_folder)
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    if args.env_name == 'AntPos-v0':
        param_bounds = {"x": [-3, 3], "y": [-3, 3]}

    tree = TreeLSTM(args.tree_hidden_layer,
                    len(param_bounds.keys()),
                    args.cluster_0,
                    args.cluster_1,
                    device=args.device)

    if continuous_actions:
        policy = NormalMLPPolicy(int(
            np.prod(sampler.envs.observation_space.shape) +
            args.tree_hidden_layer),
                                 int(np.prod(sampler.envs.action_space.shape)),
                                 hidden_sizes=(args.hidden_size, ) *
                                 args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    policy.eval()
    tree.eval()

    all_tasks = []
    # torch.autograd.set_detect_anomaly(True)
    reward_list = []
    for batch in range(args.num_batches + 1):
        print("starting iteration {}".format(batch))
        try:
            policy.load_state_dict(
                torch.load(
                    os.path.join(save_folder, 'policy-{0}.pt'.format(batch))))
            tree = torch.load(
                os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
            tree.eval()
        except Exception:
            with open(
                    './logs/{0}/reward_list_eval.pkl'.format(
                        args.output_folder), 'wb') as pf:
                pickle.dump(reward_list, pf)

            print(reward_list)
            return

        # tree.load_state_dict(torch.load(os.path.join(save_folder,
        #                        'tree-{0}.pt'.format(batch))))

        tasks = sampler.sample_tasks(args.meta_batch_size)

        all_tasks.append(tasks)
        # tasks = np.array(tasks)
        # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        with open('./logs/{0}/task_list_eval.pkl'.format(args.output_folder),
                  'wb') as pf:
            pickle.dump(all_tasks, pf)

        print("evaluating...".format(batch))
        all_rewards = []
        for task in tasks:
            print(task["position"])
            episodes = sampler.sample(policy, task, tree=tree)
            # print("training...".format(batch))

            # tr = [ep.rewards for ep in episodes]
            # tr = np.mean([torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr])
            all_rewards.append(total_rewards(episodes.rewards))

        reward_list.append(np.mean(all_rewards))

    with open('./logs/{0}/reward_list_eval.pkl'.format(args.output_folder),
              'wb') as pf:
        pickle.dump(reward_list, pf)

    print(reward_list)