def gather_eval_data(
        policy,
        encoder,
        env,
        expert_buffer_for_eval_tasks=None,
        num_diff_context_per_task=8,
        context_size_min=1,
        context_size_max=12,
        num_rollouts_per_context=20,
        deterministic=True,
        params_sampler=None,
    ):
    policy.eval()
    encoder.eval()

    all_success_transitions = []
    all_no_op_transitions = []

    task_num = 0
    for task_params, obs_task_params in params_sampler:
        print('\n\tEvaluating task {}...'.format(task_num))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for _ in range(num_diff_context_per_task):
            print('new context transition')

            transition_success_rate = []
            transition_no_op_rate = []
            list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                task_id,
                context_size_max
            )
            for i in range(context_size_min, context_size_max+1):
                print('next size')
                correct = []
                incorrect = []
                no_op = []

                new_list_of_trajs = list_of_trajs[:i]
                print(len(new_list_of_trajs))
                post_dist = encoder([new_list_of_trajs])
                z = post_dist.mean
                z = z.cpu().data.numpy()[0]

                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
            
                for _ in range(num_rollouts_per_context):
                    max_path_length = 50
                    within_correct, within_incorrect = rollout_path(
                        env,
                        task_params,
                        obs_task_params,
                        post_cond_policy,
                        max_path_length
                    )
                    correct.append(within_correct)
                    incorrect.append(within_incorrect)
                    no_op.append(not (within_correct or within_incorrect))
                
                transition_success_rate.append(np.mean(correct))
                transition_no_op_rate.append(np.mean(no_op))
                # task_rets.append(np.sum(stacked_path['rewards']))
            all_success_transitions.append(transition_success_rate)
            all_no_op_transitions.append(transition_no_op_rate)

            print(transition_success_rate)
            print(transition_no_op_rate)
        
        if task_num == 32: break


        # print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets)))
        # all_statistics[task_id] = task_rets
    
    return {
        'all_success_transitions': all_success_transitions,
        'all_no_op_transitions': all_no_op_transitions,
    }
def gather_eval_data(alg,
                     sample_from_prior=False,
                     num_rollouts_per_task=8,
                     context_sizes=[4],
                     deterministic=True,
                     eval_expert=False,
                     just_loading_policy=False,
                     render=False):
    if not eval_expert: alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    params_sampler = EvalParamsSampler()
    if not just_loading_policy:
        env = alg.env
    else:
        env = AntRandDirec2DEnv()

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        # print('\tEvaluating task %.4f...' % obs_task_params)
        print('\n\tEvaluating task {}'.format(obs_task_params))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)

            # evaluate all posterior sample trajs with same initial state
            env_seed = np.random.randint(0, high=10000)

            if sample_from_prior: raise NotImplementedError
            # z = post_dist.sample()
            # z = z.cpu().data.numpy()[0]
            # if sample_from_prior:
            #     z = np.random.normal(size=z.shape)
            if eval_expert:
                if just_loading_policy:
                    post_cond_policy = PostCondMLPPolicyWrapper(
                        alg, obs_task_params)
                else:
                    post_cond_policy = alg.get_eval_policy(obs_task_params)
            else:
                post_cond_policy = alg.get_eval_policy(task_id,
                                                       mode='meta_test')
            post_cond_policy.policy.eval()
            post_cond_policy.deterministic = deterministic

            # reset the env seed
            env.seed(seed=env_seed)
            _rets = []
            _min_dists = []
            _last_100 = []
            for _ in range(num_rollouts_per_task):
                if just_loading_policy:
                    # max_path_length = 200
                    # max_path_length = 300
                    max_path_length = 100
                else:
                    alg.max_path_length
                stacked_path = rollout_path(env, task_params, obs_task_params,
                                            post_cond_policy, max_path_length,
                                            eval_expert, render)
                obs = np.array(
                    [d['obs'] for d in stacked_path['observations']])

        all_statistics[task_id] = _task_dict
    return all_statistics
def gather_eval_data(
    policy,
    encoder,
    env,
    num_diff_context=4,
    num_rollouts_per_context=4,
    deterministic=True,
    expert_buffer_for_eval_tasks=None,
    params_sampler=None,
    eval_non_meta_policy=False
    ):
    policy.eval()
    if not eval_non_meta_policy:
        encoder.eval()

    all_statistics = {}
    task_num = 0

    for task_params, obs_task_params in params_sampler:
        task_rets = []
        # print('\tEvaluating task %.4f...' % obs_task_params)
        # print('\n\tEvaluating task {}...'.format(obs_task_params))
        print('\n\tEvaluating task {}...'.format(task_num))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for _ in range(num_diff_context):
            if not eval_non_meta_policy:
                list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                    task_id,
                    1
                )
                post_dist = encoder([list_of_trajs])
                z = post_dist.mean
                z = z.cpu().data.numpy()[0]

                post_cond_policy = PostCondMLPPolicyWrapper(policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
            else:
                if deterministic:
                    print('DETERMINISTIC')
                    post_cond_policy = MakeDeterministic(policy)
                else:
                    post_cond_policy = policy
            
            for _ in range(num_rollouts_per_context):
                max_path_length = 1000

                stacked_path = rollout_path(
                    env,
                    task_params,
                    obs_task_params,
                    post_cond_policy,
                    max_path_length,
                    task_num
                )
                task_rets.append(np.sum(stacked_path['rewards']))

        print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets)))
        all_statistics[task_id] = task_rets
    return all_statistics
def gather_eval_data(alg,
                     sample_from_prior=False,
                     num_rollouts_per_task=8,
                     context_sizes=[4],
                     deterministic=True,
                     num_diff_context=1):
    alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    params_sampler = alg.test_task_params_sampler
    expert_buffer_for_eval_tasks = alg.test_context_expert_replay_buffer
    env = alg.env

    _all_rets = []

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        print('\tEvaluating task %.4f...' % obs_task_params)
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)
            # list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
            #     task_id,
            #     context_size
            # )

            # # evaluate all posterior sample trajs with same initial state
            # env_seed = np.random.randint(0, high=10000)

            if sample_from_prior: raise NotImplementedError
            # z = post_dist.sample()
            # z = z.cpu().data.numpy()[0]
            # if sample_from_prior:
            #     z = np.random.normal(size=z.shape)

            #
            # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_test')
            # post_cond_policy.policy.eval()
            # post_cond_policy.deterministic = deterministic
            #

            # reset the env seed
            _vels = []
            # _std_vels = []
            _run_costs = []
            _rets = []
            # env.seed(seed=env_seed)

            for c_idx in range(num_diff_context):
                list_of_trajs = alg.test_context_expert_replay_buffer.sample_trajs_from_task(
                    task_id, context_size)
                alg.encoder.eval()
                post_dist = alg.encoder([list_of_trajs])
                z = post_dist.sample()
                z = z.cpu().data.numpy()[0]
                # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                post_cond_policy = PostCondMLPPolicyWrapper(alg.policy, z)
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic
                for _ in range(num_rollouts_per_task):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                alg.max_path_length)

                    # compute mean vel, return, run cost per traj
                    _vels.extend([d['vel'] for d in stacked_path['env_infos']])
                    # _std_vels.append(np.std([d['vel'] for d in stacked_path['env_infos']]))
                    _run_costs.append(
                        np.sum([
                            d['run_cost'] for d in stacked_path['env_infos']
                        ]))
                    _rets.append(np.sum(stacked_path['rewards']))

            _cont_size_dict['_vels'] = _vels
            # _cont_size_dict['std_vels'] = _std_vels
            _cont_size_dict['run_costs'] = _run_costs
            _cont_size_dict['rets'] = _rets
            _task_dict[context_size] = _cont_size_dict

            print('\t\tVel: %.4f +/- %.4f' % (np.mean(_vels), np.std(_vels)))
            _all_rets.extend(_rets)

        all_statistics[task_id] = _task_dict
    print('\nReturns: %.4f +/- %.4f' % (np.mean(_all_rets), np.std(_all_rets)))
    return all_statistics
def gather_eval_data(
    alg,
    sample_from_prior=False,
    num_rollouts_per_task=8,
    context_sizes=[4],
    num_diff_context=1,
    deterministic=True,
    eval_expert=False,
    just_loading_policy=False,
    render=False,
    use_separate_expert_buffer=False,
    expert_buffer_for_eval_tasks=None,
):
    if not eval_expert: alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    # params_sampler = alg.test_task_params_sampler
    # params_sampler = alg.train_task_params_sampler
    params_sampler = AntRandGoalExpertTestSampler()
    if not just_loading_policy:
        env = alg.env
    else:
        env = AntRandGoalEnv()

    for task_params, obs_task_params in params_sampler:
        _task_dict = {}
        # print('\tEvaluating task %.4f...' % obs_task_params)
        print('\n\tEvaluating task {}'.format(obs_task_params))
        task_num += 1
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}
            print('\t\tTry with context size: %d...' % context_size)

            # evaluate all posterior sample trajs with same initial state
            env_seed = np.random.randint(0, high=10000)
            # reset the env seed
            env.seed(seed=env_seed)
            _rets = []
            _min_dists = []
            _last_100 = []

            for _ in range(num_diff_context):
                if sample_from_prior: raise NotImplementedError
                # z = post_dist.sample()
                # z = z.cpu().data.numpy()[0]
                # if sample_from_prior:
                #     z = np.random.normal(size=z.shape)
                if eval_expert:
                    if just_loading_policy:
                        post_cond_policy = PostCondMLPPolicyWrapper(
                            alg, obs_task_params)
                    else:
                        post_cond_policy = alg.get_eval_policy(obs_task_params)
                else:
                    if use_separate_expert_buffer:
                        list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task(
                            task_id, context_size)
                        post_dist = alg.encoder([list_of_trajs])
                        z = post_dist.mean
                        z = z.cpu().data.numpy()[0]
                        # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                        post_cond_policy = PostCondMLPPolicyWrapper(
                            alg.policy, z)
                    else:
                        post_cond_policy = alg.get_eval_policy(
                            task_id, mode='meta_test')
                    # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_train')
                post_cond_policy.policy.eval()
                post_cond_policy.deterministic = deterministic

                for _ in range(num_rollouts_per_task):
                    if just_loading_policy:
                        max_path_length = 100
                    else:
                        max_path_length = alg.max_path_length
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                max_path_length, eval_expert,
                                                render)
                    obs = np.array(
                        [d['obs'] for d in stacked_path['observations']])
                    # print(np.max(obs, axis=0))
                    # print(np.min(obs, axis=0))
                    # print(np.mean(obs, axis=0))
                    # print(np.std(obs, axis=0))
                    # print(obs.shape)
                    # print(np.max(obs))
                    # print(np.min(obs))

                    _rets.append(np.sum(stacked_path['rewards']))
                    rew_frw = [
                        d['reward_forward'] for d in stacked_path['env_infos']
                    ]
                    _min_dists.append(-np.max(rew_frw))
                    _last_100.append(np.mean(rew_frw[-100:]))

            _cont_size_dict['rets'] = _rets
            _cont_size_dict['min_dists'] = _min_dists
            _cont_size_dict['last_100'] = _last_100
            _task_dict[context_size] = _cont_size_dict

            print('\t\t\tMin Dist: %.4f +/- %.4f' %
                  (np.mean(_min_dists), np.std(_min_dists)))
            print(_min_dists)

        all_statistics[task_id] = _task_dict
    return all_statistics
Пример #6
0
def gather_eval_data(alg,
                     num_rollouts_per_context=8,
                     deterministic=True,
                     num_diff_context=1,
                     eval_params_sampler=None,
                     expert_buffer_for_eval_tasks=None,
                     evaluating_expert=False,
                     eval_deterministic=True,
                     eval_no_task_info=False):
    context_sizes = [1]
    if not evaluating_expert:
        alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    # env = alg.env
    env = Walker2DRandomDynamicsEnv()

    _means = []
    _stds = []

    for task_params, obs_task_params in eval_params_sampler:
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_rets = []
        print('\tEvaluating task {}...'.format(obs_task_params))
        print(task_params)
        task_num += 1
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}

            for c_idx in range(num_diff_context):
                if not evaluating_expert:
                    if eval_no_task_info:
                        print('Evaluting with no task information!')
                        new_task_params = {}
                        for k in task_params:
                            new_task_params[k] = np.ones(task_params[k].shape)
                        raise NotImplementedError()
                    else:
                        list_of_trajs = alg.expert_buffer_for_eval_tasks.sample_trajs_from_task(
                            task_id, context_size)
                    alg.encoder.eval()
                    post_dist = alg.encoder([list_of_trajs])
                    z = post_dist.sample()
                    z = z.cpu().data.numpy()[0]
                    # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                    post_cond_policy = PostCondMLPPolicyWrapper(
                        alg.main_policy, z)
                    post_cond_policy.policy.eval()
                else:
                    # if eval_no_task_info:
                    #     print('Evaluting with no task information!')
                    #     post_cond_policy = alg.get_eval_policy(0.0*np.ones(obs_task_params.shape))
                    # else:
                    #     post_cond_policy = alg.get_eval_policy(np.ones(obs_task_params))

                    # For evaluating a standard walker expert
                    # post_cond_policy = alg.policy
                    # post_cond_policy = alg.eval_policy
                    post_cond_policy = MakeDeterministic(alg.policy)

                post_cond_policy.deterministic = eval_deterministic
                context_returns = []
                for _ in range(num_rollouts_per_context):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                alg.max_path_length)
                    context_returns.append(np.sum(stacked_path['rewards']))
                task_rets.extend(context_returns)

        all_statistics[task_id] = task_rets
        print('\nReturns: %.4f +/- %.4f' %
              (np.mean(task_rets), np.std(task_rets)))
        _means.append(np.mean(task_rets))
        _stds.append(np.std(task_rets))
    for i in range(len(_means)):
        print('%.4f +/- %.4f' % (_means[i], _stds[i]))
    return all_statistics