Exemplo n.º 1
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render)
    }

    for name in ['max_episode_steps', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = EpisodeRollout(params['make_env'], policy, dims, logger,
                               **eval_params)
    evaluator.seed(seed)
    gym = evaluator.envs._gym
    sim = evaluator.envs._sim
    viewer = gym.create_viewer(sim, gymapi.DEFAULT_VIEWER_WIDTH,
                               gymapi.DEFAULT_VIEWER_HEIGHT)
    evaluator.viewer = viewer
    evaluator.viewer = viewer

    # Run evaluation.
    evaluator.clear_history()
    all_episodes = []
    for _ in range(n_test_rollouts):

        # if friction_idx == len(friction_arr):
        #     friction_idx = 0
        # if _ % 5 == 0:
        #     evaluator.seed(seed)
        #     friction = friction_arr[friction_idx]
        #     friction_idx += 1
        #     evaluator.set_physics(param=friction)
        episode = evaluator.generate_rollouts()
        all_episodes.append(episode)

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Exemplo n.º 2
0
def configure_everything(rank,
                         seed,
                         num_cpu,
                         env,
                         trial_id,
                         n_epochs,
                         reward_function,
                         policy_encoding,
                         feedback_strategy,
                         policy_architecture,
                         goal_invention,
                         reward_checkpoint,
                         rl_positive_ratio,
                         p_partner_availability,
                         imagination_method,
                         git_commit='',
                         display=True):
    # Seed everything
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # goal invention one epoch later than stated
    epoch = int(goal_invention.split('_')[-1]) + 1
    goal_invention = 'from_epoch_{}'.format(epoch)

    # Prepare params.
    params = DEFAULT_CONFIG

    # Env generating function
    def make_env():
        return gym.make(params['conditions']['env_name'], display=display)

    # Get info from environment and configure dimensions dict
    tmp_env = make_env()
    tmp_env.reset()
    params['env_params'] = tmp_env.unwrapped.params
    params['learning_params']['T'] = tmp_env._max_episode_steps
    params['learning_params'][
        'gamma'] = 1. - 1. / params['learning_params']['T']
    params['reward_function']['n_objs'] = params['env_params'][
        'max_nb_objects']
    params['make_env'] = make_env

    train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions(
        params['env_params'])
    # compute imagined goals to get the list of all possible goals
    goal_generator = SentenceGeneratorHeuristic(train_descriptions,
                                                test_descriptions,
                                                sentences=None,
                                                method='CGH')
    goal_generator.update_model(train_descriptions + test_descriptions)
    imagined_descriptions = goal_generator.generate_sentences()
    all_descriptions = train_descriptions + test_descriptions + tuple(
        imagined_descriptions)

    # train_descriptions, test_descriptions, all_descriptions = get_descriptions(ENV_ID)
    # assert sorted(train_descriptions) == sorted(train_descriptions_env)
    # assert sorted(test_descriptions) == sorted(test_descriptions_env)
    params.update(date_time=str(datetime.datetime.now()),
                  train_descriptions=train_descriptions,
                  test_descriptions=test_descriptions,
                  extra_descriptions=extra_descriptions,
                  all_descriptions=all_descriptions,
                  git_commit=git_commit)

    # Configure logging
    if rank == 0:
        logdir = find_save_path('../../data/expe/' + env + "/", trial_id)
        logger.configure(dir=logdir)
        os.makedirs(logdir + 'tmp/', exist_ok=True)
        os.makedirs(logdir + 'reward_checkpoints/', exist_ok=True)
        os.makedirs(logdir + 'policy_checkpoints/', exist_ok=True)
        os.makedirs(logdir + 'goal_info/', exist_ok=True)
        if params['experiment_params']['save_obs']:
            os.makedirs(logdir + 'save_obs/', exist_ok=True)
    else:
        logdir = None
    logdir = MPI.COMM_WORLD.bcast(logdir, root=0)

    # Update conditions parameters from arguments or variables defined in train.py
    params['conditions'].update(
        env_name=env,
        policy_architecture=policy_architecture,
        reward_function=reward_function,
        goal_invention=goal_invention,
        imagination_method=imagination_method,
        feedback_strategy=feedback_strategy,
        rl_positive_ratio=rl_positive_ratio,
        reward_checkpoint=reward_checkpoint,
        policy_encoding=policy_encoding,
        p_social_partner_availability=p_partner_availability)

    # checks
    if params['conditions']['policy_architecture'] in [
            'modular_attention', 'attention'
    ]:
        error_msg = 'You need an lstm policy encoding and reward is you use {}'.format(
            params['conditions']['policy_architecture'])
        assert params['conditions']['policy_encoding'] == 'lstm', error_msg
        assert params['conditions']['reward_function'] in [
            'pretrained', 'learned_lstm'
        ], error_msg
    elif params['conditions']['reward_function'] == 'oracle':
        error_msg = 'You cannot use an lstm policy encoding if you use an oracle reward'
        assert params['conditions']['policy_encoding'] != 'lstm', error_msg
        error_msg = 'You can only use a flat_concat policy architecture if you use an oracle reward'
        assert params['conditions'][
            'policy_architecture'] == 'flat_concat', error_msg

    # Update experiment parameters from arguments or variables defined in train.py
    params['experiment_params'].update(
        n_epochs=n_epochs,
        trial_id=trial_id,
        logdir=logdir,
        seed=seed,
        n_cpus=num_cpu,
        n_test_rollouts=len(params['train_descriptions']),
    )
    params['reward_function'].update(
        reward_positive_ratio=params['conditions']['reward_positive_ratio'])
    # Define social partner params
    params['social_partner_params'] = dict(
        feedback_strategy=feedback_strategy,
        p_availability=p_partner_availability)

    if params['conditions']['policy_encoding'] == 'lstm':
        dim_encoding = params['reward_function']['num_hidden_lstm']
    else:
        raise NotImplementedError

    inds_objs = tmp_env.unwrapped.inds_objs  # indices of object in state
    for i in range(len(inds_objs)):
        inds_objs[i] = inds_objs[i].tolist()
    dims = dict(obs=tmp_env.observation_space.shape[0],
                g_encoding=dim_encoding,
                g_id=1,
                acts=tmp_env.action_space.shape[0],
                g_str=None,
                nb_obj=tmp_env.unwrapped.nb_obj,
                inds_objs=inds_objs)
    params['dims'] = dims

    # configure learning params and interactions
    if params['learning_params']['algo'] == 'ddpg':
        params['learning_params']['network_class'] += 'DDPG'
    else:
        raise NotImplementedError

    params['training_rollout_params'] = dict(
        exploit=False,
        use_target_net=False,
        compute_Q=False,
        eval_bool=False,
    )
    params['evaluation_rollout_params'] = dict(
        exploit=True,
        use_target_net=params['learning_params']['test_with_polyak'],
        compute_Q=True,
        eval_bool=True)

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        params['training_rollout_params'][name] = params['learning_params'][
            name]
        params['evaluation_rollout_params'][name] = params['learning_params'][
            name]
    params['evaluation_rollout_params']['rollout_batch_size'] = 1

    params['repo_path'] = REPO_PATH
    params[
        'lstm_reward_checkpoint_path'] = REPO_PATH + './src/data/lstm_checkpoints/{}'.format(
            params['conditions']['reward_checkpoint'])
    params['or_params_path'] = dict()
    for n_obj in [3]:
        params['or_params_path'][
            n_obj] = REPO_PATH + '/src/data/or_function/or_params_{}objs.pk'.format(
                n_obj)

    # Save parameter dict

    if rank == 0:
        json_dict = clean_dict_for_json(params)
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(json_dict, f)
        for key in sorted(params.keys()):
            logger.info('{}: {}'.format(key, params[key]))

    return params, rank_seed
Exemplo n.º 3
0
def launch(env,
           logdir,
           n_epochs,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           random_physics,
           lower_bound,
           upper_bound,
           randomise_every_n_epoch,
           override_params={},
           save_policies=True,
           mdn_prior=None):

    now = datetime.datetime.now()
    logdir += "/" + env + "/" + str(now.strftime("%Y-%m-%d-%H:%M"))

    # Configure logging
    if logdir or logger.get_dir() is None:
        logger.configure(dir=logdir)

    logdir = logger.get_dir()

    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    params = set_params(env, replay_strategy, override_params)

    config.log_params(params, logger=logger)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'render': params['render'],
        'max_episode_steps': params['max_episode_steps'],
        'random_physics': random_physics,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'randomise_every_n_epoch': randomise_every_n_epoch
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'render': params['render'],
        'max_episode_steps': params['max_episode_steps'],
        'random_physics': False,
        'rnd_phys_lower_bound': lower_bound,
        'rnd_phys_upper_bound': upper_bound,
        'randomise_every_n_epoch': randomise_every_n_epoch
    }

    dims = config.configure_dims(params)

    her = HindisghtExperienceReplay(params['make_env'], replay_strategy,
                                    params['replay_k'])

    sample_her_transitions = her.make_sample_her_transitions()

    # Seed everything.
    rank_seed = seed + 1000000
    set_global_seeds(rank_seed)

    # DDPG agent
    ddpg_params = params['ddpg_params']

    ddpg_params.update({
        'input_dims':
        dims.copy(),  # agent takes an input observations
        'max_episode_steps':
        params['max_episode_steps'],
        'clip_pos_returns':
        True,  # clip positive returns
        'clip_return': (1. / (1. - params['gamma']))
        if clip_return else np.inf,  # max abs of return
        'rollout_batch_size':
        params['rollout_batch_size'],
        'subtract_goals':
        simple_goal_subtract,
        'sample_transitions':
        sample_her_transitions,
        'gamma':
        params['gamma'],
    })

    ddpg_params['info'] = {
        'env_name': params['env_name'],
    }

    policy = DDPG(reuse=False, **ddpg_params, use_mpi=True)

    for name in [
            'max_episode_steps', 'rollout_batch_size', 'gamma', 'noise_eps',
            'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = EpisodeRollout(params['make_env'], policy, dims, logger,
                                    **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = EpisodeRollout(params['make_env'], policy, dims, logger,
                               **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          mdn_prior=mdn_prior)
Exemplo n.º 4
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load params
    with open(PARAMS_FILE) as json_file:
        params = json.load(json_file)

    if not render:
        env = 'PlaygroundNavigation-v1'
    else:
        env = 'PlaygroundNavigationRender-v1'
    params, rank_seed = config.configure_everything(
        rank=0,
        seed=seed,
        num_cpu=params['experiment_params']['n_cpus'],
        env=env,
        trial_id=0,
        n_epochs=10,
        reward_function=params['conditions']['reward_function'],
        policy_encoding=params['conditions']['policy_encoding'],
        feedback_strategy=params['conditions']['feedback_strategy'],
        policy_architecture=params['conditions']['policy_architecture'],
        goal_invention=params['conditions']['goal_invention'],
        reward_checkpoint=params['conditions']['reward_checkpoint'],
        rl_positive_ratio=params['conditions']['rl_positive_ratio'],
        p_partner_availability=params['conditions']
        ['p_social_partner_availability'],
        imagination_method=params['conditions']['imagination_method'],
        git_commit='')

    policy_language_model, reward_language_model = config.get_language_models(
        params)

    onehot_encoder = config.get_one_hot_encoder(params['all_descriptions'])
    # Define the goal sampler for training
    goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                               reward_language_model=reward_language_model,
                               goal_dim=policy_language_model.goal_dim,
                               one_hot_encoder=onehot_encoder,
                               params=params)

    reward_function = config.get_reward_function(goal_sampler, params)
    if params['conditions']['reward_function'] == 'learned_lstm':
        reward_function.restore_from_checkpoint(
            PATH +
            'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH))
    policy_language_model.set_reward_function(reward_function)
    if reward_language_model is not None:
        reward_language_model.set_reward_function(reward_function)
    goal_sampler.update_discovered_goals(params['all_descriptions'],
                                         episode_count=0,
                                         epoch=0)

    # Define learning algorithm
    policy = config.configure_learning_algo(reward_function=reward_function,
                                            goal_sampler=goal_sampler,
                                            params=params)

    policy.load_params(POLICY_FILE)

    evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                      policy=policy,
                                      reward_function=reward_function,
                                      params=params,
                                      render=render,
                                      **params['evaluation_rollout_params'])
    evaluation_worker.seed(seed)

    # Run evaluation.
    evaluation_worker.clear_history()

    env_params = evaluation_worker.env.unwrapped.params
    train_descriptions, test_descriptions, _ = generate_all_descriptions(
        env_params)
    train_descriptions = list(train_descriptions)
    np.random.shuffle(list(test_descriptions))
    np.random.shuffle(train_descriptions)
    successes_test_descr = []
    for d in test_descriptions:
        successes_test_descr.append([])
        print(d)
        for i in range(n_test_rollouts):
            goal_str = [d]
            goal_encoding = [policy_language_model.encode(goal_str[0])]
            goal_id = [0]
            ep = evaluation_worker.generate_rollouts(
                exploit=True,
                imagined=False,
                goals_str=goal_str,
                goals_encodings=goal_encoding,
                goals_ids=goal_id)
            out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0],
                                        env_params)
            successes_test_descr[-1].append(out == 1)
        print('Success rate {}: {}'.format(d,
                                           np.mean(successes_test_descr[-1])))
    print('Global success rate: {}'.format(np.mean(successes_test_descr)))
Exemplo n.º 5
0
def plot_generalization(path, freq=10):
    first = True
    trial_folder = path
    for trial in os.listdir(path):
        print(trial)
        # if os.path.exists(path + '/' + trial + '/adaptation_success_rates_food.txt'):

        trial_folder = path + '/' + trial + '/'
        policy_folder = trial_folder + 'policy_checkpoints/'
        params_file = trial_folder + 'params.json'

        data = pd.read_csv(os.path.join(trial_folder, 'progress.csv'))
        all_epochs = data['epoch']
        all_episodes = data['episode']
        epochs = []
        episodes = []
        for epoch, episode in zip(all_epochs, all_episodes):
            if epoch % freq == 0:
                epochs.append(epoch)
                episodes.append(int(episode))

        # Load params
        with open(params_file) as json_file:
            params = json.load(json_file)
        seed = params['experiment_params']['seed']
        set_global_seeds(seed)

        goal_invention = int(
            params['conditions']['goal_invention'].split('_')[-1])
        env_id = params['conditions']['env_id']
        if 'plant' not in env_id:
            test_plants = plants.copy() + ['plant', 'living_thing']
            test_plants.remove('flower')
            test_descriptions = [
                'Grow {} {}'.format(c, p) for c in thing_colors + ['any']
                for p in test_plants
            ]
        else:
            if 'big' in env_id:
                test_plants = [
                    'algae', 'bonsai', 'tree', 'bush', 'plant', 'living_thing'
                ]
            else:
                test_plants = ['tree', 'bush', 'plant', 'living_thing']
            test_descriptions = [
                'Grow {} {}'.format(c, p) for c in thing_colors + ['any']
                for p in test_plants
            ]

        first_epoch = True

        rank = 0
        if first:
            if not RENDER:
                env = 'PlaygroundNavigation-v1'
            else:
                env = 'PlaygroundNavigationRender-v1'
            params, rank_seed = config.configure_everything(
                rank=rank,
                seed=seed,
                num_cpu=params['experiment_params']['n_cpus'],
                env=env,
                trial_id=0,
                n_epochs=10,
                reward_function=params['conditions']['reward_function'],
                curriculum_replay_target=params['conditions']
                ['curriculum_replay_target'],
                curriculum_target=params['conditions']['curriculum_target'],
                policy_encoding=params['conditions']['policy_encoding'],
                bias_buffer=params['conditions']['bias_buffer'],
                feedback_strategy=params['conditions']['feedback_strategy'],
                goal_sampling_policy=params['conditions']
                ['goal_sampling_policy'],
                policy_architecture=params['conditions']
                ['policy_architecture'],
                goal_invention=params['conditions']['goal_invention'],
                reward_checkpoint=params['conditions']['reward_checkpoint'],
                rl_positive_ratio=params['conditions']['rl_positive_ratio'],
                p_partner_availability=params['conditions']
                ['p_social_partner_availability'],
                power_rarity=2,
                git_commit='')

            policy_language_model, reward_language_model = config.get_language_models(
                params)

            onehot_encoder = config.get_one_hot_encoder()
            goal_sampler = GoalSampler(
                policy_language_model=policy_language_model,
                reward_language_model=reward_language_model,
                goal_dim=policy_language_model.goal_dim,
                one_hot_encoder=onehot_encoder,
                **params['goal_sampler'],
                params=params)

            reward_function = config.get_reward_function(goal_sampler, params)

        else:

            def make_env():
                return gym.make(params['conditions']['env_name'])

            params['make_env'] = make_env

        # Load policy.
        success_rates = np.zeros([len(test_descriptions), len(epochs), 2])
        for ind_ep, epoch in enumerate(epochs):
            print('\n\n\t\t EPOCH', epoch)
            if first:
                first = False
                reuse = False
            else:
                reuse = True

            if params['conditions']['reward_function'] == 'learned_lstm':
                reward_function.restore_from_checkpoint(
                    trial_folder +
                    'reward_checkpoints/reward_func_checkpoint_{}'.format(
                        epoch))
            policy_language_model.set_reward_function(reward_function)
            if reward_language_model is not None:
                reward_language_model.set_reward_function(reward_function)

            goal_sampler.update_discovered_goals(params['all_descriptions'],
                                                 episode_count=0,
                                                 epoch=0)

            with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
                with open(policy_folder + 'policy_{}.pkl'.format(epoch),
                          'rb') as f:
                    policy = pickle.load(f)

            evaluation_worker = RolloutWorker(
                make_env=params['make_env'],
                policy=policy,
                reward_function=reward_function,
                params=params,
                render=RENDER,
                **params['evaluation_rollout_params'])
            evaluation_worker.seed(seed)

            # Run evaluation.
            evaluation_worker.clear_history()
            successes_per_descr = np.zeros([len(test_descriptions), 2])
            for ind_inst, instruction in enumerate(test_descriptions):
                # instruction = 'Grasp any fly'
                success_instruction = []
                goal_str = [instruction]
                goal_encoding = [policy_language_model.encode(goal_str[0])]
                goal_id = [0]

                for i in range(N_REPET):
                    ep = evaluation_worker.generate_rollouts(
                        exploit=True,
                        imagined=False,
                        goals_str=goal_str,
                        goals_encodings=goal_encoding,
                        goals_ids=goal_id)
                    for t in range(ep[0]['obs'].shape[0]):
                        metric_food = food_on_furniture(
                            ep[0]['obs'][t], goal_str[0])
                        if metric_food:
                            # print('\n\n Touched food')
                            break
                    for t in range(ep[0]['obs'].shape[0]):
                        metric_water = water_on_furniture(
                            ep[0]['obs'][t], goal_str[0])
                        if metric_water:
                            # print('\n \n Touched water')
                            break
                    success_instruction.append([metric_food, metric_water])
                success_instruction = np.array(success_instruction)
                success_rate_inst = np.mean(success_instruction, axis=0)
                successes_per_descr[ind_inst] = success_rate_inst
                print('\t Success rate {}: food {}, water {}'.format(
                    goal_str[0], success_rate_inst[0], success_rate_inst[1]))
                success_rates[ind_inst, ind_ep, :] = success_rate_inst
            np.savetxt(trial_folder + 'adaptation_success_rates_water.txt',
                       success_rates[:, :, 1])
            np.savetxt(trial_folder + 'adaptation_success_rates_food.txt',
                       success_rates[:, :, 0])

        # success_rates = np.zeros([len(test_descriptions), len(epochs), 2])
        # success_rates[:, :, 0] = np.loadtxt(trial_folder + 'adaptation_success_rates_food.txt')
        # success_rates[:, :, 1] = np.loadtxt(trial_folder + 'adaptation_success_rates_water.txt')

        line, err_min, err_max = get_stat_func(LINE, ERR)
        # plot
        fig = plt.figure(figsize=(22, 15), frameon=False)
        ax = fig.add_subplot(111)
        ax.spines['top'].set_linewidth(6)
        ax.spines['right'].set_linewidth(6)
        ax.spines['bottom'].set_linewidth(6)
        ax.spines['left'].set_linewidth(6)
        ax.tick_params(width=4, direction='in', length=10, labelsize='small')
        for i in range(2):
            plt.plot(np.array(episodes) / 1000,
                     line(success_rates)[:, i],
                     linewidth=10,
                     color=colors[i])
            plt.fill_between(np.array(episodes) / 1000,
                             err_min(success_rates)[:, i],
                             err_max(success_rates)[:, i],
                             color=colors[i],
                             alpha=0.2)
        # plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5)
        leg = plt.legend(['food', 'water'], frameon=False)
        lab = plt.xlabel('Episodes (x$10^3$)')
        plt.ylim([-0.01, 1.01])
        plt.yticks([0.25, 0.50, 0.75, 1])
        lab2 = plt.ylabel('Average success rate')
        plt.savefig(os.path.join(trial_folder, 'adaptation_success_rates.pdf'),
                    bbox_extra_artists=(lab, lab2, leg),
                    bbox_inches='tight',
                    dpi=50)  # add leg
Exemplo n.º 6
0
def run_generalization_study(path, freq=10):
    first = True

    for t_id, trial in enumerate(os.listdir(path)):
        print(trial)
        t_init = time.time()
        trial_folder = path + '/' + trial + '/'
        policy_folder = trial_folder + 'policy_checkpoints/'
        params_file = trial_folder + 'params.json'

        data = pd.read_csv(os.path.join(trial_folder, 'progress.csv'))
        all_epochs = data['epoch']
        all_episodes = data['episode']
        epochs = []
        episodes = []
        for epoch, episode in zip(all_epochs, all_episodes):
            if epoch % freq == 0:
                epochs.append(epoch)
                episodes.append(int(episode))

        # Load params
        with open(params_file) as json_file:
            params = json.load(json_file)
        seed = params['experiment_params']['seed']
        set_global_seeds(seed)

        goal_invention = int(params['conditions']['goal_invention'].split('_')[-1])
        test_descriptions = params['test_descriptions']

        rank = 0
        if first:
            if not RENDER:
                env = 'PlaygroundNavigation-v1'
            else:
                env = 'PlaygroundNavigationRender-v1'
            params, rank_seed = config.configure_everything(rank=rank,
                                                            seed=seed,
                                                            num_cpu=params['experiment_params']['n_cpus'],
                                                            env=env,
                                                            trial_id=0,
                                                            n_epochs=10,
                                                            reward_function=params['conditions']['reward_function'],
                                                            policy_encoding=params['conditions']['policy_encoding'],
                                                            bias_buffer=params['conditions']['bias_buffer'],
                                                            feedback_strategy=params['conditions']['feedback_strategy'],
                                                            policy_architecture=params['conditions']['policy_architecture'],
                                                            goal_invention=params['conditions']['goal_invention'],
                                                            reward_checkpoint=params['conditions']['reward_checkpoint'],
                                                            rl_positive_ratio=params['conditions']['rl_positive_ratio'],
                                                            p_partner_availability=params['conditions']['p_social_partner_availability'],
                                                            git_commit='')

            policy_language_model, reward_language_model = config.get_language_models(params)
            onehot_encoder = config.get_one_hot_encoder()
            goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                                       reward_language_model=reward_language_model,
                                       goal_dim=policy_language_model.goal_dim,
                                       one_hot_encoder=onehot_encoder,
                                       **params['goal_sampler'],
                                       params=params)


            reward_function = config.get_reward_function(goal_sampler, params)
        else:
            def make_env():
                return gym.make(params['conditions']['env_name'])

            params['make_env'] = make_env
        loaded = False
        success_rates = np.zeros([len(test_descriptions), len(epochs)])
        if params['conditions']['reward_function'] == 'pretrained':
            reward_function.load_params(trial_folder + 'params_reward')
        if not loaded:
            # Load policy.
            t_init = time.time()

            for ind_ep, epoch in enumerate(epochs):
                print(time.time() - t_init)
                t_init = time.time()

                print('\n\n\t\t EPOCH', epoch)
                if first:
                    first = False
                    reuse = False
                else:
                    reuse = True

                if params['conditions']['reward_function'] == 'learned_lstm':
                    reward_function.restore_from_checkpoint(trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format(epoch))

                policy_language_model.set_reward_function(reward_function)
                if reward_language_model is not None:
                    reward_language_model.set_reward_function(reward_function)

                goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0)

                with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
                    with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f:
                        policy = pickle.load(f)

                evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                                  policy=policy,
                                                  reward_function=reward_function,
                                                  params=params,
                                                  render=RENDER,
                                                  **params['evaluation_rollout_params'])
                evaluation_worker.seed(seed)

                # Run evaluation.
                evaluation_worker.clear_history()
                successes_per_descr = np.zeros([len(test_descriptions)])
                for ind_inst, instruction in enumerate(test_descriptions):
                    # instruction = 'Grasp any fly'
                    success_instruction = []
                    goal_str = [instruction]
                    goal_encoding = [policy_language_model.encode(goal_str[0])]
                    goal_id = [0]
                    for i in range(N_REPET):
                        ep = evaluation_worker.generate_rollouts(exploit=True,
                                                                 imagined=False,
                                                                 goals_str=goal_str,
                                                                 goals_encodings=goal_encoding,
                                                                 goals_ids=goal_id)
                        success = get_reward_from_state(state=ep[0]['obs'][-1], goal=instruction)
                        success_instruction.append(success)
                    success_rate_inst = np.mean(success_instruction)
                    successes_per_descr[ind_inst] = success_rate_inst
                    print('\t Success rate {}: {}'.format(goal_str[0], success_rate_inst))
                    success_rates[ind_inst, ind_ep] = success_rate_inst
                np.savetxt(trial_folder + 'generalization_success_rates.txt', success_rates)
Exemplo n.º 7
0
def plot_generalization(path, freq):

    for trial in os.listdir(path):
        print(trial)
        t_init = time.time()
        trial_folder = path + '/' + trial + '/'
        policy_folder = trial_folder + 'policy_checkpoints/'
        params_file = trial_folder + 'params.json'

        data = pd.read_csv(os.path.join(trial_folder, 'progress.csv'))
        all_epochs = data['epoch']
        all_episodes = data['episode']
        epochs = []
        episodes = []
        for epoch, episode in zip(all_epochs, all_episodes):
            if epoch % freq == 0:
                epochs.append(epoch)
                episodes.append(int(episode))

        # Load params
        with open(params_file) as json_file:
            params = json.load(json_file)
        seed = params['experiment_params']['seed']
        set_global_seeds(seed)

        goal_invention = int(params['conditions']['goal_invention'].split('_')[-1])
        test_descriptions = params['test_descriptions']

        success_rates = np.loadtxt(path + '/' + trial + '/generalization_success_rates.txt')

        line, err_min, err_max = get_stat_func(LINE, ERR)
        first = False
        # plot
        fig = plt.figure(figsize=(22, 15), frameon=False)
        ax = fig.add_subplot(111)
        ax.spines['top'].set_linewidth(6)
        ax.spines['right'].set_linewidth(6)
        ax.spines['bottom'].set_linewidth(6)
        ax.spines['left'].set_linewidth(6)
        ax.tick_params(width=4, direction='in', length=10, labelsize='small')
        plt.plot(np.array(episodes) / 1000, line(success_rates), linewidth=10)
        plt.fill_between(np.array(episodes) / 1000, err_min(success_rates), err_max(success_rates), alpha=0.2)
        if goal_invention < 100:
            plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5)
        lab = plt.xlabel('Episodes (x$10^3$)')
        plt.ylim([-0.01, 1.01])
        plt.yticks([0.25, 0.50, 0.75, 1])
        lab2 = plt.ylabel('Average success rate')
        plt.savefig(os.path.join(trial_folder, 'generalization_test_set_policy.pdf'), bbox_extra_artists=(lab, lab2), bbox_inches='tight',
                    dpi=50)  # add leg

        # plot per group
        inds_per_types = []
        descr_per_type = []
        for i_type, type in enumerate(types_words):
            inds_per_types.append([])
            descr_per_type.append([])
            for i_d, descr in enumerate(test_descriptions):
                for type_w in type:
                    if type_w in descr:
                        inds_per_types[-1].append(i_d)
                        descr_per_type[-1].append(descr)
            inds_per_types[-1] = np.array(inds_per_types[-1])
        for i in range(len(type_legends)):
            print('Type {}:'.format(i + 1), descr_per_type[i])

        fig = plt.figure(figsize=(22, 15), frameon=False)
        ax = fig.add_subplot(111)
        ax.spines['top'].set_linewidth(6)
        ax.spines['right'].set_linewidth(6)
        ax.spines['bottom'].set_linewidth(6)
        ax.spines['left'].set_linewidth(6)
        ax.tick_params(width=4, direction='in', length=10, labelsize='small')
        for i in range(len(types_words)):
            to_plot = success_rates[np.array(inds_per_types[i]), :]
            plt.plot(np.array(episodes) / 1000 , line(to_plot), linewidth=8, c=colors[i])
            plt.fill_between(np.array(episodes) / 1000 , err_min(to_plot), err_max(to_plot), color=colors[i], alpha=0.2)
        if goal_invention < 100:
            plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5)
        leg = plt.legend(type_legends, frameon=False)
        lab = plt.xlabel('Episodes (x$10^3$)')
        plt.ylim([-0.01, 1.01])
        plt.yticks([0.25, 0.50, 0.75, 1])
        lab2 = plt.ylabel('Average success rate')
        plt.savefig(os.path.join(trial_folder, 'generalization_test_set_policy_per_type.pdf'), bbox_extra_artists=(lab, lab2), bbox_inches='tight',
                    dpi=50)  # add leg