예제 #1
0
def configure_everything(rank,
                         seed,
                         num_cpu,
                         env,
                         trial_id,
                         n_epochs,
                         reward_function,
                         policy_encoding,
                         feedback_strategy,
                         policy_architecture,
                         goal_invention,
                         reward_checkpoint,
                         rl_positive_ratio,
                         p_partner_availability,
                         imagination_method,
                         git_commit='',
                         display=True):
    # Seed everything
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # goal invention one epoch later than stated
    epoch = int(goal_invention.split('_')[-1]) + 1
    goal_invention = 'from_epoch_{}'.format(epoch)

    # Prepare params.
    params = DEFAULT_CONFIG

    # Env generating function
    def make_env():
        return gym.make(params['conditions']['env_name'], display=display)

    # Get info from environment and configure dimensions dict
    tmp_env = make_env()
    tmp_env.reset()
    params['env_params'] = tmp_env.unwrapped.params
    params['learning_params']['T'] = tmp_env._max_episode_steps
    params['learning_params'][
        'gamma'] = 1. - 1. / params['learning_params']['T']
    params['reward_function']['n_objs'] = params['env_params'][
        'max_nb_objects']
    params['make_env'] = make_env

    train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions(
        params['env_params'])
    # compute imagined goals to get the list of all possible goals
    goal_generator = SentenceGeneratorHeuristic(train_descriptions,
                                                test_descriptions,
                                                sentences=None,
                                                method='CGH')
    goal_generator.update_model(train_descriptions + test_descriptions)
    imagined_descriptions = goal_generator.generate_sentences()
    all_descriptions = train_descriptions + test_descriptions + tuple(
        imagined_descriptions)

    # train_descriptions, test_descriptions, all_descriptions = get_descriptions(ENV_ID)
    # assert sorted(train_descriptions) == sorted(train_descriptions_env)
    # assert sorted(test_descriptions) == sorted(test_descriptions_env)
    params.update(date_time=str(datetime.datetime.now()),
                  train_descriptions=train_descriptions,
                  test_descriptions=test_descriptions,
                  extra_descriptions=extra_descriptions,
                  all_descriptions=all_descriptions,
                  git_commit=git_commit)

    # Configure logging
    if rank == 0:
        logdir = find_save_path('../../data/expe/' + env + "/", trial_id)
        logger.configure(dir=logdir)
        os.makedirs(logdir + 'tmp/', exist_ok=True)
        os.makedirs(logdir + 'reward_checkpoints/', exist_ok=True)
        os.makedirs(logdir + 'policy_checkpoints/', exist_ok=True)
        os.makedirs(logdir + 'goal_info/', exist_ok=True)
        if params['experiment_params']['save_obs']:
            os.makedirs(logdir + 'save_obs/', exist_ok=True)
    else:
        logdir = None
    logdir = MPI.COMM_WORLD.bcast(logdir, root=0)

    # Update conditions parameters from arguments or variables defined in train.py
    params['conditions'].update(
        env_name=env,
        policy_architecture=policy_architecture,
        reward_function=reward_function,
        goal_invention=goal_invention,
        imagination_method=imagination_method,
        feedback_strategy=feedback_strategy,
        rl_positive_ratio=rl_positive_ratio,
        reward_checkpoint=reward_checkpoint,
        policy_encoding=policy_encoding,
        p_social_partner_availability=p_partner_availability)

    # checks
    if params['conditions']['policy_architecture'] in [
            'modular_attention', 'attention'
    ]:
        error_msg = 'You need an lstm policy encoding and reward is you use {}'.format(
            params['conditions']['policy_architecture'])
        assert params['conditions']['policy_encoding'] == 'lstm', error_msg
        assert params['conditions']['reward_function'] in [
            'pretrained', 'learned_lstm'
        ], error_msg
    elif params['conditions']['reward_function'] == 'oracle':
        error_msg = 'You cannot use an lstm policy encoding if you use an oracle reward'
        assert params['conditions']['policy_encoding'] != 'lstm', error_msg
        error_msg = 'You can only use a flat_concat policy architecture if you use an oracle reward'
        assert params['conditions'][
            'policy_architecture'] == 'flat_concat', error_msg

    # Update experiment parameters from arguments or variables defined in train.py
    params['experiment_params'].update(
        n_epochs=n_epochs,
        trial_id=trial_id,
        logdir=logdir,
        seed=seed,
        n_cpus=num_cpu,
        n_test_rollouts=len(params['train_descriptions']),
    )
    params['reward_function'].update(
        reward_positive_ratio=params['conditions']['reward_positive_ratio'])
    # Define social partner params
    params['social_partner_params'] = dict(
        feedback_strategy=feedback_strategy,
        p_availability=p_partner_availability)

    if params['conditions']['policy_encoding'] == 'lstm':
        dim_encoding = params['reward_function']['num_hidden_lstm']
    else:
        raise NotImplementedError

    inds_objs = tmp_env.unwrapped.inds_objs  # indices of object in state
    for i in range(len(inds_objs)):
        inds_objs[i] = inds_objs[i].tolist()
    dims = dict(obs=tmp_env.observation_space.shape[0],
                g_encoding=dim_encoding,
                g_id=1,
                acts=tmp_env.action_space.shape[0],
                g_str=None,
                nb_obj=tmp_env.unwrapped.nb_obj,
                inds_objs=inds_objs)
    params['dims'] = dims

    # configure learning params and interactions
    if params['learning_params']['algo'] == 'ddpg':
        params['learning_params']['network_class'] += 'DDPG'
    else:
        raise NotImplementedError

    params['training_rollout_params'] = dict(
        exploit=False,
        use_target_net=False,
        compute_Q=False,
        eval_bool=False,
    )
    params['evaluation_rollout_params'] = dict(
        exploit=True,
        use_target_net=params['learning_params']['test_with_polyak'],
        compute_Q=True,
        eval_bool=True)

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        params['training_rollout_params'][name] = params['learning_params'][
            name]
        params['evaluation_rollout_params'][name] = params['learning_params'][
            name]
    params['evaluation_rollout_params']['rollout_batch_size'] = 1

    params['repo_path'] = REPO_PATH
    params[
        'lstm_reward_checkpoint_path'] = REPO_PATH + './src/data/lstm_checkpoints/{}'.format(
            params['conditions']['reward_checkpoint'])
    params['or_params_path'] = dict()
    for n_obj in [3]:
        params['or_params_path'][
            n_obj] = REPO_PATH + '/src/data/or_function/or_params_{}objs.pk'.format(
                n_obj)

    # Save parameter dict

    if rank == 0:
        json_dict = clean_dict_for_json(params)
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(json_dict, f)
        for key in sorted(params.keys()):
            logger.info('{}: {}'.format(key, params[key]))

    return params, rank_seed
                if sentence in train_descriptions:

                    pass
                else:
                    new_sentence.append(sentence)
            self.new_sentence_generate = tuple(new_sentence)
            return tuple(new_sentence)
        # new sentence will be generated randomly from enviroment directly


if __name__ == '__main__':

    from src.playground_env.descriptions import generate_all_descriptions

    env_params = get_env_params()
    train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions(
        env_params)

    p = env_params.copy()
    # Get the list of admissible attributes and split them by name attributes (type and categories) and adjective attributes.
    name_attributes = env_params['name_attributes']
    adjective_attributes = env_params['adjective_attributes']
    adj_list = list(adjective_attributes)
    adj_list.append('any')
    adjective_attributes = tuple(adj_list)
    action = env_params['admissible_actions']
    generator = simple_conjuction_based_heuristic(train_descriptions,
                                                  test_descriptions,
                                                  None,
                                                  method='SCBH')
    new_descriptions = generator.generate_sentences()
예제 #3
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load params
    with open(PARAMS_FILE) as json_file:
        params = json.load(json_file)

    if not render:
        env = 'PlaygroundNavigation-v1'
    else:
        env = 'PlaygroundNavigationRender-v1'
    params, rank_seed = config.configure_everything(
        rank=0,
        seed=seed,
        num_cpu=params['experiment_params']['n_cpus'],
        env=env,
        trial_id=0,
        n_epochs=10,
        reward_function=params['conditions']['reward_function'],
        policy_encoding=params['conditions']['policy_encoding'],
        feedback_strategy=params['conditions']['feedback_strategy'],
        policy_architecture=params['conditions']['policy_architecture'],
        goal_invention=params['conditions']['goal_invention'],
        reward_checkpoint=params['conditions']['reward_checkpoint'],
        rl_positive_ratio=params['conditions']['rl_positive_ratio'],
        p_partner_availability=params['conditions']
        ['p_social_partner_availability'],
        imagination_method=params['conditions']['imagination_method'],
        git_commit='')

    policy_language_model, reward_language_model = config.get_language_models(
        params)

    onehot_encoder = config.get_one_hot_encoder(params['all_descriptions'])
    # Define the goal sampler for training
    goal_sampler = GoalSampler(policy_language_model=policy_language_model,
                               reward_language_model=reward_language_model,
                               goal_dim=policy_language_model.goal_dim,
                               one_hot_encoder=onehot_encoder,
                               params=params)

    reward_function = config.get_reward_function(goal_sampler, params)
    if params['conditions']['reward_function'] == 'learned_lstm':
        reward_function.restore_from_checkpoint(
            PATH +
            'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH))
    policy_language_model.set_reward_function(reward_function)
    if reward_language_model is not None:
        reward_language_model.set_reward_function(reward_function)
    goal_sampler.update_discovered_goals(params['all_descriptions'],
                                         episode_count=0,
                                         epoch=0)

    # Define learning algorithm
    policy = config.configure_learning_algo(reward_function=reward_function,
                                            goal_sampler=goal_sampler,
                                            params=params)

    policy.load_params(POLICY_FILE)

    evaluation_worker = RolloutWorker(make_env=params['make_env'],
                                      policy=policy,
                                      reward_function=reward_function,
                                      params=params,
                                      render=render,
                                      **params['evaluation_rollout_params'])
    evaluation_worker.seed(seed)

    # Run evaluation.
    evaluation_worker.clear_history()

    env_params = evaluation_worker.env.unwrapped.params
    train_descriptions, test_descriptions, _ = generate_all_descriptions(
        env_params)
    train_descriptions = list(train_descriptions)
    np.random.shuffle(list(test_descriptions))
    np.random.shuffle(train_descriptions)
    successes_test_descr = []
    for d in test_descriptions:
        successes_test_descr.append([])
        print(d)
        for i in range(n_test_rollouts):
            goal_str = [d]
            goal_encoding = [policy_language_model.encode(goal_str[0])]
            goal_id = [0]
            ep = evaluation_worker.generate_rollouts(
                exploit=True,
                imagined=False,
                goals_str=goal_str,
                goals_encodings=goal_encoding,
                goals_ids=goal_id)
            out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0],
                                        env_params)
            successes_test_descr[-1].append(out == 1)
        print('Success rate {}: {}'.format(d,
                                           np.mean(successes_test_descr[-1])))
    print('Global success rate: {}'.format(np.mean(successes_test_descr)))