def configure_everything(rank, seed, num_cpu, env, trial_id, n_epochs, reward_function, policy_encoding, feedback_strategy, policy_architecture, goal_invention, reward_checkpoint, rl_positive_ratio, p_partner_availability, imagination_method, git_commit='', display=True): # Seed everything rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # goal invention one epoch later than stated epoch = int(goal_invention.split('_')[-1]) + 1 goal_invention = 'from_epoch_{}'.format(epoch) # Prepare params. params = DEFAULT_CONFIG # Env generating function def make_env(): return gym.make(params['conditions']['env_name'], display=display) # Get info from environment and configure dimensions dict tmp_env = make_env() tmp_env.reset() params['env_params'] = tmp_env.unwrapped.params params['learning_params']['T'] = tmp_env._max_episode_steps params['learning_params'][ 'gamma'] = 1. - 1. / params['learning_params']['T'] params['reward_function']['n_objs'] = params['env_params'][ 'max_nb_objects'] params['make_env'] = make_env train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions( params['env_params']) # compute imagined goals to get the list of all possible goals goal_generator = SentenceGeneratorHeuristic(train_descriptions, test_descriptions, sentences=None, method='CGH') goal_generator.update_model(train_descriptions + test_descriptions) imagined_descriptions = goal_generator.generate_sentences() all_descriptions = train_descriptions + test_descriptions + tuple( imagined_descriptions) # train_descriptions, test_descriptions, all_descriptions = get_descriptions(ENV_ID) # assert sorted(train_descriptions) == sorted(train_descriptions_env) # assert sorted(test_descriptions) == sorted(test_descriptions_env) params.update(date_time=str(datetime.datetime.now()), train_descriptions=train_descriptions, test_descriptions=test_descriptions, extra_descriptions=extra_descriptions, all_descriptions=all_descriptions, git_commit=git_commit) # Configure logging if rank == 0: logdir = find_save_path('../../data/expe/' + env + "/", trial_id) logger.configure(dir=logdir) os.makedirs(logdir + 'tmp/', exist_ok=True) os.makedirs(logdir + 'reward_checkpoints/', exist_ok=True) os.makedirs(logdir + 'policy_checkpoints/', exist_ok=True) os.makedirs(logdir + 'goal_info/', exist_ok=True) if params['experiment_params']['save_obs']: os.makedirs(logdir + 'save_obs/', exist_ok=True) else: logdir = None logdir = MPI.COMM_WORLD.bcast(logdir, root=0) # Update conditions parameters from arguments or variables defined in train.py params['conditions'].update( env_name=env, policy_architecture=policy_architecture, reward_function=reward_function, goal_invention=goal_invention, imagination_method=imagination_method, feedback_strategy=feedback_strategy, rl_positive_ratio=rl_positive_ratio, reward_checkpoint=reward_checkpoint, policy_encoding=policy_encoding, p_social_partner_availability=p_partner_availability) # checks if params['conditions']['policy_architecture'] in [ 'modular_attention', 'attention' ]: error_msg = 'You need an lstm policy encoding and reward is you use {}'.format( params['conditions']['policy_architecture']) assert params['conditions']['policy_encoding'] == 'lstm', error_msg assert params['conditions']['reward_function'] in [ 'pretrained', 'learned_lstm' ], error_msg elif params['conditions']['reward_function'] == 'oracle': error_msg = 'You cannot use an lstm policy encoding if you use an oracle reward' assert params['conditions']['policy_encoding'] != 'lstm', error_msg error_msg = 'You can only use a flat_concat policy architecture if you use an oracle reward' assert params['conditions'][ 'policy_architecture'] == 'flat_concat', error_msg # Update experiment parameters from arguments or variables defined in train.py params['experiment_params'].update( n_epochs=n_epochs, trial_id=trial_id, logdir=logdir, seed=seed, n_cpus=num_cpu, n_test_rollouts=len(params['train_descriptions']), ) params['reward_function'].update( reward_positive_ratio=params['conditions']['reward_positive_ratio']) # Define social partner params params['social_partner_params'] = dict( feedback_strategy=feedback_strategy, p_availability=p_partner_availability) if params['conditions']['policy_encoding'] == 'lstm': dim_encoding = params['reward_function']['num_hidden_lstm'] else: raise NotImplementedError inds_objs = tmp_env.unwrapped.inds_objs # indices of object in state for i in range(len(inds_objs)): inds_objs[i] = inds_objs[i].tolist() dims = dict(obs=tmp_env.observation_space.shape[0], g_encoding=dim_encoding, g_id=1, acts=tmp_env.action_space.shape[0], g_str=None, nb_obj=tmp_env.unwrapped.nb_obj, inds_objs=inds_objs) params['dims'] = dims # configure learning params and interactions if params['learning_params']['algo'] == 'ddpg': params['learning_params']['network_class'] += 'DDPG' else: raise NotImplementedError params['training_rollout_params'] = dict( exploit=False, use_target_net=False, compute_Q=False, eval_bool=False, ) params['evaluation_rollout_params'] = dict( exploit=True, use_target_net=params['learning_params']['test_with_polyak'], compute_Q=True, eval_bool=True) for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: params['training_rollout_params'][name] = params['learning_params'][ name] params['evaluation_rollout_params'][name] = params['learning_params'][ name] params['evaluation_rollout_params']['rollout_batch_size'] = 1 params['repo_path'] = REPO_PATH params[ 'lstm_reward_checkpoint_path'] = REPO_PATH + './src/data/lstm_checkpoints/{}'.format( params['conditions']['reward_checkpoint']) params['or_params_path'] = dict() for n_obj in [3]: params['or_params_path'][ n_obj] = REPO_PATH + '/src/data/or_function/or_params_{}objs.pk'.format( n_obj) # Save parameter dict if rank == 0: json_dict = clean_dict_for_json(params) with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(json_dict, f) for key in sorted(params.keys()): logger.info('{}: {}'.format(key, params[key])) return params, rank_seed
if sentence in train_descriptions: pass else: new_sentence.append(sentence) self.new_sentence_generate = tuple(new_sentence) return tuple(new_sentence) # new sentence will be generated randomly from enviroment directly if __name__ == '__main__': from src.playground_env.descriptions import generate_all_descriptions env_params = get_env_params() train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions( env_params) p = env_params.copy() # Get the list of admissible attributes and split them by name attributes (type and categories) and adjective attributes. name_attributes = env_params['name_attributes'] adjective_attributes = env_params['adjective_attributes'] adj_list = list(adjective_attributes) adj_list.append('any') adjective_attributes = tuple(adj_list) action = env_params['admissible_actions'] generator = simple_conjuction_based_heuristic(train_descriptions, test_descriptions, None, method='SCBH') new_descriptions = generator.generate_sentences()
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load params with open(PARAMS_FILE) as json_file: params = json.load(json_file) if not render: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything( rank=0, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], policy_encoding=params['conditions']['policy_encoding'], feedback_strategy=params['conditions']['feedback_strategy'], policy_architecture=params['conditions']['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions'] ['p_social_partner_availability'], imagination_method=params['conditions']['imagination_method'], git_commit='') policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder(params['all_descriptions']) # Define the goal sampler for training goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, params=params) reward_function = config.get_reward_function(goal_sampler, params) if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( PATH + 'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) # Define learning algorithm policy = config.configure_learning_algo(reward_function=reward_function, goal_sampler=goal_sampler, params=params) policy.load_params(POLICY_FILE) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=render, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() env_params = evaluation_worker.env.unwrapped.params train_descriptions, test_descriptions, _ = generate_all_descriptions( env_params) train_descriptions = list(train_descriptions) np.random.shuffle(list(test_descriptions)) np.random.shuffle(train_descriptions) successes_test_descr = [] for d in test_descriptions: successes_test_descr.append([]) print(d) for i in range(n_test_rollouts): goal_str = [d] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] ep = evaluation_worker.generate_rollouts( exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0], env_params) successes_test_descr[-1].append(out == 1) print('Success rate {}: {}'.format(d, np.mean(successes_test_descr[-1]))) print('Global success rate: {}'.format(np.mean(successes_test_descr)))