def get_modules_for_notebook(path, params): EPOCH = 'best' POLICY_FILE = path + 'policy_checkpoints/policy_{}.pkl'.format(EPOCH) policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder(params['all_descriptions']) # Define the goal sampler for training goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, params=params) reward_function = config.get_reward_function(goal_sampler, params) if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( path + 'reward_checkpoints/reward_func_{}_checkpoint'.format(EPOCH)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) # Define learning algorithm policy = config.configure_learning_algo(reward_function=reward_function, goal_sampler=goal_sampler, params=params) policy.load_params(POLICY_FILE) return policy_language_model, reward_language_model, policy, reward_function, goal_sampler
def launch(**kwargs): # Fork for multi-CPU MPI implementation. rank = fork(kwargs['num_cpu']) # Configure everything and log parameters params, rank_seed = config.configure_everything(rank, **kwargs) # Define language model policy_language_model, reward_language_model = config.get_language_models( params) # Define the one-hot_encoder (vocabulary of words and max_seq_legnth) onehot_encoder = config.get_one_hot_encoder(params['train_descriptions'] + params['test_descriptions']) # Define the goal sampler for training goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, params=params) # Define reward function reward_function = config.get_reward_function(goal_sampler=goal_sampler, params=params) oracle_reward_function = config.get_oracle_reward_function( goal_sampler, params) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) # Define the goal sampler for evaluation eval_goal_sampler = EvalGoalSampler( policy_language_model=policy_language_model, one_hot_encoder=onehot_encoder, params=params) # Give reward function to goal sampler to track metrics goal_sampler.store_reward_function(reward_function) # Define learning algorithm policy = config.configure_learning_algo(reward_function=reward_function, goal_sampler=goal_sampler, params=params) # Define the social partner social_partner = SocialPartner( oracle_reward_function=oracle_reward_function, goal_sampler=goal_sampler, **params['social_partner_params'], params=params) # Define the data processor data_processor = DataProcessor( reward_function=reward_function, oracle_reward_function=oracle_reward_function, goal_sampler=goal_sampler, params=params) # Define the worker to interact with the environment (training and evaluation) training_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, **params['training_rollout_params']) training_worker.seed(rank_seed) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, **params['evaluation_rollout_params'], render=False) evaluation_worker.seed(rank_seed * 10) stats_logger = StatsLogger(goal_sampler=goal_sampler, data_processor=data_processor, training_worker=training_worker, evaluation_worker=evaluation_worker, reward_function=reward_function, policy=policy, params=params) train(logdir=params['experiment_params']['logdir'], policy=policy, training_worker=training_worker, goal_sampler=goal_sampler, eval_goal_sampler=eval_goal_sampler, evaluation_worker=evaluation_worker, social_partner=social_partner, n_epochs=params['experiment_params']['n_epochs'], n_test_rollouts=params['experiment_params']['n_test_rollouts'], n_cycles=params['experiment_params']['n_cycles'], n_batches=params['experiment_params']['n_batches'], reward_function=reward_function, stats_logger=stats_logger, data_processor=data_processor, params=params)
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load params with open(PARAMS_FILE) as json_file: params = json.load(json_file) if not render: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything( rank=0, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], policy_encoding=params['conditions']['policy_encoding'], feedback_strategy=params['conditions']['feedback_strategy'], policy_architecture=params['conditions']['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions'] ['p_social_partner_availability'], imagination_method=params['conditions']['imagination_method'], git_commit='') policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder(params['all_descriptions']) # Define the goal sampler for training goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, params=params) reward_function = config.get_reward_function(goal_sampler, params) if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( PATH + 'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) # Define learning algorithm policy = config.configure_learning_algo(reward_function=reward_function, goal_sampler=goal_sampler, params=params) policy.load_params(POLICY_FILE) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=render, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() env_params = evaluation_worker.env.unwrapped.params train_descriptions, test_descriptions, _ = generate_all_descriptions( env_params) train_descriptions = list(train_descriptions) np.random.shuffle(list(test_descriptions)) np.random.shuffle(train_descriptions) successes_test_descr = [] for d in test_descriptions: successes_test_descr.append([]) print(d) for i in range(n_test_rollouts): goal_str = [d] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] ep = evaluation_worker.generate_rollouts( exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0], env_params) successes_test_descr[-1].append(out == 1) print('Success rate {}: {}'.format(d, np.mean(successes_test_descr[-1]))) print('Global success rate: {}'.format(np.mean(successes_test_descr)))
def plot_generalization(path, freq=10): first = True trial_folder = path for trial in os.listdir(path): print(trial) # if os.path.exists(path + '/' + trial + '/adaptation_success_rates_food.txt'): trial_folder = path + '/' + trial + '/' policy_folder = trial_folder + 'policy_checkpoints/' params_file = trial_folder + 'params.json' data = pd.read_csv(os.path.join(trial_folder, 'progress.csv')) all_epochs = data['epoch'] all_episodes = data['episode'] epochs = [] episodes = [] for epoch, episode in zip(all_epochs, all_episodes): if epoch % freq == 0: epochs.append(epoch) episodes.append(int(episode)) # Load params with open(params_file) as json_file: params = json.load(json_file) seed = params['experiment_params']['seed'] set_global_seeds(seed) goal_invention = int( params['conditions']['goal_invention'].split('_')[-1]) env_id = params['conditions']['env_id'] if 'plant' not in env_id: test_plants = plants.copy() + ['plant', 'living_thing'] test_plants.remove('flower') test_descriptions = [ 'Grow {} {}'.format(c, p) for c in thing_colors + ['any'] for p in test_plants ] else: if 'big' in env_id: test_plants = [ 'algae', 'bonsai', 'tree', 'bush', 'plant', 'living_thing' ] else: test_plants = ['tree', 'bush', 'plant', 'living_thing'] test_descriptions = [ 'Grow {} {}'.format(c, p) for c in thing_colors + ['any'] for p in test_plants ] first_epoch = True rank = 0 if first: if not RENDER: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything( rank=rank, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], curriculum_replay_target=params['conditions'] ['curriculum_replay_target'], curriculum_target=params['conditions']['curriculum_target'], policy_encoding=params['conditions']['policy_encoding'], bias_buffer=params['conditions']['bias_buffer'], feedback_strategy=params['conditions']['feedback_strategy'], goal_sampling_policy=params['conditions'] ['goal_sampling_policy'], policy_architecture=params['conditions'] ['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions'] ['p_social_partner_availability'], power_rarity=2, git_commit='') policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder() goal_sampler = GoalSampler( policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, **params['goal_sampler'], params=params) reward_function = config.get_reward_function(goal_sampler, params) else: def make_env(): return gym.make(params['conditions']['env_name']) params['make_env'] = make_env # Load policy. success_rates = np.zeros([len(test_descriptions), len(epochs), 2]) for ind_ep, epoch in enumerate(epochs): print('\n\n\t\t EPOCH', epoch) if first: first = False reuse = False else: reuse = True if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format( epoch)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f: policy = pickle.load(f) evaluation_worker = RolloutWorker( make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=RENDER, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() successes_per_descr = np.zeros([len(test_descriptions), 2]) for ind_inst, instruction in enumerate(test_descriptions): # instruction = 'Grasp any fly' success_instruction = [] goal_str = [instruction] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] for i in range(N_REPET): ep = evaluation_worker.generate_rollouts( exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) for t in range(ep[0]['obs'].shape[0]): metric_food = food_on_furniture( ep[0]['obs'][t], goal_str[0]) if metric_food: # print('\n\n Touched food') break for t in range(ep[0]['obs'].shape[0]): metric_water = water_on_furniture( ep[0]['obs'][t], goal_str[0]) if metric_water: # print('\n \n Touched water') break success_instruction.append([metric_food, metric_water]) success_instruction = np.array(success_instruction) success_rate_inst = np.mean(success_instruction, axis=0) successes_per_descr[ind_inst] = success_rate_inst print('\t Success rate {}: food {}, water {}'.format( goal_str[0], success_rate_inst[0], success_rate_inst[1])) success_rates[ind_inst, ind_ep, :] = success_rate_inst np.savetxt(trial_folder + 'adaptation_success_rates_water.txt', success_rates[:, :, 1]) np.savetxt(trial_folder + 'adaptation_success_rates_food.txt', success_rates[:, :, 0]) # success_rates = np.zeros([len(test_descriptions), len(epochs), 2]) # success_rates[:, :, 0] = np.loadtxt(trial_folder + 'adaptation_success_rates_food.txt') # success_rates[:, :, 1] = np.loadtxt(trial_folder + 'adaptation_success_rates_water.txt') line, err_min, err_max = get_stat_func(LINE, ERR) # plot fig = plt.figure(figsize=(22, 15), frameon=False) ax = fig.add_subplot(111) ax.spines['top'].set_linewidth(6) ax.spines['right'].set_linewidth(6) ax.spines['bottom'].set_linewidth(6) ax.spines['left'].set_linewidth(6) ax.tick_params(width=4, direction='in', length=10, labelsize='small') for i in range(2): plt.plot(np.array(episodes) / 1000, line(success_rates)[:, i], linewidth=10, color=colors[i]) plt.fill_between(np.array(episodes) / 1000, err_min(success_rates)[:, i], err_max(success_rates)[:, i], color=colors[i], alpha=0.2) # plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5) leg = plt.legend(['food', 'water'], frameon=False) lab = plt.xlabel('Episodes (x$10^3$)') plt.ylim([-0.01, 1.01]) plt.yticks([0.25, 0.50, 0.75, 1]) lab2 = plt.ylabel('Average success rate') plt.savefig(os.path.join(trial_folder, 'adaptation_success_rates.pdf'), bbox_extra_artists=(lab, lab2, leg), bbox_inches='tight', dpi=50) # add leg
def run_generalization_study(path, freq=10): first = True for t_id, trial in enumerate(os.listdir(path)): print(trial) t_init = time.time() trial_folder = path + '/' + trial + '/' policy_folder = trial_folder + 'policy_checkpoints/' params_file = trial_folder + 'params.json' data = pd.read_csv(os.path.join(trial_folder, 'progress.csv')) all_epochs = data['epoch'] all_episodes = data['episode'] epochs = [] episodes = [] for epoch, episode in zip(all_epochs, all_episodes): if epoch % freq == 0: epochs.append(epoch) episodes.append(int(episode)) # Load params with open(params_file) as json_file: params = json.load(json_file) seed = params['experiment_params']['seed'] set_global_seeds(seed) goal_invention = int(params['conditions']['goal_invention'].split('_')[-1]) test_descriptions = params['test_descriptions'] rank = 0 if first: if not RENDER: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything(rank=rank, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], policy_encoding=params['conditions']['policy_encoding'], bias_buffer=params['conditions']['bias_buffer'], feedback_strategy=params['conditions']['feedback_strategy'], policy_architecture=params['conditions']['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions']['p_social_partner_availability'], git_commit='') policy_language_model, reward_language_model = config.get_language_models(params) onehot_encoder = config.get_one_hot_encoder() goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, **params['goal_sampler'], params=params) reward_function = config.get_reward_function(goal_sampler, params) else: def make_env(): return gym.make(params['conditions']['env_name']) params['make_env'] = make_env loaded = False success_rates = np.zeros([len(test_descriptions), len(epochs)]) if params['conditions']['reward_function'] == 'pretrained': reward_function.load_params(trial_folder + 'params_reward') if not loaded: # Load policy. t_init = time.time() for ind_ep, epoch in enumerate(epochs): print(time.time() - t_init) t_init = time.time() print('\n\n\t\t EPOCH', epoch) if first: first = False reuse = False else: reuse = True if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint(trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format(epoch)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f: policy = pickle.load(f) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=RENDER, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() successes_per_descr = np.zeros([len(test_descriptions)]) for ind_inst, instruction in enumerate(test_descriptions): # instruction = 'Grasp any fly' success_instruction = [] goal_str = [instruction] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] for i in range(N_REPET): ep = evaluation_worker.generate_rollouts(exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) success = get_reward_from_state(state=ep[0]['obs'][-1], goal=instruction) success_instruction.append(success) success_rate_inst = np.mean(success_instruction) successes_per_descr[ind_inst] = success_rate_inst print('\t Success rate {}: {}'.format(goal_str[0], success_rate_inst)) success_rates[ind_inst, ind_ep] = success_rate_inst np.savetxt(trial_folder + 'generalization_success_rates.txt', success_rates)