def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render) } for name in ['max_episode_steps', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = EpisodeRollout(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) gym = evaluator.envs._gym sim = evaluator.envs._sim viewer = gym.create_viewer(sim, gymapi.DEFAULT_VIEWER_WIDTH, gymapi.DEFAULT_VIEWER_HEIGHT) evaluator.viewer = viewer evaluator.viewer = viewer # Run evaluation. evaluator.clear_history() all_episodes = [] for _ in range(n_test_rollouts): # if friction_idx == len(friction_arr): # friction_idx = 0 # if _ % 5 == 0: # evaluator.seed(seed) # friction = friction_arr[friction_idx] # friction_idx += 1 # evaluator.set_physics(param=friction) episode = evaluator.generate_rollouts() all_episodes.append(episode) # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def configure_everything(rank, seed, num_cpu, env, trial_id, n_epochs, reward_function, policy_encoding, feedback_strategy, policy_architecture, goal_invention, reward_checkpoint, rl_positive_ratio, p_partner_availability, imagination_method, git_commit='', display=True): # Seed everything rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # goal invention one epoch later than stated epoch = int(goal_invention.split('_')[-1]) + 1 goal_invention = 'from_epoch_{}'.format(epoch) # Prepare params. params = DEFAULT_CONFIG # Env generating function def make_env(): return gym.make(params['conditions']['env_name'], display=display) # Get info from environment and configure dimensions dict tmp_env = make_env() tmp_env.reset() params['env_params'] = tmp_env.unwrapped.params params['learning_params']['T'] = tmp_env._max_episode_steps params['learning_params'][ 'gamma'] = 1. - 1. / params['learning_params']['T'] params['reward_function']['n_objs'] = params['env_params'][ 'max_nb_objects'] params['make_env'] = make_env train_descriptions, test_descriptions, extra_descriptions = generate_all_descriptions( params['env_params']) # compute imagined goals to get the list of all possible goals goal_generator = SentenceGeneratorHeuristic(train_descriptions, test_descriptions, sentences=None, method='CGH') goal_generator.update_model(train_descriptions + test_descriptions) imagined_descriptions = goal_generator.generate_sentences() all_descriptions = train_descriptions + test_descriptions + tuple( imagined_descriptions) # train_descriptions, test_descriptions, all_descriptions = get_descriptions(ENV_ID) # assert sorted(train_descriptions) == sorted(train_descriptions_env) # assert sorted(test_descriptions) == sorted(test_descriptions_env) params.update(date_time=str(datetime.datetime.now()), train_descriptions=train_descriptions, test_descriptions=test_descriptions, extra_descriptions=extra_descriptions, all_descriptions=all_descriptions, git_commit=git_commit) # Configure logging if rank == 0: logdir = find_save_path('../../data/expe/' + env + "/", trial_id) logger.configure(dir=logdir) os.makedirs(logdir + 'tmp/', exist_ok=True) os.makedirs(logdir + 'reward_checkpoints/', exist_ok=True) os.makedirs(logdir + 'policy_checkpoints/', exist_ok=True) os.makedirs(logdir + 'goal_info/', exist_ok=True) if params['experiment_params']['save_obs']: os.makedirs(logdir + 'save_obs/', exist_ok=True) else: logdir = None logdir = MPI.COMM_WORLD.bcast(logdir, root=0) # Update conditions parameters from arguments or variables defined in train.py params['conditions'].update( env_name=env, policy_architecture=policy_architecture, reward_function=reward_function, goal_invention=goal_invention, imagination_method=imagination_method, feedback_strategy=feedback_strategy, rl_positive_ratio=rl_positive_ratio, reward_checkpoint=reward_checkpoint, policy_encoding=policy_encoding, p_social_partner_availability=p_partner_availability) # checks if params['conditions']['policy_architecture'] in [ 'modular_attention', 'attention' ]: error_msg = 'You need an lstm policy encoding and reward is you use {}'.format( params['conditions']['policy_architecture']) assert params['conditions']['policy_encoding'] == 'lstm', error_msg assert params['conditions']['reward_function'] in [ 'pretrained', 'learned_lstm' ], error_msg elif params['conditions']['reward_function'] == 'oracle': error_msg = 'You cannot use an lstm policy encoding if you use an oracle reward' assert params['conditions']['policy_encoding'] != 'lstm', error_msg error_msg = 'You can only use a flat_concat policy architecture if you use an oracle reward' assert params['conditions'][ 'policy_architecture'] == 'flat_concat', error_msg # Update experiment parameters from arguments or variables defined in train.py params['experiment_params'].update( n_epochs=n_epochs, trial_id=trial_id, logdir=logdir, seed=seed, n_cpus=num_cpu, n_test_rollouts=len(params['train_descriptions']), ) params['reward_function'].update( reward_positive_ratio=params['conditions']['reward_positive_ratio']) # Define social partner params params['social_partner_params'] = dict( feedback_strategy=feedback_strategy, p_availability=p_partner_availability) if params['conditions']['policy_encoding'] == 'lstm': dim_encoding = params['reward_function']['num_hidden_lstm'] else: raise NotImplementedError inds_objs = tmp_env.unwrapped.inds_objs # indices of object in state for i in range(len(inds_objs)): inds_objs[i] = inds_objs[i].tolist() dims = dict(obs=tmp_env.observation_space.shape[0], g_encoding=dim_encoding, g_id=1, acts=tmp_env.action_space.shape[0], g_str=None, nb_obj=tmp_env.unwrapped.nb_obj, inds_objs=inds_objs) params['dims'] = dims # configure learning params and interactions if params['learning_params']['algo'] == 'ddpg': params['learning_params']['network_class'] += 'DDPG' else: raise NotImplementedError params['training_rollout_params'] = dict( exploit=False, use_target_net=False, compute_Q=False, eval_bool=False, ) params['evaluation_rollout_params'] = dict( exploit=True, use_target_net=params['learning_params']['test_with_polyak'], compute_Q=True, eval_bool=True) for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: params['training_rollout_params'][name] = params['learning_params'][ name] params['evaluation_rollout_params'][name] = params['learning_params'][ name] params['evaluation_rollout_params']['rollout_batch_size'] = 1 params['repo_path'] = REPO_PATH params[ 'lstm_reward_checkpoint_path'] = REPO_PATH + './src/data/lstm_checkpoints/{}'.format( params['conditions']['reward_checkpoint']) params['or_params_path'] = dict() for n_obj in [3]: params['or_params_path'][ n_obj] = REPO_PATH + '/src/data/or_function/or_params_{}objs.pk'.format( n_obj) # Save parameter dict if rank == 0: json_dict = clean_dict_for_json(params) with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(json_dict, f) for key in sorted(params.keys()): logger.info('{}: {}'.format(key, params[key])) return params, rank_seed
def launch(env, logdir, n_epochs, seed, replay_strategy, policy_save_interval, clip_return, random_physics, lower_bound, upper_bound, randomise_every_n_epoch, override_params={}, save_policies=True, mdn_prior=None): now = datetime.datetime.now() logdir += "/" + env + "/" + str(now.strftime("%Y-%m-%d-%H:%M")) # Configure logging if logdir or logger.get_dir() is None: logger.configure(dir=logdir) logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) params = set_params(env, replay_strategy, override_params) config.log_params(params, logger=logger) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'render': params['render'], 'max_episode_steps': params['max_episode_steps'], 'random_physics': random_physics, 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'randomise_every_n_epoch': randomise_every_n_epoch } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'render': params['render'], 'max_episode_steps': params['max_episode_steps'], 'random_physics': False, 'rnd_phys_lower_bound': lower_bound, 'rnd_phys_upper_bound': upper_bound, 'randomise_every_n_epoch': randomise_every_n_epoch } dims = config.configure_dims(params) her = HindisghtExperienceReplay(params['make_env'], replay_strategy, params['replay_k']) sample_her_transitions = her.make_sample_her_transitions() # Seed everything. rank_seed = seed + 1000000 set_global_seeds(rank_seed) # DDPG agent ddpg_params = params['ddpg_params'] ddpg_params.update({ 'input_dims': dims.copy(), # agent takes an input observations 'max_episode_steps': params['max_episode_steps'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - params['gamma'])) if clip_return else np.inf, # max abs of return 'rollout_batch_size': params['rollout_batch_size'], 'subtract_goals': simple_goal_subtract, 'sample_transitions': sample_her_transitions, 'gamma': params['gamma'], }) ddpg_params['info'] = { 'env_name': params['env_name'], } policy = DDPG(reuse=False, **ddpg_params, use_mpi=True) for name in [ 'max_episode_steps', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = EpisodeRollout(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = EpisodeRollout(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, mdn_prior=mdn_prior)
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load params with open(PARAMS_FILE) as json_file: params = json.load(json_file) if not render: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything( rank=0, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], policy_encoding=params['conditions']['policy_encoding'], feedback_strategy=params['conditions']['feedback_strategy'], policy_architecture=params['conditions']['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions'] ['p_social_partner_availability'], imagination_method=params['conditions']['imagination_method'], git_commit='') policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder(params['all_descriptions']) # Define the goal sampler for training goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, params=params) reward_function = config.get_reward_function(goal_sampler, params) if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( PATH + 'reward_checkpoints/reward_func_checkpoint_{}'.format(EPOCH)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) # Define learning algorithm policy = config.configure_learning_algo(reward_function=reward_function, goal_sampler=goal_sampler, params=params) policy.load_params(POLICY_FILE) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=render, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() env_params = evaluation_worker.env.unwrapped.params train_descriptions, test_descriptions, _ = generate_all_descriptions( env_params) train_descriptions = list(train_descriptions) np.random.shuffle(list(test_descriptions)) np.random.shuffle(train_descriptions) successes_test_descr = [] for d in test_descriptions: successes_test_descr.append([]) print(d) for i in range(n_test_rollouts): goal_str = [d] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] ep = evaluation_worker.generate_rollouts( exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) out = get_reward_from_state(ep[0]['obs'][-1], goal_str[0], env_params) successes_test_descr[-1].append(out == 1) print('Success rate {}: {}'.format(d, np.mean(successes_test_descr[-1]))) print('Global success rate: {}'.format(np.mean(successes_test_descr)))
def plot_generalization(path, freq=10): first = True trial_folder = path for trial in os.listdir(path): print(trial) # if os.path.exists(path + '/' + trial + '/adaptation_success_rates_food.txt'): trial_folder = path + '/' + trial + '/' policy_folder = trial_folder + 'policy_checkpoints/' params_file = trial_folder + 'params.json' data = pd.read_csv(os.path.join(trial_folder, 'progress.csv')) all_epochs = data['epoch'] all_episodes = data['episode'] epochs = [] episodes = [] for epoch, episode in zip(all_epochs, all_episodes): if epoch % freq == 0: epochs.append(epoch) episodes.append(int(episode)) # Load params with open(params_file) as json_file: params = json.load(json_file) seed = params['experiment_params']['seed'] set_global_seeds(seed) goal_invention = int( params['conditions']['goal_invention'].split('_')[-1]) env_id = params['conditions']['env_id'] if 'plant' not in env_id: test_plants = plants.copy() + ['plant', 'living_thing'] test_plants.remove('flower') test_descriptions = [ 'Grow {} {}'.format(c, p) for c in thing_colors + ['any'] for p in test_plants ] else: if 'big' in env_id: test_plants = [ 'algae', 'bonsai', 'tree', 'bush', 'plant', 'living_thing' ] else: test_plants = ['tree', 'bush', 'plant', 'living_thing'] test_descriptions = [ 'Grow {} {}'.format(c, p) for c in thing_colors + ['any'] for p in test_plants ] first_epoch = True rank = 0 if first: if not RENDER: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything( rank=rank, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], curriculum_replay_target=params['conditions'] ['curriculum_replay_target'], curriculum_target=params['conditions']['curriculum_target'], policy_encoding=params['conditions']['policy_encoding'], bias_buffer=params['conditions']['bias_buffer'], feedback_strategy=params['conditions']['feedback_strategy'], goal_sampling_policy=params['conditions'] ['goal_sampling_policy'], policy_architecture=params['conditions'] ['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions'] ['p_social_partner_availability'], power_rarity=2, git_commit='') policy_language_model, reward_language_model = config.get_language_models( params) onehot_encoder = config.get_one_hot_encoder() goal_sampler = GoalSampler( policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, **params['goal_sampler'], params=params) reward_function = config.get_reward_function(goal_sampler, params) else: def make_env(): return gym.make(params['conditions']['env_name']) params['make_env'] = make_env # Load policy. success_rates = np.zeros([len(test_descriptions), len(epochs), 2]) for ind_ep, epoch in enumerate(epochs): print('\n\n\t\t EPOCH', epoch) if first: first = False reuse = False else: reuse = True if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint( trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format( epoch)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f: policy = pickle.load(f) evaluation_worker = RolloutWorker( make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=RENDER, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() successes_per_descr = np.zeros([len(test_descriptions), 2]) for ind_inst, instruction in enumerate(test_descriptions): # instruction = 'Grasp any fly' success_instruction = [] goal_str = [instruction] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] for i in range(N_REPET): ep = evaluation_worker.generate_rollouts( exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) for t in range(ep[0]['obs'].shape[0]): metric_food = food_on_furniture( ep[0]['obs'][t], goal_str[0]) if metric_food: # print('\n\n Touched food') break for t in range(ep[0]['obs'].shape[0]): metric_water = water_on_furniture( ep[0]['obs'][t], goal_str[0]) if metric_water: # print('\n \n Touched water') break success_instruction.append([metric_food, metric_water]) success_instruction = np.array(success_instruction) success_rate_inst = np.mean(success_instruction, axis=0) successes_per_descr[ind_inst] = success_rate_inst print('\t Success rate {}: food {}, water {}'.format( goal_str[0], success_rate_inst[0], success_rate_inst[1])) success_rates[ind_inst, ind_ep, :] = success_rate_inst np.savetxt(trial_folder + 'adaptation_success_rates_water.txt', success_rates[:, :, 1]) np.savetxt(trial_folder + 'adaptation_success_rates_food.txt', success_rates[:, :, 0]) # success_rates = np.zeros([len(test_descriptions), len(epochs), 2]) # success_rates[:, :, 0] = np.loadtxt(trial_folder + 'adaptation_success_rates_food.txt') # success_rates[:, :, 1] = np.loadtxt(trial_folder + 'adaptation_success_rates_water.txt') line, err_min, err_max = get_stat_func(LINE, ERR) # plot fig = plt.figure(figsize=(22, 15), frameon=False) ax = fig.add_subplot(111) ax.spines['top'].set_linewidth(6) ax.spines['right'].set_linewidth(6) ax.spines['bottom'].set_linewidth(6) ax.spines['left'].set_linewidth(6) ax.tick_params(width=4, direction='in', length=10, labelsize='small') for i in range(2): plt.plot(np.array(episodes) / 1000, line(success_rates)[:, i], linewidth=10, color=colors[i]) plt.fill_between(np.array(episodes) / 1000, err_min(success_rates)[:, i], err_max(success_rates)[:, i], color=colors[i], alpha=0.2) # plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5) leg = plt.legend(['food', 'water'], frameon=False) lab = plt.xlabel('Episodes (x$10^3$)') plt.ylim([-0.01, 1.01]) plt.yticks([0.25, 0.50, 0.75, 1]) lab2 = plt.ylabel('Average success rate') plt.savefig(os.path.join(trial_folder, 'adaptation_success_rates.pdf'), bbox_extra_artists=(lab, lab2, leg), bbox_inches='tight', dpi=50) # add leg
def run_generalization_study(path, freq=10): first = True for t_id, trial in enumerate(os.listdir(path)): print(trial) t_init = time.time() trial_folder = path + '/' + trial + '/' policy_folder = trial_folder + 'policy_checkpoints/' params_file = trial_folder + 'params.json' data = pd.read_csv(os.path.join(trial_folder, 'progress.csv')) all_epochs = data['epoch'] all_episodes = data['episode'] epochs = [] episodes = [] for epoch, episode in zip(all_epochs, all_episodes): if epoch % freq == 0: epochs.append(epoch) episodes.append(int(episode)) # Load params with open(params_file) as json_file: params = json.load(json_file) seed = params['experiment_params']['seed'] set_global_seeds(seed) goal_invention = int(params['conditions']['goal_invention'].split('_')[-1]) test_descriptions = params['test_descriptions'] rank = 0 if first: if not RENDER: env = 'PlaygroundNavigation-v1' else: env = 'PlaygroundNavigationRender-v1' params, rank_seed = config.configure_everything(rank=rank, seed=seed, num_cpu=params['experiment_params']['n_cpus'], env=env, trial_id=0, n_epochs=10, reward_function=params['conditions']['reward_function'], policy_encoding=params['conditions']['policy_encoding'], bias_buffer=params['conditions']['bias_buffer'], feedback_strategy=params['conditions']['feedback_strategy'], policy_architecture=params['conditions']['policy_architecture'], goal_invention=params['conditions']['goal_invention'], reward_checkpoint=params['conditions']['reward_checkpoint'], rl_positive_ratio=params['conditions']['rl_positive_ratio'], p_partner_availability=params['conditions']['p_social_partner_availability'], git_commit='') policy_language_model, reward_language_model = config.get_language_models(params) onehot_encoder = config.get_one_hot_encoder() goal_sampler = GoalSampler(policy_language_model=policy_language_model, reward_language_model=reward_language_model, goal_dim=policy_language_model.goal_dim, one_hot_encoder=onehot_encoder, **params['goal_sampler'], params=params) reward_function = config.get_reward_function(goal_sampler, params) else: def make_env(): return gym.make(params['conditions']['env_name']) params['make_env'] = make_env loaded = False success_rates = np.zeros([len(test_descriptions), len(epochs)]) if params['conditions']['reward_function'] == 'pretrained': reward_function.load_params(trial_folder + 'params_reward') if not loaded: # Load policy. t_init = time.time() for ind_ep, epoch in enumerate(epochs): print(time.time() - t_init) t_init = time.time() print('\n\n\t\t EPOCH', epoch) if first: first = False reuse = False else: reuse = True if params['conditions']['reward_function'] == 'learned_lstm': reward_function.restore_from_checkpoint(trial_folder + 'reward_checkpoints/reward_func_checkpoint_{}'.format(epoch)) policy_language_model.set_reward_function(reward_function) if reward_language_model is not None: reward_language_model.set_reward_function(reward_function) goal_sampler.update_discovered_goals(params['all_descriptions'], episode_count=0, epoch=0) with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): with open(policy_folder + 'policy_{}.pkl'.format(epoch), 'rb') as f: policy = pickle.load(f) evaluation_worker = RolloutWorker(make_env=params['make_env'], policy=policy, reward_function=reward_function, params=params, render=RENDER, **params['evaluation_rollout_params']) evaluation_worker.seed(seed) # Run evaluation. evaluation_worker.clear_history() successes_per_descr = np.zeros([len(test_descriptions)]) for ind_inst, instruction in enumerate(test_descriptions): # instruction = 'Grasp any fly' success_instruction = [] goal_str = [instruction] goal_encoding = [policy_language_model.encode(goal_str[0])] goal_id = [0] for i in range(N_REPET): ep = evaluation_worker.generate_rollouts(exploit=True, imagined=False, goals_str=goal_str, goals_encodings=goal_encoding, goals_ids=goal_id) success = get_reward_from_state(state=ep[0]['obs'][-1], goal=instruction) success_instruction.append(success) success_rate_inst = np.mean(success_instruction) successes_per_descr[ind_inst] = success_rate_inst print('\t Success rate {}: {}'.format(goal_str[0], success_rate_inst)) success_rates[ind_inst, ind_ep] = success_rate_inst np.savetxt(trial_folder + 'generalization_success_rates.txt', success_rates)
def plot_generalization(path, freq): for trial in os.listdir(path): print(trial) t_init = time.time() trial_folder = path + '/' + trial + '/' policy_folder = trial_folder + 'policy_checkpoints/' params_file = trial_folder + 'params.json' data = pd.read_csv(os.path.join(trial_folder, 'progress.csv')) all_epochs = data['epoch'] all_episodes = data['episode'] epochs = [] episodes = [] for epoch, episode in zip(all_epochs, all_episodes): if epoch % freq == 0: epochs.append(epoch) episodes.append(int(episode)) # Load params with open(params_file) as json_file: params = json.load(json_file) seed = params['experiment_params']['seed'] set_global_seeds(seed) goal_invention = int(params['conditions']['goal_invention'].split('_')[-1]) test_descriptions = params['test_descriptions'] success_rates = np.loadtxt(path + '/' + trial + '/generalization_success_rates.txt') line, err_min, err_max = get_stat_func(LINE, ERR) first = False # plot fig = plt.figure(figsize=(22, 15), frameon=False) ax = fig.add_subplot(111) ax.spines['top'].set_linewidth(6) ax.spines['right'].set_linewidth(6) ax.spines['bottom'].set_linewidth(6) ax.spines['left'].set_linewidth(6) ax.tick_params(width=4, direction='in', length=10, labelsize='small') plt.plot(np.array(episodes) / 1000, line(success_rates), linewidth=10) plt.fill_between(np.array(episodes) / 1000, err_min(success_rates), err_max(success_rates), alpha=0.2) if goal_invention < 100: plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5) lab = plt.xlabel('Episodes (x$10^3$)') plt.ylim([-0.01, 1.01]) plt.yticks([0.25, 0.50, 0.75, 1]) lab2 = plt.ylabel('Average success rate') plt.savefig(os.path.join(trial_folder, 'generalization_test_set_policy.pdf'), bbox_extra_artists=(lab, lab2), bbox_inches='tight', dpi=50) # add leg # plot per group inds_per_types = [] descr_per_type = [] for i_type, type in enumerate(types_words): inds_per_types.append([]) descr_per_type.append([]) for i_d, descr in enumerate(test_descriptions): for type_w in type: if type_w in descr: inds_per_types[-1].append(i_d) descr_per_type[-1].append(descr) inds_per_types[-1] = np.array(inds_per_types[-1]) for i in range(len(type_legends)): print('Type {}:'.format(i + 1), descr_per_type[i]) fig = plt.figure(figsize=(22, 15), frameon=False) ax = fig.add_subplot(111) ax.spines['top'].set_linewidth(6) ax.spines['right'].set_linewidth(6) ax.spines['bottom'].set_linewidth(6) ax.spines['left'].set_linewidth(6) ax.tick_params(width=4, direction='in', length=10, labelsize='small') for i in range(len(types_words)): to_plot = success_rates[np.array(inds_per_types[i]), :] plt.plot(np.array(episodes) / 1000 , line(to_plot), linewidth=8, c=colors[i]) plt.fill_between(np.array(episodes) / 1000 , err_min(to_plot), err_max(to_plot), color=colors[i], alpha=0.2) if goal_invention < 100: plt.vlines(goal_invention * 0.6, ymin=0, ymax=1, linestyles='--', color='k', linewidth=5) leg = plt.legend(type_legends, frameon=False) lab = plt.xlabel('Episodes (x$10^3$)') plt.ylim([-0.01, 1.01]) plt.yticks([0.25, 0.50, 0.75, 1]) lab2 = plt.ylabel('Average success rate') plt.savefig(os.path.join(trial_folder, 'generalization_test_set_policy_per_type.pdf'), bbox_extra_artists=(lab, lab2), bbox_inches='tight', dpi=50) # add leg