def find_all_feasible_reject_states( env, distance_threshold=0.1, ): # test reject see how many are feasible uniform_state_generator = UniformStateGenerator( state_size=len(env.current_start), bounds=env.start_generator.bounds) any_starts = StateCollection(distance_threshold=distance_threshold) k = 0 while any_starts.size < 1e6: state = uniform_state_generator.update() obs = env.reset(init_state=state) action = np.zeros(env.action_dim) next_obs, _, done, env_info = env.step(action) if not np.linalg.norm(next_obs - obs) == 0: print("CONTACT! obs changed:", obs, next_obs) elif done and not env_info['gaol_reached']: print("outside range") else: any_starts.append(state) print("any_starts: ", any_starts.size, " out of ", k) k += 1
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntMazeEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform() # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals # if needed label the goals before any update if v['label_with_variation']: old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=False, return_rew=True) # itr_label = outer_iter # use outer_iter to log everything or "last" to log only the last # with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True): with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] elif v['label_with_variation']: labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', old_rewards=old_rewards, full_path=True) else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) #logger.log("Labeling the goals") #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) if v['label_with_variation']: # this will use only the performance variation for labeling labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1)) else: labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") gan.train( goals, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['constant_baseline']: logger.log("Using constant baseline") baseline = ConstantBaseline(env_spec=env.spec, value=1.0) else: logger.log("Using linear baseline") baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) # use goal for plot report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) start_classes, text_labels = convert_label(labels) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample(300) # sample them from the replay else: seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, goal_generator=fixed_goal_generator, obs2start_transform=lambda x: x[:v['start_size']], obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['start_center'], limit=v['start_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = np.array([]).reshape((-1, v['start_size'])) k = 0 while starts.shape[0] < v['num_new_starts']: print('good starts collected: ', starts.shape[0]) logger.log("Sampling and labeling the starts: %d" % k) k += 1 unif_starts = sample_unif_feas(env, samples_per_cell=samples_per_cell) if v['start_size'] > 2: unif_starts = np.array([np.concatenate([start, np.random.uniform(-v['start_range'], v['start_range'], 2)]) for start in unif_starts]) labels = label_states(unif_starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') # plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['start_range'], # center=v['start_center'], maze_id=v['maze_id']) logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) starts = np.concatenate([starts, unif_starts[init_classes == 2]]).reshape((-1, v['start_size'])) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # report.new_row() with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], gae_lambda=v['gae_lambda'], plot=False, ) algo.train() logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): num_empty_spaces = len(unwrap_maze(env).find_empty_space()) logger.record_tabular('LabelingRollouts', k * v['n_traj'] * samples_per_cell * num_empty_spaces) total_rollouts += k * v['n_traj'] * samples_per_cell * num_empty_spaces logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] unif_samples = 300 # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals") goals = np.array([]).reshape((-1, v['goal_size'])) k = 0 while goals.shape[0] < v['num_new_goals']: print('good goals collected: ', goals.shape[0]) logger.log("Sampling and labeling the goals: %d" % k) k += 1 unif_goals = np.random.uniform( np.array(v['goal_center']) - np.array(v['goal_range']), np.array(v['goal_center']) + np.array(v['goal_range']), size=(unif_samples, v['goal_size'])) labels = label_states(unif_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) goals = np.concatenate([goals, unif_goals[init_classes == 2]]).reshape( (-1, v['goal_size'])) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample( v['num_old_goals']) #todo: replay noise? goals = np.vstack([goals, old_goals]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): logger.record_tabular('LabelingRollouts', k * v['n_traj'] * unif_samples) total_rollouts += k * v['n_traj'] * unif_samples logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) logger.log('Saving to report') report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], start_generation=False) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) if v['baseline'] == 'g_mlp': baseline_alice = GaussianMLPBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) raw_goals, t_alices = generate_states_alice(env_alice=env_alice, algo_alice=algo_alice, num_new_states=v['num_new_goals'], log_dir=log_dir, start_generation=False) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) all_paths = algo.train() [goals, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) # logger.log("Labeling the goals") # labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # goal generators logger.log("Initializing the goal generators and the inner env...") inner_env = normalize( PointEnv(dim=v['goal_size'], state_bounds=v['state_bounds'])) # inner_env = normalize(PendulumEnv()) center = np.zeros(v['goal_size']) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=center) feasible_goal_ub = np.array(v['state_bounds'])[:v['goal_size']] # print("the feasible_goal_ub is: ", feasible_goal_ub) uniform_feasible_goal_generator = UniformStateGenerator( state_size=v['goal_size'], bounds=[-1 * feasible_goal_ub, feasible_goal_ub]) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:int(len(x) / 2)], terminal_eps=v['terminal_eps'], only_feasible=v['only_feasible'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], terminate_env=True, goal_weight=v['goal_weight'], ) # this goal_generator will be updated by a uniform after policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) n_traj = 3 logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" debug = True else: debug = False inner_log_dir = osp.join(log_dir, 'inner_iters') report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) # img = plot_policy_reward( # policy, env, v['goal_range'], # horizon=v['horizon'], grid_size=5, # fname='{}/policy_reward_init.png'.format(log_dir), # ) # report.add_image(img, 'policy performance initialization\n') # GAN # logger.log("Instantiating the GAN...") # tf_session = tf.Session() # gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} # # gan = StateGAN( # state_size=v['goal_size'], # evaluater_size=v['num_labels'], # state_range=v['goal_range'], # state_noise_level=v['goal_noise_level'], # generator_layers=v['gan_generator_layers'], # discriminator_layers=v['gan_discriminator_layers'], # noise_size=v['gan_noise_size'], # tf_session=tf_session, # configs=gan_configs, # ) # final_gen_loss = 11 # k = -1 # while final_gen_loss > 10: # k += 1 # gan.gan.initialize() # img = plot_gan_samples(gan, v['goal_range'], '{}/start.png'.format(log_dir)) # report.add_image(img, 'GAN re-initialized %i' % k) # logger.log("pretraining the GAN...") # if v['smart_init']: # initial_goals = generate_initial_goals(env, policy, v['goal_range'], horizon=v['horizon']) # if np.size(initial_goals[0]) == 2: # plt.figure() # plt.scatter(initial_goals[:, 0], initial_goals[:, 1], marker='x') # plt.xlim(-v['goal_range'], v['goal_range']) # plt.ylim(-v['goal_range'], v['goal_range']) # img = save_image() # report.add_image(img, 'goals sampled to pretrain GAN: {}'.format(np.shape(initial_goals))) # dis_loss, gen_loss = gan.pretrain( # initial_goals, outer_iters=30 # # initial_goals, outer_iters=30, generator_iters=10, discriminator_iters=200, # ) # final_gen_loss = gen_loss # logger.log("Loss at the end of {}th trial: {}gen, {}disc".format(k, gen_loss, dis_loss)) # else: # gan.pretrain_uniform() # final_gen_loss = 0 # logger.log("Plotting GAN samples") # img = plot_gan_samples(gan, v['goal_range'], '{}/start.png'.format(log_dir)) # report.add_image(img, 'GAN pretrained %i: %i gen_itr, %i disc_itr' % (k, 10 + k, 200 - k * 10)) # # report.add_image(img, 'GAN pretrained %i: %i gen_itr, %i disc_itr' % (k, 10, 200)) # report.save() # report.new_row() sagg_riac = SaggRIAC( state_size=v['goal_size'], state_range=v['goal_range'], #state_center=v['goal_center'], max_goals=v['max_goals']) all_goals = StateCollection(v['coll_eps']) logger.log("Starting the outer iterations") for outer_iter in range(v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Train GAN logger.log("Sampling goals...") #raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: # sampler uniformly 2000 old goals and add them to the training pool (50/50) old_goals = all_goals.sample(v['num_old_goals']) # print("old_goals: {}, raw_goals: {}".format(old_goals, raw_goals)) goals = np.vstack([raw_goals, old_goals]) else: # print("no goals in all_goals: sample fresh ones") goals = raw_goals logger.log("Perform TRPO with UniformListStateGenerator...") with ExperimentLogger(inner_log_dir, '_last', snapshot_mode='last', hold_outter_log=True): # set goal generator to uniformly sample from selected all_goals env.update_goal_generator(UniformListStateGenerator(goals)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], discount=0.995, step_size=0.01, plot=False, ) all_paths = algo.train() [goals_with_labels, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') if v['use_competence_ratio']: [goals, rewards ] = compute_rewards_from_paths(all_paths, key='competence', as_goal=True, env=env, terminal_eps=v['terminal_eps']) else: [goals, rewards] = compute_rewards_from_paths(all_paths, key='rewards', as_goal=True, env=env) # logger.log("Plot performance policy on full grid...") # img = plot_policy_reward( # policy, env, v['goal_range'], # horizon=v['horizon'], # max_reward=v['max_reward'], # grid_size=10, # # fname='{}/policy_reward_{}.png'.format(log_config.plot_dir, outer_iter), # ) # report.add_image(img, 'policy performance\n itr: {}'.format(outer_iter)) # report.save() # report.add_image( # plot_generator_samples(gan, env), 'policy_rewards_{}'.format(outer_iter) # ) report.add_image(plot_policy_performance(policy, env, v['horizon']), 'gan_samples_{}'.format(outer_iter)) goal_classes, text_labels = convert_label(labels) total_goals = labels.shape[0] goal_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(goal_classes == k) / total_goals logger.record_tabular('GenGoal_frac_' + text_labels[k], frac) goal_class_frac[text_labels[k]] = frac img = plot_labeled_samples( samples=goals_with_labels, sample_classes=goal_classes, text_labels=text_labels, limit=v['goal_range'] + 1, bounds=env.feasible_goal_space.bounds, # '{}/sampled_goals_{}.png'.format(log_dir, outer_iter), # if i don't give the file it doesn't save ) summary_string = '' for key, value in goal_class_frac.items(): summary_string += key + ' frac: ' + str(value) + '\n' report.add_image(img, 'itr: {}\nLabels of generated goals:\n{}'.format( outer_iter, summary_string), width=500) # log feasibility of generated goals # feasible = np.array([1 if env.feasible_goal_space.contains(goal) else 0 for goal in goals], dtype=int) # feasibility_rate = np.mean(feasible) # logger.record_tabular('GenGoalFeasibilityRate', feasibility_rate) # img = plot_labeled_samples( # samples=goals, sample_classes=feasible, text_labels={0: 'Infeasible', 1: "Feasible"}, # markers={0: 'v', 1: 'o'}, limit=v['goal_range'] + 1, bounds=env.feasible_goal_space.bounds, # # '{}/sampled_goals_{}.png'.format(log_dir, outer_iter), # if i don't give the file it doesn't save # ) # report.add_image(img, 'feasibility of generated goals: {}\n itr: {}'.format(feasibility_rate, outer_iter), # width=500) ###### try single label for good goals if v['num_labels'] == 1: labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # logger.log("Training GAN...") # gan.train( # goals, labels, # v['gan_outer_iters'], # ) logger.log("Updating SAGG-RIAC") sagg_riac.add_states(goals, rewards) # Find final states "accidentally" reached by the agent. # final_goals = compute_final_states_from_paths(all_paths, as_goal=True, env=env) # sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew']) logger.log("Evaluating performance on Unif and Fix Goal Gen...") with logger.tabular_prefix('UnifFeasGoalGen_'): env.update_goal_generator(uniform_feasible_goal_generator) evaluate_goal_env(env, policy=policy, horizon=v['horizon'], n_goals=50, fig_prefix='UnifFeasGoalGen_', report=report, n_traj=n_traj) logger.dump_tabular(with_prefix=False) report.save() report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 all_starts = StateCollection(distance_threshold=v['coll_eps']) # seed_starts: from which we will be performing brownian motion exploration seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) def plot_states(states, report, itr, summary_string, **kwargs): states = np.array(states) if states.size == 0: states = np.zeros((1, 2)) img = plot_labeled_samples( states, np.zeros(len(states), dtype='uint8'), markers={0: 'o'}, text_labels={0: "all"}, **kwargs) report.add_image(img, 'itr: {}\n{}'.format(itr, summary_string), width=500) for outer_iter in range(1, v['outer_iters']): report.new_row() logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") plot_states( seed_starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="seed starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) plot_states( starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="brownian starts") sampled_from_buffer = [] if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: sampled_from_buffer = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, sampled_from_buffer]) plot_states( sampled_from_buffer, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="states sampled from buffer") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='all starts before update\n') with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states( starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base="all starts after update\n") with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: logger.log("Only goods A") seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("Only goods B") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("Only goods C") # add a ton of noise if all the states I had ended up being high_reward seed_starts = generate_starts( env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts']) logger.log('Generating Heatmap...') plot_policy_means( policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) _, _, states, returns, successes = test_and_plot_policy2( policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) eval_state_path = osp.join(log_dir, "eval_states.json") if not osp.exists(eval_state_path): with open(eval_state_path, 'w') as f: json.dump(np.array(states).tolist(), f) with open(osp.join(log_dir, 'eval_pos_per_state_mean_return.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(returns) writer.writerow(row) with open(osp.join(log_dir, 'eval_pos_per_state_mean_success.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(successes) writer.writerow(row) logger.dump_tabular() report.save() if outer_iter == 1 or outer_iter % 5 == 0 and v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True) if v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv( com_bound=v['goal_range'])) # todo: this does not take in goal_center! uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], goal_weight=v['goal_weight'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) report.new_row() sagg_riac = SaggRIAC(state_size=v['goal_size'], state_range=v['goal_range'], state_center=v['goal_center'], max_goals=v['max_goals'], max_history=v['max_history']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals']) goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals, persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) all_paths = algo.train() if v['use_competence_ratio']: [goals, rewards ] = compute_rewards_from_paths(all_paths, key='competence', as_goal=True, env=env, terminal_eps=v['terminal_eps']) else: [goals, rewards] = compute_rewards_from_paths(all_paths, key='rewards', as_goal=True, env=env) [goals_with_labels, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals_with_labels, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.save() logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) sagg_riac.plot_regions_interest(report=report) sagg_riac.plot_regions_states(report=report) logger.log("Updating SAGG-RIAC") sagg_riac.add_states(goals, rewards) # Find final states "accidentally" reached by the agent. final_goals = compute_final_states_from_paths(all_paths, as_goal=True, env=env) sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew']) logger.dump_tabular(with_prefix=False) report.new_row()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) report.new_row() for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals from the GAN") goals = np.random.uniform( np.array(v['goal_center']) - np.array(v['goal_range']), np.array(v['goal_center']) + np.array(v['goal_range']), size=(300, v['goal_size'])) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") if v['unif_goals']: env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) else: env.update_goal_generator(FixedStateGenerator(v['final_goal'])) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) algo.train() logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) logger.log("Labeling the goals") labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) report.new_row() # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='On-policy Goals:\n') if v['only_on_policy']: goals = feasible_goals[np.random.choice( feasible_goals.shape[0], v['num_new_goals'], replace=False), :] else: logger.log("Training the GAN") gan.pretrain(feasible_goals, v['gan_outer_iters']) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) algo.train() logger.log("Labeling the goals") labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals( env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # goal generators logger.log("Initializing the goal generators and the inner env...") inner_env = normalize( PointEnv(dim=v['goal_size'], state_bounds=v['state_bounds'])) print("the state_bounds are: ", v['state_bounds']) center = np.zeros(v['goal_size']) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=center) feasible_goal_ub = np.array(v['state_bounds'])[:v['goal_size']] print("the feasible_goal_ub is: ", feasible_goal_ub) uniform_feasible_goal_generator = UniformStateGenerator( state_size=v['goal_size'], bounds=[-1 * feasible_goal_ub, feasible_goal_ub]) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:int(len(x) / 2)], terminal_eps=v['terminal_eps'], only_feasible=v['only_feasible'], distance_metric=v['distance_metric'], terminate_env=True, goal_weight=v['goal_weight'], ) # this goal_generator will be updated by a uniform after if v['sample_unif_feas']: env.update_goal_generator(uniform_feasible_goal_generator) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=False, output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) n_traj = 3 # feasible_goals = generate_initial_goals(env, policy, v['goal_range'], horizon=v['horizon'], size=10000) #v['horizon']) # print(feasible_goals) # uniform_list_goal_generator = UniformListStateGenerator(goal_list=feasible_goals.tolist()) # env.update_goal_generator(uniform_list_goal_generator) # env.update_goal_generator(fixed_goal_generator) logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() inner_log_dir = osp.join(log_dir, 'inner_iters') report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) logger.log("Starting the outer iterations") for outer_iter in range(v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Perform TRPO with UniformListStateGenerator...") with ExperimentLogger(inner_log_dir, outer_iter, snapshot_mode='last', hold_outter_log=True): algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], discount=0.995, step_size=0.01, plot=False, ) algo.train() report.add_image(plot_policy_performance(policy, env, v['horizon'])) # log some more on how the pendulum performs the upright and general task old_goal_generator = env.goal_generator logger.log("Evaluating performance on Unif and Fix Goal Gen...") with logger.tabular_prefix('UnifFeasGoalGen_'): env.update_goal_generator(uniform_feasible_goal_generator) evaluate_goal_env(env, policy=policy, horizon=v['horizon'], n_goals=50, fig_prefix='UnifFeasGoalGen_itr%d' % outer_iter, report=report, n_traj=n_traj) # back to old goal generator with logger.tabular_prefix("UnifGoalGen_"): env.update_goal_generator(old_goal_generator) evaluate_goal_env(env, policy=policy, horizon=v['horizon'], n_goals=50, fig_prefix='UnifGoalGen_itr%d' % outer_iter, report=report, n_traj=n_traj) logger.dump_tabular(with_prefix=False) report.save() report.new_row() with logger.tabular_prefix('FINALUnifFeasGoalGen_'): env.update_goal_generator(uniform_feasible_goal_generator) evaluate_goal_env(env, policy=policy, horizon=v['horizon'], n_goals=5e3, fig_prefix='FINAL1UnifFeasGoalGen_', report=report, n_traj=n_traj) evaluate_goal_env(env, policy=policy, horizon=v['horizon'], n_goals=5e3, fig_prefix='FINAL2UnifFeasGoalGen_', report=report, n_traj=n_traj) logger.dump_tabular(with_prefix=False)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize( PointMazeEnv(maze_id=v['maze_id'], length=v['maze_length'])) #inner_env = normalize(PointEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) # initialize all logging arrays on itr0 outer_iter = 0 # TODO - show initial states for Alice report.new_row() ring_spacing = 1 init_iter = 2 # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # if outer_iter > 10: # init_iter = 5 #env_alice.set_iter(init_iter) #import pdb; pdb.set_trace() print("Init iter: " + str(init_iter)) env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) algo_alice.env = env_alice #env_alice.set_iter(outer_iter) starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) # Make fake labels labels = np.ones([len(starts), 2]) radius = init_iter * ring_spacing plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n', radius=radius) report.save() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.dump_tabular(with_prefix=False) report.new_row()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=v['step_size'], discount=v['discount'], plot=False, ) # We don't use these labels anyway, so we might as well take them from training. #trpo_paths = algo.train() algo.train() # logger.log("labeling starts with trpo rollouts") # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj # as_goal=False, env=env) # paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") # seed_starts = filtered_raw_starts # else: # seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], # variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)