def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix(log_prefix + '_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): if 'env_infos' in paths[0].keys( ) and 'inner_rew' in paths[0]['env_infos'].keys(): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics( stripped_paths ) # see swimmer_env.py for a scketch of the maze plotting!
def log_diagnostics(self, paths): BatchPolopt.log_diagnostics(self, paths) self.sampler.log_diagnostics(paths) if self.policy.latent_dim: if self.log_individual_latents and not self.policy.resample: # this is only valid for finite discrete latents!! all_latent_avg_returns = [] clustered_by_latents = collections.OrderedDict() # this could be done within the distribution to be more general, but ugly for lat_key in range(self.policy.latent_dim): clustered_by_latents[lat_key] = [] for path in paths: lat = path['agent_infos']['latents'][0] lat_key = int(from_onehot(lat)) # from_onehot returns an axis less than the input. clustered_by_latents[lat_key].append(path) for latent_key, paths in clustered_by_latents.items(): # what to do if this is empty?? set a default! with logger.tabular_prefix(str(latent_key)), logger.prefix(str(latent_key)): if paths: undiscounted_rewards = [sum(path["true_rewards"]) for path in paths] else: undiscounted_rewards = [0] all_latent_avg_returns.append(np.mean(undiscounted_rewards)) logger.record_tabular('Avg_TrueReturn', np.mean(undiscounted_rewards)) logger.record_tabular('Std_TrueReturn', np.std(undiscounted_rewards)) logger.record_tabular('Max_TrueReturn', np.max(undiscounted_rewards)) if self.log_deterministic: lat = from_index(latent_key, self.policy.latent_dim) with self.policy.fix_latent(lat), self.policy.set_std_to_0(): path_det = rollout(self.env, self.policy, self.max_path_length) logger.record_tabular('Deterministic_TrueReturn', np.sum(path_det["rewards"])) with logger.tabular_prefix('all_lat_'), logger.prefix('all_lat_'): logger.record_tabular('MaxAvgReturn', np.max(all_latent_avg_returns)) logger.record_tabular('MinAvgReturn', np.min(all_latent_avg_returns)) logger.record_tabular('StdAvgReturn', np.std(all_latent_avg_returns)) if self.log_hierarchy: max_in_path_length = 10 completed_in_paths = 0 path = rollout(self.env, self.policy, max_path_length=max_in_path_length, animated=False) if len(path['rewards']) == max_in_path_length: completed_in_paths += 1 for t in range(1, 50): path = rollout(self.env, self.policy, max_path_length=10, animated=False, reset_start_rollout=False) if len(path['rewards']) < 10: break completed_in_paths += 1 logger.record_tabular('Hierarchy', completed_in_paths) else: if self.log_deterministic: with self.policy.set_std_to_0(): path = rollout(self.env, self.policy, self.max_path_length) logger.record_tabular('Deterministic_TrueReturn', np.sum(path["rewards"]))
def evaluate_performance(env): four_rooms = np.array([[-2, -2], [-13, -13]]) if v['unif_starts']: mean_rewards, successes = [], [] for pos in four_rooms: env.update_start_generator(FixedStateGenerator(np.array(pos))) mr, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8, log=False) mean_rewards.append(mr) successes.append(scs) with logger.tabular_prefix('Outer_'): logger.record_tabular('iter', outer_iter) logger.record_tabular('MeanRewards', np.mean(mean_rewards)) logger.record_tabular('Success', np.mean(successes)) else: env.update_start_generator(FixedStateGenerator(np.array([0, 0]))) _, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8) report.new_row() env.update_start_generator(uniform_start_generator) return scs
def test_and_plot_policy(policy, env, as_goals=True, visualize=True, sampling_res=1, n_traj=1, max_reward=1, itr=0, report=None, center=None, limit=None, bounds=None): avg_totRewards, avg_success, states, spacing, avg_time = test_policy( policy, env, as_goals, visualize, center=center, sampling_res=sampling_res, n_traj=n_traj, bounds=bounds) obj = env while not hasattr(obj, '_maze_id') and hasattr(obj, 'wrapped_env'): obj = obj.wrapped_env maze_id = obj._maze_id if hasattr(obj, '_maze_id') else None plot_heatmap(avg_success, states, spacing=spacing, show_heatmap=False, maze_id=maze_id, center=center, limit=limit) reward_img = save_image() # plot_heatmap(avg_time, states, spacing=spacing, show_heatmap=False, maze_id=maze_id, # center=center, limit=limit, adaptive_range=True) # time_img = save_image() mean_rewards = np.mean(avg_totRewards) success = np.mean(avg_success) with logger.tabular_prefix('Outer_'): logger.record_tabular('iter', itr) logger.record_tabular('MeanRewards', mean_rewards) logger.record_tabular('Success', success) # logger.dump_tabular(with_prefix=False) if report is not None: report.add_image( reward_img, 'policy performance\n itr: {} \nmean_rewards: {} \nsuccess: {}'. format(itr, mean_rewards, success)) # report.add_image( # time_img, # 'policy time\n itr: {} \n'.format( # itr # ) # ) return mean_rewards, success
def log_diagnostics(self, all_paths): for n, (env, policy, baseline, paths) in enumerate( zip(self.env_partitions, self.local_policies, self.local_baselines, all_paths)): with logger.tabular_prefix(str(n)): env.log_diagnostics(paths) policy.log_diagnostics(paths) baseline.log_diagnostics(paths)
def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix(log_prefix + '_'): gather_undiscounted_returns = [sum(path['env_infos']['outer_rew']) for path in paths] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): if 'env_infos' in paths[0].keys() and 'inner_rew' in paths[0]['env_infos'].keys(): wrapped_undiscounted_return = np.mean([np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics(stripped_paths) # see swimmer_env.py for a scketch of the maze plotting!
def _evaluate(self, epoch): """Perform evaluation for the current policy. We always use the most recent policy, but for computational efficiency we sometimes use a stale version of the metapolicy. During evaluation, our policy expects an un-augmented observation. :param epoch: The epoch number. :return: None """ if self._eval_n_episodes < 1: return if epoch % self._find_best_skill_interval == 0: self._single_option_policy = self._get_best_single_option_policy() for (policy, policy_name) in [(self._single_option_policy, 'best_single_option_policy')]: with logger.tabular_prefix(policy_name + '/'), logger.prefix(policy_name + '/'): with self._policy.deterministic(self._eval_deterministic): if self._eval_render: paths = rollouts(self._eval_env, policy, self._max_path_length, self._eval_n_episodes, render=True, render_mode='rgb_array') else: paths = rollouts(self._eval_env, policy, self._max_path_length, self._eval_n_episodes) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) batch = self._pool.random_batch(self._batch_size) self.log_diagnostics(batch)
def log_diagnostics(self, paths, prefix=''): progs = [ np.linalg.norm(path["env_infos"]['com'][-1] - path["env_infos"]['com'][0]) for path in paths ] with logger.tabular_prefix(prefix): logger.record_tabular('AverageForwardProgress', np.mean(progs)) logger.record_tabular('MaxForwardProgress', np.max(progs)) logger.record_tabular('MinForwardProgress', np.min(progs)) logger.record_tabular('StdForwardProgress', np.std(progs)) self.plot_visitations(paths, prefix=prefix)
def log_diagnostics(self, paths, *args, **kwargs): # we call here any logging related to the maze, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix('Maze_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): # print("k", k) stripped_path[k] = v # for k, v in path["agent_infos"].items(): # print("k", k) # print("latents", stripped_path["agent_infos"]["latents"]) # print("latents", stripped_path["agent_infos"]["latents"].shape) # print("shape_len", len(stripped_path['observations'].shape)) # print("after_con", np.concatenate(stripped_path['observations']).shape) if len(stripped_path['observations'].shape) == 1: stripped_path['observations'] = np.concatenate( stripped_path['observations']) stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) # for _ in range(10): # print('OK!') # print(wrapped_undiscounted_return) # print([np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('SuccessRate', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics(stripped_paths, *args, **kwargs)
def train(self, sess=None): if sess is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() sess.run(tf.initialize_all_variables()) else: sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables())))) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): all_paths = [] logger.log("Obtaining samples...") for sampler in self.local_samplers: all_paths.append(sampler.obtain_samples(itr)) logger.log("Processing samples...") all_samples_data = [] for n, (sampler, paths) in enumerate(zip(self.local_samplers, all_paths)): with logger.tabular_prefix(str(n)): all_samples_data.append( sampler.process_samples(itr, paths)) logger.log("Logging diagnostics...") self.log_diagnostics(all_paths, ) logger.log("Optimizing policy...") self.optimize_policy(itr, all_samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, all_samples_data) # , **kwargs) logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) self.shutdown_worker()
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) if not self.test_env is None: logger.log("Obtaining test samples...") test_paths = self.test_sampler.obtain_samples(itr) with logger.tabular_prefix("Test"): self.test_sampler.process_samples(itr, test_paths) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close()
def trainExperts(self, num_training_itrs): for itr in range(num_training_itrs): print('############itr_' + str(itr) + '################') all_paths = [] for sampler in self.local_samplers: all_paths.append(sampler.obtain_samples(itr)) #if itr == (num_training_itrs-1) or itr == 0: log = True #else: #log = False all_samples_data = [] for n, (sampler, paths) in enumerate(zip(self.local_samplers, all_paths)): with logger.tabular_prefix(str(n)): all_samples_data.append( sampler.process_samples(itr, paths, log=log)) logger.log("Logging diagnostics...") self.log_diagnostics(all_paths, prefix='') logger.log("Optimizing policy...") self.optimize_expert_policies(itr, all_samples_data) # logger.log("Saving snapshot...") # params = self.get_itr_snapshot(itr, all_samples_data) # , **kwargs) # logger.save_itr_params(itr, params) # logger.log("Saved") # logger.record_tabular('Time', time.time() - start_time) # logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) for t in range(len(all_paths)): for path in all_paths[t]: path['expert_actions'] = np.clip(deepcopy(path['actions']), -1.0, 1.0) path['agent_infos'] = dict( mean=[[0.0] * len(path['actions'][0])] * len(path['actions']), log_std=[[0.0] * len(path['actions'][0])] * len(path['actions'])) expertDict = {i: all_paths[i] for i in range(len(all_paths))} return expertDict
def evaluate_performance_plane(test_env): epss = [0.1, 0.2, 0.3, 0.5, 0.7] labels, paths = label_states(goals, test_env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', n_processes=8, using_gym=True, noise=0, full_path=True) with logger.tabular_prefix('Outer_'): logger.record_tabular('iter', outer_iter) for eps in epss: successes = np.mean(goal_reached_by_eps(paths, eps)) logger.record_tabular('Success_%3.1f' % eps, successes) return np.mean(successes)
def fit_with_samples(self, paths, samples_data): inputs = [ samples_data["observations"], samples_data["returns"], samples_data["valids"] ] self.f_update_stats(samples_data["returns"], samples_data["valids"]) with logger.prefix("Vf | "), logger.tabular_prefix("Vf."): if self.log_loss_before: logger.log("Computing loss before training") loss_before, _ = self.optimizer.loss_diagnostics(inputs) logger.log("Computed") epoch_losses = [] def record_data(loss, diagnostics, *args, **kwargs): epoch_losses.append(loss) return True self.optimizer.optimize(inputs, callback=record_data) if self.log_loss_after: logger.log("Computing loss after training") loss_after, _ = self.optimizer.loss_diagnostics(inputs) logger.log("Computed") # perform minibatch gradient descent on the surrogate loss, while monitoring the KL divergence if self.log_loss_before: logger.record_tabular('LossBefore', loss_before) else: # Log approximately logger.record_tabular('FirstEpoch.Loss', epoch_losses[0]) if self.log_loss_after: logger.record_tabular('LossAfter', loss_after) else: logger.record_tabular('LastEpoch.Loss', epoch_losses[-1]) if self.log_loss_before and self.log_loss_after: logger.record_tabular('dLoss', loss_before - loss_after)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) report.save() inner_env = normalize(Arm3dDiscEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']:], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'disc_all_feasible_states_min.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) # brownian_starts = StateCollection(distance_threshold=v['regularize_starts']) # with env.set_kill_outside(): # seed_starts = generate_starts(env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! # variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=1) # # with env.set_kill_outside(): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10) # Use asymmetric self-play to run Alice to generate starts for Bob. env_alice = AliceEnv(env, env, policy, v['horizon']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # with env.set_kill_outside(): # starts = generate_starts(env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts # brownian_starts.empty() # brownian_starts.append(starts) # starts = brownian_starts.sample(size=v['num_new_starts']) starts = generate_starts_alice(env_bob=env, env_alice=env_alice, policy_bob=policy, policy_alice=policy_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], alice_factor=v['alice_factor'], log_dir=log_dir) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.record_tabular('starts', starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(1000) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): Not the low reward ones!! logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntMazeEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform() # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals # if needed label the goals before any update if v['label_with_variation']: old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=False, return_rew=True) # itr_label = outer_iter # use outer_iter to log everything or "last" to log only the last # with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True): with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] elif v['label_with_variation']: labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', old_rewards=old_rewards, full_path=True) else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) #logger.log("Labeling the goals") #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) if v['label_with_variation']: # this will use only the performance variation for labeling labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1)) else: labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") gan.train( goals, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] unif_samples = 300 # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals") goals = np.array([]).reshape((-1, v['goal_size'])) k = 0 while goals.shape[0] < v['num_new_goals']: print('good goals collected: ', goals.shape[0]) logger.log("Sampling and labeling the goals: %d" % k) k += 1 unif_goals = np.random.uniform( np.array(v['goal_center']) - np.array(v['goal_range']), np.array(v['goal_center']) + np.array(v['goal_range']), size=(unif_samples, v['goal_size'])) labels = label_states(unif_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) goals = np.concatenate([goals, unif_goals[init_classes == 2]]).reshape( (-1, v['goal_size'])) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample( v['num_old_goals']) #todo: replay noise? goals = np.vstack([goals, old_goals]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): logger.record_tabular('LabelingRollouts', k * v['n_traj'] * unif_samples) total_rollouts += k * v['n_traj'] * unif_samples logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals)
def plot_visitations(self, paths, mesh_density=20, visit_prefix='', visit_axis_bound=None, maze=None, scaling=2): if 'env_infos' not in paths[0].keys( ) or 'com' not in paths[0]['env_infos'].keys(): raise KeyError( "No 'com' key in your path['env_infos']: please change you step function" ) fig, ax = plt.subplots() # now we will grid the space and check how much of it the policy is covering x_max = np.int( np.ceil( np.max( np.abs( np.concatenate([ path["env_infos"]['com'][:, 0] for path in paths ]))))) y_max = np.int( np.ceil( np.max( np.abs( np.concatenate([ path["env_infos"]['com'][:, 1] for path in paths ]))))) furthest = max(x_max, y_max) print( 'THE FUTHEST IT WENT COMPONENT-WISE IS: x_max={}, y_max={}'.format( x_max, y_max)) if visit_axis_bound is None: visit_axis_bound = self.visit_axis_bound if visit_axis_bound and visit_axis_bound >= furthest: furthest = max(furthest, visit_axis_bound) # if maze: # x_max = max(scaling * len( # maze) / 2. - 1, x_max) # maze enlarge plot to include the walls. ASSUME ROBOT STARTS IN CENTER! # y_max = max(scaling * len(maze[0]) / 2. - 1, y_max) # the max here should be useless... # print("THE MAZE LIMITS ARE: x_max={}, y_max={}".format(x_max, y_max)) delta = 1. / mesh_density y, x = np.mgrid[-furthest:furthest + delta:delta, -furthest:furthest + delta:delta] if 'agent_infos' in list(paths[0].keys()) and ( ('latents' in list(paths[0]['agent_infos'].keys()) and np.size(paths[0]['agent_infos']['latents'])) or ('selectors' in list(paths[0]['agent_infos'].keys()) and np.size(paths[0]['agent_infos']['selectors']))): selectors_name = 'selectors' if 'selectors' in list( paths[0]['agent_infos'].keys()) else 'latents' dict_visit = collections.OrderedDict( ) # keys: latents, values: np.array with number of visitations # num_latents = np.size(paths[0]["agent_infos"][selectors_name][0][0])、 num_latents = 6 # print("num_latents", num_latents) # set all the labels for the latents and initialize the entries of dict_visit for i in range(num_latents): # use integer to define the latents dict_visit[i] = np.zeros((2 * furthest * mesh_density + 1, 2 * furthest * mesh_density + 1)) # keep track of the overlap overlap = 0 # now plot all the paths for path in paths: lats = [ np.argmax(lat, axis=-1) for lat in path['agent_infos'][selectors_name] ] # list of all lats by idx com_x = np.ceil( ((np.array(path['env_infos']['com'][:, 0]) + furthest) * mesh_density)).astype(int) com_y = np.ceil( ((np.array(path['env_infos']['com'][:, 1]) + furthest) * mesh_density)).astype(int) coms = list(zip(com_x, com_y)) if not type(lats[0]) == numpy.int64: lats = np.concatenate(lats) for i, com in enumerate(coms): if i >= len(lats): break if lats[i] > 5: print("lats", lats) else: dict_visit[lats[i]][com] += 1 # fix the colors for each latent num_colors = num_latents + 2 # +2 for the 0 and Repetitions NOT COUNTING THE WALLS cmap = plt.get_cmap('nipy_spectral', num_colors) # add one color for the walls # create a matrix with entries corresponding to the latent that was there (or other if several/wall/nothing) visitation_by_lat = np.zeros((2 * furthest * mesh_density + 1, 2 * furthest * mesh_density + 1)) for i, visit in dict_visit.items(): lat_visit = np.where(visit == 0, visit, i + 1) # transform the map into 0 or i+1 visitation_by_lat += lat_visit overlap += np.sum(np.where(visitation_by_lat > lat_visit) ) # add the overlaps of this latent visitation_by_lat = np.where(visitation_by_lat <= i + 1, visitation_by_lat, num_colors - 1) # mark overlaps # if maze: # remember to also put a +1 for cmap!! # for row in range(len(maze)): # for col in range(len(maze[0])): # if maze[row][col] == 1: # wall_min_x = max(0, (row - 0.5) * mesh_density * scaling) # wall_max_x = min(2 * furthest * mesh_density * scaling + 1, # (row + 0.5) * mesh_density * scaling) # wall_min_y = max(0, (col - 0.5) * mesh_density * scaling) # wall_max_y = min(2 * furthest * mesh_density * scaling + 1, # (col + 0.5) * mesh_density * scaling) # visitation_by_lat[wall_min_x: wall_max_x, # wall_min_y: wall_max_y] = num_colors # gx_min, gfurthest, gy_min, gfurthest = self._find_goal_range() # ax.add_patch(patches.Rectangle( # (gx_min, gy_min), # gfurthest - gx_min, # gfurthest - gy_min, # edgecolor='g', fill=False, linewidth=2, # )) # ax.annotate('G', xy=(0.5*(gx_min+gfurthest), 0.5*(gy_min+gfurthest)), color='g', fontsize=20) map_plot = ax.pcolormesh( x, y, visitation_by_lat, cmap=cmap, vmin=0.1, vmax=num_latents + 1) # before 1 (will it affect when no walls?) color_len = (num_colors - 1.) / num_colors ticks = np.arange(color_len / 2., num_colors - 1, color_len) cbar = fig.colorbar(map_plot, ticks=ticks) # print("dict_visit_key", dict_visit.keys()) latent_tick_labels = [ 'latent: ' + str(i) for i in list(dict_visit.keys()) ] # print("latent_tick_labels", latent_tick_labels) cbar.ax.set_yticklabels(['No visitation'] + latent_tick_labels + ['Repetitions']) # horizontal colorbar # still log the total visitation visitation_all = reduce(np.add, [visit for visit in dict_visit.values()]) else: visitation_all = np.zeros((2 * furthest * mesh_density + 1, 2 * furthest * mesh_density + 1)) for path in paths: com_x = np.ceil( ((np.array(path['env_infos']['com'][:, 0]) + furthest) * mesh_density)).astype(int) com_y = np.ceil( ((np.array(path['env_infos']['com'][:, 1]) + furthest) * mesh_density)).astype(int) coms = list(zip(com_x, com_y)) for com in coms: visitation_all[com] += 1 plt.pcolormesh(x, y, visitation_all, vmax=mesh_density) overlap = np.sum( np.where(visitation_all > 1, visitation_all, 0)) # sum of all visitations larger than 1 ax.set_xlim([x[0][0], x[0][-1]]) ax.set_ylim([y[0][0], y[-1][0]]) log_dir = logger.get_snapshot_dir() exp_name = log_dir.split('/')[-1] if log_dir else '?' ax.set_title(visit_prefix + 'visitation: ' + exp_name) # print("log_dir", log_dir) # print("visit_prefix", visit_prefix) if log_dir is None: log_dir = '/home/wr1/rllab/data/local/transfer/' plt.savefig(osp.join( log_dir, visit_prefix + 'visitation.png')) # this saves the current figure, here f plt.close() with logger.tabular_prefix(visit_prefix): total_visitation = np.count_nonzero(visitation_all) logger.record_tabular('VisitationTotal', total_visitation) logger.record_tabular('VisitationOverlap', overlap) #### # This was giving some problem with matplotlib and maximum number of colors #### # # now downsample the visitation # for down in [5, 10, 20]: # visitation_down = np.zeros(tuple((i//down for i in visitation_all.shape))) # delta_down = delta * down # y_down, x_down = np.mgrid[-furthest:furthest+delta_down:delta_down, -furthest:furthest+delta_down:delta_down] # for i, row in enumerate(visitation_down): # for j, v in enumerate(row): # visitation_down[i, j] = np.sum(visitation_all[down*i:down*(1+i), down*j:down*(j+1)]) # plt.figure() # plt.pcolormesh(x_down, y_down, visitation_down, vmax=mesh_density) # plt.title('Visitation_down') # plt.xlim([x_down[0][0], x_down[0][-1]]) # plt.ylim([y_down[0][0], y_down[-1][0]]) # plt.title('visitation_down{}: {}'.format(down, exp_name)) # plt.savefig(osp.join(log_dir, 'visitation_down{}.png'.format(down))) # plt.close() # # total_visitation_down = np.count_nonzero(visitation_down) # overlap_down = np.sum(np.where(visitation_down > 1, 1, 0)) # sum of all visitations larger than 1 # logger.record_tabular('VisitationTotal_down{}'.format(down), total_visitation_down) # logger.record_tabular('VisitationOverlap_down{}'.format(down), overlap_down) plt.cla() plt.clf() plt.close('all') # del fig, ax, cmap, cbar, map_plot gc.collect()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize( PointMazeEnv(maze_id=v['maze_id'], length=v['maze_length'])) #inner_env = normalize(PointEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) # initialize all logging arrays on itr0 outer_iter = 0 # TODO - show initial states for Alice report.new_row() ring_spacing = 1 init_iter = 2 # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # if outer_iter > 10: # init_iter = 5 #env_alice.set_iter(init_iter) #import pdb; pdb.set_trace() print("Init iter: " + str(init_iter)) env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) algo_alice.env = env_alice #env_alice.set_iter(outer_iter) starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) # Make fake labels labels = np.ones([len(starts), 2]) radius = init_iter * ring_spacing plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n', radius=radius) report.save() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.dump_tabular(with_prefix=False) report.new_row()
def train(self, sess=None): if sess is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() sess.run(tf.initialize_all_variables()) else: sess.run( tf.initialize_variables( list( tf.get_variable(name) for name in sess.run( tf.report_uninitialized_variables())))) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): all_paths = [] logger.log("Obtaining samples...") for sampler in self.local_samplers: all_paths.append(sampler.obtain_samples(itr)) logger.log("Processing samples...") all_samples_data = [] for n, (sampler, paths) in enumerate(zip(self.local_samplers, all_paths)): with logger.tabular_prefix(str(n)): all_samples_data.append( sampler.process_samples(itr, paths)) logger.log("Logging diagnostics...") self.log_diagnostics(all_paths, ) if self.should_optimize_policy: logger.log("Optimizing policy...") self.optimize_policy(itr, all_samples_data) if not self.test_env is None: logger.log("Obtaining test samples...") test_paths = self.test_sampler.obtain_samples(itr) with logger.tabular_prefix("Test"): test_samples = self.test_sampler.process_samples( itr, test_paths) logger.record_tabular( "TestSuccessRate", np.mean(test_samples["env_infos"]["success"])) successes = 0.0 trials = 0.0 for i, samples_data in enumerate(all_samples_data): success = samples_data["env_infos"]["success"] logger.record_tabular("SuccessRate{}".format(i), np.mean(success)) successes += np.sum(success) trials += success.shape[0] success_rate = successes / trials logger.record_tabular("SuccessRate", success_rate) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, all_samples_data) # , **kwargs) logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) self.shutdown_worker()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['start_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']: ], # the goal are the last 9 coords terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=v['policy_hidden_sizes'], # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['baseline'] == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb')) # all_feasible_starts2 = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb')) # all_feasible_starts3 = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) brownian_starts = StateCollection( distance_threshold=v['regularize_starts']) logger.log( 'Generating seed starts from the goal (horizon 10, subsample 600 of them)' ) with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=10) # seed_starts = all_feasible_starts.states # with env.set_kill_outside(radius=0.4): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], # zero_action=True, animated=True, speedup=10) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") with algo.env.set_kill_outside(radius=v['kill_radius']): starts = generate_starts(algo.env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts brownian_starts.empty() brownian_starts.append(starts) starts = brownian_starts.sample(size=v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 50 * (outer_iter // 50 + 1), snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") algo.env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) # algo.start_worker() logger.log("Training the algorithm") algo.current_itr = 0 trpo_paths = algo.train(already_init=outer_iter > 1) # import pdb; pdb.set_trace() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=algo.env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, algo.env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): algo.env.log_diagnostics(paths) logger.record_tabular('brownian_starts', brownian_starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_4med_"): unif_starts = all_feasible_starts.sample(500) unif_starts = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') mean_reward, paths = evaluate_states(unif_starts, algo.env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) algo.env.log_diagnostics(paths) # with logger.tabular_prefix("Uniform_4med_bis_"): # unif_starts = all_feasible_starts.sample(200) # unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') # mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths1bis) # with logger.tabular_prefix("Uniform_4min_"): # unif_starts2 = all_feasible_starts2.sample(200) # unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant') # mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths2) # with logger.tabular_prefix("Uniform_2max_"): # unif_starts3 = all_feasible_starts3.sample(200) # unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant') # mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths3) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): if v['seed_with'] == 'only_goods': logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts) if len(filtered_raw_starts) > 0: seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum( start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample( 300) # sample them from the replay else: # add a tone of noise if all the states I had ended up being high_reward! with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( algo.env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': logger.log("Appending all goals to replay and generating seeds") all_starts.append(starts) seed_starts = starts elif v['seed_with'] == 'on_policy': all_starts.append(starts) with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts(algo.env, policy, horizon=v['horizon'], subsample=v['num_new_starts'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=2) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None uniform_start_generator = UniformListStateGenerator( state_list=all_feasible_starts.state_list) init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) env.update_start_generator(uniform_start_generator) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") # env.update_start_generator(uniform_start_generator) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) algo.train() logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=3, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # create Alice env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) if v["baseline"] == "MLP": baseline_alice = GaussianMLPBaseline(env_spec=env.spec) else: baseline_alice = LinearFeatureBaseline(env_spec=env.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) if v['filter_bad_starts']: logger.log("Prefilter starts: {}".format(len(starts))) starts = parallel_check_feasibility( env=env, starts=starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered starts: {}".format(len(starts))) logger.log("Total number of starts in buffer: {}".format( all_starts.size)) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log("Labeling the starts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=v['n_traj'], key='goal_reached', # using the min n_traj as_goal=False, env=env) # labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") all_starts.append(filtered_raw_starts) # Useful plotting and metrics (basic test set) # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') # report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' save_dir = 'data/debug/' # with open(os.path.join(config.PROJECT_PATH, save_dir, "test.pkl"), 'wb') as handle: # pickle.dump({}, handle) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) # initial brownian horizon and size are pretty important logger.log("Brownian horizon: {}".format(v['initial_brownian_horizon'])) seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=15000, variance=v['brownian_variance'], animated=False, ) if v['filter_bad_starts']: logger.log("Prefilter seed starts: {}".format(len(seed_starts))) seed_starts = parallel_check_feasibility( env=env, starts=seed_starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered seed starts: {}".format(len(seed_starts))) # can also filter these starts optionally # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) # logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) with open(osp.join(log_dir, 'init_pos.json'), 'w') as f: json.dump(init_pos.tolist(), f) for outer_iter in range(1, v['outer_iters'] + 1): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() # generate starts from the previous seed starts, which are defined below starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], size=2000, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # note: this messes with the balance between starts and old_starts! if v['filter_bad_starts']: logger.log("Prefilter starts: {}".format(len(starts))) starts = parallel_check_feasibility( env=env, starts=starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered starts: {}".format(len(starts))) logger.log("Total number of starts in buffer: {}".format( all_starts.size)) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: # with open(os.path.join(config.PROJECT_PATH, save_dir, "qval{}.pkl".format(outer_iter)), 'wb') as handle: # pickle.dump(all_starts.q_vals, handle) # with open(os.path.join(config.PROJECT_PATH, save_dir, "preval{}.pkl".format(outer_iter)), 'wb') as handle: # pickle.dump(all_starts.prev_vals, handle) old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # plot starts before training # takes too much time # labels = label_states(starts, env, policy, v['horizon'], # as_goals=False, n_traj=v['n_traj'], key='goal_reached') # plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], # center=v['goal_center'], maze_id=v['maze_id'], # summary_string_base='initial starts labels:\n') # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() logger.log("Labeling the starts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached', as_goal=False, env=env) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len( filtered_raw_starts ) > 0: # add a ton of noise if all the states I had ended up being high_reward! logger.log("We have {} good starts!".format( len(filtered_raw_starts))) seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum( start_classes == 1): # if more low reward than high reward logger.log( "More bad starts than good starts, sampling seeds from replay buffer" ) seed_starts = all_starts.sample( 300) # sample them from the replay else: logger.log("More good starts than bad starts, resampling") seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000, variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts filtered_raw_starts = starts # no filtering done else: raise Exception # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") if not v["debug"]: # with logger.tabular_prefix("Uniform_"): # unif_starts = all_feasible_starts.sample(100) # mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', # as_goals=False, full_path=True) # env.log_diagnostics(paths) # mean_rewards = mean_reward.reshape(-1, 1) # labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, # improvement_threshold=improvement_threshold) # logger.log("Starts labelled") # plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], # center=v['goal_center'], maze_id=v['maze_id'], # summary_string_base='initial starts labels:\n') # report.add_text("Uniform Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) with open( osp.join(log_dir, 'init_pos_per_state_mean_return.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(mean_reward) writer.writerow(row) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states( init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular() if outer_iter == 1 or outer_iter % 5 == 0 and v.get( 'scratch_dir', False): command = 'rsync -a --delete {} {}'.format( os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True) if v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) logger.log('Saving to report') report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], start_generation=False) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) if v['baseline'] == 'g_mlp': baseline_alice = GaussianMLPBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) raw_goals, t_alices = generate_states_alice(env_alice=env_alice, algo_alice=algo_alice, num_new_states=v['num_new_goals'], log_dir=log_dir, start_generation=False) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) all_paths = algo.train() [goals, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) # logger.log("Labeling the goals") # labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['constant_baseline']: logger.log("Using constant baseline") baseline = ConstantBaseline(env_spec=env.spec, value=1.0) else: logger.log("Using linear baseline") baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) # use goal for plot report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) start_classes, text_labels = convert_label(labels) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample(300) # sample them from the replay else: seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.baseline, "predict_n"): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [self.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) # if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths) logger.log("fitted") with logger.tabular_prefix('Low_'): logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) #baseline = LinearFeatureBaseline(env_spec=env.spec) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) # can also filter these starts optionally load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None # hardest to easiest init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4] ][::-1] for pos in init_pos: pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) array_init_pos = np.array(init_pos) init_pos = [tuple(pos) for pos in init_pos] online_start_generator = Online_TCSL(init_pos) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() # generate starts from the previous seed starts, which are defined below dist = online_start_generator.get_distribution() # added logger.log(np.array_str(online_start_generator.get_q())) # how to log Q values? # with logger.tabular_prefix("General: "): # logger.record_tabular("Q values:", online_start_generator.get_q()) logger.log(np.array_str(dist)) # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator? env.update_start_generator( ListStateGenerator( init_pos, dist ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() logger.log("Labeling the starts") [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached', # using the min n_traj as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a ton of noise if all the states I had ended up being high_reward! logger.log("We have {} good starts!".format(len(filtered_raw_starts))) seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("More bad starts than good starts, sampling seeds from replay buffer") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("More good starts than bad starts, resampling") seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000, variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts else: raise Exception all_starts.append(filtered_raw_starts) # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') # report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): penalty_sum = sum([path['env_infos']['penalty'].sum() for path in paths]) with logger.tabular_prefix(log_prefix): logger.record_tabular("Penalty", penalty_sum)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 all_starts = StateCollection(distance_threshold=v['coll_eps']) # seed_starts: from which we will be performing brownian motion exploration seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) def plot_states(states, report, itr, summary_string, **kwargs): states = np.array(states) if states.size == 0: states = np.zeros((1, 2)) img = plot_labeled_samples( states, np.zeros(len(states), dtype='uint8'), markers={0: 'o'}, text_labels={0: "all"}, **kwargs) report.add_image(img, 'itr: {}\n{}'.format(itr, summary_string), width=500) for outer_iter in range(1, v['outer_iters']): report.new_row() logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") plot_states( seed_starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="seed starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) plot_states( starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="brownian starts") sampled_from_buffer = [] if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: sampled_from_buffer = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, sampled_from_buffer]) plot_states( sampled_from_buffer, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="states sampled from buffer") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='all starts before update\n') with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states( starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base="all starts after update\n") with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: logger.log("Only goods A") seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("Only goods B") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("Only goods C") # add a ton of noise if all the states I had ended up being high_reward seed_starts = generate_starts( env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts']) logger.log('Generating Heatmap...') plot_policy_means( policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) _, _, states, returns, successes = test_and_plot_policy2( policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) eval_state_path = osp.join(log_dir, "eval_states.json") if not osp.exists(eval_state_path): with open(eval_state_path, 'w') as f: json.dump(np.array(states).tolist(), f) with open(osp.join(log_dir, 'eval_pos_per_state_mean_return.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(returns) writer.writerow(row) with open(osp.join(log_dir, 'eval_pos_per_state_mean_success.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(successes) writer.writerow(row) logger.dump_tabular() report.save() if outer_iter == 1 or outer_iter % 5 == 0 and v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True) if v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=v['step_size'], discount=v['discount'], plot=False, ) # We don't use these labels anyway, so we might as well take them from training. #trpo_paths = algo.train() algo.train() # logger.log("labeling starts with trpo rollouts") # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj # as_goal=False, env=env) # paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") # seed_starts = filtered_raw_starts # else: # seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], # variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)