def __call__(self): results = OrderedDict() for name, indices in [ ('train_tasks', self.train_task_indices), ('test_tasks', self.test_task_indices), ]: final_returns, online_returns, idx_to_final_context = self.algorithm._do_eval( indices, -1) results['eval/adaptation/{}/final_returns Mean'.format( name)] = np.mean(final_returns) results['eval/adaptation/{}/all_returns Mean'.format( name)] = np.mean(online_returns) if 'train' in name: z_dist_log = self.algorithm._get_z_distribution_log( idx_to_final_context) append_log(results, z_dist_log, prefix='trainer/{}/'.format(name)) paths = [] for idx in self.train_task_indices: paths += self._get_init_from_buffer_path(idx) results[ 'eval/init_from_buffer/train_tasks/all_returns Mean'] = np.mean( eval_util.get_average_returns(paths)) return results
def _do_eval(self, indices, epoch): final_returns = [] online_returns = [] for idx in indices: all_rets = [] all_success = [] for r in range(self.num_evals): paths = self.collect_paths(idx, epoch, r) all_rets.append( [eval_util.get_average_returns([p]) for p in paths]) all_success.append(eval_util.get_success_rate(paths)) success_rate = np.mean(all_success) self.eval_statistics['Success_test_task{}'.format( self.task_idx)] = success_rate final_returns.append(np.mean([a[-1] for a in all_rets])) # record online returns for the first n trajectories n = min([len(a) for a in all_rets]) all_rets = [a[:n] for a in all_rets] all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout online_returns.append(all_rets) self.eval_statistics['AverageReturn_test_task{}'.format( self.task_idx)] = all_rets n = min([len(t) for t in online_returns]) online_returns = [t[:n] for t in online_returns] return final_returns, online_returns
def _get_returns_init_from_offline_buffer(self, indices): train_returns = [] for idx in indices: self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length): init_context = self._reward_decoder_buffer.sample_context( idx, self.embedding_batch_size) init_context = ptu.from_numpy(init_context) p, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample_latent_period=0, update_posterior_period=0, initial_context=init_context, task_idx=idx, ) paths += p if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape( -1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths)) return train_returns
def evaluate(self, epoch, eval_paths=None): statistics = OrderedDict() statistics.update(self.eval_statistics) logger.log("Collecting samples for evaluation") if eval_paths: test_paths = eval_paths else: test_paths = self.get_eval_paths() statistics.update( eval_util.get_generic_path_information( test_paths, stat_prefix="Test", )) if len(self._exploration_paths) > 0: statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths, logger=logger) if hasattr(self.env, "get_diagnostics"): statistics.update(self.env.get_diagnostics(test_paths)) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) self.need_to_update_eval_statistics = True
def _do_eval(self, indices, epoch): final_returns = [] online_returns = [] for idx in indices: runs, all_rets = [], [] for r in range(self.num_evals): paths = self.collect_paths(idx, epoch, r) all_rets.append([eval_util.get_average_returns([p]) for p in paths]) runs.append(paths) all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout final_returns.append(all_rets[-1]) online_returns.append(all_rets) return final_returns, online_returns
def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ statistics = OrderedDict() try: statistics.update(self.eval_statistics) self.eval_statistics = None except: print('No Stats to Eval') logger.log("Collecting samples for evaluation") test_paths = self.eval_sampler.obtain_samples() statistics.update( eval_util.get_generic_path_information( test_paths, stat_prefix="Test", )) statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) if hasattr(self.env, "log_statistics"): statistics.update(self.env.log_statistics(test_paths)) if epoch % self.freq_log_visuals == 0: if hasattr(self.env, "log_visuals"): self.env.log_visuals(test_paths, epoch, logger.get_snapshot_dir()) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) best_statistic = statistics[self.best_key] if best_statistic > self.best_statistic_so_far: self.best_statistic_so_far = best_statistic if self.save_best and epoch >= self.save_best_starting_from_epoch: data_to_save = {'epoch': epoch, 'statistics': statistics} data_to_save.update(self.get_epoch_snapshot(epoch)) logger.save_extra_data(data_to_save, 'best.pkl') print('\n\nSAVED BEST\n\n')
def collect_data_for_embedding_online_with_logging(self, idx, epoch): self.task_idx = idx dprint('Task:', idx) self.env.reset_task(idx) n_exploration_episodes = 10 n_inference_episodes = 10 all_init_paths = [] all_inference_paths = [] self.enc_replay_buffer.clear_buffer(idx) for i in range(n_exploration_episodes): initial_z = self.sample_z_from_prior() init_paths = self.obtain_eval_paths(idx, z=initial_z, eval_task=True) all_init_paths += init_paths self.enc_replay_buffer.add_paths(idx, init_paths) dprint('enc_replay_buffer.task_buffers[idx]._size', self.enc_replay_buffer.task_buffers[idx]._size) for i in range(n_inference_episodes): paths = self.obtain_eval_paths(idx, eval_task=True) all_inference_paths += paths self.enc_replay_buffer.add_paths(idx, init_paths) # save evaluation rollouts for vis # all paths with open( self.output_dir + "/proto-sac-point-mass-fb-16z-init-task{}-{}.pkl".format( idx, epoch), 'wb+') as f: pickle.dump(all_init_paths, f, pickle.HIGHEST_PROTOCOL) with open( self.output_dir + "/proto-sac-point-mass-fb-16z-inference-task{}-{}.pkl".format( idx, epoch), 'wb+') as f: pickle.dump(all_inference_paths, f, pickle.HIGHEST_PROTOCOL) average_inference_returns = [ eval_util.get_average_returns(paths) for paths in all_inference_paths ] self.eval_statistics['AverageInferenceReturns_test_task{}'.format( idx)] = average_inference_returns
def _do_eval(self, goal_set, epoch): final_returns = [] final_achieved = [] for goal in goal_set: all_rets = [] all_achieved = [] for r in range(self.num_evals): paths = self.collect_paths(goal, epoch, r) all_rets.append( [eval_util.get_average_returns([p]) for p in paths]) all_achieved.append( [eval_util.get_average_achieved([p]) for p in paths]) final_returns.append(np.mean([a[-1] for a in all_rets])) final_achieved.append(np.mean([a[-1] for a in all_achieved])) return final_returns, final_achieved
def _do_eval(self, indices, epoch): final_returns = [] online_returns = [] for idx in indices: all_rets = [] for r in range(self.num_evals): paths = self.collect_paths(idx, epoch, r) all_rets.append([eval_util.get_average_returns([p]) for p in paths]) final_returns.append(np.mean([a[-1] for a in all_rets])) # record online returns for the first n trajectories n = min([len(a) for a in all_rets]) all_rets = [a[:n] for a in all_rets] all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout online_returns.append(all_rets) n = min([len(t) for t in online_returns]) online_returns = [t[:n] for t in online_returns] return final_returns, online_returns
def eval_alg(policy, env, num_eval_rollouts, eval_deterministic=False, max_path_length=1000): if eval_deterministic: policy = MakeDeterministic(policy) eval_sampler = InPlacePathSampler(env=env, policy=policy, max_samples=max_path_length * (num_eval_rollouts + 1), max_path_length=max_path_length, policy_uses_pixels=False, policy_uses_task_params=False, concat_task_params_to_policy_obs=False) test_paths = eval_sampler.obtain_samples() average_returns = get_average_returns(test_paths) return average_returns
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) policy = joblib.load(variant['policy_checkpoint'])['exploration_policy'] if variant['eval_deterministic']: policy = MakeDeterministic(policy) policy.to(ptu.device) eval_sampler = PathSampler(env, policy, variant['num_eval_steps'], variant['max_path_length'], no_terminal=variant['no_terminal'], render=variant['render'], render_kwargs=variant['render_kwargs']) test_paths = eval_sampler.obtain_samples() average_returns = eval_util.get_average_returns(test_paths) print(average_returns) return 1
def log_statistics(self, paths, split=''): self.eval_statistics.update( eval_util.get_generic_path_information( paths, stat_prefix="{}_task{}".format(split, self.task_idx), )) # TODO(KR) what are these? self.eval_statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration_task{}".format(self.task_idx), ) ) # something is wrong with these exploration paths i'm pretty sure... average_returns = eval_util.get_average_returns(paths) self.eval_statistics['AverageReturn_{}_task{}'.format( split, self.task_idx)] = average_returns goal = self.env._goal dprint('GoalPosition_{}_task'.format(split)) dprint(goal) self.eval_statistics['GoalPosition_{}_task{}'.format( split, self.task_idx)] = goal
def evaluate(self, epoch): statistics = OrderedDict() statistics.update(self.eval_statistics) self.eval_statistics = statistics # old_device = ptu.device # ptu.device = torch.device('cpu') # self.policy.cnn_enc.to(ptu.device) # self.policy.task_enc.to(ptu.device) # self.policy.qf1.to(ptu.device) # self.policy_dataset = PolicyDataset(self, eval_task=False) # self.policy_loader = iter(torch.utils.data.DataLoader(self.policy_dataset, batch_size=1, # shuffle=False, pin_memory=True, sampler=None, batch_sampler=None, num_workers=10, # worker_init_fn=None, collate_fn=lambda x: x)) import time # for i in range(10): # t0 = time.time() # paths = self.policy_loader.next() # print((time.time() - t0)) # # import pdb; pdb.set_trace() ### train tasks dprint('evaluating on {} train tasks'.format(len(self.train_tasks))) train_avg_returns = [] train_avg_succ = [] train_avg_len = [] for idx in self.train_tasks: dprint('task {} encoder RB size'.format(idx), self.enc_replay_buffer.task_buffers[idx]._size) paths = self.collect_paths(idx, epoch, eval_task=False) t0 = time.time() # paths = self.policy_loader.next()[0] # import pdb; pdb.set_trace() train_avg_returns.append(eval_util.get_average_returns(paths)) train_avg_succ.append( [sum([j['succ'] for j in i['env_infos']]) for i in paths]) train_avg_len.append([len(i['env_infos']) for i in paths]) print((time.time() - t0)) # import pdb; pdb.set_trace() # ptu.device = old_device # self.policy.cnn_enc.to(ptu.device) # self.policy.task_enc.to(ptu.device) # self.policy.qf1.to(ptu.device) ## test tasks dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) test_avg_returns = [] test_avg_succ = [] test_avg_len = [] # This is calculating the embedding online, because every iteration # we clear the encoding buffer for the test tasks. for idx in np.random.choice(self.eval_tasks, self.num_evals, replace=False): print('eval task', idx) self.task_idx = idx self.env.reset_task(idx) # collect data fo computing embedding if needed if self.eval_embedding_source in ['online', 'initial_pool']: pass elif self.eval_embedding_source == 'online_exploration_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # task embedding sampled from prior and held fixed self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, eval_task=True) elif self.eval_embedding_source == 'online_on_policy_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # half the data from z sampled from prior, the other half from z sampled from posterior self.collect_data_online(idx=idx, num_samples=self.num_steps_per_task, eval_task=True) else: raise Exception("Invalid option for computing eval embedding") dprint('task {} encoder RB size'.format(idx), self.eval_enc_replay_buffer.task_buffers[idx]._size) test_paths = self.collect_paths(idx, epoch, eval_task=True) test_avg_returns.append(eval_util.get_average_returns(test_paths)) test_avg_succ.append( [sum([j['succ'] for j in i['env_infos']]) for i in test_paths]) test_avg_len.append([len(i['env_infos']) for i in test_paths]) if self.use_information_bottleneck: z_mean = np.mean( np.abs(ptu.get_numpy(self.policy.z_dists[0].mean))) z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance)) self.eval_statistics['Z mean eval'] = z_mean self.eval_statistics['Z variance eval'] = z_sig # TODO(KR) what does this do if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) avg_train_return = np.mean(train_avg_returns) avg_test_return = np.mean(test_avg_returns) avg_train_succ = np.mean(train_avg_succ, axis=0) avg_test_succ = np.mean(test_avg_succ, axis=0) avg_train_len = np.mean(train_avg_len, axis=0) avg_test_len = np.mean(test_avg_len, axis=0) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return for i, s in enumerate(avg_train_succ): self.eval_statistics['Succ_train_tasks_%s' % i] = s for i, s in enumerate(avg_test_succ): self.eval_statistics['Succ_test_tasks_%s' % i] = s for i, s in enumerate(avg_train_len): self.eval_statistics['Len__train_tasks_%s' % i] = s for i, s in enumerate(avg_test_len): self.eval_statistics['Len_test_tasks_%s' % i] = s for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(test_paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample_latent_period=self.exploration_resample_latent_period, update_posterior_period=self. exploration_update_posterior_period, # following PEARL protocol ) logger.save_extra_data( prior_paths, file_name='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks if self._num_tasks_to_eval_on >= len(self.train_task_indices): indices = self.train_task_indices else: # eval on a subset of train tasks in case num train tasks is huge indices = np.random.choice(self.offline_train_task_indices, self._num_tasks_to_eval_on) # logger.log('evaluating on {} train tasks'.format(len(indices))) ### eval train tasks with posterior sampled from the training replay buffer train_returns = [] for idx in indices: self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length): # init_context = self.sample_context(idx) if self.use_meta_learning_buffer: init_context = self.meta_replay_buffer._sample_contexts( [idx], self.embedding_batch_size) else: init_context = self.enc_replay_buffer.sample_context( idx, self.embedding_batch_size) if self.eval_data_collector: p = self.eval_data_collector.collect_new_paths( num_steps=self. max_path_length, # TODO: also cap num trajs max_path_length=self.max_path_length, discard_incomplete_paths=False, accum_context=False, resample_latent_period=0, update_posterior_period=0, initial_context=init_context, task_idx=idx, ) else: init_context = ptu.from_numpy(init_context) # TODO: replace with sampler # self.agent.infer_posterior(context) p, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample_latent_period=0, update_posterior_period=0, initial_context=init_context, task_idx=idx, ) paths += p if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape( -1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths)) train_returns_offline_buffer = self._get_returns_init_from_offline_buffer( indices) # train_returns = np.mean(train_returns) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_online_returns, train_task_to_final_context = ( self._do_eval(indices, epoch)) # logger.log('train online returns') # logger.log(train_online_returns) ### test tasks # logger.log('evaluating on {} test tasks'.format(len(self.eval_task_indices))) test_final_returns, test_online_returns, test_task_to_final_context = ( self._do_eval(self.eval_task_indices, epoch)) # logger.log('test online returns') # logger.log(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) z_dist_log = self._get_z_distribution_log(train_task_to_final_context) append_log(self.eval_statistics, z_dist_log, prefix='trainer/train_tasks/') if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(paths, prefix=None) avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0) avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/init_from_offline_buffer/train_tasks/all_returns', train_returns_offline_buffer, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/init_from_buffer/train_tasks/all_returns', train_returns, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/train_tasks/final_returns', train_final_returns, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/test_tasks/final_returns', test_final_returns, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/train_tasks/all_returns', avg_train_online_return, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/test_tasks/all_returns', avg_test_online_return, )) if len(self.fake_task_idx_to_z) > 0: self_generated_indices = np.random.choice( np.array(list(self.fake_task_idx_to_z.keys())), self._num_tasks_to_eval_on, ) self_generated_final_returns, self_generated_online_returns, _ = self._do_eval( self_generated_indices, epoch) avg_self_generated_return = np.mean( np.stack(self_generated_online_returns)) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/generated_tasks/final_returns', self_generated_final_returns, )) self.eval_statistics.update( eval_util.create_stats_ordered_dict( 'eval/adaptation/generated_tasks/all_returns', avg_self_generated_return, )) try: import os import psutil process = psutil.Process(os.getpid()) self.eval_statistics['RAM Usage (Mb)'] = int( process.memory_info().rss / 1000000) except ImportError: pass logger.save_extra_data(avg_train_online_return, file_name='online-train-epoch{}'.format(epoch)) logger.save_extra_data(avg_test_online_return, file_name='online-test-epoch{}'.format(epoch)) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(paths) if self.plotter: self.plotter.draw()
def get_custom_generic_path_information(paths, path_length, reward_scale, stat_prefix=''): """ Get an OrderedDict with a bunch of statistic names and values. Differs from normal rlkit utility function in the following ways: Grabs normalized reward / return values where reward is normalized to 1.0 Grabs cumulative reward specified accumulated at @path_length timestep """ statistics = OrderedDict() returns = [sum(path["rewards"]) for path in paths] # Grab returns accumulated up to specified timestep expl_returns = [sum(path["rewards"][:path_length]) for path in paths] rewards = np.vstack([path["rewards"] for path in paths]) # norm_rewards = [path["rewards"] / reward_scale for path in paths] statistics.update( eval_util.create_stats_ordered_dict('Rewards', rewards, stat_prefix=stat_prefix)) statistics.update( eval_util.create_stats_ordered_dict('Returns', returns, stat_prefix=stat_prefix)) # Add extra stats statistics.update( eval_util.create_stats_ordered_dict('ExplReturns', expl_returns, stat_prefix=stat_prefix)) actions = [path["actions"] for path in paths] if len(actions[0].shape) == 1: actions = np.hstack([path["actions"] for path in paths]) else: actions = np.vstack([path["actions"] for path in paths]) statistics.update( eval_util.create_stats_ordered_dict('Actions', actions, stat_prefix=stat_prefix)) statistics['Num Paths'] = len(paths) statistics[stat_prefix + 'Average Returns'] = eval_util.get_average_returns(paths) for info_key in ['env_infos', 'agent_infos']: if info_key in paths[0]: all_env_infos = [ ppp.list_of_dicts__to__dict_of_lists(p[info_key]) for p in paths ] for k in all_env_infos[0].keys(): final_ks = np.array([info[k][-1] for info in all_env_infos]) first_ks = np.array([info[k][0] for info in all_env_infos]) all_ks = np.concatenate([info[k] for info in all_env_infos]) statistics.update( eval_util.create_stats_ordered_dict( stat_prefix + k, final_ks, stat_prefix='{}/final/'.format(info_key), )) statistics.update( eval_util.create_stats_ordered_dict( stat_prefix + k, first_ks, stat_prefix='{}/initial/'.format(info_key), )) statistics.update( eval_util.create_stats_ordered_dict( stat_prefix + k, all_ks, stat_prefix='{}/'.format(info_key), )) return statistics
def get_traffic_path_information(paths, stat_prefix=''): """ Get an OrderedDict with a bunch of statistic names and values. """ statistics = OrderedDict() returns = [sum(path["rewards"]) for path in paths] rewards = np.vstack([path["rewards"] for path in paths]) statistics.update( create_stats_ordered_dict('Rewards', rewards, stat_prefix=stat_prefix)) statistics.update( create_stats_ordered_dict('Returns', returns, stat_prefix=stat_prefix)) actions = [path["actions"] for path in paths] if len(actions[0].shape) == 1: actions = np.hstack([path["actions"] for path in paths]) else: actions = np.vstack([path["actions"] for path in paths]) statistics.update( create_stats_ordered_dict('Actions', actions, stat_prefix=stat_prefix)) statistics['Num Paths'] = len(paths) statistics[stat_prefix + 'Average Returns'] = get_average_returns(paths) num_collision, num_block, num_outroad, num_success, num_timeout = 0, 0, 0, 0, 0 log_path = logger.get_snapshot_dir() for pid, path in enumerate(paths): event = path["env_infos"][-1]['event'] if event == 'collision': num_collision += 1 elif event == 'block': num_block += 1 elif event == 'outroad': num_outroad += 1 elif event == 'goal': num_success += 1 else: num_timeout += 1 statistics['Num Collision'] = num_collision statistics['Num Block'] = num_block statistics['Num Outroad'] = num_outroad statistics['Num Success'] = num_success statistics['Num Timeout'] = num_timeout for info_key in ['agent_infos']: if info_key in paths[0]: all_env_infos = [ ppp.list_of_dicts__to__dict_of_lists(p[info_key]) for p in paths ] for k in all_env_infos[0].keys(): final_ks = np.array([info[k][-1] for info in all_env_infos]) first_ks = np.array([info[k][0] for info in all_env_infos]) all_ks = np.concatenate([info[k] for info in all_env_infos]) statistics.update( create_stats_ordered_dict( stat_prefix + k, final_ks, stat_prefix='{}/final/'.format(info_key), )) statistics.update( create_stats_ordered_dict( stat_prefix + k, first_ks, stat_prefix='{}/initial/'.format(info_key), )) statistics.update( create_stats_ordered_dict( stat_prefix + k, all_ks, stat_prefix='{}/'.format(info_key), )) return statistics
def evaluate(self, epoch): statistics = OrderedDict() statistics.update(self.eval_statistics) self.eval_statistics = statistics ### train tasks dprint('evaluating on {} train tasks'.format(len(self.train_tasks))) train_avg_returns = [] for idx in self.train_tasks: dprint('task {} encoder RB size'.format(idx), self.enc_replay_buffer.task_buffers[idx]._size) paths = self.collect_paths(idx, epoch, eval_task=False) train_avg_returns.append(eval_util.get_average_returns(paths)) ### test tasks dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) test_avg_returns = [] # This is calculating the embedding online, because every iteration # we clear the encoding buffer for the test tasks. for idx in self.eval_tasks: self.task_idx = idx self.env.reset_task(idx) # collect data fo computing embedding if needed if self.eval_embedding_source in ['online', 'initial_pool']: pass elif self.eval_embedding_source == 'online_exploration_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # task embedding sampled from prior and held fixed self.collect_data_sampling_from_prior( num_samples=self.num_steps_per_task, resample_z_every_n=self.max_path_length, eval_task=True) elif self.eval_embedding_source == 'online_on_policy_trajectories': self.eval_enc_replay_buffer.task_buffers[idx].clear() # half the data from z sampled from prior, the other half from z sampled from posterior self.collect_data_online(idx=idx, num_samples=self.num_steps_per_task, eval_task=True) else: raise Exception("Invalid option for computing eval embedding") dprint('task {} encoder RB size'.format(idx), self.eval_enc_replay_buffer.task_buffers[idx]._size) test_paths = self.collect_paths(idx, epoch, eval_task=True) test_avg_returns.append(eval_util.get_average_returns(test_paths)) if self.use_information_bottleneck: z_mean = np.mean( np.abs(ptu.get_numpy(self.policy.z_dists[0].mean))) z_sig = np.mean(ptu.get_numpy(self.policy.z_dists[0].variance)) self.eval_statistics['Z mean eval'] = z_mean self.eval_statistics['Z variance eval'] = z_sig # TODO(KR) what does this do if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) avg_train_return = np.mean(train_avg_returns) avg_test_return = np.mean(test_avg_returns) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(test_paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1, testing=True) logger.save_extra_data( prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed indices = np.random.choice(self.train_tasks, len(self.eval_tasks)) eval_util.dprint('evaluating on {} train tasks'.format(len(indices))) ### eval train tasks with posterior sampled from the training replay buffer train_returns = [] for idx in indices: self.task_idx = idx self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length): context = self.sample_context(idx) self.agent.infer_posterior(context) p, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample=np.inf, testing=True) paths += p if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape( -1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths)) train_returns = np.mean(train_returns) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_online_returns = self._do_eval( indices, epoch) eval_util.dprint('train online returns') eval_util.dprint(train_online_returns) ### test tasks eval_util.dprint('evaluating on {} test tasks'.format( len(self.eval_tasks))) test_final_returns, test_online_returns = self._do_eval( self.eval_tasks, epoch) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(paths, prefix=None) avg_train_return = np.mean(train_final_returns) avg_test_return = np.mean(test_final_returns) avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0) avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0) self.eval_statistics[ 'AverageTrainReturn_all_train_tasks'] = train_returns self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return logger.save_extra_data(avg_train_online_return, path='online-train-epoch{}'.format(epoch)) logger.save_extra_data(avg_test_online_return, path='online-test-epoch{}'.format(epoch)) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(paths) if self.plotter: self.plotter.draw()
def evaluate(self, epoch): """ Evaluate the policy, e.g. save/print progress. :param epoch: :return: """ statistics = OrderedDict() try: statistics.update(self.eval_statistics) self.eval_statistics = None except: print('No Stats to Eval') logger.log("Collecting samples for evaluation") test_paths = [] sampled_task_params = self.test_task_params_sampler.sample_unique( self.num_eval_tasks) for i in range(self.num_eval_tasks): env = self.env_factory(sampled_task_params[i]) for _ in range(self.num_rollouts_per_task_per_eval): test_paths.append( rollout( self.env, self.get_eval_policy(sampled_task_params[i]), self.max_path_length, no_terminal=self.no_terminal, render=self.render, render_kwargs=self.render_kwargs, )) statistics.update( eval_util.get_generic_path_information( test_paths, stat_prefix="Test", )) statistics.update( eval_util.get_generic_path_information( self._exploration_paths, stat_prefix="Exploration", )) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(test_paths) if hasattr(self.env, "log_statistics"): statistics.update(self.env.log_statistics(test_paths)) if epoch % self.freq_log_visuals == 0: if hasattr(self.env, "log_visuals"): self.env.log_visuals(test_paths, epoch, logger.get_snapshot_dir()) average_returns = eval_util.get_average_returns(test_paths) statistics['AverageReturn'] = average_returns for key, value in statistics.items(): logger.record_tabular(key, value) best_statistic = statistics[self.best_key] if best_statistic > self.best_statistic_so_far: self.best_statistic_so_far = best_statistic if self.save_best and epoch >= self.save_best_starting_from_epoch: data_to_save = {'epoch': epoch, 'statistics': statistics} data_to_save.update(self.get_epoch_snapshot(epoch)) logger.save_extra_data(data_to_save, 'best.pkl') print('\n\nSAVED BEST\n\n')