def gen_rollout(self, obs_task_params, task_params, z, num_rollouts): # set up the post cond policy z = z.cpu().data.numpy() post_cond_policy = PostCondMLPPolicyWrapper(self.main_policy, z) post_cond_policy.policy.eval() # generate some rollouts successes = [] for roll_num in range(num_rollouts): observation = self.env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False timestep = 0 cur_success = False while (not terminal) and timestep < self.alg.max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (self.env.step(action)) if env_info['is_success']: cur_success = True if self.alg.no_terminal: terminal = False observation = next_ob timestep += 1 successes.append(float(cur_success)) return successes
def get_eval_policy(self, task_identifier, mode='meta_test'): if self.wrap_absorbing: raise NotImplementedError('wrap absorbing') if mode == 'meta_train': rb = self.train_context_expert_replay_buffer else: rb = self.test_context_expert_replay_buffer eval_context_size = np.random.randint(self.min_context_size, self.max_context_size+1) list_of_trajs = rb.sample_trajs_from_task( task_identifier, eval_context_size\ if self.few_shot_version else self.num_context_trajs_for_eval, ) # list_of_trajs = rb.sample_trajs_from_task( # task_identifier, # 3 if self.few_shot_version else self.num_context_trajs_for_eval, # ) if self.use_target_enc: enc_to_use = self.target_enc else: enc_to_use = self.encoder mode = enc_to_use.training enc_to_use.eval() post_dist = enc_to_use([list_of_trajs]) enc_to_use.train(mode) z = post_dist.sample() # z = post_dist.mean z = z.cpu().data.numpy()[0] if self.use_target_policy: return PostCondMLPPolicyWrapper(self.target_policy, z, deterministic=self.eval_deterministic) else: return PostCondMLPPolicyWrapper(self.policy, z, deterministic=self.eval_deterministic)
def get_exploration_policy(self, task_identifier): if self.wrap_absorbing: raise NotImplementedError('wrap absorbing') if self.few_shot_version: # no need for this if/else statement like this, make it cleaner this_context_size = np.random.randint(self.min_context_size, self.max_context_size + 1) list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task( task_identifier, this_context_size) mask = None else: list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, ) mask = None if self.use_target_enc: enc_to_use = self.target_enc else: enc_to_use = self.encoder mode = enc_to_use.training enc_to_use.eval() post_dist = enc_to_use([list_of_trajs], mask) enc_to_use.train(mode) z = post_dist.sample() # z = post_dist.mean z = z.cpu().data.numpy()[0] if self.use_target_policy: return PostCondMLPPolicyWrapper(self.target_policy, z) else: return PostCondMLPPolicyWrapper(self.policy, z)
def get_eval_policy(self, task_identifier, mode='meta_test'): if task_identifier not in self.context_buffer.task_replay_buffers: # generate some rollouts with prior policy eval_context_buffer = MetaEnvReplayBuffer( self.context_buffer_size_per_task, self.training_env, policy_uses_pixels=self.policy_uses_pixels, ) n_steps_total = 0 steps_needed = self.num_context_trajs_for_exploration * self.max_path_length task_params = self.training_env.task_id_to_task_params( task_identifier) obs_task_params = self.training_env.task_id_to_obs_task_params( task_identifier) while n_steps_total < steps_needed: first_obs = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier z = self.prior_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper( self.main_policy, z) new_path = rollout(self.training_env, post_cond_policy, max_path_length=min( self.max_path_length + 1, steps_needed - n_steps_total + 1), do_not_reset=True, first_obs=first_obs) n_steps_total += len(new_path['observations']) eval_context_buffer.add_path(new_path, task_id) list_of_trajs = eval_context_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, samples_per_traj=self.samples_per_traj) mask = None else: list_of_trajs = self.context_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, ) mask = None enc_to_use = self.encoder mode = enc_to_use.training enc_to_use.eval() post_dist = enc_to_use([list_of_trajs], mask) enc_to_use.train(mode) z = post_dist.sample() z = z.cpu().data.numpy()[0] return PostCondMLPPolicyWrapper(self.main_policy, z)
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks): # return all the metrics we would need for evaluating the models # for each trajectory we need to know 1) was it successful 2) was it a good reach # policy.cuda() # np_encoder.cuda() policy.eval() np_encoder.eval() params_sampler = _BaseParamsSampler(random=52269, num_colors=16) env = EvalEnv() all_statistics = {} task_num = 0 algorithm_all_percent_good_reach = [] algorithm_all_percent_solved = [] for task_params, obs_task_params in params_sampler: print('\tEvaluating task %d...' % task_num) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_num in range(4): print('\t\tTry with new context number %d...' % context_num) # get a context list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, 1) post_dist = np_encoder([list_of_trajs]) all_good_reach_for_context = [0 for _ in range(20)] all_solved_for_context = [0 for _ in range(20)] for post_sample_num in range(4): z = post_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper(policy, z) # reset the env seed env.seed(seed=ENV_EVAL_SEED) for t in range(20): stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy) # print(stacked_path['observations'][0]) stats = env.log_statistics([stacked_path]) if stats['Percent_Good_Reach'] > 0: all_good_reach_for_context[t] = 1.0 if stats['Percent_Solved'] > 0: all_solved_for_context[t] = 1.0 # paths_for_context_size.append(stacked_path) algorithm_all_percent_good_reach.append( np.mean(all_good_reach_for_context)) algorithm_all_percent_solved.append( np.mean(all_solved_for_context)) return { 'algorithm_all_percent_good_reach': algorithm_all_percent_good_reach, 'algorithm_all_percent_solved': algorithm_all_percent_solved }
def get_exploration_policy(self, task_identifier): list_of_trajs = self.train_context_expert_replay_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, ) post_dist = self.encoder([list_of_trajs]) # z = post_dist.sample() z = post_dist.mean z = z.cpu().data.numpy()[0] return PostCondMLPPolicyWrapper(self.main_policy, z)
def get_eval_policy(self, task_identifier, mode='meta_test'): if mode == 'meta_train': rb = self.train_context_expert_replay_buffer else: rb = self.test_context_expert_replay_buffer list_of_trajs = rb.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_eval, ) post_dist = self.encoder([list_of_trajs]) # z = post_dist.sample() z = post_dist.mean z = z.cpu().data.numpy()[0] return PostCondMLPPolicyWrapper(self.main_policy, z)
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks): # return all the metrics we would need for evaluating the models # for each trajectory we need to know 1) was it successful 2) was it a good reach # policy.cuda() # np_encoder.cuda() policy.eval() np_encoder.eval() params_sampler = _BaseParamsSampler(random=52269, num_colors=16) env = EvalEnv() all_statistics = {} task_num = 0 for task_params, obs_task_params in params_sampler: print('\tEvaluating task %d...' % task_num) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_size in range(1, 7): print('\t\tEvaluating context size %d...' % context_size) paths_for_context_size = [] for _ in range(3): # get a context list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, context_size) post_dist = np_encoder([list_of_trajs]) for _ in range(3): # sample from the posterior and get the PostCondPolicy z = post_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper(policy, z) for _ in range(4): stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy) paths_for_context_size.append(stacked_path) stats_for_context_size = env.log_statistics(paths_for_context_size) all_statistics[context_size] = { 'Percent_Good_Reach': stats_for_context_size['Percent_Good_Reach'], 'Percent_Solved': stats_for_context_size['Percent_Solved'] } return all_statistics
def get_exploration_policy(self, task_identifier): list_of_trajs = self.context_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, samples_per_traj=self.samples_per_traj) mask = None enc_to_use = self.encoder mode = enc_to_use.training enc_to_use.eval() post_dist = enc_to_use([list_of_trajs], mask) enc_to_use.train(mode) z = post_dist.sample() z = z.cpu().data.numpy()[0] return PostCondMLPPolicyWrapper(self.main_policy, z)
def pretrain(self): print('Generating initial contexts') # fill the contexts for task_params, obs_task_params in self.train_task_params_sampler: print('task') n_steps_total = 0 # print(n_steps_total) while n_steps_total < self.context_buffer_size_per_task: # print('------') # print(n_steps_total) # print(self.context_buffer_size_per_task) # print(self.max_path_length) first_obs = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier z = self.prior_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper( self.main_policy, z) new_path = rollout( self.training_env, post_cond_policy, max_path_length=min( self.max_path_length + 1, self.context_buffer_size_per_task - n_steps_total + 1), do_not_reset=True, first_obs=first_obs) # print(len(new_path['observations'])) n_steps_total += len(new_path['observations']) if self.add_context_rollouts_to_replay_buffer: self.replay_buffer.add_path(new_path, task_id) self.context_buffer.add_path(new_path, task_id) print('Generating initial replay buffer rollouts') super().pretrain()
def gather_eval_data(alg, num_rollouts_per_context=8, deterministic=True, num_diff_context=1, eval_params_sampler=None, expert_buffer_for_eval_tasks=None, evaluating_expert=False, eval_deterministic=True, eval_no_task_info=False): context_sizes = [1] if not evaluating_expert: alg.encoder.eval() all_statistics = {} task_num = 0 # env = alg.env env = Walker2DRandomDynamicsEnv() _means = [] _stds = [] for task_params, obs_task_params in eval_params_sampler: env.reset(task_params=task_params, obs_task_params=obs_task_params) task_rets = [] print('\tEvaluating task {}...'.format(obs_task_params)) print(task_params) task_num += 1 task_id = env.task_identifier for context_size in context_sizes: _cont_size_dict = {} for c_idx in range(num_diff_context): if not evaluating_expert: if eval_no_task_info: print('Evaluting with no task information!') new_task_params = {} for k in task_params: new_task_params[k] = np.ones(task_params[k].shape) raise NotImplementedError() else: list_of_trajs = alg.expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, context_size) alg.encoder.eval() post_dist = alg.encoder([list_of_trajs]) z = post_dist.sample() z = z.cpu().data.numpy()[0] # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z) post_cond_policy = PostCondMLPPolicyWrapper( alg.main_policy, z) post_cond_policy.policy.eval() else: # if eval_no_task_info: # print('Evaluting with no task information!') # post_cond_policy = alg.get_eval_policy(0.0*np.ones(obs_task_params.shape)) # else: # post_cond_policy = alg.get_eval_policy(np.ones(obs_task_params)) # For evaluating a standard walker expert # post_cond_policy = alg.policy # post_cond_policy = alg.eval_policy post_cond_policy = MakeDeterministic(alg.policy) post_cond_policy.deterministic = eval_deterministic context_returns = [] for _ in range(num_rollouts_per_context): stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy, alg.max_path_length) context_returns.append(np.sum(stacked_path['rewards'])) task_rets.extend(context_returns) all_statistics[task_id] = task_rets print('\nReturns: %.4f +/- %.4f' % (np.mean(task_rets), np.std(task_rets))) _means.append(np.mean(task_rets)) _stds.append(np.std(task_rets)) for i in range(len(_means)): print('%.4f +/- %.4f' % (_means[i], _stds[i])) return all_statistics
def gather_eval_data( policy, encoder, env, expert_buffer_for_eval_tasks=None, num_diff_context_per_task=8, context_size_min=1, context_size_max=12, num_rollouts_per_context=20, deterministic=True, params_sampler=None, ): policy.eval() encoder.eval() all_success_transitions = [] all_no_op_transitions = [] task_num = 0 for task_params, obs_task_params in params_sampler: print('\n\tEvaluating task {}...'.format(task_num)) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for _ in range(num_diff_context_per_task): print('new context transition') transition_success_rate = [] transition_no_op_rate = [] list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, context_size_max ) for i in range(context_size_min, context_size_max+1): print('next size') correct = [] incorrect = [] no_op = [] new_list_of_trajs = list_of_trajs[:i] print(len(new_list_of_trajs)) post_dist = encoder([new_list_of_trajs]) z = post_dist.mean z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper(policy, z) post_cond_policy.policy.eval() post_cond_policy.deterministic = deterministic for _ in range(num_rollouts_per_context): max_path_length = 50 within_correct, within_incorrect = rollout_path( env, task_params, obs_task_params, post_cond_policy, max_path_length ) correct.append(within_correct) incorrect.append(within_incorrect) no_op.append(not (within_correct or within_incorrect)) transition_success_rate.append(np.mean(correct)) transition_no_op_rate.append(np.mean(no_op)) # task_rets.append(np.sum(stacked_path['rewards'])) all_success_transitions.append(transition_success_rate) all_no_op_transitions.append(transition_no_op_rate) print(transition_success_rate) print(transition_no_op_rate) if task_num == 32: break # print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets))) # all_statistics[task_id] = task_rets return { 'all_success_transitions': all_success_transitions, 'all_no_op_transitions': all_no_op_transitions, }
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks, max_k=8, sample_from_prior=False): # return all the metrics we would need for evaluating the models # for each trajectory we need to know 1) was it successful 2) was it a good reach # policy.cuda() # np_encoder.cuda() policy.eval() np_encoder.eval() params_sampler = _BaseParamsSampler(random=52269, num_colors=NUM_EVAL_TASKS) env = EvalEnv() all_statistics = {} task_num = 0 ''' algorithm = [tasks] task = [contexts] context = [post_samples] # post samples run on the same set of trajs post_samples = [trajs] trajs \in {0,1} ''' algorithm_good_reach = [] algorithm_solved = [] for task_params, obs_task_params in params_sampler: print('\tEvaluating task %d...' % task_num) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier task_good_reach = [] task_solved = [] for context_num in range(4): print('\t\tTry with new context, number %d...' % context_num) # get a single trajectory context list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, 1 ) post_dist = np_encoder([list_of_trajs]) context_good_reach = [] context_solved = [] # evaluate all posterior sample trajs with same initial state env_seed = np.random.randint(0, high=10000) for post_sample_num in range(1,max_k+1): z = post_dist.sample() z = z.cpu().data.numpy()[0] if sample_from_prior: z = np.random.normal(size=z.shape) post_cond_policy = PostCondMLPPolicyWrapper(policy, z) # reset the env seed env.seed(seed=env_seed) post_good_reach = [] post_solved = [] for t in range(20): stacked_path = rollout_path( env, task_params, obs_task_params, post_cond_policy ) # print(stacked_path['observations'][0]) stats = env.log_statistics([stacked_path]) if stats['Percent_Good_Reach'] > 0: post_good_reach.append(1.0) else: post_good_reach.append(0.0) if stats['Percent_Solved'] > 0: post_solved.append(1.0) else: post_solved.append(0.0) # paths_for_context_size.append(stacked_path) context_good_reach.append(post_good_reach) context_solved.append(post_solved) task_good_reach.append(context_good_reach) task_solved.append(context_solved) algorithm_good_reach.append(task_good_reach) algorithm_solved.append(task_solved) return { 'algorithm_good_reach': algorithm_good_reach, 'algorithm_solved': algorithm_solved }
def gather_eval_data(policy, np_encoder, expert_buffer_for_eval_tasks, max_context_size=6, sample_from_prior=False): # return all the metrics we would need for evaluating the models # for each trajectory we need to know 1) was it successful 2) was it a good reach # policy.cuda() # np_encoder.cuda() policy.eval() np_encoder.eval() params_sampler = _BaseParamsSampler(random=52269, num_colors=NUM_EVAL_TASKS) env = EvalEnv() all_statistics = {} task_num = 0 if sample_from_prior: max_context_size = 1 all_good_reach = defaultdict(list) all_solved = defaultdict(list) all_no_op_fail = defaultdict(list) for task_params, obs_task_params in params_sampler: print('\tEvaluating task %d...' % task_num) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_size in range(1, max_context_size + 1): print('\t\tEvaluating context size %d...' % context_size) paths_for_context_size = [] for _ in range(NUM_CONTEXT_SAMPLES): # get a context list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, context_size) post_dist = np_encoder([list_of_trajs]) for _ in range(NUM_POST_SAMPLES): # sample from the posterior and get the PostCondPolicy # z = post_dist.sample() z = post_dist.mean z = z.cpu().data.numpy()[0] if sample_from_prior: z = np.random.normal(size=z.shape) post_cond_policy = PostCondMLPPolicyWrapper(policy, z) for _ in range(NUM_ROLLOUTS_PER_POST_SAMPLE): stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy) paths_for_context_size.append(stacked_path) stats_for_context_size = env.log_statistics(paths_for_context_size) all_good_reach[context_size].append( stats_for_context_size['Percent_Good_Reach']) all_solved[context_size].append( stats_for_context_size['Percent_Solved']) all_no_op_fail[context_size].append( stats_for_context_size['Percent_NoOp_Fail']) return { 'algorithm_good_reach': all_good_reach, 'algorithm_solved': all_solved, 'algorithm_no_op_fail': all_no_op_fail }
def gather_eval_data(alg, sample_from_prior=False, num_rollouts_per_task=8, context_sizes=[4], deterministic=True, eval_expert=False, just_loading_policy=False, render=False): if not eval_expert: alg.encoder.eval() all_statistics = {} task_num = 0 params_sampler = EvalParamsSampler() if not just_loading_policy: env = alg.env else: env = AntRandDirec2DEnv() for task_params, obs_task_params in params_sampler: _task_dict = {} # print('\tEvaluating task %.4f...' % obs_task_params) print('\n\tEvaluating task {}'.format(obs_task_params)) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_size in context_sizes: _cont_size_dict = {} print('\t\tTry with context size: %d...' % context_size) # evaluate all posterior sample trajs with same initial state env_seed = np.random.randint(0, high=10000) if sample_from_prior: raise NotImplementedError # z = post_dist.sample() # z = z.cpu().data.numpy()[0] # if sample_from_prior: # z = np.random.normal(size=z.shape) if eval_expert: if just_loading_policy: post_cond_policy = PostCondMLPPolicyWrapper( alg, obs_task_params) else: post_cond_policy = alg.get_eval_policy(obs_task_params) else: post_cond_policy = alg.get_eval_policy(task_id, mode='meta_test') post_cond_policy.policy.eval() post_cond_policy.deterministic = deterministic # reset the env seed env.seed(seed=env_seed) _rets = [] _min_dists = [] _last_100 = [] for _ in range(num_rollouts_per_task): if just_loading_policy: # max_path_length = 200 # max_path_length = 300 max_path_length = 100 else: alg.max_path_length stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, eval_expert, render) obs = np.array( [d['obs'] for d in stacked_path['observations']]) all_statistics[task_id] = _task_dict return all_statistics
def gather_eval_data( policy, encoder, env, num_diff_context=4, num_rollouts_per_context=4, deterministic=True, expert_buffer_for_eval_tasks=None, params_sampler=None, eval_non_meta_policy=False ): policy.eval() if not eval_non_meta_policy: encoder.eval() all_statistics = {} task_num = 0 for task_params, obs_task_params in params_sampler: task_rets = [] # print('\tEvaluating task %.4f...' % obs_task_params) # print('\n\tEvaluating task {}...'.format(obs_task_params)) print('\n\tEvaluating task {}...'.format(task_num)) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for _ in range(num_diff_context): if not eval_non_meta_policy: list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, 1 ) post_dist = encoder([list_of_trajs]) z = post_dist.mean z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper(policy, z) post_cond_policy.policy.eval() post_cond_policy.deterministic = deterministic else: if deterministic: print('DETERMINISTIC') post_cond_policy = MakeDeterministic(policy) else: post_cond_policy = policy for _ in range(num_rollouts_per_context): max_path_length = 1000 stacked_path = rollout_path( env, task_params, obs_task_params, post_cond_policy, max_path_length, task_num ) task_rets.append(np.sum(stacked_path['rewards'])) print('Returns: %.1f +/- %.1f' % (np.mean(task_rets), np.std(task_rets))) all_statistics[task_id] = task_rets return all_statistics
def gather_eval_data(alg, sample_from_prior=False, num_rollouts_per_task=8, context_sizes=[4], deterministic=True, num_diff_context=1): alg.encoder.eval() all_statistics = {} task_num = 0 params_sampler = alg.test_task_params_sampler expert_buffer_for_eval_tasks = alg.test_context_expert_replay_buffer env = alg.env _all_rets = [] for task_params, obs_task_params in params_sampler: _task_dict = {} print('\tEvaluating task %.4f...' % obs_task_params) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_size in context_sizes: _cont_size_dict = {} print('\t\tTry with context size: %d...' % context_size) # list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( # task_id, # context_size # ) # # evaluate all posterior sample trajs with same initial state # env_seed = np.random.randint(0, high=10000) if sample_from_prior: raise NotImplementedError # z = post_dist.sample() # z = z.cpu().data.numpy()[0] # if sample_from_prior: # z = np.random.normal(size=z.shape) # # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_test') # post_cond_policy.policy.eval() # post_cond_policy.deterministic = deterministic # # reset the env seed _vels = [] # _std_vels = [] _run_costs = [] _rets = [] # env.seed(seed=env_seed) for c_idx in range(num_diff_context): list_of_trajs = alg.test_context_expert_replay_buffer.sample_trajs_from_task( task_id, context_size) alg.encoder.eval() post_dist = alg.encoder([list_of_trajs]) z = post_dist.sample() z = z.cpu().data.numpy()[0] # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z) post_cond_policy = PostCondMLPPolicyWrapper(alg.policy, z) post_cond_policy.policy.eval() post_cond_policy.deterministic = deterministic for _ in range(num_rollouts_per_task): stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy, alg.max_path_length) # compute mean vel, return, run cost per traj _vels.extend([d['vel'] for d in stacked_path['env_infos']]) # _std_vels.append(np.std([d['vel'] for d in stacked_path['env_infos']])) _run_costs.append( np.sum([ d['run_cost'] for d in stacked_path['env_infos'] ])) _rets.append(np.sum(stacked_path['rewards'])) _cont_size_dict['_vels'] = _vels # _cont_size_dict['std_vels'] = _std_vels _cont_size_dict['run_costs'] = _run_costs _cont_size_dict['rets'] = _rets _task_dict[context_size] = _cont_size_dict print('\t\tVel: %.4f +/- %.4f' % (np.mean(_vels), np.std(_vels))) _all_rets.extend(_rets) all_statistics[task_id] = _task_dict print('\nReturns: %.4f +/- %.4f' % (np.mean(_all_rets), np.std(_all_rets))) return all_statistics
def _do_training(self, epoch): # sample a mini-batch of tasks task_batch = self.train_task_params_sampler.sample_unique( self.num_tasks_used_per_update) # reset the context buffer for these tasks for task_params, obs_task_params in task_batch: self.training_env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier self.context_buffer.task_replay_buffers[task_id]._size = 0 self.context_buffer.task_replay_buffers[task_id]._top = 0 # generate contexts for each task in the minibatch for task_params, obs_task_params in task_batch: n_steps_total = 0 while n_steps_total < self.context_buffer_size_per_task: first_obs = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier z = self.prior_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper( self.main_policy, z) new_path = rollout( self.training_env, post_cond_policy, max_path_length=min( self.max_path_length + 1, self.context_buffer_size_per_task - n_steps_total + 1), do_not_reset=True, first_obs=first_obs) n_steps_total += len(new_path['observations']) if self.add_context_rollouts_to_replay_buffer: self.replay_buffer.add_path(new_path, task_id) self.context_buffer.add_path(new_path, task_id) # # generate rollouts using the posteriors # for task_params, obs_task_params in task_batch: # n_steps_total = 0 # while n_steps_total < self.num_posterior_steps_per_task: # first_obs = self.training_env.reset(task_params=task_params, obs_task_params=obs_task_params) # task_id = self.training_env.task_identifier # post_cond_policy = self.get_posterior_policy(task_id) # new_path = rollout( # self.training_env, # post_cond_policy, # max_path_length=min(self.max_path_length, self.num_context_steps - self.max_path_length), # do_not_reset=True, # first_obs=first_obs # ) # n_steps_total += len(new_path['observations']) # self.replay_buffer.add_path(new_path, task_id) # now do some training for t in range(self.num_update_loops_per_train_call): self._do_update(epoch)
def gather_eval_data( alg, sample_from_prior=False, num_rollouts_per_task=8, context_sizes=[4], num_diff_context=1, deterministic=True, eval_expert=False, just_loading_policy=False, render=False, use_separate_expert_buffer=False, expert_buffer_for_eval_tasks=None, ): if not eval_expert: alg.encoder.eval() all_statistics = {} task_num = 0 # params_sampler = alg.test_task_params_sampler # params_sampler = alg.train_task_params_sampler params_sampler = AntRandGoalExpertTestSampler() if not just_loading_policy: env = alg.env else: env = AntRandGoalEnv() for task_params, obs_task_params in params_sampler: _task_dict = {} # print('\tEvaluating task %.4f...' % obs_task_params) print('\n\tEvaluating task {}'.format(obs_task_params)) task_num += 1 env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = env.task_identifier for context_size in context_sizes: _cont_size_dict = {} print('\t\tTry with context size: %d...' % context_size) # evaluate all posterior sample trajs with same initial state env_seed = np.random.randint(0, high=10000) # reset the env seed env.seed(seed=env_seed) _rets = [] _min_dists = [] _last_100 = [] for _ in range(num_diff_context): if sample_from_prior: raise NotImplementedError # z = post_dist.sample() # z = z.cpu().data.numpy()[0] # if sample_from_prior: # z = np.random.normal(size=z.shape) if eval_expert: if just_loading_policy: post_cond_policy = PostCondMLPPolicyWrapper( alg, obs_task_params) else: post_cond_policy = alg.get_eval_policy(obs_task_params) else: if use_separate_expert_buffer: list_of_trajs = expert_buffer_for_eval_tasks.sample_trajs_from_task( task_id, context_size) post_dist = alg.encoder([list_of_trajs]) z = post_dist.mean z = z.cpu().data.numpy()[0] # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z) post_cond_policy = PostCondMLPPolicyWrapper( alg.policy, z) else: post_cond_policy = alg.get_eval_policy( task_id, mode='meta_test') # post_cond_policy = alg.get_eval_policy(task_id, mode='meta_train') post_cond_policy.policy.eval() post_cond_policy.deterministic = deterministic for _ in range(num_rollouts_per_task): if just_loading_policy: max_path_length = 100 else: max_path_length = alg.max_path_length stacked_path = rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, eval_expert, render) obs = np.array( [d['obs'] for d in stacked_path['observations']]) # print(np.max(obs, axis=0)) # print(np.min(obs, axis=0)) # print(np.mean(obs, axis=0)) # print(np.std(obs, axis=0)) # print(obs.shape) # print(np.max(obs)) # print(np.min(obs)) _rets.append(np.sum(stacked_path['rewards'])) rew_frw = [ d['reward_forward'] for d in stacked_path['env_infos'] ] _min_dists.append(-np.max(rew_frw)) _last_100.append(np.mean(rew_frw[-100:])) _cont_size_dict['rets'] = _rets _cont_size_dict['min_dists'] = _min_dists _cont_size_dict['last_100'] = _last_100 _task_dict[context_size] = _cont_size_dict print('\t\t\tMin Dist: %.4f +/- %.4f' % (np.mean(_min_dists), np.std(_min_dists))) print(_min_dists) all_statistics[task_id] = _task_dict return all_statistics