def rollout(env, agent, max_path_length=10000, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a,o) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout_policy(agent, env, max_path_length=200, speedup=1, get_image_observations=False, animated=False): """ Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164 Generate a rollout for a given policy """ observations = [] im_observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() path_length = 0 while path_length <= max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 o = next_o if get_image_observations: if not animated: pixel_array = env.render(mode="rgb_array") else: pixel_array = env.render() if pixel_array is None and not animated: # Not convinced that behaviour works for all environments, so until # such a time as I'm convinced of this, drop into a debug shell print("Problem! Couldn't get pixels! Dropping into debug shell.") import pdb; pdb.set_trace() im_observations.append(pixel_array) if d: rewards.append(r) break else: rewards.append(r) if animated: env.render(close=True) im_observations = tensor_utils.stack_tensor_list(im_observations) observations = tensor_utils.stack_tensor_list(observations) rewards = tensor_utils.stack_tensor_list(rewards) return dict( observations=observations, im_observations=im_observations, actions=tensor_utils.stack_tensor_list(actions), rewards=rewards, agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] images = [] o = env.reset(reset_args=reset_arg) agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: # and not animated: # TODO testing break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if save_video: from PIL import Image image = env.wrapped_env.wrapped_env.get_viewer().get_image() pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) images.append(np.flipud(np.array(pil_image))) if animated: if save_video and len(images) >= max_path_length: import moviepy.editor as mpy clip = mpy.ImageSequenceClip(images, fps=20*speedup) if video_filename[-3:] == 'gif': clip.write_gif(video_filename, fps=20*speedup) else: clip.write_videofile(video_filename, fps=20*speedup) #return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def ed_dec_rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1): if (agents.recurrent): assert isinstance( agents, GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class' """Decentralized rollout""" n_agents = len(env.agents) observations = [[] for _ in range(n_agents)] actions = [[] for _ in range(n_agents)] rewards = [[] for _ in range(n_agents)] agent_infos = [[] for _ in range(n_agents)] env_infos = [[] for _ in range(n_agents)] offset_t_sojourn = [[] for _ in range(n_agents)] olist = env.reset() assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents) agents.reset(dones=[True for _ in range(n_agents)]) path_length = 0 if animated: env.render() while path_length < max_path_length: agents_to_act = [ i for i, j in enumerate(olist) if j != [None] * len(j) ] if (not agents.recurrent): alist, agent_info_list = agents.get_actions( [olist[i] for i in agents_to_act]) agent_info_list = tensor_utils.split_tensor_dict_list( agent_info_list) else: alist, agent_info_list = agents.get_actions(olist) alist = [a for a in alist if a != None] agent_info_list = tensor_utils.split_tensor_dict_list( agent_info_list) agent_info_list = [ ainfo for i, ainfo in enumerate(agent_info_list) if i in agents_to_act ] next_actions = [None] * n_agents # will fill in in the loop # For each agent for ind, o in enumerate([olist[j] for j in agents_to_act]): # ind refers to non-None indicies # i refers to indices with Nones i = agents_to_act[ind] observations[i].append(env.observation_space.flatten(o)) # observations[i].append(o) # REMOVE THIS AND UNCOMMENT THE ABOVE LINE actions[i].append(env.action_space.flatten(alist[ind])) next_actions[i] = alist[ind] if agent_info_list is None: agent_infos[i].append({}) else: agent_infos[i].append(agent_info_list[ind]) # take next actions next_olist, rlist, d, env_info = env.step(np.asarray(next_actions)) # update sojourn time (we should associate ts from next_olist to r, not current) for i, r in enumerate(rlist): if r is None: continue # skip reward if agent has not acted yet if (len(observations[i]) > 0): rewards[i].append(r) offset_t_sojourn[i].append( env.observation_space.flatten(next_olist[i])[-1]) env_infos[i].append(env_info) path_length = max([len(o) for o in observations]) if d: break olist = next_olist if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if (path_length == max_path_length): # probably have some paths that aren't the right length for ind, o in enumerate(observations): r = rewards[ind] if (len(o) > len(r)): assert(len(o) <= (len(r) + 1)), \ 'len(o) %d, len(r) %d' % (len(o), len(r)) # delete last elem of obs, actions, agent infos del observations[ind][-1] del actions[ind][-1] del agent_infos[ind][-1] if animated: env.render() # remove empty agent trajectories observations = [o for o in observations if len(o) > 0] actions = [a for a in actions if len(a) > 0] rewards = [r for r in rewards if len(r) > 0] agent_infos = [i for i in agent_infos if len(i) > 0] env_infos = [e for e in env_infos if len(e) > 0] offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0] if (any( map(lambda x: x < n_agents, [ len(observations), len(actions), len(rewards), len(agent_infos), len(env_infos) ]))): print('\nWARNING: \n') print('n_agents: ', n_agents) print('len(observations): ', len(observations)) print('len(actions): ', len(actions)) print('len(rewards): ', len(rewards)) print('len(agent_infos): ', len(agent_infos)) print('len(env_infos): ', len(env_infos)) return [ dict( observations=tensor_utils.stack_tensor_list(observations[i]), actions=tensor_utils.stack_tensor_list(actions[i]), rewards=tensor_utils.stack_tensor_list(rewards[i]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]), env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]), offset_t_sojourn=tensor_utils.stack_tensor_list( offset_t_sojourn[i]), ) for i in range(len(observations)) ]
def obtain_samples(self, itr, determ=False): # logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) if determ: actions = agent_infos['mean'] policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() # logger.record_tabular("PolicyExecTime", policy_time) # logger.record_tabular("EnvExecTime", env_time) # logger.record_tabular("ProcessExecTime", process_time) return paths
def rollout(env, agent, line_params, max_path_length=np.inf, animated=False): """ Modified rollout function from rllab.sampler.utils to run arbitrary straight trajectories. """ observations = [] rewards = [] actions = [] agent_infos = [] env_infos = [] projected_trajectory = [] x0, y0, angle = line_params env.reset() agent.reset() # Force start state to be zeros # Note: Because env is an instance of NormalizedEnv, there is no # way of writing a custom function that I can use to set the # initial state. Consequently we just force set it here. start_yaw = angle start_state = np.array([x0, y0, start_yaw, 0, 0, 0]) env._wrapped_env._state = start_state o = np.zeros(5) path_length = 0 if animated: env.render() print('--------------------') while path_length < max_path_length: print('') state = env._wrapped_env._state print('State = ', state) projected_o = StraightEnv.project_line(state, x0, y0, angle) print('Projected state = ', projected_o) _, agent_info = agent.get_action(projected_o[1:]) a = agent_info['mean'] print('Computed action = ', a) next_o, r, d, env_info = env.step(a) print('Next observation = ', next_o) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) projected_trajectory.append(projected_o) path_length += 1 if d: break o = next_o if animated: env.render() print('--------------------') return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), ), projected_trajectory
def process_samples(self, itr, paths, update_baseline=True): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if hasattr(self.algo, 'epopt_epsilon'): if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr: # prune the paths target_path_size = len(paths) * self.algo.epopt_epsilon sorted_indices = np.argsort( [path["returns"][0] for path in paths]) idx = 0 si_idx = 0 while True: if sorted_indices[si_idx] > target_path_size: paths.pop(idx) idx -= 1 idx += 1 si_idx += 1 if idx >= len(paths): break ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [] ct = 0 for path in paths: if path['env_infos']['dyn_model_id'][-1] == 0: undiscounted_returns.append(sum(path["rewards"])) if path['env_infos']['dyn_model_id'][-1] == 1: ct += 1 print('path count with fake dynamics: ', ct, len(undiscounted_returns), len(paths)) ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if update_baseline: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def obtain_agent_info_offpolicy(self, itr, expert_trajs_dir=None, offpol_trajs=None, treat_as_expert_traj=False, log_prefix=''): assert expert_trajs_dir is None, "deprecated" start = time.time() if offpol_trajs is None: assert expert_trajs_dir is not None, "neither offpol_trajs nor expert_trajs_dir is provided" if self.use_pooled_goals: for t, taskidx in enumerate(self.goals_idxs_for_itr_dict[itr]): assert np.array_equal( self.goals_pool[taskidx], self.goals_to_use_dict[itr][t]), "fail" offpol_trajs = { t: joblib.load(expert_trajs_dir + str(taskidx) + self.expert_trajs_suffix + ".pkl") for t, taskidx in enumerate( self.goals_idxs_for_itr_dict[itr]) } else: offpol_trajs = joblib.load(expert_trajs_dir + str(itr) + self.expert_trajs_suffix + ".pkl") offpol_trajs = { tasknum: offpol_trajs[tasknum] for tasknum in range(self.meta_batch_size) } # some initial rearrangement tasknums = offpol_trajs.keys( ) # tasknums is range(self.meta_batch_size) as can be seen above for t in tasknums: for path in offpol_trajs[t]: if 'expert_actions' not in path.keys( ) and treat_as_expert_traj: # print("copying expert actions, you should do this only 1x per metaitr") path['expert_actions'] = np.clip(deepcopy(path['actions']), -1.0, 1.0) if treat_as_expert_traj: path['agent_infos'] = dict( mean=[[0.0] * len(path['actions'][0])] * len(path['actions']), log_std=[[0.0] * len(path['actions'][0])] * len(path['actions'])) else: path['agent_infos'] = [None] * len(path['rewards']) if not treat_as_expert_traj: print("debug12, running offpol on own previous samples") running_path_idx = {t: 0 for t in tasknums} running_intra_path_idx = {t: 0 for t in tasknums} while max([running_path_idx[t] for t in tasknums ]) > -0.5: # we cycle until all indices are -1 observations = [ offpol_trajs[t][running_path_idx[t]]['observations'][ running_intra_path_idx[t]] for t in tasknums ] actions, agent_infos = self.policy.get_actions(observations) agent_infos = split_tensor_dict_list(agent_infos) for t, action, agent_info in zip(itertools.count(), actions, agent_infos): offpol_trajs[t][running_path_idx[t]]['agent_infos'][ running_intra_path_idx[t]] = agent_info # INDEX JUGGLING: if -0.5 < running_intra_path_idx[t] < len(offpol_trajs[t][ running_path_idx[t]]['rewards']) - 1: # if we haven't reached the end: running_intra_path_idx[t] += 1 else: if -0.5 < running_path_idx[t] < len( offpol_trajs[t]) - 1: # we wrap up the agent_infos offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \ stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos']) # if we haven't reached the last path: running_intra_path_idx[t] = 0 running_path_idx[t] += 1 elif running_path_idx[t] == len(offpol_trajs[t]) - 1: offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \ stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos']) running_intra_path_idx[t] = -1 running_path_idx[t] = -1 else: # otherwise we set the running index to -1 to signal a stop running_intra_path_idx[t] = -1 running_path_idx[t] = -1 total_time = time.time() - start # logger.record_tabular(log_prefix+"TotalExecTime", total_time) return offpol_trajs
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, init_state=None, no_action=False, using_gym=False, noise=0, o=None, plan=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] dones = [] # no_action = True if o is None: if init_state is not None: o = env.reset(init_state) else: o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: if not using_gym: a, agent_info = agent.get_action(o) else: if hasattr(agent, 'relative_goals') and agent.relative_goals: ag = env_info['xy_pos'] if len( env_infos) > 0 else env.init_goal_obs goal = plan( ag, env.current_goal) if plan is not None else env.current_goal a = agent.get_actions([o], ag, goal, noise_eps=noise) agent_infos = None else: a = agent.get_actions([o], env.transform_to_goal_space(o), env.current_goal, noise_eps=noise) # a = agent.get_actions([o], np.zeros_like(env.current_goal), np.zeros_like(env.current_goal), noise_eps=noise) agent_infos = None if no_action: a = np.zeros_like(a) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) if agent_infos is not None: agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render(close=False) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), # agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos) if agent_infos is not None else None, env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.asarray(dones), last_obs=o, )
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in xrange(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in xrange(len(paths)): paths[i]['rewards'] = (paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in xrange(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in xrange(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in xrange(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths ] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def obtain_samples(self, itr, num_samples=None, log=True, log_prefix='RandomSampler-'): if num_samples is None: num_samples = self.algo.batch_size paths = [] n_samples_collected = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(num_samples) env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples_collected < num_samples: # random actions t = time.time() actions = np.stack([ self.vec_env.action_space.sample() for _ in range(len(obses)) ], axis=0) policy_time = time.time() - t agent_infos = {} t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples_collected += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() if log: logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) return paths
def sample_paths(N, policy, baseline, env_mode='train', T=1e6, gamma=1, mujoco_env=True, normalized_env=False, env=None): # Directly specifying env works only when sampling in series # set random seed (needed for multiprocessing) np.random.seed() if env == None: env = get_environment(env_mode) T = min(T, env.horizon) T = max(1, T) # sometimes, env is not initialized correctly in multiprocessing # this is just a sanity check and step size should essentially be zero. print("####### Worker started #######") paths = [] for ep in range(N): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] qpos = [] qvel = [] o = env.reset() if mujoco_env == True: if normalized_env: qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) done = False t = 0 while t < T and done != True: a, agent_info = policy.get_action(o) next_o, r, done, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) if mujoco_env == True: if normalized_env: qpos.append( env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append( env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) o = next_o t += 1 # make a path dictionary # Also store the path belief and env data used in the trajectory #try: # path_belief = env.env.belief #except Exception as e: # path_belief = str(e) # path_model = env.env qpos_flat = tensor_utils.stack_tensor_list(qpos) qvel_flat = tensor_utils.stack_tensor_list(qvel) path = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), qpos=qpos_flat, qvel=qvel_flat, #path_belief=path_belief, #path_model=path_model, ) # TODO: Storing the path model is too space inefficient. Need to find alternative # compute returns using the path path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + gamma * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # advantages and returns are stored backward in time advantages = np.array(advantages[::-1]) returns = np.array(returns[::-1]) # normalize advantages advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) #print "Env body_mass : ", env.env.model.body_mass[1] print("====== Worker finished ======") return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', extra_input=None, extra_input_dim=None, preupdate=False, save_img_obs=False): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) if extra_input is not None: if extra_input == "onehot_exploration": if preupdate: print("debug, using extra_input onehot") def expand_obs(obses, path_nums): extra = [special.to_onehot(path_num % extra_input_dim, extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra),axis=1) elif extra_input == "onehot_hacked": if preupdate: print("debug, using extra_input onehot") def expand_obs(obses, path_nums): extra = [special.to_onehot(3, extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra),axis=1) elif extra_input == "gaussian_exploration": if preupdate: print("debug, using extra_input gaussian") def expand_obs(obses, path_nums): extra = [np.random.normal(0.,1.,size=(extra_input_dim,)) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: def expand_obs(obses, path_nums): return obses else: def expand_obs(obses, path_nums): return obses #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): assert False, "debug, should we be using this?" print("WARNING, will vectorize reset_args") reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance obses = self.vec_env.reset(reset_args) obses = expand_obs(obses, path_nums) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) # print("debug, agent_infos", agent_infos) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict next_obses = expand_obs(next_obses,path_nums) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) # TODO: let's also add the incomplete running_paths to paths running_paths[idx] = None path_nums[idx] += 1 process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses # adding the incomplete paths # for idx in range(self.vec_env.num_envs): # if running_paths[idx] is not None: # paths[idx].append(dict( # observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), # )) pbar.stop() # logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) # logger.record_tabular(log_prefix + "EnvExecTime", env_time) # logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def rarl_rollout(env, agent1, agent2, policy_num, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): #logger.log("rollout~~~~~~~~~~~~~~~~~~~") observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent1.reset() agent2.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a1, agent1_info = agent1.get_action(o) a2, agent2_info = agent2.get_action(o) action_true = np.append(a1, a2) Action = {} Action['action'] = np.append(a1, a2) # Action['dist1'] = agent1_info # Action['dist2'] = agent2_info Action['policy_num'] = policy_num next_o, r, d, env_info = env.step(Action) # print(' ') # print('policy_num: ',policy_num,' a1: ',a1,' a2: ',a2,' reward: ',r) if policy_num == 1: observations.append(agent1._env_spec.observation_space.flatten(o)) rewards.append(r) actions.append(agent1._env_spec.action_space.flatten(a1)) agent_infos.append(agent1_info) else: observations.append(agent2._env_spec.observation_space.flatten(o)) rewards.append(r) actions.append(agent2._env_spec.action_space.flatten(a2)) agent_infos.append(agent2_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def rollout_torch(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, extra_clip=False, terminate_only_max_path=False): observations = [] next_observations = [] normalized_observations = [] normalized_next_observations = [] unscaled_actions = [] actions = [] rewards = [] agent_infos = [] env_infos = [] mask = [] o = env.reset() try: agent.reset() except AttributeError: pass path_length = 0 t = 0 def handle_obs(o): # get list with bools if output of env is normalized if isinstance(env, TorchModel): normalized_obs = env.normalized_output else: normalized_obs = [False] * len(o) unnormalized_idx = [i for i, x in enumerate(normalized_obs) if not x] normalized_idx = [i for i, x in enumerate(normalized_obs) if x] lb, ub = env.observation_space.bounds # normalize the unnormalized idx normalized_unnormalized_val = ( 2 * (o[unnormalized_idx] - lb[unnormalized_idx]) / (ub[unnormalized_idx] - lb[unnormalized_idx])) - 1 normalized_unnormalized_val = np.clip(normalized_unnormalized_val, -1, 1) # unnormalize the normalized idx unnormalized_normalized_val = lb[normalized_idx] + ( o[normalized_idx] + 1.) * 0.5 * (ub[normalized_idx] - lb[normalized_idx]) unnormalized_normalized_val = np.clip(unnormalized_normalized_val, lb[normalized_idx], ub[normalized_idx]) # put everything together normalized_obs = np.zeros(o.shape) normalized_obs[normalized_idx] = o[normalized_idx] normalized_obs[unnormalized_idx] = normalized_unnormalized_val unnormalized_obs = np.zeros(o.shape) unnormalized_obs[unnormalized_idx] = o[unnormalized_idx] unnormalized_obs[normalized_idx] = unnormalized_normalized_val # do extra clipping since original values could be out of bounds if extra_clip: normalized_obs = np.clip(normalized_obs, -1, 1) unnormalized_obs = np.clip(unnormalized_obs, lb, ub) # TODO: build own function for this # select the right observations for the agent normalized_policy_input = agent.normalized_input normalized_policy_input_idx = [ i for i, x in enumerate(normalized_policy_input) if x ] unnormalized_policy_input_idx = [ i for i, x in enumerate(normalized_policy_input) if not x ] policy_input = np.zeros(o.shape) policy_input[normalized_policy_input_idx] = normalized_obs[ normalized_policy_input_idx] policy_input[unnormalized_policy_input_idx] = unnormalized_obs[ unnormalized_policy_input_idx] agent_obs_torch_var = (torch.from_numpy(policy_input.astype( np.float32))).unsqueeze(0) # select the right observations for the env if isinstance(env, TorchModel): normalized_env_input = env.normalized_input_obs else: normalized_env_input = [False] * len(o) normalized_env_input_idx = [ i for i, x in enumerate(normalized_env_input) if x ] unnormalized_env_input_idx = [ i for i, x in enumerate(normalized_env_input) if not x ] env_input = np.zeros(o.shape) env_input[normalized_env_input_idx] = normalized_obs[ normalized_env_input_idx] env_input[unnormalized_env_input_idx] = unnormalized_obs[ unnormalized_env_input_idx] env_obs_torch_var = (torch.from_numpy(env_input.astype(np.float32))) return normalized_obs, unnormalized_obs, agent_obs_torch_var, env_obs_torch_var def handle_action(a): normalized_a = agent.normalized_output # scale only the normalized action outputs unnormalized_idx = [i for i, x in enumerate(normalized_a) if not x] normalized_idx = [i for i, x in enumerate(normalized_a) if x] lb, ub = env.action_space.bounds # normalize the unnormalized idx normalized_unnormalized_val = ( 2 * (a[unnormalized_idx] - lb[unnormalized_idx]) / (ub[unnormalized_idx] - lb[unnormalized_idx])) - 1 normalized_unnormalized_val = np.clip(normalized_unnormalized_val, -1, 1) # unnormalize the normalized idx unnormalized_normalized_val = lb[normalized_idx] + ( a[normalized_idx] + 1.) * 0.5 * (ub[normalized_idx] - lb[normalized_idx]) unnormalized_normalized_val = np.clip(unnormalized_normalized_val, lb[normalized_idx], ub[normalized_idx]) # put everything together normalized_a = np.zeros(a.shape) normalized_a[normalized_idx] = a[normalized_idx] normalized_a[unnormalized_idx] = normalized_unnormalized_val unnormalized_a = np.zeros(a.shape) unnormalized_a[unnormalized_idx] = a[unnormalized_idx] unnormalized_a[normalized_idx] = unnormalized_normalized_val # do extra clipping since original values could be out of bounds if extra_clip: normalized_a = np.clip(normalized_a, -1, 1) unnormalized_a = np.clip(unnormalized_a, lb, ub) unscaled_a = normalized_a action = unnormalized_a # select the right actions for the env if isinstance(env, TorchModel): normalized_env_input = env.normalized_input_a else: normalized_env_input = [False] * len(a) normalized_env_input_idx = [ i for i, x in enumerate(normalized_env_input) if x ] unnormalized_env_input_idx = [ i for i, x in enumerate(normalized_env_input) if not x ] env_input = np.zeros(a.shape) env_input[normalized_env_input_idx] = normalized_a[ normalized_env_input_idx] env_input[unnormalized_env_input_idx] = unnormalized_a[ unnormalized_env_input_idx] env_a_np_var = env_input return action, unscaled_a, env_a_np_var if animated: env.render() while path_length < max_path_length: # TODO: it might be the case that the env is not giving a numpy array normalized_o, o, agent_obs_torch, env_obs_torch = handle_obs(o) a, agent_info = agent.select_action(agent_obs_torch, t) #print(a, agent_obs_torch) a, unscaled_a, env_a_torch = handle_action(a) if isinstance(env, TorchModel): #print(env_a_torch, env_obs_torch, o) #print(a, unscaled_a, env_a_torch) next_orig_o, r, d, env_info = env.step(env_a_torch, env_obs_torch, o) else: next_orig_o, r, d, env_info = env.step(a) normalized_next_o, next_o, _, _ = handle_obs(next_orig_o) observations.append(env.observation_space.flatten(o)) normalized_observations.append( env.observation_space.flatten(normalized_o)) next_observations.append(next_o) normalized_next_observations.append(normalized_next_o) rewards.append(r) actions.append(env.action_space.flatten(a)) unscaled_actions.append(env.action_space.flatten(unscaled_a)) agent_infos.append(agent_info) env_infos.append(env_info) if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) print(o, r, a, next_o) path_length += 1 if d and not terminate_only_max_path: mask.append(0) break elif path_length == max_path_length: mask.append(0) # add termination when we reached max time break elif not d: mask.append(1) else: mask.append(0) o = next_orig_o t += 1 if animated: try: env.close() except AttributeError: pass if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), next_observations=tensor_utils.stack_tensor_list(next_observations), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), mask=tensor_utils.stack_tensor_list(mask), normalized_observations=tensor_utils.stack_tensor_list( normalized_observations), normalized_next_observations=tensor_utils.stack_tensor_list( normalized_next_observations), unscaled_actions=tensor_utils.stack_tensor_list(unscaled_actions), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None, use_maml=False, maml_task_index=None, maml_num_tasks=None, use_rl2=False, new_trial=True): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] images = [] o = env.reset(reset_args=reset_arg) if use_rl2: agent.reset(new_trial=new_trial) else: agent.reset() path_length = 0 if animated: env1 = env while hasattr(env1, "wrapped_env"): env1 = env1.wrapped_env if hasattr(env1, "viewer_setup"): env1.viewer_setup() env.render() while path_length < max_path_length: if not use_maml and not use_rl2: a, agent_info = agent.get_action(observation=o) else: a, agent_info = agent.get_action_single_env( observation=o, idx=maml_task_index, num_tasks=maml_num_tasks) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: # and not animated: # TODO testing break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if save_video: from PIL import Image image = env.wrapped_env.wrapped_env.get_viewer().get_image() pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) images.append(np.flipud(np.array(pil_image))) if animated: if save_video and len(images) >= max_path_length: import moviepy.editor as mpy clip = mpy.ImageSequenceClip(images, fps=20 * speedup) if video_filename[-3:] == 'gif': clip.write_gif(video_filename, fps=20 * speedup) else: clip.write_videofile(video_filename, fps=20 * speedup) #return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def obtain_samples(self, itr, oracle_policy): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] agent_only_paths = [] oracle_only_paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs agent_only_running_paths = [None] * self.vec_env.num_envs oracle_only_running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) agent_actions, binary_actions, agent_infos = policy.get_actions( obses) oracle_actions, oracle_agent_infos = oracle_policy.get_actions( obses) sigma = np.round(binary_actions) actions_1 = np.array([ sigma[0, 0] * agent_actions[0, :] + sigma[0, 1] * oracle_actions[0, :] ]) actions_2 = np.array([ sigma[1, 0] * agent_actions[1, :] + sigma[1, 1] * oracle_actions[1, :] ]) actions = np.concatenate((actions_1, actions_2), axis=0) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, itr) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if sigma[0, 0] == 1 or sigma[1, 0] == 1: for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if agent_only_running_paths[idx] is None: agent_only_running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) agent_only_running_paths[idx]["observations"].append( observation) agent_only_running_paths[idx]["actions"].append(action) agent_only_running_paths[idx]["rewards"].append(reward) agent_only_running_paths[idx]["env_infos"].append(env_info) agent_only_running_paths[idx]["agent_infos"].append( agent_info) if done: agent_only_paths.append( dict( observations=self.env_spec.observation_space. flatten_n(agent_only_running_paths[idx] ["observations"]), actions=self.env_spec.action_space.flatten_n( agent_only_running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( agent_only_running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( agent_only_running_paths[idx] ["env_infos"]), agent_infos=tensor_utils. stack_tensor_dict_list( agent_only_running_paths[idx] ["agent_infos"]), )) n_samples += len( agent_only_running_paths[idx]["rewards"]) agent_only_running_paths[idx] = None """ To get paths taken by the oracle """ # elif sigma[0] == 0. or sigma[1] == 0.: # for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, # rewards, env_infos, agent_infos, # dones): # if oracle_only_running_paths[idx] is None: # oracle_only_running_paths[idx] = dict( # observations=[], # actions=[], # rewards=[], # env_infos=[], # agent_infos=[], # ) # oracle_only_running_paths[idx]["observations"].append(observation) # oracle_only_running_paths[idx]["actions"].append(action) # oracle_only_running_paths[idx]["rewards"].append(reward) # oracle_only_running_paths[idx]["env_infos"].append(env_info) # oracle_only_running_paths[idx]["agent_infos"].append(agent_info) # if done: # oracle_only_paths.append(dict( # observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]), # )) # n_samples += len(oracle_only_running_paths[idx]["rewards"]) # oracle_only_running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) #return paths, agent_only_paths, oracle_only_paths return paths, agent_only_paths
def rollout(env, agent, max_path_length=np.inf, reset_start_rollout=True, keep_rendered_rgbs=False, animated=False, speedup=1): """ :param reset_start_rollout: whether to reset the env when calling this function :param keep_rendered_rgbs: whether to keep a list of all rgb_arrays (for future video making) """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] terminated = [] if reset_start_rollout: o = env.reset() # otherwise it will never advance!! else: if isinstance(env, NormalizedEnv): o = env.wrapped_env.get_current_obs() else: o = env.get_current_obs() agent.reset() path_length = 0 if animated: env.render() if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs = [env.render(mode='rgb_array')] while path_length < max_path_length: # print("next_o", len(o)) # print("env", env) a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) # print("next_obs", next_o.shape) # print("env", env) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: terminated.append(1) break terminated.append(0) o = next_o if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs.append(env.render(mode='rgb_array')) if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) # if animated: # this is off as in the case of being an inner rollout, it will close the outer renderer! # env.render(close=True) path_dict = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), # here it concatenates all lower-level paths! # termination indicates if the rollout was terminated or if we simply reached the limit of steps: important # when BOTH happend at the same time, to still be able to know it was the done (for hierarchized envs) terminated=tensor_utils.stack_tensor_list(terminated), ) if keep_rendered_rgbs: path_dict['rendered_rgbs'] = tensor_utils.stack_tensor_list(rendered_rgbs) return path_dict
def process_samples(self, itr, paths, prefix='', log=True, fast_process=False, testitr=False, metalearn_baseline=False , isExpertTraj = False): baselines = [] returns = [] if testitr: metalearn_baseline = False train_baseline = (itr in BASELINE_TRAINING_ITRS) if not fast_process: for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if not fast_process and not metalearn_baseline: if log: pass #logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) # TODO: doesn't seem like this is ever used else: # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1]) # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) self.algo.baseline.fit(paths, log=log) # print("debug25 predloss AFTER fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) # print("debug22 returns ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1]) # print("debug24 baseline after fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1]) if log: pass #logger.log("fitted") if 'switch_to_init_dist' in dir(self.algo.baseline): self.algo.baseline.switch_to_init_dist() if train_baseline: self.algo.baseline.fit_train_baseline(paths) if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): if not fast_process and not metalearn_baseline: # if idx==0: # print("debug22", all_path_baselines[idx]) # print("debug23", path['returns']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) if not fast_process: returns.append(path["returns"]) if "expert_actions" not in path.keys(): if ("expert_actions" in path["env_infos"].keys()): path["expert_actions"] = path["env_infos"]["expert_actions"] else: # assert False, "you shouldn't need expert_actions" path["expert_actions"] = np.array([[None]*len(path['actions'][0])] * len(path['actions'])) if not fast_process and not metalearn_baseline: # TODO: we want the ev eventually ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) l2 = np.linalg.norm(np.array(baselines)-np.array(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) if not fast_process: rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) if "env_infos" in paths[0].keys(): env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) if not fast_process and not metalearn_baseline: advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) # print("debug, advantages are", advantages,) # print("debug, shape of advantages is", type(advantages), np.shape(advantages)) expert_actions = tensor_utils.concat_tensor_list([path["expert_actions"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if not fast_process and not metalearn_baseline: if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) if "meta_predict" in dir(self.algo.baseline): # print("debug, advantages are", advantages, ) advantages = advantages + self.algo.baseline.meta_predict(observations) print("debug, metalearned baseline constant is", self.algo.baseline.meta_predict(observations)[0:2],"...",self.algo.baseline.meta_predict(observations)[-3:-1]) # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations))) # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1]) # print("debug, advantages shape is", np.shape(advantages)) # average_discounted_return = \ # np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths] # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) if fast_process: samples_data = dict( observations=observations, actions=actions, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) elif metalearn_baseline: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths] # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log: # logger.record_tabular('Iteration', itr) # logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix + 'NumTrajs', len(paths)) if testitr and prefix == "1": # TODO make this functional for more than 1 iteration self.memory["AverageReturnLastTest"]=np.mean(undiscounted_returns) self.memory["AverageReturnBestTest"]=max(self.memory["AverageReturnLastTest"],self.memory["AverageReturnBestTest"]) if self.memory["AverageReturnBestTest"] == 0.0: self.memory["AverageReturnBestTest"] = self.memory["AverageReturnLastTest"] if not testitr and prefix == '1': logger.record_tabular(prefix + 'AverageExpertReturn', np.mean(undiscounted_returns)) #if testitr: logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns)) if not fast_process and not metalearn_baseline: logger.record_tabular(prefix + 'ExplainedVariance', ev) logger.record_tabular(prefix + 'BaselinePredLoss', l2) # logger.record_tabular(prefix + 'Entropy', ent) # logger.record_tabular(prefix + 'Perplexity', np.exp(ent)) # if "env_infos" in paths[0].keys() and "success_left" in paths[0]["env_infos"].keys(): # logger.record_tabular(prefix + 'success_left', eval_success_left(paths)) # logger.record_tabular(prefix + 'success_right', eval_success_right(paths)) # else: # logger.record_tabular(prefix + 'success_left', -1.0) # logger.record_tabular(prefix + 'success_right', -1.0) # if metalearn_baseline: # if hasattr(self.algo.baseline, "revert"): # self.algo.baseline.revert() return samples_data
def obtain_samples(self, itr, max_path_length, batch_size, max_n_trajs=None): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 dones = np.asarray([True] * self.vec_env.n_envs) obses = self.vec_env.reset(dones) running_paths = [None] * self.vec_env.n_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.policy import time while n_samples < batch_size: t = time.time() if hasattr(self.vec_env, "handle_policy_reset"): self.vec_env.handle_policy_reset(policy, dones) else: policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, max_path_length=max_path_length) if np.any(dones): new_obses = self.vec_env.reset(dones) reset_idx = 0 for idx, done in enumerate(dones): if done: next_obses[idx] = new_obses[reset_idx] reset_idx += 1 env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.n_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.n_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if max_n_trajs is not None and len(paths) >= max_n_trajs: break process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def rollout_hide(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, mode=None, hide_tmax=None, init_state=None, init_goal=None, return_states_as_list=False): ## HIDE AGENT # animated = True # Reset the model configuration # print('Init goal: ', init_goal) if env.spec.id[:6] == 'Blocks': env.reset() obs = env.env.env.reload_model(pose=init_state, goal=init_goal) else: env.reset() obs = env.env.env.reload_model(pose=init_state, goal=init_goal) # time.sleep(1) # if animated: # print('rollout: HIDE') # frame_skip_prev = env.env.unwrapped.frame_skip # env.env.unwrapped.frame_skip = 20 hide_observations = [] hide_states = [] hide_actions = [] hide_rewards = [] hide_agent_infos = [] hide_env_infos = [] # Hide is capable of stopping so let's set stop if available # WARNING: It is important to do all this stuff after reset, since # blocks dependent stuff could be reset from config file as well if mode is not None: if mode == 'seek_force_only': env.env.env.use_stop = True env.env.env.add_mnist_reward(False) env.env.env.use_mnist_stop_criteria(False) elif mode == 'reach_center_and_stop': env.env.env.use_stop = True env.env.env.use_distance2center_stop_criteria = False prev_set_limit = env.env.unwrapped.step_limit if hide_tmax is not None: env.env.unwrapped.step_limit = hide_tmax agents['hide'].reset() hide_path_length = 0 if animated: env.render() while hide_path_length < max_path_length: a, agent_info = agents['hide'].get_action(obs) if animated: env.render() # need to do it before the step, to match states to observations in the vector hide_states.append(env.env.unwrapped.get_all_pose()) obs_next, r, d, env_info = env.step(a) # print('action:', a) hide_observations.append(obs) hide_rewards.append(r) hide_actions.append(env.action_space.flatten(a)) hide_agent_infos.append(agent_info) hide_env_infos.append(env_info) hide_path_length += 1 obs = obs_next if d: print('Hide | path_length:', hide_path_length) break if mode is not None: if mode == 'seek_force_only': env.env.env.use_stop = False env.env.env.add_mnist_reward(True) env.env.env.use_mnist_stop_criteria(True) elif mode == 'reach_center_and_stop': env.env.env.use_stop = False env.env.env.use_distance2center_stop_criteria = True if hide_tmax is not None: env.env.unwrapped.step_limit = prev_set_limit if not return_states_as_list: hide_states = tensor_utils.stack_tensor_list(hide_states) hide_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(hide_observations), actions=tensor_utils.stack_tensor_list(hide_actions), rewards=tensor_utils.stack_tensor_list(hide_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos), states=hide_states, ) # print('Episode done:', hide_path_length) return hide_paths
def obtain_samples(self, itr, init_state=None, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs if init_state is not None: init_state = [init_state] * self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(init_state, reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs else: n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 #policy = self.algo.policy import time while n_samples < self.algo.max_path_length: t = time.time() #self.env_spec.reset(reset_args = reset_args) #policy.reset(dones) actions, agent_infos = self.get_MPC_action(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append( dict( observations=running_paths[idx]["observations"], actions=running_paths[idx]["actions"], rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def rollout_hide_seek(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, mode=None, hide_tmax=None): # animated = True ## HIDE AGENT #Reset the model configuration env.reset() obs = env.env.env.reload_model() last_goal = env.env.unwrapped.get_all_pose() # print('-----------------------------------------------------') # print('goal hide: ', env.env.env.goal, 'obs:', obs) # if animated: # print('rollout: HIDE') # print('Frame skip = ', env.env.unwrapped.frame_skip) # frame_skip_prev = env.env.unwrapped.frame_skip # env.env.unwrapped.frame_skip = 20 hide_observations = [] hide_actions = [] hide_rewards = [] hide_agent_infos = [] hide_env_infos = [] # Hide is capable of stopping so let's set stop if available # WARNING: It is important to do all this stuff after reset, since # blocks dependent stuff could be reset from config file as well if mode is not None: if mode == 'seek_force_only': env.env.env.use_stop = True env.env.env.add_mnist_reward(False) env.env.env.use_mnist_stop_criteria(False) elif mode == 'reach_center_and_stop': env.env.env.use_stop = True env.env.env.use_distance2center_stop_criteria = False prev_set_limit = env.env.unwrapped.step_limit if hide_tmax is not None: env.env.unwrapped.step_limit = hide_tmax # print('rollout: hide step_limit = ', env.env.unwrapped.step_limit) agents['hide'].reset() hide_path_length = 0 if animated: env.render() # print('rollout: HIDE') while hide_path_length < max_path_length: a, agent_info = agents['hide'].get_action(obs) # print('hide action: ', a) if animated: env.render() obs_next, r, d, env_info = env.step(a) hide_observations.append(obs) hide_rewards.append(r) hide_actions.append(env.action_space.flatten(a)) hide_agent_infos.append(agent_info) hide_env_infos.append(env_info) hide_path_length += 1 last_pose = env.env.unwrapped.get_all_pose() # last_goal = copy.deepcopy(env.env.env.goal) obs = obs_next # print('hide obs: ', obs_next) # time.sleep(0.5) # if r > 0: # print('!!!!!!!!!!!!!! r:', r, 'stop crit: ', env.env.unwrapped.use_distance2center_stop_criteria) if d: break # print('step hide') # print('-------------------------') # print('goal hide last: ', env.env.env.goal, 'obs:', obs[1]) hide_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(hide_observations), actions=tensor_utils.stack_tensor_list(hide_actions), rewards=tensor_utils.stack_tensor_list(hide_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos), ) if animated: time.sleep(1) ############################################## # SEEK AGENT # print('last obs: ', obs[1]) if env.spec.id[:6] != 'Blocks' or env.spec.id[:12] == 'BlocksSimple': # Avoiding randomization for blocks env env.reset() #must do reset for reacher otherwise it feaks out obs = env.env.env.reload_model(pose=last_pose, goal=last_goal) # print('goal seek: ', env.env.env.goal, 'obs:', obs) # print('Timelen max = ', env.env.unwrapped.step_limit) # print('Prev limit = ', prev_set_limit) if animated: print('rollout: SEEK') # env.env.unwrapped.frame_skip = 10 # print('rollout: SEEK') if mode is not None: if mode == 'seek_force_only': env.env.env.use_stop = False env.env.env.add_mnist_reward(True) env.env.env.use_mnist_stop_criteria(True) elif mode == 'reach_center_and_stop': env.env.env.use_stop = False env.env.env.use_distance2center_stop_criteria = True if hide_tmax is not None: env.env.unwrapped.step_limit = prev_set_limit seek_observations = [] seek_actions = [] seek_rewards = [] seek_agent_infos = [] seek_env_infos = [] # obs = env.reset() agents['seek'].reset() seek_path_length = 0 if animated: env.render() while seek_path_length < max_path_length: # if seek_path_length < 2: print('seek obs: ', obs) a, agent_info = agents['seek'].get_action(obs) if animated: # print('Seek obs: ', obs, 'action:', a) # print('action:', a) env.render() obs_next, r, d, env_info = env.step(a) seek_observations.append(obs) seek_rewards.append(r) seek_actions.append(env.action_space.flatten(a)) seek_agent_infos.append(agent_info) seek_env_infos.append(env_info) seek_path_length += 1 if d: # print('break ...') break obs = obs_next # print('step seek') # if animated: # env.env.unwrapped.frame_skip = frame_skip_prev if animated and not always_return_paths: return seek_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(seek_observations), actions=tensor_utils.stack_tensor_list(seek_actions), rewards=tensor_utils.stack_tensor_list(seek_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos), ) hide_paths['actions'] = hide_paths['actions'].astype(glob_config.dtype) seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype) hide_paths['rewards'] = hide_paths['rewards'].astype(glob_config.dtype) seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype) return {'hide': hide_paths, 'seek': seek_paths}
def rollout_w_truth(env, agent, max_path_length=np.inf, animated=False, save_gif=False, speedup=1, mean=np.zeros(2), std=np.ones(2), seed=-1, **kwargs): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset(seed=seed) truth = defaultdict(list) ef = env.wrapped_env.j.rollout_ego_features(env.wrapped_env.simparams) for d in ef: for key, val in d.items(): truth[key].append(val) agent.reset() path_length = 0 if animated: env.render() if save_gif: initial_simparams0 = env.wrapped_env.copy_simparams() initial_simparams1 = env.wrapped_env.copy_simparams() while path_length < max_path_length: a, agent_info = agent.get_action(o) a = (a * std) + mean next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if save_gif: actions = [ np.clip(action, *env.wrapped_env.j.action_space_bounds(initial_simparams0)) for action in actions ] env.wrapped_env.save_gif(initial_simparams0, np.column_stack(actions), kwargs['filename'], truth_simparams=initial_simparams1) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), ), truth
def rollout_seek(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, mode=None): ############################################## # SEEK AGENT # env.env.unwrapped.reload_model(pose=last_pose) seek_observations = [] seek_actions = [] seek_rewards = [] seek_agent_infos = [] seek_env_infos = [] if mode == 'mnist_stop': env.env.env.use_stop = False env.env.env.use_mnist_reward(True) env.env.env.use_mnist_stop_criteria(True) else: env.env.env.use_stop = False obs = env.reset() agents['seek'].reset() seek_path_length = 0 # print('obs: ', obs[1]) if animated: env.render() while seek_path_length < max_path_length: a, agent_info = agents['seek'].get_action(obs) if animated: env.render() obs_next, r, d, env_info = env.step(a) seek_observations.append(obs) seek_rewards.append(r) seek_actions.append(env.action_space.flatten(a)) seek_agent_infos.append(agent_info) seek_env_infos.append(env_info) seek_path_length += 1 obs = obs_next if d: break print('SEEK Test | path_length:', seek_path_length) if animated and not always_return_paths: return seek_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(seek_observations), actions=tensor_utils.stack_tensor_list(seek_actions), rewards=tensor_utils.stack_tensor_list(seek_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos), ) seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype) seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype) return {'seek': seek_paths}
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def rollout_debug(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): ############################################## # SEEK AGENT # env.env.unwrapped.reload_model(pose=last_pose) animated = True always_return_paths = True seek_observations = [] seek_actions = [] seek_rewards = [] seek_agent_infos = [] seek_env_infos = [] env.env.env.use_stop = False env.env.env.use_mnist_reward(True) env.env.env.use_mnist_stop_criteria(True) obs = env.reset() agents['seek'].reset() seek_path_length = 0 if animated: env.render() while seek_path_length < max_path_length: a, agent_info = agents['seek'].get_action(obs) if animated: env.render() obs_next, r, d, env_info = env.step(a) seek_observations.append(obs) seek_rewards.append(r) seek_actions.append(env.action_space.flatten(a)) seek_agent_infos.append(agent_info) seek_env_infos.append(env_info) seek_path_length += 1 obs = obs_next print('Distance = ', env_info['act_min_dist'], ' Max_dist = ', env_info['act_dist_max']) time.sleep(0.5) if d: break # print('step seek') if animated and not always_return_paths: return seek_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(seek_observations), actions=tensor_utils.stack_tensor_list(seek_actions), rewards=tensor_utils.stack_tensor_list(seek_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos), ) seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype) seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype) return {'seek': seek_paths}
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, controller=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] dones = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: # import ipdb; ipdb.set_trace() # To test if the weights are correct, we need to use our local # get_action function # controller = control.StraightController() a, agent_info = controller.get_action(o.T) # a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) if isinstance(env.observation_space, list): n = len(env.shadow_envs) observations.append([ env.shadow_envs[i].observation_space.flatten_n(o[i]) for i in range(n) ]) rewards.append(r) actions.append([ env.shadow_envs[i].action_space.flatten_n(a[i]) for i in range(n) ]) else: observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render(close=True) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.asarray(dones), last_obs=o, )
def rollout_brownian(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, mode=None, hide_tmax=None): ############################################## ## HIDE AGENT env.reset() # 현재 agent['hide'].starts에서 random하게 하나를 뽑는다. start_pose, start_pose_id = agents['hide'].sample_one_start() start_pose = np.array(start_pose) # 현재 agent['hide'].starts에서 p의 확률로 agent['hide'].starts_old에서 1-p의 확률로 random하게 하나를 뽑는다. goal, goal_id = agents['hide'].sample_one_goal() obs = env.env.env.reload_model(pose=start_pose, goal=goal) # print("++++++++++++++++++++++++++++++++++++") # print('start_pose:', start_pose[0][0:2], ' goal:', goal[0][0:2]) # print('start_pose:', np.array(obs[0][0:2]) * 2.4, ' goal:',np.array(obs[0][-2:]) * 2.4) ############################################## ## SEEK AGENT # print('rollout: Student') if animated: env.render() # env.env.unwrapped.frame_skip = 10 if mode is not None: if mode == 'seek_force_only': env.env.env.use_stop = False env.env.env.add_mnist_reward(True) env.env.env.use_mnist_stop_criteria(True) elif mode == 'reach_center_and_stop': env.env.env.use_stop = False env.env.env.use_distance2center_stop_criteria = True seek_observations = [] seek_actions = [] seek_rewards = [] seek_agent_infos = [] seek_env_infos = [] agents['seek'].reset() seek_path_length = 0 if animated: env.render() while seek_path_length < max_path_length: # if seek_path_length < 2: print('seek obs: ', obs) a, agent_info = agents['seek'].get_action(obs) # print('action:',a) if animated: env.render() obs_next, r, d, env_info = env.step(a) seek_observations.append(obs) seek_rewards.append(r) seek_actions.append(env.action_space.flatten(a)) seek_agent_infos.append(agent_info) seek_env_infos.append(env_info) seek_path_length += 1 if d: print('SEEK| path_length:', len(seek_rewards)) break obs = obs_next # print('step seek') ## Here we assigning if the goal was reached for a particular goal step_limit = env.env.unwrapped.step_limit goal_reached = float(seek_path_length < step_limit) # starts값에 대한 reward값 저장 if agents['hide'].reverse_mode: agents['hide'].rewards[start_pose_id].append(goal_reached) else: agents['hide'].rewards[goal_id].append(goal_reached) if animated and not always_return_paths: return seek_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(seek_observations), actions=tensor_utils.stack_tensor_list(seek_actions), rewards=tensor_utils.stack_tensor_list(seek_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos), ) seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype) seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype) return {'seek': seek_paths}
def ed_simpy_dec_rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1): if (agents.recurrent): assert isinstance( agents, GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class' """Decentralized rollout""" n_agents = len(env.agents) observations = [[] for _ in range(n_agents)] actions = [[] for _ in range(n_agents)] rewards = [[] for _ in range(n_agents)] agent_infos = [[] for _ in range(n_agents)] env_infos = [[] for _ in range(n_agents)] offset_t_sojourn = [[] for _ in range(n_agents)] agents.reset(dones=[True for _ in range(n_agents)]) agent_policies = [None] * n_agents for i in range(n_agents): agent_policies[i] = lambda obs: get_actions_wrapper( agents, i, n_agents, obs) # if(not agents.recurrent): # agent_policies[i] = lambda obs: agents.get_actions([obs]) # else: # agent_policies[i] = lambda obs: agents.get_actions(obs_to_ith_loc(obs, i, n_agents)) observations, actions, rewards, agent_infos, env_infos, offset_t_sojourn = env.wrapped_env.reset_and_sim( agent_policies) # remove empty agent trajectories observations = [o for o in observations if len(o) > 0] actions = [a for a in actions if len(a) > 0] rewards = [r for r in rewards if len(r) > 0] agent_infos = [i for i in agent_infos if len(i) > 0] env_infos = [e for e in env_infos if len(e) > 0] offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0] if (any( map(lambda x: x < n_agents, [ len(observations), len(actions), len(rewards), len(agent_infos), len(env_infos) ]))): print('\nWARNING: \n') print('n_agents: ', n_agents) print('len(observations): ', len(observations)) print('len(actions): ', len(actions)) print('len(rewards): ', len(rewards)) print('len(agent_infos): ', len(agent_infos)) print('len(env_infos): ', len(env_infos)) return [ dict( observations=tensor_utils.stack_tensor_list(observations[i]), actions=tensor_utils.stack_tensor_list(actions[i]), rewards=tensor_utils.stack_tensor_list(rewards[i]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]), env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]), offset_t_sojourn=tensor_utils.stack_tensor_list( offset_t_sojourn[i]), ) for i in range(len(observations)) ]
def obtain_samples(self, itr, reset_args=None, task_idxs=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs n_samples = 0 curr_noises = [ np.random.normal(0, 1, size=(self.latent_dim, )) for _ in range(self.vec_env.num_envs) ] #curr_noises = [np.ones(size = (self.latent_dim)) for _ in range(self.vec_env.num_envs)] obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() #print(obses.shape,task_idxs.shape,curr_noises[0].shape) policy.reset(dones) #TODO: What the hell does this do? actions, agent_infos = policy.get_actions(obses, task_idxs, curr_noises) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done, noise in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones, curr_noises): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], noises=[]) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) running_paths[idx]["noises"].append(noise) if done: paths[idx].append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), noises=self.flatten_n( running_paths[idx]["noises"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None curr_noises[idx] = np.random.normal( 0, 1, size=(self.latent_dim, )) #curr_noises[idx] = np.ones(size=(self.latent_dim)) process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def rollout_policy(agent, env, max_path_length=200, speedup=1, get_image_observations=False, animated=False): """ Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164 Generate a rollout for a given policy """ observations = [] im_observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() path_length = 0 while path_length <= max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 o = next_o if get_image_observations: if not animated: pixel_array = env.render(mode="rgb_array") else: pixel_array = env.render() if pixel_array is None and not animated: # Not convinced that behaviour works for all environments, so until # such a time as I'm convinced of this, drop into a debug shell print( "Problem! Couldn't get pixels! Dropping into debug shell.") import pdb pdb.set_trace() im_observations.append(pixel_array) if d: rewards.append(r) break else: rewards.append(r) if animated: env.render(close=True) im_observations = tensor_utils.stack_tensor_list(im_observations) observations = tensor_utils.stack_tensor_list(observations) rewards = tensor_utils.stack_tensor_list(rewards) return dict( observations=observations, im_observations=im_observations, actions=tensor_utils.stack_tensor_list(actions), rewards=rewards, agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def process_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.append(self.algo.baseline.predict(path), 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def rollout_policy(agent, env, max_path_length=200, reward_extractor=None, speedup=1, get_image_observations=False, num_frames=4, concat_timesteps=True, animated=False): """ Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164 Generate a rollout for a given policy """ observations = [] im_observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() path_length = 0 while path_length <= max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 o = next_o if get_image_observations: if not animated: pixel_array = env.render(mode="rgb_array") else: pixel_array = env.render() if pixel_array is None and not animated: # Not convinced that behaviour works for all environments, so until # such a time as I'm convinced of this, drop into a debug shell print( "Problem! Couldn't get pixels! Dropping into debug shell.") import pdb pdb.set_trace() im_observations.append(pixel_array) if d: rewards.append(r) break else: rewards.append(r) # if animated: # env.render(close=True) im_observations = tensor_utils.stack_tensor_list(im_observations) observations = tensor_utils.stack_tensor_list(observations) if reward_extractor is not None: #TODO: remove/replace this if concat_timesteps: true_rewards = tensor_utils.stack_tensor_list(rewards) obs_pls_three = np.zeros( (observations.shape[0], num_frames, observations.shape[1])) # import pdb; pdb.set_trace() for iter_step in range(0, obs_pls_three.shape[0]): for i in range(num_frames): idx_plus_three = min(iter_step + num_frames, obs_pls_three.shape[0] - 1) obs_pls_three[iter_step, i, :] = observations[idx_plus_three, :] rewards = reward_extractor.get_reward(obs_pls_three) else: true_rewards = tensor_utils.stack_tensor_list(rewards) rewards = reward_extractor.get_reward(observations) else: rewards = tensor_utils.stack_tensor_list(rewards) true_rewards = rewards return dict( observations=observations, im_observations=im_observations, actions=tensor_utils.stack_tensor_list(actions), rewards=rewards, true_rewards=true_rewards, agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def sample_paths(N, policy, baseline, env_mode='train', T=1e6, gamma=1, normalized_env=False, env=None): # Directly specifying env works only when sampling in series # set random seed (needed for multiprocessing) np.random.seed() if env == None: env = get_environment(env_mode) T = min(T, env.horizon) T = max(1, T) # sometimes, env is not initialized correctly in multiprocessing # this is just a sanity check and step size should essentially be zero. print "####### Worker started #######" paths = [] for ep in range(N): observations=[] actions=[] rewards=[] agent_infos = [] env_infos = [] qpos = [] qvel = [] o = env.reset() if normalized_env: qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) done = False t = 0 while t < T and done != True: a, agent_info = policy.get_action(o) next_o, r, done, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) if normalized_env: qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) o = next_o t += 1 # make a path dictionary # Also store the path belief and env data used in the trajectory try: path_belief = env.env.belief except Exception as e: path_belief = str(e) # path_model = env.env path = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), qpos=tensor_utils.stack_tensor_list(qpos), qvel=tensor_utils.stack_tensor_list(qvel), #path_belief=path_belief, #path_model=path_model, ) # TODO: Storing the path model is too space inefficient. Need to find alternative # compute returns using the path path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in xrange(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + gamma * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # advantages and returns are stored backward in time advantages = np.array(advantages[::-1]) returns = np.array(returns[::-1]) # normalize advantages advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) #print "Env body_mass : ", env.env.model.body_mass[1] print "====== Worker finished ======" return paths
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in xrange(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in xrange(len(paths)): paths[i]['rewards'] = ( paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in xrange(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in xrange(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in xrange(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [ (path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data