def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, reset_args=None, task_idxs=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs n_samples = 0 curr_noises = [ np.random.normal(0, 1, size=(self.latent_dim, )) for _ in range(self.vec_env.num_envs) ] #curr_noises = [np.ones(size = (self.latent_dim)) for _ in range(self.vec_env.num_envs)] obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) #TODO: What the hell does this do? actions, agent_infos = policy.get_actions(obses, task_idxs, curr_noises) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done, noise in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones, curr_noises): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], noises=[]) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) running_paths[idx]["noises"].append(noise) if done: paths[idx].append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), noises=self.flatten_n( running_paths[idx]["noises"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None curr_noises[idx] = np.random.normal( 0, 1, size=(self.latent_dim, )) #curr_noises[idx] = np.ones(size=(self.latent_dim)) process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', extra_input=None, extra_input_dim=None, preupdate=False, save_img_obs=False, numTrajs_perTask = None): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths #logger.log("Obtaining samples for iteration %d..." % itr) if extra_input is not None: if extra_input == "onehot_exploration": if preupdate: print("debug, using extra_input onehot") def expand_obs(obses, path_nums): extra = [special.to_onehot(path_num % extra_input_dim, extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra),axis=1) elif extra_input == "onehot_hacked": if preupdate: print("debug, using extra_input onehot") def expand_obs(obses, path_nums): extra = [special.to_onehot(3, extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra),axis=1) elif extra_input == "gaussian_exploration": if preupdate: print("debug, using extra_input gaussian") def expand_obs(obses, path_nums): extra = [np.random.normal(0.,1.,size=(extra_input_dim,)) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: print("debug, using extra_input zeros") def expand_obs(obses, path_nums): extra = [np.zeros(extra_input_dim) for path_num in path_nums] return np.concatenate((obses, extra), axis=1) else: def expand_obs(obses, path_nums): return obses else: def expand_obs(obses, path_nums): return obses #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): assert False, "debug, should we be using this?" print("WARNING, will vectorize reset_args") reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance obses = self.vec_env.reset(reset_args) obses = expand_obs(obses, path_nums) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs #pbar = ProgBarCounter(self.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) # print("debug, agent_infos", agent_infos) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict next_obses = expand_obs(next_obses,path_nums) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) # TODO: let's also add the incomplete running_paths to paths running_paths[idx] = None path_nums[idx] += 1 process_time += time.time() - t #pbar.inc(len(obses)) obses = next_obses # pbar.stop() # logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) # logger.record_tabular(log_prefix + "EnvExecTime", env_time) # logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', preupdate=False, save_img_obs=False, contexts = None): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): assert False, "debug, should we be using this?" print("WARNING, will vectorize reset_args") reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance obses = self.vec_env.reset(reset_args) if contexts: obses = np.concatenate([obses, contexts], axis = 1) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.batch_size) policy_time = 0 env_time = 0 process_time = 0 if contexts: policy = self.algo.post_policy else: policy = self.algo.policy while n_samples < self.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) # print("debug, agent_infos", agent_infos) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict if contexts: next_obses = np.concatenate([next_obses, contexts], axis = 1) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) # TODO: let's also add the incomplete running_paths to paths running_paths[idx] = None path_nums[idx] += 1 process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses # adding the incomplete paths # for idx in range(self.vec_env.num_envs): # if running_paths[idx] is not None: # paths[idx].append(dict( # observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), # )) pbar.stop() # logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) # logger.record_tabular(log_prefix + "EnvExecTime", env_time) # logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def ed_dec_rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1): if (agents.recurrent): assert isinstance( agents, GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class' """Decentralized rollout""" n_agents = len(env.agents) observations = [[] for _ in range(n_agents)] actions = [[] for _ in range(n_agents)] rewards = [[] for _ in range(n_agents)] agent_infos = [[] for _ in range(n_agents)] env_infos = [[] for _ in range(n_agents)] offset_t_sojourn = [[] for _ in range(n_agents)] olist = env.reset() assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents) agents.reset(dones=[True for _ in range(n_agents)]) path_length = 0 if animated: env.render() while path_length < max_path_length: agents_to_act = [ i for i, j in enumerate(olist) if j != [None] * len(j) ] if (not agents.recurrent): alist, agent_info_list = agents.get_actions( [olist[i] for i in agents_to_act]) agent_info_list = tensor_utils.split_tensor_dict_list( agent_info_list) else: alist, agent_info_list = agents.get_actions(olist) alist = [a for a in alist if a != None] agent_info_list = tensor_utils.split_tensor_dict_list( agent_info_list) agent_info_list = [ ainfo for i, ainfo in enumerate(agent_info_list) if i in agents_to_act ] next_actions = [None] * n_agents # will fill in in the loop # For each agent for ind, o in enumerate([olist[j] for j in agents_to_act]): # ind refers to non-None indicies # i refers to indices with Nones i = agents_to_act[ind] observations[i].append(env.observation_space.flatten(o)) # observations[i].append(o) # REMOVE THIS AND UNCOMMENT THE ABOVE LINE actions[i].append(env.action_space.flatten(alist[ind])) next_actions[i] = alist[ind] if agent_info_list is None: agent_infos[i].append({}) else: agent_infos[i].append(agent_info_list[ind]) # take next actions next_olist, rlist, d, env_info = env.step(np.asarray(next_actions)) # update sojourn time (we should associate ts from next_olist to r, not current) for i, r in enumerate(rlist): if r is None: continue # skip reward if agent has not acted yet if (len(observations[i]) > 0): rewards[i].append(r) offset_t_sojourn[i].append( env.observation_space.flatten(next_olist[i])[-1]) env_infos[i].append(env_info) path_length = max([len(o) for o in observations]) if d: break olist = next_olist if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if (path_length == max_path_length): # probably have some paths that aren't the right length for ind, o in enumerate(observations): r = rewards[ind] if (len(o) > len(r)): assert(len(o) <= (len(r) + 1)), \ 'len(o) %d, len(r) %d' % (len(o), len(r)) # delete last elem of obs, actions, agent infos del observations[ind][-1] del actions[ind][-1] del agent_infos[ind][-1] if animated: env.render() # remove empty agent trajectories observations = [o for o in observations if len(o) > 0] actions = [a for a in actions if len(a) > 0] rewards = [r for r in rewards if len(r) > 0] agent_infos = [i for i in agent_infos if len(i) > 0] env_infos = [e for e in env_infos if len(e) > 0] offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0] if (any( map(lambda x: x < n_agents, [ len(observations), len(actions), len(rewards), len(agent_infos), len(env_infos) ]))): print('\nWARNING: \n') print('n_agents: ', n_agents) print('len(observations): ', len(observations)) print('len(actions): ', len(actions)) print('len(rewards): ', len(rewards)) print('len(agent_infos): ', len(agent_infos)) print('len(env_infos): ', len(env_infos)) return [ dict( observations=tensor_utils.stack_tensor_list(observations[i]), actions=tensor_utils.stack_tensor_list(actions[i]), rewards=tensor_utils.stack_tensor_list(rewards[i]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]), env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]), offset_t_sojourn=tensor_utils.stack_tensor_list( offset_t_sojourn[i]), ) for i in range(len(observations)) ]
def obtain_agent_info_offpolicy(self, itr, expert_trajs_dir=None, offpol_trajs=None, treat_as_expert_traj=False, log_prefix=''): assert expert_trajs_dir is None, "deprecated" start = time.time() if offpol_trajs is None: assert expert_trajs_dir is not None, "neither offpol_trajs nor expert_trajs_dir is provided" if self.use_pooled_goals: for t, taskidx in enumerate(self.goals_idxs_for_itr_dict[itr]): assert np.array_equal( self.goals_pool[taskidx], self.goals_to_use_dict[itr][t]), "fail" offpol_trajs = { t: joblib.load(expert_trajs_dir + str(taskidx) + self.expert_trajs_suffix + ".pkl") for t, taskidx in enumerate( self.goals_idxs_for_itr_dict[itr]) } else: offpol_trajs = joblib.load(expert_trajs_dir + str(itr) + self.expert_trajs_suffix + ".pkl") offpol_trajs = { tasknum: offpol_trajs[tasknum] for tasknum in range(self.meta_batch_size) } # some initial rearrangement tasknums = offpol_trajs.keys( ) # tasknums is range(self.meta_batch_size) as can be seen above for t in tasknums: for path in offpol_trajs[t]: if 'expert_actions' not in path.keys( ) and treat_as_expert_traj: # print("copying expert actions, you should do this only 1x per metaitr") path['expert_actions'] = np.clip(deepcopy(path['actions']), -1.0, 1.0) if treat_as_expert_traj: path['agent_infos'] = dict( mean=[[0.0] * len(path['actions'][0])] * len(path['actions']), log_std=[[0.0] * len(path['actions'][0])] * len(path['actions'])) else: path['agent_infos'] = [None] * len(path['rewards']) if not treat_as_expert_traj: print("debug12, running offpol on own previous samples") running_path_idx = {t: 0 for t in tasknums} running_intra_path_idx = {t: 0 for t in tasknums} while max([running_path_idx[t] for t in tasknums ]) > -0.5: # we cycle until all indices are -1 observations = [ offpol_trajs[t][running_path_idx[t]]['observations'][ running_intra_path_idx[t]] for t in tasknums ] actions, agent_infos = self.policy.get_actions(observations) agent_infos = split_tensor_dict_list(agent_infos) for t, action, agent_info in zip(itertools.count(), actions, agent_infos): offpol_trajs[t][running_path_idx[t]]['agent_infos'][ running_intra_path_idx[t]] = agent_info # INDEX JUGGLING: if -0.5 < running_intra_path_idx[t] < len(offpol_trajs[t][ running_path_idx[t]]['rewards']) - 1: # if we haven't reached the end: running_intra_path_idx[t] += 1 else: if -0.5 < running_path_idx[t] < len( offpol_trajs[t]) - 1: # we wrap up the agent_infos offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \ stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos']) # if we haven't reached the last path: running_intra_path_idx[t] = 0 running_path_idx[t] += 1 elif running_path_idx[t] == len(offpol_trajs[t]) - 1: offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \ stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos']) running_intra_path_idx[t] = -1 running_path_idx[t] = -1 else: # otherwise we set the running index to -1 to signal a stop running_intra_path_idx[t] = -1 running_path_idx[t] = -1 total_time = time.time() - start # logger.record_tabular(log_prefix+"TotalExecTime", total_time) return offpol_trajs
def obtain_samples(self, itr, max_path_length, batch_size, max_n_trajs=None): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 dones = np.asarray([True] * self.vec_env.n_envs) obses = self.vec_env.reset(dones) running_paths = [None] * self.vec_env.n_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.policy import time while n_samples < batch_size: t = time.time() if hasattr(self.vec_env, "handle_policy_reset"): self.vec_env.handle_policy_reset(policy, dones) else: policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, max_path_length=max_path_length) if np.any(dones): new_obses = self.vec_env.reset(dones) reset_idx = 0 for idx, done in enumerate(dones): if done: next_obses[idx] = new_obses[reset_idx] reset_idx += 1 env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.n_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.n_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if max_n_trajs is not None and len(paths) >= max_n_trajs: break process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, num_samples=None, log=True, log_prefix='RandomSampler-'): if num_samples is None: num_samples = self.algo.batch_size paths = [] n_samples_collected = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(num_samples) env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples_collected < num_samples: # random actions t = time.time() actions = np.stack([ self.vec_env.action_space.sample() for _ in range(len(obses)) ], axis=0) policy_time = time.time() - t agent_infos = {} t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples_collected += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() if log: logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, oracle_policy): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] agent_only_paths = [] oracle_only_paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs agent_only_running_paths = [None] * self.vec_env.num_envs oracle_only_running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) agent_actions, binary_actions, agent_infos = policy.get_actions( obses) oracle_actions, oracle_agent_infos = oracle_policy.get_actions( obses) sigma = np.round(binary_actions) actions_1 = np.array([ sigma[0, 0] * agent_actions[0, :] + sigma[0, 1] * oracle_actions[0, :] ]) actions_2 = np.array([ sigma[1, 0] * agent_actions[1, :] + sigma[1, 1] * oracle_actions[1, :] ]) actions = np.concatenate((actions_1, actions_2), axis=0) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, itr) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if sigma[0, 0] == 1 or sigma[1, 0] == 1: for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if agent_only_running_paths[idx] is None: agent_only_running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) agent_only_running_paths[idx]["observations"].append( observation) agent_only_running_paths[idx]["actions"].append(action) agent_only_running_paths[idx]["rewards"].append(reward) agent_only_running_paths[idx]["env_infos"].append(env_info) agent_only_running_paths[idx]["agent_infos"].append( agent_info) if done: agent_only_paths.append( dict( observations=self.env_spec.observation_space. flatten_n(agent_only_running_paths[idx] ["observations"]), actions=self.env_spec.action_space.flatten_n( agent_only_running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( agent_only_running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( agent_only_running_paths[idx] ["env_infos"]), agent_infos=tensor_utils. stack_tensor_dict_list( agent_only_running_paths[idx] ["agent_infos"]), )) n_samples += len( agent_only_running_paths[idx]["rewards"]) agent_only_running_paths[idx] = None """ To get paths taken by the oracle """ # elif sigma[0] == 0. or sigma[1] == 0.: # for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, # rewards, env_infos, agent_infos, # dones): # if oracle_only_running_paths[idx] is None: # oracle_only_running_paths[idx] = dict( # observations=[], # actions=[], # rewards=[], # env_infos=[], # agent_infos=[], # ) # oracle_only_running_paths[idx]["observations"].append(observation) # oracle_only_running_paths[idx]["actions"].append(action) # oracle_only_running_paths[idx]["rewards"].append(reward) # oracle_only_running_paths[idx]["env_infos"].append(env_info) # oracle_only_running_paths[idx]["agent_infos"].append(agent_info) # if done: # oracle_only_paths.append(dict( # observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]), # )) # n_samples += len(oracle_only_running_paths[idx]["rewards"]) # oracle_only_running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) #return paths, agent_only_paths, oracle_only_paths return paths, agent_only_paths