class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, #env=pickle.loads(pickle.dumps(self.algo.env)), #n = n_envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() #logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) #logger.record_tabular(log_prefix+"EnvExecTime", env_time) #logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None, batch_size=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs # if batch_size is not None: # self.batch_size = batch_size # else: self.batch_size = self.algo.batch_size print("vectorized sampler initiated") def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)] self.vec_env = VecEnvExecutor( envs=envs, #env=pickle.loads(pickle.dumps(self.algo.env)), #n = n_envs, max_path_length=self.algo.max_path_length ) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', preupdate=False, save_img_obs=False, contexts = None): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): assert False, "debug, should we be using this?" print("WARNING, will vectorize reset_args") reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance obses = self.vec_env.reset(reset_args) if contexts: obses = np.concatenate([obses, contexts], axis = 1) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.batch_size) policy_time = 0 env_time = 0 process_time = 0 if contexts: policy = self.algo.post_policy else: policy = self.algo.policy while n_samples < self.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) # print("debug, agent_infos", agent_infos) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict if contexts: next_obses = np.concatenate([next_obses, contexts], axis = 1) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) # TODO: let's also add the incomplete running_paths to paths running_paths[idx] = None path_nums[idx] += 1 process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses # adding the incomplete paths # for idx in range(self.vec_env.num_envs): # if running_paths[idx] is not None: # paths[idx].append(dict( # observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), # )) pbar.stop() # logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) # logger.record_tabular(log_prefix + "EnvExecTime", env_time) # logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], terminals=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["terminals"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), terminals=tensor_utils.stack_tensor_list( running_paths[idx]["terminals"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)] self.vec_env = VecEnvExecutor( envs=envs, #env=pickle.loads(pickle.dumps(self.algo.env)), #n = n_envs, max_path_length=self.algo.max_path_length ) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
class VectorizedSampler(BaseSampler): def start_worker(self): estimated_envs = int(self.algo.batch_size / self.algo.max_path_length) estimated_envs = max(1, min(estimated_envs, 100)) self.vec_env = VecEnvExecutor( self.algo.env, n=estimated_envs, max_path_length=self.algo.max_path_length ) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs def start_worker(self, include_joint_coords=False): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length, include_joint_coords=include_joint_coords) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length, include_joint_coords=include_joint_coords) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def obtain_samples_for_visualization(self, include_joint_coords=False): tf_env = self.algo.env if hasattr(tf_env.wrapped_env, "stats_recorder"): setattr(tf_env.wrapped_env.stats_recorder, "done", None) import builtins builtins.visualize = True print("\nAbout to start video...") obs_dim = self.env_spec.observation_space.shape[0] obs = tf_env.reset() obs = self._add_joint_coords_to_obs(obs, include_joint_coords) horizon = 1000 for horizon_num in range(1, horizon + 1): # action, _ = self.algo.policy.get_action(obs[:obs_dim]) action, _ = self.algo.policy.get_action(obs) next_obs, reward, done, _info = tf_env.step(action, use_states=obs) obs = self._add_joint_coords_to_obs(next_obs, include_joint_coords) if done or horizon_num == horizon: break builtins.visualize = False def obtain_samples(self, itr, include_joint_coords=False): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() obses = self._add_joint_coords_to_obses(obses, include_joint_coords) obs_dim = self.env_spec.observation_space.shape[0] dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, use_states=obses) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = self._add_joint_coords_to_obses(next_obses, include_joint_coords) pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths, n_samples def _add_joint_coords_to_obses(self, obses, include_joint_coords): if include_joint_coords: try: inner_env = self._get_inner_env() extended_obses = [] for obs in obses: extended_obses.append( self._add_joint_coords_to_obs(obs, include_joint_coords, inner_env)) return np.array(extended_obses) except AttributeError: inner_envs = self._get_inner_envs() extended_obses = [] for obs_i in range(len(obses)): extended_obses.append( self._add_joint_coords_to_obs(obses[obs_i], include_joint_coords, inner_envs[obs_i])) return np.array(extended_obses) return obses def _add_joint_coords_to_obs(self, obs, include_joint_coords, inner_env=None): if include_joint_coords: if not inner_env: inner_env = self._get_inner_env() if hasattr(inner_env, "get_geom_xpos"): return np.append(obs, inner_env.get_geom_xpos().flatten()) else: return np.append(obs, inner_env.env.get_geom_xpos().flatten()) return obs def _get_inner_env(self): env = self.vec_env.vec_env while hasattr(env, "env"): env = env.env if hasattr(env.wrapped_env, '_wrapped_env'): return env.wrapped_env._wrapped_env else: return env.wrapped_env.env.unwrapped def _get_inner_envs(self): inner_envs = [] for env in self.vec_env.envs: while hasattr(env, "env"): env = env.env if hasattr(env.wrapped_env, '_wrapped_env'): inner_envs.append(env.wrapped_env._wrapped_env) else: inner_envs.append(env.wrapped_env.env.unwrapped) return inner_envs
class RNNCriticSampler(object): def __init__(self, policy, env, n_envs, replay_pool_size, max_path_length, sampling_method, save_rollouts=False, save_rollouts_observations=True, save_env_infos=False, env_str=None, replay_pool_params={}): self._policy = policy self._n_envs = n_envs assert (self._n_envs == 1) # b/c policy reset self._replay_pools = [ RNNCriticReplayPool( env.spec, env.horizon, policy.N, policy.gamma, replay_pool_size // n_envs, obs_history_len=policy.obs_history_len, sampling_method=sampling_method, save_rollouts=save_rollouts, save_rollouts_observations=save_rollouts_observations, save_env_infos=save_env_infos, replay_pool_params=replay_pool_params) for _ in range(n_envs) ] try: envs = [ pickle.loads(pickle.dumps(env)) for _ in range(self._n_envs) ] if self._n_envs > 1 else [env] except: envs = [create_env(env_str) for _ in range(self._n_envs) ] if self._n_envs > 1 else [env] ### need to seed each environment if it is GymEnv seed = get_seed() if seed is not None and isinstance(utils.inner_env(env), GymEnv): for i, env in enumerate(envs): utils.inner_env(env).env.seed(seed + i) self._vec_env = VecEnvExecutor(envs=envs, max_path_length=max_path_length) self._curr_observations = self._vec_env.reset() @property def n_envs(self): return self._n_envs ################## ### Statistics ### ################## @property def statistics(self): return RNNCriticReplayPool.statistics_pools(self._replay_pools) def __len__(self): return sum([len(rp) for rp in self._replay_pools]) #################### ### Add to pools ### #################### def step(self, step, take_random_actions=False, explore=True): """ Takes one step in each simulator and adds to respective replay pools """ ### store last observations and get encoded encoded_observations = [] for i, (replay_pool, observation) in enumerate( zip(self._replay_pools, self._curr_observations)): replay_pool.store_observation(step + i, observation) encoded_observations.append( replay_pool.encode_recent_observation()) ### get actions if take_random_actions: actions = [ self._vec_env.action_space.sample() for _ in range(self._n_envs) ] est_values = [np.nan] * self._n_envs if isinstance(self._vec_env.action_space, Discrete): logprobs = [-np.log(self._vec_env.action_space.flat_dim) ] * self._n_envs elif isinstance(self._vec_env.action_space, Box): low = self._vec_env.action_space.low high = self._vec_env.action_space.high logprobs = [-np.sum(np.log(high - low))] * self._n_envs else: raise NotImplementedError else: actions, est_values, logprobs, _ = self._policy.get_actions( steps=list(range(step, step + self._n_envs)), current_episode_steps=self._vec_env.current_episode_steps, observations=encoded_observations, explore=explore) ### take step next_observations, rewards, dones, env_infos = self._vec_env.step( actions) if np.any(dones): self._policy.reset_get_action() ### add to replay pool for replay_pool, action, reward, done, env_info, est_value, logprob in \ zip(self._replay_pools, actions, rewards, dones, env_infos, est_values, logprobs): replay_pool.store_effect(action, reward, done, env_info, est_value, logprob) self._curr_observations = next_observations ##################### ### Add offpolicy ### ##################### def _rollouts_file(self, folder, itr): return os.path.join(folder, 'itr_{0:d}_rollouts.pkl'.format(itr)) def add_offpolicy(self, offpolicy_folder, num_offpolicy): step = 0 itr = 0 replay_pools = itertools.cycle(self._replay_pools) done_adding = False while os.path.exists(self._rollouts_file(offpolicy_folder, itr)): rollouts = joblib.load(self._rollouts_file(offpolicy_folder, itr))['rollouts'] itr += 1 for rollout, replay_pool in zip(rollouts, replay_pools): r_len = len(rollout['dones']) if step + r_len >= num_offpolicy: diff = num_offpolicy - step for k in ('observations', 'actions', 'rewards', 'dones', 'logprobs'): rollout[k] = rollout[k][:diff] done_adding = True r_len = len(rollout['dones']) replay_pool.store_rollout(step, rollout) step += r_len if done_adding: break if done_adding: break ######################### ### Sample from pools ### ######################### def can_sample(self): return np.any( [replay_pool.can_sample() for replay_pool in self._replay_pools]) def sample(self, batch_size): return RNNCriticReplayPool.sample_pools( self._replay_pools, batch_size, only_completed_episodes=self._policy.only_completed_episodes) ############### ### Logging ### ############### def log(self, prefix=''): RNNCriticReplayPool.log_pools(self._replay_pools, prefix=prefix) def get_recent_paths(self): return RNNCriticReplayPool.get_recent_paths_pools(self._replay_pools)
class VectorizedSampler(BaseSampler): def __init__(self, algo, n_envs=None): super(VectorizedSampler, self).__init__(algo) self.n_envs = n_envs def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, #env=pickle.loads(pickle.dumps(self.algo.env)), #n = n_envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() def get_random_action(self, observation, idx=None): # this function takes a numpy array observations and outputs randomly sampled actions. # idx: index corresponding to the task/updated policy. #flat_obs = self.env_spec.observation_space.flatten(observation) #f_dist = self._cur_f_dist mean = np.random.uniform(low=-1.0, high=1.0, size=[self.n_envs, self.algo.a_size]) #mean, log_std = [x[0] for x in f_dist([flat_obs])] action = mean return action, dict(mean=mean) def get_MPC_action(self, obs): action_list = [] for i in range(self.n_envs): action, _ = self.algo.policy.get_action(obs[i], i) action_list.append(action) return np.asarray(action_list).reshape( [-1, self.algo.env.action_space.shape[0]]), dict(means=action_list) def obtain_samples(self, itr, init_state=None, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs if init_state is not None: init_state = [init_state] * self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(init_state, reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs else: n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 #policy = self.algo.policy import time while n_samples < self.algo.max_path_length: t = time.time() #self.env_spec.reset(reset_args = reset_args) #policy.reset(dones) actions, agent_infos = self.get_MPC_action(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append( dict( observations=running_paths[idx]["observations"], actions=running_paths[idx]["actions"], rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
class VectorizedSampler(Sampler): def __init__(self, env, policy, n_envs, vec_env=None): self.env = env self.policy = policy self.n_envs = n_envs self.vec_env = vec_env self.env_spec = env.spec def start_worker(self): if self.vec_env is None: n_envs = self.n_envs if getattr(self.env, 'vectorized', False): self.vec_env = self.env.vec_env_executor(n_envs=n_envs) else: envs = [ pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor(envs=envs, ) def shutdown_worker(self): self.vec_env.terminate() def obtain_samples(self, itr, max_path_length, batch_size, max_n_trajs=None): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 dones = np.asarray([True] * self.vec_env.n_envs) obses = self.vec_env.reset(dones) running_paths = [None] * self.vec_env.n_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.policy import time while n_samples < batch_size: t = time.time() if hasattr(self.vec_env, "handle_policy_reset"): self.vec_env.handle_policy_reset(policy, dones) else: policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, max_path_length=max_path_length) if np.any(dones): new_obses = self.vec_env.reset(dones) reset_idx = 0 for idx, done in enumerate(dones): if done: next_obses[idx] = new_obses[reset_idx] reset_idx += 1 env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.n_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.n_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if max_n_trajs is not None and len(paths) >= max_n_trajs: break process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
class RandomVectorizedSampler(RandomBaseSampler, VectorizedSampler): def __init__(self, algo, n_envs=None): self.algo = algo VectorizedSampler.__init__(self, algo, n_envs=n_envs) def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec def shutdown_worker(self): self.vec_env.terminate() @overrides def obtain_samples(self, itr, num_samples=None, log=True, log_prefix='RandomSampler-'): if num_samples is None: num_samples = self.algo.batch_size paths = [] n_samples_collected = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(num_samples) env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples_collected < num_samples: # random actions t = time.time() actions = np.stack([ self.vec_env.action_space.sample() for _ in range(len(obses)) ], axis=0) policy_time = time.time() - t agent_infos = {} t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples_collected += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() if log: logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) return paths