class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv( [make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name) def sample(self, policy, params=None, gamma=0.95, device='cpu'): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] # count = -1 while (not all(dones)) or (not self.queue.empty()): # count = count + 1 with torch.no_grad(): observations_tensor = torch.from_numpy(observations).to( device=device) actions_tensor = policy(observations_tensor, params=params).sample() actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step( actions) # if count <2: # print("\ndones: ", dones) # print("info: ", new_batch_ids) # # print(new_observations.shape) # print("robot position: ", new_observations[:,:2]) # print("goal: ", new_observations[:, 4:6]) new_hid_observations = self.envs.get_peds() # new_hid_observations = np.array([[-1,-1], [1,-1], [1,1], [-1,1]]) episodes.append(observations, new_hid_observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 2): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv( [make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name) def sample(self, policy, params=None, gamma=0.95, device='cpu'): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or (not self.queue.empty()): with torch.no_grad(): observations_tensor = torch.from_numpy(observations).to( device=device) actions_tensor = policy(observations_tensor, params=params).sample() # actions_tensor = policy(observations_tensor, params=params) actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step( actions) episodes.append(observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks def sample_target_task(self, N): tasks = self._env.unwrapped.sample_target_task(N) return tasks
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=None, test_env=False): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers or mp.cpu_count() - 1 self.test_env = test_env self.queue = mp.Queue() self.envs = SubprocVecEnv([make_env(env_name, test_env=test_env) for _ in range(num_workers)], queue=self.queue) self._env = make_env(env_name, test_env=test_env)() def sample(self, policy, params=None, gamma=0.95, device='cpu'): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or (not self.queue.empty()): with torch.no_grad(): observations_tensor = torch.from_numpy(observations).to(device=device, dtype=torch.float32) actions_tensor = policy(observations_tensor, params=params).sample() actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, infos = self.envs.step(actions) # info keys: reachDist, pickRew, epRew, goalDist, success, goal, task_name # NOTE: last infos will be absent if batch_size % num_workers != 0 episodes.append(observations, actions, rewards, batch_ids, infos) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks, task2prob=None): tasks = self._env.unwrapped.sample_tasks(num_tasks, task2prob) return tasks
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name) def sample(self, policy, params=None, gamma=0.9): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or (not self.queue.empty()): observations_tensor = observations # 气死 observations和action要一样的维度 垃圾 # observations_tensor = observations.reshape(observations.shape[0], -1) actions_tensor = policy(observations_tensor, params=params).sample() # /CPU:0 with tf.device('/CPU:0'): actions = actions_tensor.numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(actions) episodes.append(observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv( [make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name) def sample(self, policy, task, tree=None, params=None, gamma=0.95, device='cpu'): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or (not self.queue.empty()): with torch.no_grad(): input = torch.from_numpy(observations).float().to( device=device) if self.env_name == 'AntPos-v0': _, embedding = tree.forward( torch.from_numpy( task["position"]).float().to(device=device)) if self.env_name == 'AntVel-v1': _, embedding = tree.forward( torch.from_numpy(np.array( [task["velocity"]])).float().to(device=device)) # print(input.shape) # print(embedding.shape) observations_tensor = torch.t( torch.stack([ torch.cat([ torch.from_numpy(np.array(teo)).to(device=device), embedding[0] ], 0) for teo in input ], 1)) actions_tensor = policy(observations_tensor, task=task, params=params, enhanced=False).sample() actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step( actions) episodes.append(observations_tensor.cpu().numpy(), actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks
class BatchSampler: def __init__(self, env_name, batch_size, num_workers=mp.cpu_count()): """ :param env_name: :param batch_size: fast batch size :param num_workers: """ self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() # [lambda function] env_factorys = [make_env(env_name) for _ in range(num_workers)] # this is the main process manager, and it will be in charge of num_workers sub-processes interacting with # environment. self.envs = SubprocVecEnv(env_factorys, queue_=self.queue) self._env = gym.make(env_name) def sample(self, policy, params=None, gamma=0.95, device='cpu'): """ :param policy: :param params: :param gamma: :param device: :return: """ episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or ( not self.queue.empty()): # if all done and queue is empty # for reinforcement learning, the forward process requires no-gradient with torch.no_grad(): # convert observation to cuda # compute policy on cuda # convert action to cpu observations_tensor = torch.from_numpy(observations).to( device=device) # forward via policy network # policy network will return Categorical(logits=logits) actions_tensor = policy(observations_tensor, params=params).sample() actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step( actions) # here is observations NOT new_observations, batch_ids NOT new_batch_ids episodes.append(observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1, args=None): self.env_name = env_name self.batch_size = batch_size # NOTE # of trajectories in each env self.num_workers = num_workers self.args = args self.queue = mp.Queue() self.envs = SubprocVecEnv( [make_env(args, i_worker) for i_worker in range(num_workers)], queue=self.queue) self._env = make_env(args, i_worker=99)() def sample(self, policy, params=None, prey=None, gamma=0.95, device='cpu'): """Sample # of trajectories defined by "self.batch_size". The size of each trajectory is defined by the Gym env registration defined at: ./maml_rl/envs/__init__.py """ assert prey is not None episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, worker_ids = self.envs.reset( ) # TODO reset needs to be fixed dones = [False] while (not all(dones)) or (not self.queue.empty()): with torch.no_grad(): # Get observations predator_observations, prey_observations = self.split_observations( observations) predator_observations_torch = torch.from_numpy( predator_observations).to(device=device) prey_observations_torch = torch.from_numpy( prey_observations).to(device=device) # Get actions predator_actions = policy(predator_observations_torch, params=params).sample() predator_actions = predator_actions.cpu().numpy() prey_actions = prey.select_deterministic_action( prey_observations_torch) prey_actions = prey_actions.cpu().numpy() actions = np.concatenate([predator_actions, prey_actions], axis=1) new_observations, rewards, dones, new_worker_ids, _ = self.envs.step( copy.deepcopy(actions)) assert np.sum(dones[:, 0]) == np.sum(dones[:, 1]) dones = dones[:, 0] # Get new observations new_predator_observations, _ = self.split_observations( new_observations) # Get rewards predator_rewards = rewards[:, 0] episodes.append(predator_observations, predator_actions, predator_rewards, worker_ids) observations, worker_ids = new_observations, new_worker_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks, test=False): if test is False: i_agents = np.random.randint(low=0, high=16, size=(num_tasks, )) else: i_agents = np.random.randint(low=16, high=21, size=(num_tasks, )) tasks = [{"i_agent": i_agent} for i_agent in i_agents] return tasks def split_observations(self, observations): predator_observations = [] prey_observations = [] for obs in observations: assert len(obs) == 2 predator_observations.append(obs[0]) prey_observations.append(obs[1]) return \ np.asarray(predator_observations, dtype=np.float32), \ np.asarray(prey_observations, dtype=np.float32)