def start(self): timing = AttrDict({'copying': 0, 'prediction': 0}) while True: actions, step_type = self.step_queue.get() if actions is None: # stop signal for i, e in enumerate(self.envs): log.info('Closing env %d', self.env_indices[i]) e.close() log.info('Stop worker %r...', self.env_indices) break if step_type == StepType.REAL: envs = self.envs self.imagined_envs = None else: # step_type == StepType.IMAGINED: if self.imagined_envs is None: # initializing new prediction, let's report timing for the previous one if timing.prediction > 0 and self._verbose: log.debug( 'Multi-env copy took %.6f s, prediction took %.6f s', timing.copying, timing.prediction, ) timing.prediction = 0 timing.copying = time.time() self.imagined_envs = [] # we expect a list of actions for every environment in this worker (list of lists) assert len(actions) == len(self.envs) for env_idx in range(len(actions)): for _ in actions[env_idx]: imagined_env = copy.deepcopy(self.envs[env_idx]) self.imagined_envs.append(imagined_env) timing.copying = time.time() - timing.copying envs = self.imagined_envs actions = np.asarray(actions).flatten() assert len(envs) == len(actions) # Collect obs, reward, and 'done' for each env (discard info) prediction_start = time.time() results = [env.step(action) for env, action in zip(envs, actions)] # pack results per-env results = np.split(np.array(results), len(self.envs)) if step_type == StepType.IMAGINED: timing.prediction += time.time() - prediction_start # If this is a real step and the env is done, reset if step_type == StepType.REAL: for i, result in enumerate(results): obs, reward, done, info = result[0] if done: obs = self.envs[i].reset() results[i] = (obs, reward, done, info ) # collapse dimension of size 1 self.result_queue.put(results) self.step_queue.task_done()
def start(self): real_envs = [] imagined_envs = None timing = AttrDict({'copying': 0, 'prediction': 0}) while True: actions, msg_type = safe_get(self.task_queue) if msg_type == MsgType.INIT: self._init(real_envs) self.task_queue.task_done() continue if msg_type == MsgType.TERMINATE: self._terminate(real_envs, imagined_envs) self.task_queue.task_done() break # handling actual workload envs = real_envs if msg_type == MsgType.RESET or msg_type == MsgType.STEP_REAL or msg_type == MsgType.STEP_REAL_RESET: if imagined_envs is not None: for imagined_env in imagined_envs: imagined_env.close() imagined_envs = None elif msg_type == MsgType.INFO: pass else: if imagined_envs is None: # initializing new prediction, let's report timing for the previous one if timing.prediction > 0 and self._verbose: log.debug( 'Multi-env copy took %.6f s, prediction took %.6f s', timing.copying, timing.prediction, ) timing.prediction = 0 timing.copying = time.time() imagined_envs = [] # we expect a list of actions for every environment in this worker (list of lists) assert len(actions) == len(real_envs) for env_idx in range(len(actions)): for _ in actions[env_idx]: imagined_env = copy.deepcopy(real_envs[env_idx]) imagined_envs.append(imagined_env) timing.copying = time.time() - timing.copying envs = imagined_envs actions = np.asarray(actions).flatten() if msg_type == MsgType.RESET: results = [env.reset() for env in envs] elif msg_type == MsgType.INFO: results = [self._get_info(env) for env in envs] else: assert len(envs) == len(actions) reset = [False] * len(actions) if msg_type == MsgType.STEP_REAL_RESET: actions, reset = zip(*actions) # Collect obs, reward, done, and info prediction_start = time.time() results = [ env.step(action) for env, action in zip(envs, actions) ] self.timestep += 1 # pack results per-env results = np.split(np.array(results), len(real_envs)) if msg_type == MsgType.STEP_IMAGINED: timing.prediction += time.time() - prediction_start # If this is a real step and the env is done, reset if msg_type == MsgType.STEP_REAL or msg_type == MsgType.STEP_REAL_RESET: for i, result in enumerate(results): obs, reward, done, info = result[0] if self.is_multiagent and all(done): is_done = True elif not self.is_multiagent and done: is_done = True else: is_done = False if is_done or reset[i]: obs = real_envs[i].reset() if not self.is_multiagent: info = self._get_info( real_envs[i]) # info for the new episode results[i] = (obs, reward, done, info) self.result_queue.put(results) self.task_queue.task_done()