def _worker(self, root_dir, parameters, device_queue): # sleep for random seconds to avoid crowded launching try: time.sleep(random.uniform(0, 3)) device = device_queue.get() if self._conf.use_gpu: os.environ["CUDA_VISIBLE_DEVICES"] = str(device) else: os.environ["CUDA_VISIBLE_DEVICES"] = "" # run on cpu if torch.cuda.is_available(): alf.set_default_device("cuda") logging.set_verbosity(logging.INFO) logging.info("Search parameters %s" % parameters) with gin.unlock_config(): gin.parse_config( ['%s=%s' % (k, v) for k, v in parameters.items()]) gin.parse_config( "TrainerConfig.confirm_checkpoint_upon_crash=False") train_eval(FLAGS.ml_type, root_dir) device_queue.put(device) except Exception as e: logging.info(traceback.format_exc()) raise e
def delayed_dequeue(): # cpu tensor on subprocess. Otherwise, spawn method is needed. alf.set_default_device("cpu") sleep(0.04) ring_buffer.dequeue() # 6(deleted), 7, 8, 9 sleep(0.04) # 10, 7, 8, 9 ring_buffer.dequeue() # 10, 7(deleted), 8, 9
def _worker(self, conn, env_constructor, env_id=None, flatten=False): """The process waits for actions and sends back environment results. Args: conn (multiprocessing.connection): Connection for communication to the main process. env_constructor (Callable): callable environment creator. flatten (bool): whether to assume flattened actions and time_steps during communication to avoid overhead. Raises: KeyError: When receiving a message of unknown type. """ try: alf.set_default_device("cpu") env = env_constructor(env_id) action_spec = env.action_spec() conn.send(self._READY) # Ready. while True: try: # Only block for short times to have keyboard exceptions be raised. if not conn.poll(0.1): continue message, payload = conn.recv() except (EOFError, KeyboardInterrupt): break if message == self._ACCESS: name = payload result = getattr(env, name) conn.send((self._RESULT, result)) continue if message == self._CALL: name, args, kwargs = payload if flatten and name == 'step': args = [nest.pack_sequence_as(action_spec, args[0])] result = getattr(env, name)(*args, **kwargs) if flatten and name in ['step', 'reset']: result = nest.flatten(result) assert all([ not isinstance(x, torch.Tensor) for x in result ]), ("Tensor result is not allowed: %s" % name) conn.send((self._RESULT, result)) continue if message == self._CLOSE: assert payload is None env.close() break raise KeyError( 'Received message of unknown type {}'.format(message)) except Exception: # pylint: disable=broad-except etype, evalue, tb = sys.exc_info() stacktrace = ''.join(traceback.format_exception(etype, evalue, tb)) message = 'Error in environment process: {}'.format(stacktrace) logging.error(message) conn.send((self._EXCEPTION, stacktrace)) finally: conn.close()
def __init__(self, *args): super().__init__(*args) alf.set_default_device("cpu") # spawn forking is required to use cuda. self.data_spec = DataItem(env_id=alf.TensorSpec(shape=(), dtype=torch.int64), x=alf.TensorSpec(shape=(self.dim, ), dtype=torch.float32), t=alf.TensorSpec(shape=(), dtype=torch.int32), o=dict({ "a": alf.TensorSpec(shape=(), dtype=torch.float32), "g": alf.TensorSpec(shape=(), dtype=torch.float32) }), reward=alf.TensorSpec(shape=(), dtype=torch.float32))
def delayed_enqueue(ring_buffer, batch): alf.set_default_device("cpu") sleep(0.04) ring_buffer.enqueue(batch, batch.env_id)