def set_device(device): """ Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU, MLU and IPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``mlu``, ``gpu:x``, ``xpu:x``, ``npu:x``, ``mlu:x`` and ``ipu``, where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. Examples: .. code-block:: python import paddle paddle.device.set_device("cpu") x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') data = paddle.stack([x1,x2], axis=1) """ place = _convert_to_place(device) framework._set_expected_place(place) return place
def _thread_loop(self, legacy_expected_place): #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread, # and it will call platform::SetDeviceId() in c++ internally. # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0, # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda # APIs in this thread. _set_expected_place(legacy_expected_place) while not self._thread_done_event.is_set(): try: indices = next(self._sampler_iter) # read data from dataset in mini-batch # with paddle.fluid.dygraph.guard(place=paddle.CPUPlace()): # read data from dataset in mini-batch batch = self._dataset_fetcher.fetch(indices, self._thread_done_event) except StopIteration: self._exit_thread_expectedly() return if batch is None or self._thread_done_event.is_set(): break # flat batch and record structure infos batch, structure = _flatten_batch(batch) self._structure_infos.append(structure) if self._thread_done_event.is_set(): break try: # pack as LoDTensorArray array = core.LoDTensorArray() for slot in batch: if isinstance(slot, (paddle.Tensor, core.eager.Tensor)): slot = slot.value().get_tensor() elif not isinstance(slot, core.LoDTensor): tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp array.append(slot) if self._thread_done_event.is_set(): break try: self._blocking_queue.push(array) except: self._exit_thread_expectedly() except: self._exit_thread_unexpectedly() six.reraise(*sys.exc_info()) self._exit_thread_expectedly()
def set_device(device): """ Paddle supports running calculations on various types of devices, including CPU and GPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the program is running on the cpu. When ``device`` is ``gpu``, the program is running ont the gpu. Examples: .. code-block:: python import paddle paddle.disable_static() paddle.set_device("cpu") x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') data = paddle.stack([x1,x2], axis=1) """ lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be 'gpu', " \ "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) else: avaliable_device = re.match(r'gpu:\d+', lower_device) if not avaliable_device: raise ValueError( "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'" ) if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with CUDA".format(avaliable_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) framework._set_expected_place(place) return place
def _thread_loop(self, legacy_expected_place): #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread, # and it will call platform::SetDeviceId() in c++ internally. # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0, # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda # APIs in this thread. _set_expected_place(legacy_expected_place) while not self._thread_done_event.is_set(): batch = self._get_data() if not self._thread_done_event.is_set(): if batch is None: self._exit_thread_expectedly() else: if isinstance(batch, _ResumeIteration): assert self._resume_worker_cnt > 0 self._resume_worker_cnt -= 1 continue try: # pack as LoDTensorArray array = core.LoDTensorArray() if self._use_shared_memory: for tensor in batch: array.append(tensor) else: # LoDTensor not in shared memory is not # serializable, cannot be create in workers for slot in batch: if isinstance(slot, (paddle.Tensor, core.eager.Tensor)): slot = slot.value().get_tensor() elif not isinstance(slot, core.LoDTensor): tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp array.append(slot) if not self._blocking_queue.push(array): self._blocking_queue.close() except Exception as e: self._exit_thread_unexpectedly() six.reraise(*sys.exc_info()) finally: self._rcvd_idx += 1
def _thread_loop(self, legacy_expected_place): try: #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread, # and it will call platform::SetDeviceId() in c++ internally. # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0, # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda # APIs in this thread. _set_expected_place(legacy_expected_place) for indices in self._sampler_iter: # read data from dataset in mini-batch batch = self._dataset_fetcher.fetch(indices) # flat batch and record structure infos batch, structure = _flatten_batch(batch) self._structure_infos.append(structure) # pack as LoDTensorArray array = core.LoDTensorArray() for slot in batch: if not isinstance(slot, core.LoDTensor): tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp array.append(slot) if not self._blocking_queue.push(array): break self._blocking_queue.close() self._thread = None except StopIteration: self._blocking_queue.close() except Exception: self._blocking_queue.kill() self._thread = None logging.warning("DataLoader reader thread raised an exception.") six.reraise(*sys.exc_info())
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now only supports initializing the GPU parallel training environment and using NCCL for communication. Returns: None Examples: .. code-block:: python import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 1. gpu check if not core.is_compiled_with_cuda(): raise NotImplementedError( "Cannot initialize parallel environment in CPU-only version, now only " "supports initializing the GPU parallel environment. Please recompile " "or reinstall paddle with GPU support.") # 2. check env def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: raise ValueError("paddle.distributed initialize error, " "environment variable %s is needed, but not set." % var_name) _check_var_exists("FLAGS_selected_gpus") _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = ParallelEnv().world_size strategy.local_rank = ParallelEnv().rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints strategy.current_endpoint = ParallelEnv().current_endpoint if strategy.nranks < 2: return # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users place = core.CUDAPlace(ParallelEnv().device_id) _set_expected_place(place) # init nccl context parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place)) parallel_helper._init_parallel_ctx()
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Args: backend (string): A string represents the backend used by DataParallel, should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect). The auto detection prefer 'nccl', 'bkcl' than 'gloo'. Returns: None Examples: .. code-block:: python # required: gpu import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend" ) if not is_cpu_only and core.is_compiled_with_cuda(): _check_var_exists("FLAGS_selected_gpus") backend = "nccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') backend = "bkcl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') backend = "hccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_mlu(): _check_var_exists('FLAGS_selected_mlus') backend = "cncl" if backend == "auto" else backend _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users if is_cpu_only: place = core.CPUPlace() elif core.is_compiled_with_cuda(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) elif core.is_compiled_with_mlu(): place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) group = None if backend in _valid_backend_list and in_dygraph_mode(): if _default_group_name in _get_group_map_by_name(): return _get_group_map_by_name()[_default_group_name] _set_default_backend(backend) rank = int(os.getenv("PADDLE_TRAINER_ID")) world_size = int(os.getenv("PADDLE_TRAINERS_NUM")) assert rank >= 0 and world_size > rank and world_size > 1, ( "rank must be non-negative and world_size must be the " "maximum rank plus one. Moreover, at least two processes are " "required to create a process group.") master_addr = os.getenv("MASTER_ADDR", None) master_port = os.getenv("MASTER_PORT", None) endpoints = ":".join([master_addr, master_port ]) if master_addr and master_port else None if endpoints is None: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0] assert endpoints, ( "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " "and 'export MASTER_ADDR=54612'. Or you can start your training" "with paddle.distributed.run module.") master_addr, master_port = endpoints.split(":") master_port = int(master_port) is_master = rank == 0 stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) default_store = core.TCPStore(master_addr, master_port, is_master, world_size, stop_check_timeout=stop_check_timeout) _set_default_store(default_store) pg = _new_process_group_impl(backend, default_store, rank, world_size, _default_group_name, pg_options=None) ranks = list(range(world_size)) group = Group(rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) return group node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) # 3: init gloo context (step 1: httpsever start) init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) if is_cpu_only or init_gloo or backend == "heter": ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} if backend == "heter": size = {'_worker': len(node_num)} http_server = Process(target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings # init nccl or hccl or bkcl or heter context if is_cpu_only: parallel_helper._set_parallel_ctx( core.GLOOParallelContext(strategy, place)) elif (backend == "heter"): parallel_helper._set_parallel_ctx( core.HeterParallelContext(strategy, parallel_env.device_id)) elif core.is_compiled_with_cuda(): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) elif core.is_compiled_with_mlu(): parallel_helper._set_parallel_ctx( core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] other_endpoints.remove(strategy.current_endpoint) if not is_cpu_only and strategy.local_rank == 0: wait_server_ready(other_endpoints) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. if (is_cpu_only or backend == "heter") and parallel_env.rank == 0: # compare to init_gloo, we don't need to # init gloo, because we do this in _init_parallel_ctx; http_server_d["running"] = False http_server.join() elif init_gloo: wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join() return group
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Returns: None Examples: .. code-block:: python import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # 1. gpu check if not core.is_compiled_with_cuda(): raise NotImplementedError( "Cannot initialize parallel environment in CPU-only version, now only " "supports initializing the GPU parallel environment. Please recompile " "or reinstall paddle with GPU support.") # 2. check env def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: raise ValueError("paddle.distributed initialize error, " "environment variable %s is needed, but not set." % var_name) _check_var_exists("FLAGS_selected_gpus") _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3: init gloo context (step 1: httpsever start) ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} http_server = Process( target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users place = core.CUDAPlace(parallel_env.device_id) _set_expected_place(place) # init nccl context parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place)) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join()
def set_device(device): """ Paddle supports running calculations on various types of devices, including CPU, GPU and XPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the index of the GPUs or XPUs. Examples: .. code-block:: python import paddle paddle.set_device("cpu") x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') data = paddle.stack([x1,x2], axis=1) """ lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be 'gpu', " \ "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) elif lower_device == 'xpu': if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be 'xpu', " \ "since PaddlePaddle is not compiled with XPU") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device: raise ValueError( "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu' or 'xpu:x'" ) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with CUDA".format(avaliable_gpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) if avaliable_xpu_device: if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with XPU".format(avaliable_xpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) framework._set_expected_place(place) return place