def get_places(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) if core.is_compiled_with_xpu(): places.append(core.XPUPlace(0)) return places
def XPUPlace(dev_id): """ Return a Baidu Kunlun Place Parameters: dev_id(int): Baidu Kunlun device id Examples: .. code-block:: python import paddle place = paddle.device.XPUPlace(0) """ return core.XPUPlace(dev_id)
def test_case(self): x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") dim = fluid.data(name="dim", shape=[1], dtype="int32") shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") scale_tensor = fluid.data(name="scale_tensor", shape=[1], dtype="float32") out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12]) out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim]) out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor) out4 = fluid.layers.resize_bilinear(x, out_shape=[4, 4], actual_shape=actual_size) out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor) x_data = np.random.random((2, 3, 6, 6)).astype("float32") dim_data = np.array([12]).astype("int32") shape_data = np.array([12, 12]).astype("int32") actual_size_data = np.array([12, 12]).astype("int32") scale_data = np.array([2.0]).astype("float32") place = core.XPUPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) results = exe.run(fluid.default_main_program(), feed={ "x": x_data, "dim": dim_data, "shape_tensor": shape_data, "actual_size": actual_size_data, "scale_tensor": scale_data }, fetch_list=[out1, out2, out3, out4, out5], return_numpy=True) expect_res = bilinear_interp_np(x_data, out_h=12, out_w=12, align_corners=True) for res in results: self.assertTrue(np.allclose(res, expect_res))
def _convert_to_place(device): lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError("The device should not be 'gpu', " "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) elif lower_device == 'xpu': if not core.is_compiled_with_xpu(): raise ValueError("The device should not be 'xpu', " "since PaddlePaddle is not compiled with XPU") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) elif lower_device == 'npu': if not core.is_compiled_with_npu(): raise ValueError("The device should not be 'npu', " "since PaddlePaddle is not compiled with NPU") selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) place = core.NPUPlace(device_id) elif lower_device == 'ipu': if not core.is_compiled_with_ipu(): raise ValueError( "The device should not be 'ipu', " \ "since PaddlePaddle is not compiled with IPU") place = core.IPUPlace() elif lower_device == 'mlu': if not core.is_compiled_with_mlu(): raise ValueError("The device should not be 'mlu', " "since PaddlePaddle is not compiled with MLU") selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) elif device in core.get_all_custom_device_type(): place = core.CustomPlace(device, 0) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: device_info_list = device.split(':', 1) device_type = device_info_list[0] if device_type in core.get_all_custom_device_type(): device_id = device_info_list[1] device_id = int(device_id) place = core.CustomPlace(device_type, device_id) else: raise ValueError( "The device must be a string which is like 'cpu', {}". format(', '.join("'{}', '{}:x'".format(x, x) for x in ['gpu', 'xpu', 'npu', 'mlu'] + core.get_all_custom_device_type()))) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with CUDA".format(avaliable_gpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) if avaliable_xpu_device: if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with XPU".format(avaliable_xpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) if avaliable_npu_device: if not core.is_compiled_with_npu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with NPU".format(avaliable_npu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.NPUPlace(device_id) if avaliable_mlu_device: if not core.is_compiled_with_mlu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with mlu".format(avaliable_mlu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.MLUPlace(device_id) return place
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Args: backend (string): A string represents the backend used by DataParallel, should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect). The auto detection prefer 'nccl', 'bkcl' than 'gloo'. Returns: None Examples: .. code-block:: python # required: gpu import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend" ) if not is_cpu_only and core.is_compiled_with_cuda(): _check_var_exists("FLAGS_selected_gpus") backend = "nccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') backend = "bkcl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') backend = "hccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_mlu(): _check_var_exists('FLAGS_selected_mlus') backend = "cncl" if backend == "auto" else backend _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users if is_cpu_only: place = core.CPUPlace() elif core.is_compiled_with_cuda(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) elif core.is_compiled_with_mlu(): place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) group = None if backend in _valid_backend_list and in_dygraph_mode(): if _default_group_name in _get_group_map_by_name(): return _get_group_map_by_name()[_default_group_name] _set_default_backend(backend) rank = int(os.getenv("PADDLE_TRAINER_ID")) world_size = int(os.getenv("PADDLE_TRAINERS_NUM")) assert rank >= 0 and world_size > rank and world_size > 1, ( "rank must be non-negative and world_size must be the " "maximum rank plus one. Moreover, at least two processes are " "required to create a process group.") master_addr = os.getenv("MASTER_ADDR", None) master_port = os.getenv("MASTER_PORT", None) endpoints = ":".join([master_addr, master_port ]) if master_addr and master_port else None if endpoints is None: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0] assert endpoints, ( "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " "and 'export MASTER_ADDR=54612'. Or you can start your training" "with paddle.distributed.run module.") master_addr, master_port = endpoints.split(":") master_port = int(master_port) is_master = rank == 0 stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) default_store = core.TCPStore(master_addr, master_port, is_master, world_size, stop_check_timeout=stop_check_timeout) _set_default_store(default_store) pg = _new_process_group_impl(backend, default_store, rank, world_size, _default_group_name, pg_options=None) ranks = list(range(world_size)) group = Group(rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) return group node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) # 3: init gloo context (step 1: httpsever start) init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) if is_cpu_only or init_gloo or backend == "heter": ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} if backend == "heter": size = {'_worker': len(node_num)} http_server = Process(target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings # init nccl or hccl or bkcl or heter context if is_cpu_only: parallel_helper._set_parallel_ctx( core.GLOOParallelContext(strategy, place)) elif (backend == "heter"): parallel_helper._set_parallel_ctx( core.HeterParallelContext(strategy, parallel_env.device_id)) elif core.is_compiled_with_cuda(): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) elif core.is_compiled_with_mlu(): parallel_helper._set_parallel_ctx( core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] other_endpoints.remove(strategy.current_endpoint) if not is_cpu_only and strategy.local_rank == 0: wait_server_ready(other_endpoints) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. if (is_cpu_only or backend == "heter") and parallel_env.rank == 0: # compare to init_gloo, we don't need to # init gloo, because we do this in _init_parallel_ctx; http_server_d["running"] = False http_server.join() elif init_gloo: wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join() return group
def test_check_grad_ingore_y(self): place = core.XPUPlace(0) self.check_grad_with_place(place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
def test_check_grad_normal(self): place = core.XPUPlace(0) self.check_grad_with_place(place, ['X', 'Y'], 'Out', max_relative_error=0.9)
def test_check_output(self): place = core.XPUPlace(0) self.check_output_with_place(place, atol=2e-1)
def set_device(device): """ Paddle supports running calculations on various types of devices, including CPU, GPU and XPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the index of the GPUs or XPUs. Examples: .. code-block:: python import paddle paddle.set_device("cpu") x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') data = paddle.stack([x1,x2], axis=1) """ lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be 'gpu', " \ "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) elif lower_device == 'xpu': if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be 'xpu', " \ "since PaddlePaddle is not compiled with XPU") place = core.XPUPlace(ParallelEnv().dev_id) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device: raise ValueError( "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu' or 'xpu:x'" ) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with CUDA".format(avaliable_gpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) if avaliable_xpu_device: if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with XPU".format(avaliable_xpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) framework._set_expected_place(place) return place
def test_check_grad(self): place = core.XPUPlace(0) self.check_grad_with_place(place, ["X"], "Out", max_relative_error=0.05)
def test_check_grad(self): if self.has_xpu(): place = core.XPUPlace(0) self.check_grad_with_place(place, set(['X']), 'Out') return
def test_check_output(self): if self.has_xpu(): place = core.XPUPlace(0) self.check_output_with_place(place) return
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Returns: None Examples: .. code-block:: python import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # 1. gpu xpu check, must be gpu or xpu if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(): raise NotImplementedError( "Cannot initialize parallel environment in CPU-only version, now only " "supports initializing the GPU and XPU parallel environment. Please recompile " "or reinstall paddle with GPU or XPU support.") # 2. check env def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: raise ValueError( "paddle.distributed initialize error, " "environment variable %s is needed, but not set." % var_name) if core.is_compiled_with_cuda(): _check_var_exists("FLAGS_selected_gpus") elif core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3: init gloo context (step 1: httpsever start) init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) if init_gloo: ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} http_server = Process(target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users if core.is_compiled_with_cuda(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) _set_expected_place(place) # init nccl or bkcl context if core.is_compiled_with_cuda(): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. if init_gloo: wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join()
def test_w_is_selected_rows(self): place = core.XPUPlace(0) # if core.is_float16_supported(place): for inplace in [True, False]: self.check_with_place(place, inplace)
def test_check_grad(self): place = core.XPUPlace(0) # if core.is_float16_supported(place): self.check_grad_with_place(place, ['x0'], 'Out', max_relative_error=0.15)
def test_check_output(self): place = core.XPUPlace(0) # if core.is_float16_supported(place): self.check_output_with_place(place, atol=2e-2)