Пример #1
0
    def __init__(self):
        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))

        # imperative only support one gpu or xpu
        if core.is_compiled_with_cuda():
            selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",")
            self._device_id = int(selected_gpus[0])
        elif core.is_compiled_with_xpu():
            selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
            self._device_id = int(selected_xpus[0])
        elif core.is_compiled_with_npu():
            selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
            self._device_id = int(selected_npus[0])
        elif core.is_compiled_with_mlu():
            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
            self._device_id = int(selected_mlus[0])

        self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                            "").split(",")
        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
        self._nrings = int(os.getenv("FLAGS_nccl_nrings", "1"))
        assert self._nrings > 0, \
            "nccl_nrings must be an integer greater than 0."
        assert self._nrings < 9, \
            "nccl_nrings should be less than 9, which is enough in most scenarios."
Пример #2
0
 def test_check_grad(self):
     if self.need_check_grad:
         if core.is_compiled_with_xpu():
             paddle.enable_static()
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(place,
                                        set(['Input', 'Filter']), 'Output')
Пример #3
0
 def get_places(self):
     places = [core.CPUPlace()]
     if core.is_compiled_with_cuda():
         places.append(core.CUDAPlace(0))
     if core.is_compiled_with_xpu():
         places.append(core.XPUPlace(0))
     return places
Пример #4
0
    def set_strategy(cls, enable_inplace, enable_sequential_execution,
                     fuse_all_optimizer_ops, fuse_all_reduce_ops,
                     fuse_elewise_add_act_ops, fuse_relu_depthwise_conv,
                     use_fast_executor, use_ir_memory_optimize, use_reduce,
                     use_device):
        exec_strategy = fluid.ExecutionStrategy()
        if use_fast_executor:
            exec_strategy.use_experimental_executor = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
        build_strategy.memory_optimize = use_ir_memory_optimize
        build_strategy.enable_inplace = enable_inplace
        build_strategy.enable_sequential_execution = enable_sequential_execution

        if use_device == DeviceType.CUDA and core.is_compiled_with_cuda():
            build_strategy.remove_unnecessary_lock = True
        if use_device == DeviceType.XPU and core.is_compiled_with_xpu():
            build_strategy.fuse_elewise_add_act_ops = False
            build_strategy.fuse_relu_depthwise_conv = False
            build_strategy.fuse_all_optimizer_ops = False
            build_strategy.memory_optimize = False
            build_strategy.enable_inplace = False
            build_strategy.enable_sequential_execution = False

        return build_strategy, exec_strategy
 def test_check_grad_stopgrad_dscale_dbias(self):
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, ['X'],
                                    'Out',
                                    no_grad_set=set(['Scale', 'Bias']))
Пример #6
0
    def _compare_reduce_and_allreduce(self,
                                      model,
                                      use_device,
                                      delta1=1e-6,
                                      delta2=1e-4):
        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
            return

        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
            return

        img, label = init_data()

        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
            model,
            feed_dict={
                "image": img,
                "label": label
            },
            use_device=use_device,
            use_reduce=False)

        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
            model,
            feed_dict={
                "image": img,
                "label": label
            },
            use_device=use_device,
            use_reduce=True)

        for loss in zip(all_reduce_first_loss, reduce_first_loss):
            self.assertAlmostEqual(loss[0], loss[1], delta=delta1)
        for loss in zip(all_reduce_last_loss, reduce_last_loss):
            self.assertAlmostEqual(loss[0], loss[1], delta=delta2)
Пример #7
0
    def _init_communicator(self, program, current_endpoint, endpoints, rank,
                           ring_id, wait_port):
        nranks = len(endpoints)
        other_endpoints = endpoints[:]
        other_endpoints.remove(current_endpoint)
        if rank == 0 and wait_port:
            wait_server_ready(other_endpoints)

        block = program.global_block()
        if core.is_compiled_with_cuda():
            comm_id_var = block.create_var(
                name=unique_name.generate('nccl_id'),
                persistable=True,
                type=core.VarDesc.VarType.RAW)
            block.append_op(type='c_gen_nccl_id',
                            inputs={},
                            outputs={'Out': comm_id_var},
                            attrs={
                                'rank': rank,
                                'endpoint': current_endpoint,
                                'other_endpoints': other_endpoints,
                                OP_ROLE_KEY: OpRole.Forward
                            })
            block.append_op(type='c_comm_init',
                            inputs={'X': comm_id_var},
                            outputs={},
                            attrs={
                                'nranks': nranks,
                                'rank': rank,
                                'ring_id': ring_id,
                                OP_ROLE_KEY: OpRole.Forward
                            })
        elif core.is_compiled_with_xpu():
            comm_id_var = block.create_var(
                name=unique_name.generate('bkcl_id'),
                persistable=True,
                type=core.VarDesc.VarType.RAW)
            block.append_op(type='c_gen_bkcl_id',
                            inputs={},
                            outputs={'Out': comm_id_var},
                            attrs={
                                'rank': rank,
                                'endpoint': current_endpoint,
                                'other_endpoints': other_endpoints,
                                OP_ROLE_KEY: OpRole.Forward
                            })
            block.append_op(type='c_comm_init',
                            inputs={'X': comm_id_var},
                            outputs={},
                            attrs={
                                'nranks': nranks,
                                'rank': rank,
                                'ring_id': ring_id,
                                OP_ROLE_KEY: OpRole.Forward
                            })
        else:
            raise ValueError(
                "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
            )
Пример #8
0
 def test_check_grad(self):
     if (hasattr(self, "no_need_check_grad")
             and self.no_need_check_grad == True):
         return
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         self.check_grad_with_place(self.place, {'Input', 'Filter'},
                                    'Output')
Пример #9
0
 def test_check_grad(self):
     # TODO(wangzhongpu): support mkldnn op in dygraph mode
     if self.dtype == np.float16:
         return
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
Пример #10
0
 def test_check_grad(self):
     if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
                                     self.no_need_check_grad == True):
         return
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
 def test_check_grad(self):
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place,
                                    {'Input', 'Offset', 'Mask', 'Filter'},
                                    'Output',
                                    max_relative_error=0.06)
Пример #12
0
 def test_check_grad_no_input(self):
     if (hasattr(self, "no_need_check_grad")
             and self.no_need_check_grad == True):
         return
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         self.check_grad_with_place(self.place, ['Filter'],
                                    'Output',
                                    no_grad_set=set(['Input']))
Пример #13
0
 def test_check_grad(self):
     # TODO(wangzhongpu): support mkldnn op in dygraph mode
     if (hasattr(self, "no_need_check_grad")
             and self.no_need_check_grad == True):
         return
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         self.check_grad_with_place(self.place, {'Input', 'Filter'},
                                    'Output')
Пример #14
0
 def test_xpu(self):
     if core.is_compiled_with_xpu():
         with fluid.dygraph.guard():
             out = paddle.to_tensor([1, 2])
             device = paddle.get_device()
             self.assertEqual(
                 isinstance(framework._current_expected_place(),
                            core.XPUPlace), True)
             self.assertTrue(out.place.is_xpu_place())
             self.assertEqual(device, "xpu:0")
Пример #15
0
def _is_cpuonly(backend):
    check_backend(backend)
    if backend in [
            'auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'
    ] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
           or core.is_compiled_with_npu() or core.is_compiled_with_mlu()):

        # passes 'auto' and can use cuda or xpu, use the default logics. so return False
        return False
    else:
        return True
Пример #16
0
 def test_xpu_device(self):
     if core.is_compiled_with_xpu():
         out1 = paddle.zeros(shape=[1, 3], dtype='float32')
         out2 = paddle.ones(shape=[1, 3], dtype='float32')
         out3 = paddle.concat(x=[out1, out2], axis=0)
         paddle.set_device('xpu:0')
         exe = paddle.fluid.Executor()
         exe.run(paddle.fluid.default_startup_program())
         res = exe.run(fetch_list=[out3])
         device = paddle.get_device()
         self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
         self.assertEqual(device, "xpu:0")
Пример #17
0
def is_compiled_with_xpu():
    """
    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun

    Returns (bool): whether paddle was built with WITH_XPU=ON

    Examples:
        .. code-block:: python

            import paddle
            support_xpu = paddle.device.is_compiled_with_xpu()
    """
    return core.is_compiled_with_xpu()
Пример #18
0
    def check_batchnorm_fc_convergence(self, use_device, use_fast_executor):
        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
            return
        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
            return
        img, label = init_data()

        self.check_network_convergence(fc_with_batchnorm,
                                       feed_dict={
                                           "image": img,
                                           "label": label
                                       },
                                       use_device=use_device,
                                       use_fast_executor=use_fast_executor)
Пример #19
0
def _set_trainer_env(env_dict):
    # NOTE(chenweihang): [ Why need set FLAGS_selected_gpus or FLAGS_selected_xpus here? ]
    # When the child process starts, it will inherit the configuration of the 
    # main process and set the FLAGS once, but the environment variable has 
    # not been set at this time, which leads to the FLAGS_selected_gpus or FLAGS_selected_xpus
    # is keep same with mainprocess(usually empty), so manually update the flags here
    if core.is_compiled_with_cuda():
        set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']})
    elif core.is_compiled_with_xpu():
        set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']})
    else:
        raise ValueError("PaddlePaddle should be compiled with XPU or CUDA.")
    for var_name in env_dict:
        os.environ[var_name] = env_dict[var_name]
Пример #20
0
    def check_simple_fc_convergence(self, use_device, use_reduce=False):
        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
            return

        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
            return

        img, label = init_data()

        self.check_network_convergence(simple_fc_net,
                                       feed_dict={
                                           "image": img,
                                           "label": label
                                       },
                                       use_device=use_device,
                                       use_reduce=use_reduce)
Пример #21
0
def _prepare_trainer_env(cluster, trainer):
    if core.is_compiled_with_xpu():
        proc_env = {
            "FLAGS_selected_xpus":
            "%s" % ",".join([str(g) for g in trainer.gpus]),
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
        }
    elif core.is_compiled_with_cuda():
        proc_env = {
            "FLAGS_selected_gpus":
            "%s" % ",".join([str(g) for g in trainer.gpus]),
            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
        }
    return proc_env
Пример #22
0
def _convert_to_place(device):
    lower_device = device.lower()
    if lower_device == 'cpu':
        place = core.CPUPlace()
    elif lower_device == 'gpu':
        if not core.is_compiled_with_cuda():
            raise ValueError("The device should not be 'gpu', "
                             "since PaddlePaddle is not compiled with CUDA")
        place = core.CUDAPlace(ParallelEnv().dev_id)
    elif lower_device == 'xpu':
        if not core.is_compiled_with_xpu():
            raise ValueError("The device should not be 'xpu', "
                             "since PaddlePaddle is not compiled with XPU")
        selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
        device_id = int(selected_xpus[0])
        place = core.XPUPlace(device_id)
    elif lower_device == 'npu':
        if not core.is_compiled_with_npu():
            raise ValueError("The device should not be 'npu', "
                             "since PaddlePaddle is not compiled with NPU")
        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
        device_id = int(selected_npus[0])
        place = core.NPUPlace(device_id)
    elif lower_device == 'ipu':
        if not core.is_compiled_with_ipu():
            raise ValueError(
                "The device should not be 'ipu', " \
                "since PaddlePaddle is not compiled with IPU")
        place = core.IPUPlace()
    elif lower_device == 'mlu':
        if not core.is_compiled_with_mlu():
            raise ValueError("The device should not be 'mlu', "
                             "since PaddlePaddle is not compiled with MLU")
        selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
        device_id = int(selected_mlus[0])
        place = core.MLUPlace(device_id)
    elif device in core.get_all_custom_device_type():
        place = core.CustomPlace(device, 0)
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
        avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
        if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
            device_info_list = device.split(':', 1)
            device_type = device_info_list[0]
            if device_type in core.get_all_custom_device_type():
                device_id = device_info_list[1]
                device_id = int(device_id)
                place = core.CustomPlace(device_type, device_id)
            else:
                raise ValueError(
                    "The device must be a string which is like 'cpu', {}".
                    format(', '.join("'{}', '{}:x'".format(x, x)
                                     for x in ['gpu', 'xpu', 'npu', 'mlu'] +
                                     core.get_all_custom_device_type())))
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with CUDA".format(avaliable_gpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.CUDAPlace(device_id)
        if avaliable_xpu_device:
            if not core.is_compiled_with_xpu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with XPU".format(avaliable_xpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.XPUPlace(device_id)
        if avaliable_npu_device:
            if not core.is_compiled_with_npu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with NPU".format(avaliable_npu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.NPUPlace(device_id)
        if avaliable_mlu_device:
            if not core.is_compiled_with_mlu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with mlu".format(avaliable_mlu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.MLUPlace(device_id)
    return place
Пример #23
0
def set_device(device):
    """
    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
    They are represented by string identifiers. This function can specify the global device
    which the OP will run.

    Parameters:
        device(str): This parameter determines the specific running device.
            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
            index of the GPUs or XPUs. 

    Examples:

     .. code-block:: python
            
        import paddle

        paddle.set_device("cpu")
        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
        data = paddle.stack([x1,x2], axis=1)
    """
    lower_device = device.lower()
    if lower_device == 'cpu':
        place = core.CPUPlace()
    elif lower_device == 'gpu':
        if not core.is_compiled_with_cuda():
            raise ValueError(
                "The device should not be 'gpu', " \
                "since PaddlePaddle is not compiled with CUDA")
        place = core.CUDAPlace(ParallelEnv().dev_id)
    elif lower_device == 'xpu':
        if not core.is_compiled_with_xpu():
            raise ValueError(
                "The device should not be 'xpu', " \
                "since PaddlePaddle is not compiled with XPU")
        place = core.XPUPlace(ParallelEnv().dev_id)
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
        if not avaliable_gpu_device and not avaliable_xpu_device:
            raise ValueError(
                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu' or 'xpu:x'"
            )
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is " \
                    "not compiled with CUDA".format(avaliable_gpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.CUDAPlace(device_id)
        if avaliable_xpu_device:
            if not core.is_compiled_with_xpu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is " \
                    "not compiled with XPU".format(avaliable_xpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.XPUPlace(device_id)
    framework._set_expected_place(place)
    return place
Пример #24
0
 def test_check_output(self):
     # TODO(wangzhongpu): support mkldnn op in dygraph mode
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         self.check_output_with_place(place=self.place)
Пример #25
0
 def test_check_output(self):
     if core.is_compiled_with_xpu():
         paddle.enable_static()
         self.check_output_with_place(self.place)
Пример #26
0
 def test_check_preset_envs(self):
     if core.is_compiled_with_xpu():
         os.environ["FLAGS_selected_xpus"] = "0"
         place_list = static.xpu_places()
         self.assert_places_equal([fluid.XPUPlace(0)], place_list)
Пример #27
0
def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.

    .. note::
        Now initialize both `NCCL` and `GLOO` contexts for communication.

    Args:
        backend (string): A string represents the backend used by DataParallel,
            should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect).
            The auto detection prefer 'nccl', 'bkcl' than 'gloo'.

    Returns:
        None
        
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
            import paddle.optimizer as opt
            import paddle.distributed as dist

            class LinearNet(nn.Layer):
                def __init__(self):
                    super(LinearNet, self).__init__()
                    self._linear1 = nn.Linear(10, 10)
                    self._linear2 = nn.Linear(10, 1)
                    
                def forward(self, x):
                    return self._linear2(self._linear1(x))

            def train():
                # 1. initialize parallel environment
                dist.init_parallel_env()

                # 2. create data parallel layer & optimizer
                layer = LinearNet()
                dp_layer = paddle.DataParallel(layer)

                loss_fn = nn.MSELoss()
                adam = opt.Adam(
                    learning_rate=0.001, parameters=dp_layer.parameters())

                # 3. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
                loss = loss_fn(outputs, labels)
                
                loss.backward()

                adam.step()
                adam.clear_grad()

            if __name__ == '__main__':
                dist.spawn(train)
    """

    # 0. get env & check world size
    global _global_parallel_env
    # when call init_parallel_env, need update `_global_parallel_env`
    _global_parallel_env = ParallelEnv()
    parallel_env = _global_parallel_env
    # if not parallel, `init_parallel_env` do nothing
    if parallel_env.world_size < 2:
        warnings.warn(
            "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything."
        )
        return
    # NOTE(xiongkun): support cpu gloo only, add this environment variable to
    #                 enable cpu only gloo prarllel training)
    backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
    is_cpu_only = _is_cpuonly(backend)
    # 1. gpu xpu check, must be gpu or xpu,
    if not (is_cpu_only or core.is_compiled_with_cuda()
            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
            or core.is_compiled_with_mlu()):
        raise NotImplementedError(
            "If you want to use CPU-only version, please use 'gloo' as backend"
        )

    if not is_cpu_only and core.is_compiled_with_cuda():
        _check_var_exists("FLAGS_selected_gpus")
        backend = "nccl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_xpu():
        _check_var_exists('FLAGS_selected_xpus')
        backend = "bkcl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_npu():
        _check_var_exists('FLAGS_selected_npus')
        backend = "hccl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_mlu():
        _check_var_exists('FLAGS_selected_mlus')
        backend = "cncl" if backend == "auto" else backend

    _check_var_exists("PADDLE_TRAINER_ID")
    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
    _check_var_exists("PADDLE_TRAINERS_NUM")
    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")

    # NOTE(chenweihang): [ why config global place here? ]
    # the dygraph mode will be set to default mode,
    # users will not call `dygraph.guard` or `enable_dygraph`
    # directly, if they want to switch default place,
    # they need to call a function to change default place,
    # here just set correctly place to users
    if is_cpu_only:
        place = core.CPUPlace()
    elif core.is_compiled_with_cuda():
        place = core.CUDAPlace(parallel_env.device_id)
    elif core.is_compiled_with_xpu():
        place = core.XPUPlace(parallel_env.device_id)
    elif core.is_compiled_with_npu():
        place = core.NPUPlace(parallel_env.device_id)
    elif core.is_compiled_with_mlu():
        place = core.MLUPlace(parallel_env.device_id)

    _set_expected_place(place)

    group = None
    if backend in _valid_backend_list and in_dygraph_mode():
        if _default_group_name in _get_group_map_by_name():
            return _get_group_map_by_name()[_default_group_name]
        _set_default_backend(backend)
        rank = int(os.getenv("PADDLE_TRAINER_ID"))
        world_size = int(os.getenv("PADDLE_TRAINERS_NUM"))
        assert rank >= 0 and world_size > rank and world_size > 1, (
            "rank must be non-negative and world_size must be the "
            "maximum rank plus one. Moreover, at least two processes are "
            "required to create a process group.")
        master_addr = os.getenv("MASTER_ADDR", None)
        master_port = os.getenv("MASTER_PORT", None)
        endpoints = ":".join([master_addr, master_port
                              ]) if master_addr and master_port else None
        if endpoints is None:
            endpoints = os.getenv("PADDLE_MASTER", None)
        if endpoints is None:
            endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0]
        assert endpoints, (
            "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
            "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
            "and 'export MASTER_ADDR=54612'. Or you can start your training"
            "with paddle.distributed.run module.")
        master_addr, master_port = endpoints.split(":")
        master_port = int(master_port)
        is_master = rank == 0
        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
        default_store = core.TCPStore(master_addr,
                                      master_port,
                                      is_master,
                                      world_size,
                                      stop_check_timeout=stop_check_timeout)
        _set_default_store(default_store)
        pg = _new_process_group_impl(backend,
                                     default_store,
                                     rank,
                                     world_size,
                                     _default_group_name,
                                     pg_options=None)
        ranks = list(range(world_size))
        group = Group(rank,
                      world_size,
                      id=0,
                      ranks=ranks,
                      pg=pg,
                      name=_default_group_name)
        _set_group_map_by_name(_default_group_name, group)
        _set_group_map(0, group)
        parallel_helper._set_parallel_ctx(True)

        paddle.distributed.barrier(group=group)
        return group

    node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
    # 3: init gloo context (step 1: httpsever start)
    init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
    if is_cpu_only or init_gloo or backend == "heter":
        ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
        manager = Manager()
        # glboal dict to store status
        http_server_d = manager.dict()
        http_server_d["running"] = False
        if parallel_env.rank == 0:
            # The scope for worker used by http server is '_worker'
            size = {'_worker': parallel_env.world_size}
            if backend == "heter":
                size = {'_worker': len(node_num)}
            http_server = Process(target=_start_kv_server,
                                  args=(int(ep_rank_0[1]), http_server_d,
                                        size))
            http_server.daemon = True
            http_server_d["running"] = True
            http_server.start()

    # 4. init NCCL ParallelStrategy
    strategy = ParallelStrategy()
    if parallel_helper._is_parallel_ctx_initialized():
        warnings.warn("The parallel environment has been initialized.")
    strategy.nranks = parallel_env.world_size
    strategy.local_rank = parallel_env.rank
    strategy.trainer_endpoints = parallel_env.trainer_endpoints
    strategy.current_endpoint = parallel_env.current_endpoint
    strategy.nrings = parallel_env.nrings

    # init nccl or hccl or bkcl or heter context
    if is_cpu_only:
        parallel_helper._set_parallel_ctx(
            core.GLOOParallelContext(strategy, place))
    elif (backend == "heter"):
        parallel_helper._set_parallel_ctx(
            core.HeterParallelContext(strategy, parallel_env.device_id))
    elif core.is_compiled_with_cuda():
        parallel_helper._set_parallel_ctx(
            core.NCCLParallelContext(strategy, place))
    elif core.is_compiled_with_xpu():
        parallel_helper._set_parallel_ctx(
            core.BKCLParallelContext(strategy, place))
    elif core.is_compiled_with_npu():
        parallel_helper._set_parallel_ctx(
            core.HCCLParallelContext(strategy, place))
    elif core.is_compiled_with_mlu():
        parallel_helper._set_parallel_ctx(
            core.CNCLParallelContext(strategy, place))

    if backend != "heter":
        other_endpoints = strategy.trainer_endpoints[:]
        other_endpoints.remove(strategy.current_endpoint)
        if not is_cpu_only and strategy.local_rank == 0:
            wait_server_ready(other_endpoints)

    parallel_helper._init_parallel_ctx()

    # 5: init gloo context (step 2: gloo init)
    # dividing init_gloo into two part beacause nccl and gloo
    # are separately looking for free ports which sometimes
    # leads to port-conflict.
    if (is_cpu_only or backend == "heter") and parallel_env.rank == 0:
        # compare to init_gloo, we don't need to
        # init gloo, because we do this in _init_parallel_ctx;
        http_server_d["running"] = False
        http_server.join()

    elif init_gloo:
        wait_server_ready([parallel_env.trainer_endpoints[0]])
        gloo_strategy = core.GlooParallelStrategy()
        gloo_strategy.rank = parallel_env.rank
        gloo_strategy.rank_num = parallel_env.world_size
        gloo_strategy.ip_address = ep_rank_0[0]
        gloo_strategy.ip_port = int(ep_rank_0[1])
        default_init_timeout_seconds = 3600
        default_run_timeout_seconds = 9999999
        gloo_strategy.init_seconds = default_init_timeout_seconds
        gloo_strategy.run_seconds = default_run_timeout_seconds
        gloo = core.GlooParallelContext(gloo_strategy)
        gloo.init()
        if parallel_env.rank == 0:
            http_server_d["running"] = False
            http_server.join()
    return group
Пример #28
0
def _get_subprocess_env_list(nprocs, options):
    # contruct processes env list
    processes_env_list = []

    # get args from kwargs
    args = ParallelEnvArgs()

    # deal with `ips`
    args.cluster_node_ips = options.get('ips', None)
    if args.cluster_node_ips is None:
        args.cluster_node_ips = options.get('cluster_node_ips', None)
        if args.cluster_node_ips is None:
            args.cluster_node_ips = "127.0.0.1"

    # deal with `gpus` or `xpus`
    # set default selected devices(gpus or xpus)
    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ]
    # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place,
    # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error
    # when using `ParallelEnv`
    # NOTE(chenweihang): use absolute gpu or xpu card id
    if core.is_compiled_with_cuda():
        args.selected_devices = options.get('gpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_cuda_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`CUDA_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `gpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected gpu card %s cannot found in "
                                     "CUDA_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))

    elif core.is_compiled_with_xpu():
        args.selected_devices = options.get('xpus', None)
        if args.selected_devices is None:
            args.selected_devices = options.get('selected_devices', None)
        env_devices = os.getenv("XPU_VISIBLE_DEVICES", None)
        if env_devices is None or env_devices == "":
            env_devices_list = [
                str(x) for x in six.moves.range(core.get_xpu_device_count())
            ]
        else:
            env_devices_list = env_devices.split(',')
        if args.selected_devices is None:
            if len(env_devices_list) < nprocs:
                raise RuntimeError(
                    "the number of visible devices(%d) is less than the number "
                    "of spawn processes(%d), please ensure that the correct "
                    "`nprocs` argument is passed or the environment variable "
                    "`XPU_VISIBLE_DEVICES` is correctly configured." %
                    (len(env_devices_list), nprocs))
            args.selected_devices = ",".join(
                [str(env_devices_list[x]) for x in range(0, nprocs)])
        else:
            selected_device_list = args.selected_devices.split(',')
            if len(selected_device_list) != nprocs:
                raise ValueError(
                    "The number of selected devices(%s) is not equal to "
                    "the number of spawn processes(%d), please ensure that the "
                    "correct `nprocs` and `xpus` arguments are passed." %
                    (len(selected_device_list), nprocs))
            for card_id in selected_device_list:
                if card_id not in env_devices_list:
                    raise ValueError("The selected xpu card %s cannot found in "
                                     "XPU_VISIBLE_DEVICES (%s)." %
                                     (card_id, ",".join(env_devices_list)))

    # set other inner args
    args.node_ip = options.get('node_ip', None)
    if args.node_ip is None:
        args.node_ip = _get_node_ip(args.cluster_node_ips)

    args.started_port = options.get('started_port', None)

    args.use_paddlecloud = options.get('use_paddlecloud', None)
    if args.use_paddlecloud is None:
        args.use_paddlecloud = use_paddlecloud()

    # get cluster and pod config
    cluster, pod = get_cluster_and_pod(args)

    # prepare subprocess env list
    for trainer in pod.trainers:
        processes_env_list.append(_prepare_trainer_env(cluster, trainer))

    # [Debug] print config
    args.print_config = options.get('print_config', False)
    if args.print_config:
        _print_arguments(args)

    return processes_env_list
Пример #29
0
 def test_check_no_preset_envs(self):
     if core.is_compiled_with_xpu():
         place_list = static.xpu_places(0)
         self.assert_places_equal([fluid.XPUPlace(0)], place_list)
Пример #30
0
        if core.is_float16_supported(place):
            self.check_grad_with_place(place, ['Y'],
                                       'Out',
                                       max_relative_error=0.5,
                                       no_grad_set=set("X"))

    def test_check_grad_ingore_y(self):
        place = core.CUDAPlace(0)
        if core.is_float16_supported(place):
            self.check_grad_with_place(place, ['X'],
                                       'Out',
                                       max_relative_error=0.9,
                                       no_grad_set=set('Y'))


@unittest.skipIf(not core.is_compiled_with_xpu(),
                 "core is not compiled with XPU")
class TestXPUMulOp1(TestMulOp):
    def init_dtype_type(self):
        self.dtype = np.float32

    def test_check_output(self):
        place = core.XPUPlace(0)
        self.check_output_with_place(place, atol=1e-1)

    def test_check_grad_normal(self):
        place = core.XPUPlace(0)
        self.check_grad_with_place(place, ['X', 'Y'],
                                   'Out',
                                   max_relative_error=0.5)