def set_cuda_rng_state(state_list): """ Sets generator state for all cuda generators. Args: state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state(). Returns: None. Examples: .. code-block:: python import paddle sts = paddle.get_cuda_rng_state() paddle.set_cuda_rng_state(sts) """ if core.is_compiled_with_cuda(): if not len(state_list) == core.get_cuda_device_count(): raise ValueError( "Length of cuda state list shoule be equal to the cuda device count" ) for i in range(core.get_cuda_device_count()): core.default_cuda_generator(i).set_state(state_list[i])
def _get_batch_size(self, use_cuda, use_parallel_executor): batch_size_times = 1 if use_parallel_executor: batch_size_times = core.get_cuda_device_count( ) if use_cuda else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) return self.base_batch_size * batch_size_times
def seed(seed): """ Sets the seed for global default generator, which manages the random number generation. Args: seed(int): The random seed to set. It is recommend to set a large int number. Returns: Generator: The global default generator object. Examples: .. code-block:: python import paddle gen = paddle.seed(102) """ #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade # 2. support gpu generator by global device seed = int(seed) if core.is_compiled_with_cuda(): for i in range(core.get_cuda_device_count()): core.default_cuda_generator(i).manual_seed(seed) return core.default_cpu_generator().manual_seed(seed)
def get_gpus(selected_gpus): if selected_gpus is None: from paddle.fluid import core gpus_num = core.get_cuda_device_count() gpus = [str(x) for x in range(0, gpus_num)] else: cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_visible_devices is None or cuda_visible_devices == "": gpus = [x.strip() for x in selected_gpus.split(',')] else: # change selected_gpus into relative values # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; # therefore selected_gpus=0,1,2,3 cuda_visible_devices_list = cuda_visible_devices.split(',') for x in selected_gpus.split(','): assert x in cuda_visible_devices_list, "Can't find "\ "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ % (x, cuda_visible_devices) gpus = [ cuda_visible_devices_list.index(x.strip()) for x in selected_gpus.split(',') ] logger.info("Change selected_gpus into reletive values. --ips:{} " "will change into relative_ips:{} according to your " "CUDA_VISIBLE_DEVICES:{}".format( selected_gpus, gpus, cuda_visible_devices_list)) return gpus
def test_get_default_nprocs(self): paddle.set_device('cpu') nprocs = _get_default_nprocs() self.assertEqual(nprocs, multiprocessing.cpu_count()) paddle.set_device('gpu') nprocs = _get_default_nprocs() self.assertEqual(nprocs, core.get_cuda_device_count())
def _get_default_nprocs(): device = get_device() if 'gpu' in device: return core.get_cuda_device_count() elif 'xpu' in device: return core.get_xpu_device_count() elif 'cpu' in device: return multiprocessing.cpu_count() else: raise RuntimeError( "`paddle.distributed.spawn` does not support parallel training on device `{}` now." .format(device))
def main(self, use_cuda=True, use_parallel_executor=False, use_double_buffer=False): assert not use_cuda or use_cuda and core.is_compiled_with_cuda() self.use_cuda = use_cuda self.use_parallel_executor = use_parallel_executor self.use_double_buffer = use_double_buffer startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): in_data, label, loss, optimizer, feed_queue = simple_fc_net( in_size=self.in_size, class_num=self.class_num, hidden_sizes=self.hidden_sizes, batch_size=self.batch_size, queue_capacity=self.queue_capacity, use_double_buffer=self.use_double_buffer) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup_program) if use_parallel_executor: main_exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name) if use_cuda: self.batch_size_times = core.get_cuda_device_count() else: self.batch_size_times = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: main_exe = startup_exe self.batch_size_times = 1 reader = self.random_reader() thread = threading.Thread(target=feed_data, args=(feed_queue, reader)) thread.start() self.outputs = [] for _ in range(self.iterations): fetches = main_exe.run(fetch_list=[in_data.name, label.name]) fetches = [as_numpy(fetch) for fetch in fetches] self.outputs.append(fetches) feed_queue.close() self.validate()
def _build_program(self, place, layout, seed, sync_bn=False, only_forward=False): """Build program.""" main = fluid.Program() startup = fluid.Program() main.random_seed = seed startup.random_seed = seed use_cudnn = self.dtype == np.float16 with fluid.unique_name.guard(): with fluid.program_guard(main, startup): data = fluid.layers.data( name='input', shape=self.dshape, dtype=self.dtype, append_batch_size=False) conv = fluid.layers.conv2d( input=data, num_filters=32, filter_size=1, param_attr=fluid.ParamAttr(name='conv2d_weight'), bias_attr=False, use_cudnn=use_cudnn) bn = fluid.layers.batch_norm( conv, param_attr=fluid.ParamAttr(name='bn_scale'), bias_attr=fluid.ParamAttr(name='bn_bias'), moving_mean_name='bn_moving_mean', moving_variance_name='bn_moving_variance', data_layout=layout, is_test=only_forward) if core.is_compiled_with_rocm(): bn = fluid.layers.cast(bn, 'float32') else: bn = fluid.layers.cast(bn, 'float64') sigmoid = fluid.layers.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) if not sync_bn: out = out / core.get_cuda_device_count() if not only_forward: sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) sgd_opt.backward(out) return main, startup, [out, conv, bn]
def device_count(): ''' Return the number of GPUs available. Returns: int: the number of GPUs available. Examples: .. code-block:: python import paddle paddle.device.cuda.device_count() ''' num_gpus = core.get_cuda_device_count() if hasattr( core, 'get_cuda_device_count') else 0 return num_gpus
def get_cuda_rng_state(): """ Get random state of cuda generators. Args: None. Returns: GeneratorState: object. Examples: .. code-block:: python import paddle sts = paddle.get_cuda_rng_state() """ state_list = [] if core.is_compiled_with_cuda(): for i in range(core.get_cuda_device_count()): state_list.append(core.default_cuda_generator(i).get_state()) return state_list
def run_main(self, place, with_data_parallel): self.place = place self.with_data_parallel = with_data_parallel if not core.is_compiled_with_cuda() and isinstance( self.place, core.CUDAPlace): return if isinstance(self.place, core.CUDAPlace): device_cnt = core.get_cuda_device_count( ) if self.with_data_parallel else 1 else: device_cnt = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count()) ) if self.with_data_parallel else 1 d0 = layers.data("d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data("d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data("d2", shape=[10], append_batch_size=False, dtype='float32') i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) j = layers.fill_constant(shape=[1], dtype='int64', value=1) j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) array_len2.stop_gradient = True cond2 = layers.less_than(x=j, y=array_len2) while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) d = layers.reshape(d, shape=[10]) prev = layers.reshape(prev, shape=[10]) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) d2 = layers.reshape(d2, shape=[10]) prev2 = layers.reshape(prev2, shape=[10]) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) sum_result.persistable = True tmp = layers.unsqueeze(sum_result, axes=[0]) tmp = layers.expand(tmp, expand_times=[10, 1]) fc = layers.fc(tmp, size=256) loss = layers.mean(sum_result) optim = fluid.optimizer.Adam(learning_rate=1e-3) optim.minimize(loss) exe = Executor(self.place) exe.run(fluid.default_startup_program()) prog = fluid.default_main_program() if self.with_data_parallel: prog = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name) for _ in range(5): d = [] for i in range(3): tmp = numpy.random.random(size=[10]).astype('float32') if not self.with_data_parallel: d.append(tmp) else: d.append(numpy.array([tmp] * device_cnt)) outs = exe.run(program=prog, feed={ 'd0': d[0], 'd1': d[1], 'd2': d[2] }, fetch_list=[sum_result]) self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
def _get_device_count(self, use_cuda): return core.get_cuda_device_count() if use_cuda else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
def _compare(self, place, layout, only_forward): """Compare results.""" seed = 10 os.environ['FLAGS_cudnn_deterministic'] = "1" scope = core.Scope() data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2 data = create_or_get_tensor(scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place) # Single-GPU, N = 32 per GPU main, startup, outs = self._build_program(place, layout, seed, False, only_forward) exe = fluid.Executor(place) exe.run(startup) fetch_names = [v.name for v in outs] + [ 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' ] if not only_forward: others = [ 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' ] fetch_names += others bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_names) ##################################################################### # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU assert core.get_cuda_device_count() > 1 main, startup, outs = self._build_program(place, layout, seed, True, only_forward) exe = fluid.Executor(place) exe.run(startup) fetch_names = [v.name for v in outs] + [ 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' ] if not only_forward: others = [ 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' ] fetch_names += others for nm in fetch_names: fv = fluid.framework._get_var(str(nm), program=main) fv.persistable = True build_strategy = fluid.BuildStrategy() build_strategy.sync_batch_norm = True build_strategy.enable_inplace = False build_strategy.memory_optimize = False comp_prog = compiler.CompiledProgram(main).with_data_parallel( outs[0].name if not only_forward else None, build_strategy=build_strategy) sync_bn_fetches = exe.run(program=comp_prog, feed={'input': data}, fetch_list=fetch_names) for i in six.moves.xrange(1, len(sync_bn_fetches)): bn_val = bn_fetches[i] sync_bn_val = sync_bn_fetches[i] if sync_bn_val.shape != bn_val.shape: sync_bn_val = sync_bn_val[:bn_val.shape[0]] self.assertTrue( np.allclose(bn_val, sync_bn_val, atol=self.atol), "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN " + str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
def _get_subprocess_env_list(nprocs, options): # NOTE (xiongkun03) Why put backend deduction here ? # Becase _get_subprocess_env_list is used by many testcases. # So for campability, we put backend deduction here # logic for handle backend option if 'backend' not in options or options['backend'] == 'auto': options['backend'] = _get_default_backend() check_backend(options['backend']) block_windows_and_macos(options['backend']) # contruct processes env list processes_env_list = [] # get args from kwargs args = ParallelEnvArgs() # deal with `ips` args.cluster_node_ips = options.get('ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = options.get('cluster_node_ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = "127.0.0.1" # deal with `gpus` or `xpus` # set default selected devices(gpus or xpus) # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ] # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place, # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id if options['backend'] == 'nccl': args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_cuda_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`CUDA_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `gpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected gpu card %s cannot found in " "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'bkcl': args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("XPU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_xpu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`XPU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `xpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'cncl': args.selected_devices = options.get('mlus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("MLU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_mlu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`MLU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `mlus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected mlu card %s cannot found in " "MLU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif options['backend'] == 'gloo': # TODO check gpu / xpu flag must not exist warnings.warn( "Your model will be trained under CPUONLY mode by using GLOO," "because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device." ) args.paddle_cpuonly = True args.selected_devices = None args.ips = args.cluster_node_ips assert options.get( 'use_paddlecloud', None) is None, "CPUONLY spawn doesn't support use paddle cloud" assert len( args.cluster_node_ips.split(',') ) <= 1, "CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s." assert _get_trainers_num( ) == 1, "CPUONLY spawn doesn't support multi-trainer" # set other inner args args.node_ip = options.get('node_ip', None) if args.node_ip is None: args.node_ip = _get_node_ip(args.cluster_node_ips) args.started_port = options.get('started_port', None) args.use_paddlecloud = options.get('use_paddlecloud', None) if args.use_paddlecloud is None: args.use_paddlecloud = use_paddlecloud() # get cluster and pod config if options['backend'] == 'gloo': devices_per_proc = [x for x in range(0, nprocs)] cluster, pod = get_cluster_from_args(args, DeviceMode.CPU, devices_per_proc) else: cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: processes_env_list.append( _prepare_trainer_env(cluster, trainer, options['backend'])) # [Debug] print config args.print_config = options.get('print_config', False) if args.print_config: _print_arguments(args) return processes_env_list
def main(self, use_cuda=True, use_parallel_executor=False, use_double_buffer=False, use_feed_list=False, use_decorate_paddle_reader=False): assert not use_cuda or use_cuda and core.is_compiled_with_cuda() self.use_cuda = use_cuda self.use_parallel_executor = use_parallel_executor self.use_double_buffer = use_double_buffer self.use_feed_list = use_feed_list self.use_decorate_paddle_reader = use_decorate_paddle_reader startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net( in_size=self.in_size, class_num=self.class_num, hidden_sizes=self.hidden_sizes, batch_size=self.batch_size, queue_capacity=self.queue_capacity, use_double_buffer=self.use_double_buffer, use_feed_list=self.use_feed_list) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) train_cp = main_program if use_parallel_executor: train_cp = compiler.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name) if use_cuda: self.batch_size_times = core.get_cuda_device_count() else: self.batch_size_times = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: self.batch_size_times = 1 reader = self.tensor_reader(use_decorate_paddle_reader) batch_reader = paddle.batch(reader, batch_size=self.batch_size) self.inputs = [] self.outputs = [] if use_decorate_paddle_reader: if use_feed_list: py_reader.decorate_paddle_reader(batch_reader) else: py_reader.decorate_sample_list_generator(batch_reader) py_reader.start() else: thread = threading.Thread(target=feed_data, args=(feed_queue, batch_reader)) thread.daemon = True thread.start() try: while True: fetches = exe.run(train_cp, fetch_list=[in_data.name, label.name]) fetches = [as_numpy(fetch) for fetch in fetches] self.outputs.append(fetches) except fluid.core.EOFException: pass feed_queue.close() self.validate() if use_decorate_paddle_reader: py_reader.exited = True py_reader.thread.join() else: thread.join()
def _get_subprocess_env_list(nprocs, options): # contruct processes env list processes_env_list = [] # get args from kwargs args = ParallelEnvArgs() # deal with `ips` args.cluster_node_ips = options.get('ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = options.get('cluster_node_ips', None) if args.cluster_node_ips is None: args.cluster_node_ips = "127.0.0.1" # deal with `gpus` or `xpus` # set default selected devices(gpus or xpus) # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus or FLAGS_selected_xpus directly? ] # because the FLAGS_selected_gpus or FLAGS_selected_xpus may be used in other place, # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id if core.is_compiled_with_cuda(): args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_cuda_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`CUDA_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `gpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected gpu card %s cannot found in " "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) elif core.is_compiled_with_xpu(): args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) env_devices = os.getenv("XPU_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_xpu_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_devices is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`XPU_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_devices = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: selected_device_list = args.selected_devices.split(',') if len(selected_device_list) != nprocs: raise ValueError( "The number of selected devices(%s) is not equal to " "the number of spawn processes(%d), please ensure that the " "correct `nprocs` and `xpus` arguments are passed." % (len(selected_device_list), nprocs)) for card_id in selected_device_list: if card_id not in env_devices_list: raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) # set other inner args args.node_ip = options.get('node_ip', None) if args.node_ip is None: args.node_ip = _get_node_ip(args.cluster_node_ips) args.started_port = options.get('started_port', None) args.use_paddlecloud = options.get('use_paddlecloud', None) if args.use_paddlecloud is None: args.use_paddlecloud = use_paddlecloud() # get cluster and pod config cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: processes_env_list.append(_prepare_trainer_env(cluster, trainer)) # [Debug] print config args.print_config = options.get('print_config', False) if args.print_config: _print_arguments(args) return processes_env_list
def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): """ Start multiple processes with ``spawn`` method for parallel training. .. note:: ``spawn`` now only supports GPU collective mode. Args: func (function): The target function is called by spawned process. This function need to be able to pickled, so it must be defined at the top level of a module. args (tuple, optional): Arguments passed to ``func``. nprocs (int, optional): Number of processed to start. Default: -1. when nprocs is -1, the available device will be obtained from the environment variable when the model is executed: If use GPU, the currently available device ID is obtained from the environment variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available CPU number is obtained from the environment variable CPU_NUM. For example, export CPU_NUM=4, if the environment variable is not set, the spawn method will add default value to the environment variable and set its value to 1. join (bool, optional): Perform a blocking join on all spawned processes. Default: True. daemon (bool, optional): The spawned processes' daemon flag. Default: False. **options(dict, optional): Other initial parallel execution environment configuration options. The following options are currently supported: (1) start_method (string): the way to start a process. The start method can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA runtime does not support the ``fork`` start method, when use CUDA in subprocesses, we should start process by ``spawn`` or ``forkserver`` method. Default: "spawn" ; (2) gpus (string): The training process will run on the selected gpus, such as "0,1,2,3". Default: None; (3) ips (string): Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1" . Returns: ``MultiprocessContext`` object, it hold the spawned processes. Examples: .. code-block:: python from __future__ import print_function import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(print_result=False): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) loss.backward() adam.step() adam.clear_grad() # Usage 1: only pass function. # If your training method no need any argument, and # use all visible devices for parallel training. if __name__ == '__main__': dist.spawn(train) # Usage 2: pass function and arguments. # If your training method need some arguments, and # use all visible devices for parallel training. if __name__ == '__main__': dist.spawn(train, args=(True,)) # Usage 3: pass function, arguments and nprocs. # If your training method need some arguments, and # only use part of visible devices for parallel training. # If your machine hold 8 cards {0,1,2,3,4,5,6,7}, # this case will use cards {0,1}; If you set # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use # cards {4,5} if __name__ == '__main__': dist.spawn(train, args=(True,), nprocs=2) # Usage 4: pass function, arguments, nprocs and gpus. # If your training method need some arguments, and # only use part of visible devices for parallel training, # but you can't set your machine's environment variable # CUDA_VISIBLE_DEVICES, such as it is None or all cards # {0,1,2,3,4,5,6,7}, you can pass `gpus` to # select the GPU cards you want to use. For example, # this case will use cards {4,5} if your machine hold 8 cards. if __name__ == '__main__': dist.spawn(train, args=(True,), nprocs=2, gpus='4,5') """ # NOTE(chenweihang): [ why only supports python3.4+ ? ] # Python supported setting the child process startup method # since 3.4. The previous version can only use the default startup # method, while the default startup method of Unix is fork, which # cannot support CUDA runtime multi-process _py_supported_check() # Give an error hint when the users enter a configuration option # that does not exist _options_valid_check(options) # get default nprocs if nprocs == -1: device = get_device() if device == 'cpu': # TODO: not supports cpu parallel now nprocs = _cpu_num() elif device == 'gpu': nprocs = core.get_cuda_device_count() elif device == 'xpu': nprocs = core.get_xpu_device_count() else: raise ValueError( "`device` should be a string of `cpu`, 'gpu' or 'xpu', but got {}". format(device)) # NOTE(chenweihang): [ why need get cluster info before run? ] # when using `paddle.distributed.spawn` start parallel training, # we should get cluster info before starting subprocess, and pass # correct info to each subprocess procs_env_list = _get_subprocess_env_list(nprocs, options) # start processes # NOTE(chenweihang): [ why default start method is spawn? ] # The CUDA runtime does not support the fork start method, # either the spawn or forkserver start method are required # to use CUDA in subprocesses. start_method = options.get('start_method', None) if start_method is None: start_method = 'spawn' mp = multiprocessing.get_context(start_method) error_queues = [] return_queues = [] processes = [] for i in range(nprocs): error_queue = mp.SimpleQueue() return_queue = mp.SimpleQueue() process = mp.Process( target=_func_wrapper, args=(func, args, error_queue, return_queue, procs_env_list[i])) process.daemon = daemon process.start() error_queues.append(error_queue) return_queues.append(return_queue) processes.append(process) context = MultiprocessContext(processes, error_queues, return_queues) if not join: return context # loop until all process end while not context.join(): pass # finally return context return context
def _get_subprocess_env_list(nprocs, options): # contruct processes env list processes_env_list = [] # get args from kwargs args = ParallelEnvArgs() # set default `node_ip` and `cluster_node_ips` args.cluster_node_ips = options.get('cluster_node_ips', None) args.node_ip = options.get('node_ip', None) if args.cluster_node_ips is not None and args.node_ip is None: raise ValueError("please input current node ip, " "cannot only give `cluster_node_ips`.") default_node_ip = "127.0.0.1" if args.node_ip is None: args.node_ip = default_node_ip if args.cluster_node_ips is None: args.cluster_node_ips = default_node_ip # set default selected gpus # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ] # because the FLAGS_selected_gpus may be used in other place, # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu card id args.selected_gpus = options.get('selected_gpus', None) env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": env_devices_list = [ str(x) for x in six.moves.range(core.get_cuda_device_count()) ] else: env_devices_list = env_devices.split(',') if args.selected_gpus is None: if len(env_devices_list) < nprocs: raise RuntimeError( "the number of visible devices(%d) is less than the number " "of spawn processes(%d), please ensure that the correct " "`nprocs` argument is passed or the environment variable " "`CUDA_VISIBLE_DEVICES` is correctly configured." % (len(env_devices_list), nprocs)) args.selected_gpus = ",".join( [str(env_devices_list[x]) for x in range(0, nprocs)]) else: for card_id in args.selected_gpus.split(','): if card_id not in env_devices_list: raise ValueError("The selected gpu card %s cannot found in " "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) # set other arguments args.started_port = options.get('started_port', None) args.use_paddlecloud = options.get('use_paddlecloud', False) args.print_config = options.get('print_config', False) # reuse code of launch.py cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: processes_env_list.append(_prepare_trainer_env(cluster, trainer)) # print config if args.print_config: _print_arguments(args) return processes_env_list