def _start_workers(self, num_workers): logger.debug(f"start_workers: Setting %d workers." % num_workers) worker_config = self.config.copy() batch_size_per_worker = self._configure_and_split_batch(num_workers) if batch_size_per_worker: worker_config[BATCH_SIZE] = batch_size_per_worker params = dict(model_creator=self.model_creator, data_creator=self.data_creator, optimizer_creator=self.optimizer_creator, loss_creator=self.loss_creator, scheduler_creator=self.scheduler_creator, training_operator_cls=self.training_operator_cls, config=worker_config, use_fp16=self.use_fp16, use_gpu=self.use_gpu, use_tqdm=self.use_tqdm, apex_args=self.apex_args, scheduler_step_freq=self.scheduler_step_freq) if num_workers == 1: # Start local worker self.local_worker = TorchRunner(**params) if self.initialization_hook: self.apply_all_workers(self.initialization_hook) self.local_worker.setup() else: params.update(backend=self.backend, add_dist_sampler=self.add_dist_sampler, wrap_ddp=self.wrap_ddp) # Start local worker self.local_worker = LocalDistributedRunner(num_cpus=1, num_gpus=int( self.use_gpu), **params) # Generate actor class RemoteRunner = ray.remote(num_cpus=1, num_gpus=int( self.use_gpu))(DistributedTorchRunner) # Start workers self.remote_workers = [ RemoteRunner.remote(**params) for i in range(num_workers - 1) ] if self.initialization_hook: self.apply_all_workers(self.initialization_hook) # Compute URL for initializing distributed PyTorch ip = ray.services.get_node_ip_address() port = self.local_worker.find_free_port() address = "tcp://{ip}:{port}".format(ip=ip, port=port) remote_setups = [ worker.setup.remote(address, i + 1, num_workers) for i, worker in enumerate(self.remote_workers) ] self.local_worker.setup(address, 0, num_workers) # Get setup tasks in order to throw errors on failure ray.get(remote_setups)
def __init__( self, *, model_creator, data_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=None, initialization_hook=None, config=None, num_workers=1, use_gpu="auto", backend="auto", use_fp16=False, use_tqdm=False, apex_args=None, add_dist_sampler=True, scheduler_step_freq="batch", num_replicas=None, batch_size=None, data_loader_args=None, ): if num_workers > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_workers=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) if not (callable(model_creator) and callable(optimizer_creator) and callable(data_creator)): raise ValueError( "Must provide a callable model_creator, optimizer_creator, " "and data_creator.") if num_replicas is not None: raise DeprecationWarning( "num_replicas is deprecated. Use num_workers instead.") if batch_size is not None: raise DeprecationWarning( "batch_size is deprecated. Use config={'batch_size': N} " "specify a batch size for each worker or " "config={ray.util.sgd.utils.BATCH_SIZE: N} to specify a " "batch size to be used across all workers.") if data_loader_args: raise ValueError( "data_loader_args is deprecated. You can return a " "torch.utils.data.DataLoader in data_creator. Ray will " "automatically set a DistributedSampler if a DataLoader is " "returned and num_workers > 1.") self.model_creator = model_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.data_creator = data_creator self.scheduler_creator = scheduler_creator self.training_operator_cls = training_operator_cls if not training_operator_cls and not loss_creator: raise ValueError("If a loss_creator is not provided, you must " "provide a custom training operator.") self.initialization_hook = initialization_hook self.config = {} if config is None else config if use_gpu == "auto": use_gpu = torch.cuda.is_available() if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.debug("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.max_replicas = num_workers self.use_fp16 = use_fp16 self.use_tqdm = use_tqdm self.add_dist_sampler = add_dist_sampler if apex_args and not isinstance(apex_args, dict): raise ValueError("apex_args needs to be a dict object.") self.apex_args = apex_args self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self.local_worker = DeactivatedRunner() self.remote_workers = [] _validate_scheduler_step_freq(scheduler_step_freq) self.scheduler_step_freq = scheduler_step_freq if not ray.is_initialized() and self.max_replicas > 1: logger.info("Automatically initializing single-node Ray. To use " "multi-node training, be sure to run `ray.init(" "address='auto')` before instantiating the Trainer.") ray.init() self._start_workers(self.max_replicas)
def _start_workers(self, num_workers): logger.debug(f"start_workers: Setting %d workers." % num_workers) worker_config = self.config.copy() batch_size_per_worker = self._configure_and_split_batch(num_workers) if batch_size_per_worker: worker_config[BATCH_SIZE] = batch_size_per_worker params = dict( training_operator_cls=self.training_operator_cls, config=worker_config, serialize_data_creation=self.serialize_data_creation, use_fp16=self.use_fp16, use_gpu=self.use_gpu, use_tqdm=self.use_tqdm, apex_args=self.apex_args, scheduler_step_freq=self.scheduler_step_freq) if num_workers == 1: # Start local worker self.local_worker = TorchRunner(**params) if self.initialization_hook: self.apply_all_workers(self.initialization_hook) self.local_worker.setup_operator() else: params.update( backend=self.backend, add_dist_sampler=self.add_dist_sampler, wrap_ddp=self.wrap_ddp) # Start local worker self.local_worker = LocalDistributedRunner( num_cpus=self.num_cpus_per_worker, num_gpus=int(self.use_gpu), **params) # Generate actor class RemoteRunner = ray.remote( num_cpus=self.num_cpus_per_worker, num_gpus=int(self.use_gpu))(DistributedTorchRunner) # Start workers self.remote_workers = [ RemoteRunner.remote(**params) for i in range(num_workers - 1) ] if self.initialization_hook: self.apply_all_workers(self.initialization_hook) # Compute URL for initializing distributed PyTorch address = setup_address() # Setup the process group among all workers. remote_pgroup_setups = [ worker.setup_process_group.remote(address, i + 1, num_workers, timedelta(self.timeout_s)) for i, worker in enumerate(self.remote_workers) ] self.local_worker.setup_process_group(address, 0, num_workers, timedelta(self.timeout_s)) # Get setup tasks in order to throw errors on failure ray.get(remote_pgroup_setups) # Runs code that requires all creator functions to have run. remote_operator_setups = [ worker.setup_operator.remote() for worker in self.remote_workers ] self.local_worker.setup_operator() # Get setup tasks in order to throw errors on failure ray.get(remote_operator_setups)
def __init__( self, *, training_operator_cls, initialization_hook=None, config=None, num_workers=1, num_cpus_per_worker=1, use_gpu="auto", backend="auto", wrap_ddp=True, timeout_s=NCCL_TIMEOUT_S, use_fp16=False, use_tqdm=False, apex_args=None, add_dist_sampler=True, scheduler_step_freq=None, # Deprecated Args. num_replicas=None, batch_size=None, model_creator=None, data_creator=None, optimizer_creator=None, scheduler_creator=None, loss_creator=None, serialize_data_creation=None, data_loader_args=None, ): if (model_creator or data_creator or optimizer_creator or scheduler_creator or loss_creator): raise DeprecationWarning( "Creator functions are deprecated. You should create a " "custom TrainingOperator, override setup, and register all " "training state there. See TrainingOperator for more info. " "If you would still like to use creator functions, you can " "do CustomOperator = TrainingOperator.from_creators(" "model_creator, ...) and pass in CustomOperator into " "TorchTrainer.") if num_workers > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_workers=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) if num_replicas is not None: raise DeprecationWarning( "num_replicas is deprecated. Use num_workers instead.") if batch_size is not None: raise DeprecationWarning( "batch_size is deprecated. Use config={'batch_size': N} " "specify a batch size for each worker or " "config={ray.util.sgd.utils.BATCH_SIZE: N} to specify a " "batch size to be used across all workers.") if serialize_data_creation is True: if log_once("serialize_data_creation"): logging.warning( "serialize_data_creation is deprecated and will be " "ignored. If you require serialized data loading you " "should implement this in TrainingOperator.setup. " "You may find FileLock useful here.") if data_loader_args: raise DeprecationWarning( "data_loader_args is deprecated. You can return a " "torch.utils.data.DataLoader in data_creator. Ray will " "automatically set a DistributedSampler if a DataLoader is " "returned and num_workers > 1.") self.training_operator_cls = training_operator_cls self.initialization_hook = initialization_hook self.config = {} if config is None else config if use_gpu == "auto": use_gpu = torch.cuda.is_available() _remind_gpu_usage(use_gpu) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.debug(f"Using {backend} as backend.") self.backend = backend self.num_cpus_per_worker = num_cpus_per_worker self.use_gpu = use_gpu self.max_replicas = num_workers self.serialize_data_creation = serialize_data_creation self.wrap_ddp = wrap_ddp self.timeout_s = timeout_s self.use_fp16 = use_fp16 self.use_tqdm = use_tqdm self.add_dist_sampler = add_dist_sampler if apex_args and not isinstance(apex_args, dict): raise ValueError("apex_args needs to be a dict object.") self.apex_args = apex_args self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self.local_worker = DeactivatedRunner() self.remote_workers = [] if scheduler_step_freq: _validate_scheduler_step_freq(scheduler_step_freq) self.scheduler_step_freq = scheduler_step_freq if not ray.is_initialized() and self.max_replicas > 1: logger.info("Automatically initializing single-node Ray. To use " "multi-node training, be sure to run `ray.init(" "address='auto')` before instantiating the Trainer.") ray.init() self._start_workers(self.max_replicas)
def _start_workers(self, num_workers): logger.debug(f"start_workers: Setting {num_workers} workers.") worker_config = self.config.copy() batch_size_per_worker = self._configure_and_split_batch(num_workers) if batch_size_per_worker: worker_config[BATCH_SIZE] = batch_size_per_worker params = dict( model_creator=self.model_creator, data_creator=self.data_creator, optimizer_creator=self.optimizer_creator, loss_creator=self.loss_creator, scheduler_creator=self.scheduler_creator, training_operator_cls=self.training_operator_cls, config=worker_config, use_fp16=self.use_fp16, use_gpu=True, use_tqdm=self.use_tqdm, apex_args=self.apex_args, scheduler_step_freq=self.scheduler_step_freq, ) if num_workers == 1: # Start local worker self.local_worker = TorchRunner(**params) self.apply_all_workers(_set_device_from_fluid_res) if self.initialization_hook: self.apply_all_workers(self.initialization_hook) self.local_worker.setup() else: params.update( backend=self.backend, add_dist_sampler=self.add_dist_sampler, wrap_ddp=self.wrap_ddp, ) # Start local worker self.local_worker = LocalDistributedRunner(**params) # Start remote workers # assert num_workers == len(self.extra_assigned_worker_res) + 1 self.remote_workers = [] for res_name, res_val in self.extra_assigned_worker_res: # Generate actor class RemoteRunner = ray.remote(num_cpus=1, num_gpus=res_val, resources={res_name: res_val })(DistributedTorchRunner) self.remote_workers.append(RemoteRunner.remote(**params)) self.apply_all_workers(_set_device_from_fluid_res) if self.initialization_hook: self.apply_all_workers(self.initialization_hook) # Compute URL for initializing distributed PyTorch ip = ray.services.get_node_ip_address() port = self.local_worker.find_free_port() address = "tcp://{ip}:{port}".format(ip=ip, port=port) # Runs the creator functions. remote_component_setup = [ worker.setup_components.remote() for i, worker in enumerate(self.remote_workers) ] self.local_worker.setup_components() # Get setup tasks in order to throw errors on failure ray.get(remote_component_setup) # Setup the process group among all workers. remote_pgroup_setups = [ worker.setup_process_group.remote(address, i + 1, num_workers) for i, worker in enumerate(self.remote_workers) ] self.local_worker.setup_process_group(address, 0, num_workers) # Get setup tasks in order to throw errors on failure ray.get(remote_pgroup_setups) # Runs code that requires all creator functions to have run. remote_operator_setups = [ worker.setup_ddp_and_operator.remote() for worker in self.remote_workers ] self.local_worker.setup_ddp_and_operator() # Get setup tasks in order to throw errors on failure ray.get(remote_operator_setups)
def __init__( self, *, model_creator, data_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=None, initialization_hook=None, config=None, use_gpu="auto", backend="auto", wrap_ddp=True, serialize_data_creation=True, use_fp16=False, use_tqdm=False, apex_args=None, add_dist_sampler=True, scheduler_step_freq=None, ): if not (callable(model_creator) and callable(optimizer_creator) and callable(data_creator)): raise ValueError( "Must provide a callable model_creator, optimizer_creator, " "and data_creator.") self.model_creator = model_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.data_creator = data_creator self.scheduler_creator = scheduler_creator self.training_operator_cls = training_operator_cls if not training_operator_cls and not loss_creator: raise ValueError("If a loss_creator is not provided, you must " "provide a custom training operator.") self.initialization_hook = initialization_hook self.config = {} if config is None else config if use_gpu == "auto": use_gpu = torch.cuda.is_available() _remind_gpu_usage(use_gpu) num_workers = self._check_potential_workers_size() if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.debug("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.serialize_data_creation = serialize_data_creation self.wrap_ddp = wrap_ddp self.use_fp16 = use_fp16 self.use_tqdm = use_tqdm self.add_dist_sampler = add_dist_sampler if apex_args and not isinstance(apex_args, dict): raise ValueError("apex_args needs to be a dict object.") self.apex_args = apex_args self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self.local_worker = DeactivatedRunner() self.remote_workers = [] if scheduler_creator: _validate_scheduler_step_freq(scheduler_step_freq) self.scheduler_step_freq = scheduler_step_freq if not ray.is_initialized() and num_workers > 1: logger.info("Automatically initializing single-node Ray. To use " "multi-node training, be sure to run `ray.init(" "address='auto')` before instantiating the Trainer.") ray.init() self._start_workers(num_workers)