def machine_params(cls, mode="train", gpu_id="default", n_train_processes="default", **kwargs): if mode == "train": if n_train_processes == "default": nprocesses = cls.NUM_TRAIN_SAMPLERS else: nprocesses = n_train_processes elif mode == "valid": nprocesses = 0 elif mode == "test": nprocesses = min(100 if torch.cuda.is_available() else 8, cls.NUM_TEST_TASKS) else: raise NotImplementedError( "mode must be 'train', 'valid', or 'test'.") if gpu_id == "default": devices = [] if cls.GPU_ID is None else [cls.GPU_ID] else: devices = [gpu_id] return MachineParams(nprocesses=nprocesses, devices=devices)
def machine_params(cls, mode="train", **kwargs) -> MachineParams: """Return the number of processes and gpu_ids to use with training.""" num_gpus = cuda.device_count() has_gpu = num_gpus != 0 sampler_devices = None if mode == "train": nprocesses = cls.num_train_processes() if torch.cuda.is_available( ) else 1 devices = (list(range(min(nprocesses, num_gpus))) if has_gpu else [torch.device("cpu")]) elif mode == "valid": devices = [num_gpus - 1] if has_gpu else [torch.device("cpu")] nprocesses = 2 if has_gpu else 0 else: nprocesses = 20 if has_gpu else 1 devices = (list(range(min(nprocesses, num_gpus))) if has_gpu else [torch.device("cpu")]) nprocesses = split_processes_onto_devices(nprocesses=nprocesses, ndevices=len(devices)) return MachineParams( nprocesses=nprocesses, devices=devices, sampler_devices=sampler_devices, sensor_preprocessor_graph=cls.resnet_preprocessor_graph( mode=mode) if cls.USE_RESNET_CNN else None, )
def machine_params(self, mode="train", **kwargs): if mode == "train": workers_per_device = 1 gpu_ids = ([] if not torch.cuda.is_available() else self.TRAINING_GPUS * workers_per_device) nprocesses = (1 if not torch.cuda.is_available() else evenly_distribute_count_into_bins( self.NUM_PROCESSES, len(gpu_ids))) elif mode == "valid": nprocesses = 1 gpu_ids = [] if not torch.cuda.is_available( ) else self.VALIDATION_GPUS elif mode == "test": nprocesses = 1 gpu_ids = [] if not torch.cuda.is_available( ) else self.TESTING_GPUS else: raise NotImplementedError( "mode must be 'train', 'valid', or 'test'.") sensor_preprocessor_graph = (SensorPreprocessorGraph( source_observation_spaces=SensorSuite( self.SENSORS).observation_spaces, preprocessors=self.PREPROCESSORS, ) if mode == "train" or ( (isinstance(nprocesses, int) and nprocesses > 0) or (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else None) return MachineParams( nprocesses=nprocesses, devices=gpu_ids, sensor_preprocessor_graph=sensor_preprocessor_graph, )
def worker_devices(self, mode: str): machine_params: MachineParams = MachineParams.instance_from( self.config.machine_params(mode)) devices = machine_params.devices assert all_equal(devices) or all( d.index >= 0 for d in devices ), f"Cannot have a mix of CPU and GPU devices (`devices == {devices}`)" get_logger().info("Using {} {} workers on devices {}".format( len(devices), mode, devices)) return devices
def machine_params(self, mode="train", **kwargs): sampler_devices: Sequence[int] = [] if mode == "train": workers_per_device = 1 gpu_ids = ( [] if not torch.cuda.is_available() else self.TRAIN_GPU_IDS * workers_per_device ) nprocesses = ( 1 if not torch.cuda.is_available() else evenly_distribute_count_into_bins(self.NUM_PROCESSES, len(gpu_ids)) ) sampler_devices = self.SAMPLER_GPU_IDS elif mode == "valid": nprocesses = 1 gpu_ids = [] if not torch.cuda.is_available() else self.VALID_GPU_IDS elif mode == "test": nprocesses = 5 if torch.cuda.is_available() else 1 gpu_ids = [] if not torch.cuda.is_available() else self.TEST_GPU_IDS else: raise NotImplementedError("mode must be 'train', 'valid', or 'test'.") sensors = [*self.SENSORS] if mode != "train": sensors = [s for s in sensors if not isinstance(s, ExpertActionSensor)] sensor_preprocessor_graph = ( SensorPreprocessorGraph( source_observation_spaces=SensorSuite(sensors).observation_spaces, preprocessors=self.preprocessors(), ) if mode == "train" or ( (isinstance(nprocesses, int) and nprocesses > 0) or (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0) ) else None ) return MachineParams( nprocesses=nprocesses, devices=gpu_ids, sampler_devices=sampler_devices if mode == "train" else gpu_ids, # ignored with > 1 gpu_ids sensor_preprocessor_graph=sensor_preprocessor_graph, )
def machine_params(self, mode="train", **kwargs): sampler_devices: Sequence[torch.device] = [] devices: Sequence[torch.device] if mode == "train": workers_per_device = 1 devices = ([torch.device("cpu")] if not torch.cuda.is_available() else cast(Tuple, self.train_gpu_ids) * workers_per_device) nprocesses = evenly_distribute_count_into_bins( self.num_train_processes, max(len(devices), 1)) sampler_devices = self.sampler_devices elif mode == "valid": nprocesses = 1 devices = ([torch.device("cpu")] if not torch.cuda.is_available() else self.val_gpu_ids) elif mode == "test": nprocesses = 10 if torch.cuda.is_available() else 1 devices = ([torch.device("cpu")] if not torch.cuda.is_available() else self.test_gpu_ids) else: raise NotImplementedError( "mode must be 'train', 'valid', or 'test'.") sensors = [*self.SENSORS] if mode != "train": sensors = [ s for s in sensors if not isinstance(s, ExpertActionSensor) ] sensor_preprocessor_graph = (SensorPreprocessorGraph( source_observation_spaces=SensorSuite(sensors).observation_spaces, preprocessors=self.preprocessors(), ) if mode == "train" or ( (isinstance(nprocesses, int) and nprocesses > 0) or (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else None) return MachineParams( nprocesses=nprocesses, devices=devices, sampler_devices=sampler_devices if mode == "train" else devices, # ignored with > 1 gpu_ids sensor_preprocessor_graph=sensor_preprocessor_graph, )
def machine_params(self, mode="train", **kwargs): sampler_devices: Sequence[int] = [] if mode == "train": workers_per_device = 1 gpu_ids = ([] if not torch.cuda.is_available() else self.TRAIN_GPU_IDS * workers_per_device) nprocesses = (1 if not torch.cuda.is_available() else self.split_num_processes(len(gpu_ids))) sampler_devices = self.TRAIN_GPU_IDS elif mode == "valid": nprocesses = 1 gpu_ids = [] if not torch.cuda.is_available( ) else self.VALID_GPU_IDS elif mode == "test": nprocesses = 7 gpu_ids = [] if not torch.cuda.is_available( ) else self.TEST_GPU_IDS else: raise NotImplementedError( "mode must be 'train', 'valid', or 'test'.") # Disable parallelization for validation process if mode == "valid": for prep in self.PREPROCESSORS: prep.kwargs["parallel"] = False sensor_preprocessor_graph = (SensorPreprocessorGraph( source_observation_spaces=SensorSuite( self.SENSORS).observation_spaces, preprocessors=self.PREPROCESSORS, ) if mode == "train" or ( (isinstance(nprocesses, int) and nprocesses > 0) or (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else None) return MachineParams( nprocesses=nprocesses, devices=gpu_ids, sampler_devices=sampler_devices if mode == "train" else gpu_ids, # ignored with > 1 gpu_ids sensor_preprocessor_graph=sensor_preprocessor_graph, )
def start_train( self, checkpoint: Optional[str] = None, restart_pipeline: bool = False, max_sampler_processes_per_worker: Optional[int] = None, ): if not self.disable_config_saving: self.save_project_state() devices = self.worker_devices("train") num_workers = len(devices) # Be extra careful to ensure that all models start # with the same initializations. set_seed(self.seed) initial_model_state_dict = self.config.create_model( sensor_preprocessor_graph=MachineParams.instance_from( self.config.machine_params( self.mode)).sensor_preprocessor_graph).state_dict() distributed_port = 0 if num_workers > 1: distributed_port = find_free_port() for trainer_it in range(num_workers): train: BaseProcess = self.mp_ctx.Process( target=self.train_loop, kwargs=dict( id=trainer_it, checkpoint=checkpoint, restart_pipeline=restart_pipeline, experiment_name=self.experiment_name, config=self.config, results_queue=self.queues["results"], checkpoints_queue=self.queues["checkpoints"] if self.running_validation else None, checkpoints_dir=self.checkpoint_dir(), seed=self.seed, deterministic_cudnn=self.deterministic_cudnn, mp_ctx=self.mp_ctx, num_workers=num_workers, device=devices[trainer_it], distributed_port=distributed_port, max_sampler_processes_per_worker= max_sampler_processes_per_worker, initial_model_state_dict=initial_model_state_dict, ), ) train.start() self.processes["train"].append(train) get_logger().info("Started {} train processes".format( len(self.processes["train"]))) # Validation if self.running_validation: device = self.worker_devices("valid")[0] self.init_visualizer("valid") valid: BaseProcess = self.mp_ctx.Process( target=self.valid_loop, args=(0, ), kwargs=dict( config=self.config, results_queue=self.queues["results"], checkpoints_queue=self.queues["checkpoints"], seed= 12345, # TODO allow same order for randomly sampled tasks? Is this any useful anyway? deterministic_cudnn=self.deterministic_cudnn, deterministic_agents=self.deterministic_agents, mp_ctx=self.mp_ctx, device=device, max_sampler_processes_per_worker= max_sampler_processes_per_worker, ), ) valid.start() self.processes["valid"].append(valid) get_logger().info("Started {} valid processes".format( len(self.processes["valid"]))) else: get_logger().info( "No processes allocated to validation, no validation will be run." ) self.log(self.local_start_time_str, num_workers) return self.local_start_time_str
def init_visualizer(self, mode: str): if not self.disable_tensorboard: # Note: Avoid instantiating anything in machine_params (use Builder if needed) machine_params = MachineParams.instance_from( self.config.machine_params(mode)) self.visualizer = machine_params.visualizer
def running_validation(self): return (sum( MachineParams.instance_from( self.config.machine_params("valid")).nprocesses) > 0)
def start_train( self, checkpoint: Optional[str] = None, restart_pipeline: bool = False, max_sampler_processes_per_worker: Optional[int] = None, ): self._initialize_start_train_or_start_test() if not self.disable_config_saving: self.save_project_state() devices = self.worker_devices(TRAIN_MODE_STR) num_workers = len(devices) # Be extra careful to ensure that all models start # with the same initializations. set_seed(self.seed) initial_model_state_dict = self.config.create_model( sensor_preprocessor_graph=MachineParams.instance_from( self.config.machine_params( self.mode)).sensor_preprocessor_graph).state_dict() distributed_port = 0 if num_workers > 1: distributed_port = find_free_port() model_hash = None for trainer_it in range(num_workers): training_kwargs = dict( id=trainer_it, checkpoint=checkpoint, restart_pipeline=restart_pipeline, experiment_name=self.experiment_name, config=self.config, results_queue=self.queues["results"], checkpoints_queue=self.queues["checkpoints"] if self.running_validation else None, checkpoints_dir=self.checkpoint_dir(), seed=self.seed, deterministic_cudnn=self.deterministic_cudnn, mp_ctx=self.mp_ctx, num_workers=num_workers, device=devices[trainer_it], distributed_port=distributed_port, max_sampler_processes_per_worker= max_sampler_processes_per_worker, initial_model_state_dict=initial_model_state_dict if model_hash is None else model_hash, ) train: BaseProcess = self.mp_ctx.Process( target=self.train_loop, kwargs=training_kwargs, ) try: train.start() except ValueError as e: # If the `initial_model_state_dict` is too large we sometimes # run into errors passing it with multiprocessing. In such cases # we instead has the state_dict and confirm, in each engine worker, that # this hash equals the model the engine worker instantiates. if e.args[0] == "too many fds": model_hash = md5_hash_of_state_dict( initial_model_state_dict) training_kwargs["initial_model_state_dict"] = model_hash train = self.mp_ctx.Process( target=self.train_loop, kwargs=training_kwargs, ) train.start() else: raise e self.processes[TRAIN_MODE_STR].append(train) get_logger().info("Started {} train processes".format( len(self.processes[TRAIN_MODE_STR]))) # Validation if self.running_validation: device = self.worker_devices("valid")[0] self.init_visualizer("valid") valid: BaseProcess = self.mp_ctx.Process( target=self.valid_loop, args=(0, ), kwargs=dict( config=self.config, results_queue=self.queues["results"], checkpoints_queue=self.queues["checkpoints"], seed= 12345, # TODO allow same order for randomly sampled tasks? Is this any useful anyway? deterministic_cudnn=self.deterministic_cudnn, deterministic_agents=self.deterministic_agents, mp_ctx=self.mp_ctx, device=device, max_sampler_processes_per_worker= max_sampler_processes_per_worker, ), ) valid.start() self.processes["valid"].append(valid) get_logger().info("Started {} valid processes".format( len(self.processes["valid"]))) else: get_logger().info( "No processes allocated to validation, no validation will be run." ) self.log_and_close(self.local_start_time_str, num_workers) return self.local_start_time_str