def initialize(self, amp_id: int, num_losses: int, use_amp: bool, amp_opt_level: str, device: torch.device): self._amp_id = amp_id self._use_amp = use_amp if APEX_AVAILABLE and self._use_amp: self._model, self._optimizer = amp.initialize( self._model, self._optimizer, opt_level=amp_opt_level, num_losses=num_losses) if on_multiple_gpus(get_devices()): self._model = ApexDDP(self._model, delay_allreduce=True) if not APEX_AVAILABLE and on_multiple_gpus(get_devices()): self._model = DDP(self._model, device_ids=[device])
def _initialize_ddp_process_group(self): if on_multiple_gpus(self._devices): if NCCL_AVAILABLE: if os.environ.get("MASTER_ADDR") is None: os.environ["MASTER_ADDR"] = "127.0.0.1" if os.environ.get("MASTER_PORT") is None: os.environ["MASTER_PORT"] = str( self._get_random_free_port()) if os.environ.get("WORLD_SIZE") is None: os.environ["WORLD_SIZE"] = str(self._world_size) torch.distributed.init_process_group( backend='nccl', init_method='env://', world_size=int(os.environ["WORLD_SIZE"]), rank=self._local_rank) else: raise Exception( "NCCL not available and required for multi-GPU training.")
dataset_configs['ABIDE'].test_patch_size, dataset_configs["ABIDE"].test_step, test_image=ABIDE_reconstruction._target_images[0])) # Concat datasets. if len(dataset_configs) > 1: train_dataset = torch.utils.data.ConcatDataset(train_datasets) valid_dataset = torch.utils.data.ConcatDataset(valid_datasets) test_dataset = torch.utils.data.ConcatDataset(test_datasets) else: train_dataset = train_datasets[0] valid_dataset = valid_datasets[0] test_dataset = test_datasets[0] # Create samplers if on_multiple_gpus(run_config.devices): train_sampler = torch.utils.data.DistributedSampler(train_dataset, run_config.world_size, run_config.local_rank) valid_sampler = torch.utils.data.DistributedSampler(valid_dataset, run_config.world_size, run_config.local_rank) test_sampler = torch.utils.data.DistributedSampler(test_dataset, run_config.world_size, run_config.local_rank) else: train_sampler, valid_sampler, test_sampler = None, None, None # Create loaders. dataloaders = list(map(lambda dataset, sampler: DataLoader(dataset, training_config.batch_size, sampler=sampler, shuffle=False if sampler is not None else True, num_workers=args.num_workers,
def test_on_multiple_gpu_should_return_true_with_multiple_GPU_devices( self): assert_that(on_multiple_gpus(self._multiple_gpus_devices), is_(True))
def test_on_multiple_gpu_should_return_false_with_single_GPU_device(self): assert_that(on_multiple_gpus(self._single_gpu_device), is_(False))