def init_dist(launcher, backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError(f'Invalid launcher type: {launcher}')
def init_dist(backend='nccl', **kwargs): ''' initialization for distributed training''' # if mp.get_start_method(allow_none=True) is None: if mp.get_start_method(allow_none=True) != 'spawn': #Return the name of start method used for starting processes mp.set_start_method('spawn', force=True) ##'spawn' is the default on Windows rank = int(os.environ['RANK']) #system env process ranks num_gpus = torch.cuda.device_count() #Returns the number of GPUs available torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) #Initializes the default distributed process group
def init_dist(backend='nccl', **kwargs): ''' initialization for distributed training''' # if mp.get_start_method(allow_none=True) is None: if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn') rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs)
def init_dist(backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') #os.environ['MASTER_ADDR'] = '10.1.114.10' #os.environ['MASTER_PORT'] = '29500' #os.environ['RANK'] = '0' rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs)
def init_dist(launcher, backend='nccl', **kwargs): ''' initialization for distributed training''' if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError('Invalid launcher type: {}'.format(launcher))
def init_dist(backend, **kwargs): # These packages have globals that screw with Windows, so only import them if needed. import torch.distributed as dist import torch.multiprocessing as mp """initialization for distributed training""" if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn') rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs)
def init_dist(backend='nccl', master_ip='127.0.0.1', port=29500): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') os.environ['MASTER_ADDR'] = master_ip os.environ['MASTER_PORT'] = str(port) rank = int(os.environ['RANK']) world_size = int(os.environ['WORLD_SIZE']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend) return rank, world_size
def init_dist(launcher='pytorch', backend='nccl', **kwargs): if dist.is_initialized(): return torch.cuda.current_device() if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() gpu_id = rank % num_gpus torch.cuda.set_device(gpu_id) dist.init_process_group(backend=backend, **kwargs) return gpu_id
def init_distributed_mode(): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') is_slurm_job = "SLURM_JOB_ID" in os.environ if is_slurm_job: _init_dist_slurm() else: _init_dist_pytorch() return get_dist_info()
def init_dist(launcher, backend="nccl", **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method("spawn") if launcher == "pytorch": _init_dist_pytorch(backend, **kwargs) elif launcher == "mpi": _init_dist_mpi(backend, **kwargs) elif launcher == "slurm": _init_dist_slurm(backend, **kwargs) else: raise ValueError("Invalid launcher type: {}".format(launcher))
def init_environment(cfg): @master_only def _pprint_cfg(): pprint(dict(cfg)) if mp.get_start_method(allow_none=True) != "forkserver": mp.set_start_method("forkserver") colorama.init() _init_dist_and_device(cfg) torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) _pprint_cfg()
def init_dist(backend='nccl', rank=0): ''' initialization for distributed training''' # if mp.get_start_method(allow_none=True) is None: if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn') # rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, init_method="tcp://127.0.0.1:23571", world_size=num_gpus, rank=rank)
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend=backend, init_method='tcp://127.0.0.1:%d' % tcp_port, rank=local_rank, world_size=num_gpus) rank = dist.get_rank() return num_gpus, rank
def init_dist(launcher, backend='nccl', **kwargs): if (mp.get_start_method(allow_none=True) is None): mp.set_start_method('spawn') if (launcher == 'pytorch'): _init_dist_pytorch(backend, **kwargs) elif (launcher == 'mpi'): _init_dist_mpi(backend, **kwargs) elif (launcher == 'slurm'): _init_dist_slurm(backend, **kwargs) else: raise ValueError( ''.join(['Invalid launcher type: ', '{}'.format(launcher)]))
def main(): args = Options().parse() torch.backends.cudnn.benchmark = True if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') init_dist('pytorch', backend=args.dist_backend) logger = get_root_logger('INFO') args.lr = args.lr * dist.get_world_size() # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) main_worker(args)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device('cpu') num_action = 2 num_state = 4 num_process = 5 global_Actor = NeuralNet.ActorNet(inputs=num_state, outputs=num_action, num_hidden_layers=2, hidden_dim=8).to(device) #summary(global_Actor, input_size=(10,num_state)) global_Critic = NeuralNet.CriticNet(inputs=num_state, outputs=1, num_hidden_layers=2, hidden_dim=8).to(device) #summary(global_Critic, input_size=(10,num_state)) batch_size = 64 GAMMA = 0.95 max_episodes = 5000 max_step = 1000 global_Actor.share_memory() global_Critic.share_memory() processes = [] processes_socket = [] processes_agent = [] mp.set_start_method('spawn') print("MP start method:", mp.get_start_method()) ip = '110.76.78.109' port = 1111 for rank in range(num_process): processes_socket.append(0) processes_socket[rank] = ClientSocket.MySocket(port, 'f', 'ffff?f') processes_agent.append(0) processes_agent[rank] = Agent.Brain(GlobalActorNet=global_Actor, GlobalCriticNet=global_Critic, device=device, socket=processes_socket[rank], num_action=num_action, max_episodes=max_episodes, max_step=max_step, batch_size=batch_size, GAMMA=GAMMA) p = mp.Process(target=processes_agent[rank].train, args=()) p.start() processes.append(p) for p in processes: p.join()
def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group(backend=backend, init_method='tcp://127.0.0.1:%d' % tcp_port, rank=local_rank, world_size=num_gpus) assert batch_size % num_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % ( batch_size, num_gpus) batch_size_each_gpu = batch_size // num_gpus rank = dist.get_rank() return batch_size_each_gpu, rank
def init_dist(launcher, backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'mpi': _init_dist_mpi(backend, **kwargs) elif launcher == 'infimpi': set_environment_variables_for_nccl_backend( ompi_size() == ompi_local_size()) _init_dist_infimpi(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError('Invalid launcher type: {}'.format(launcher))
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') # os.environ['MASTER_PORT'] = str(tcp_port) # os.environ['MASTER_ADDR'] = 'localhost' num_gpus = torch.cuda.device_count() torch.cuda.set_device(local_rank % num_gpus) dist.init_process_group( backend=backend, # init_method='tcp://127.0.0.1:%d' % tcp_port, # rank=local_rank, # world_size=num_gpus ) rank = dist.get_rank() return num_gpus, rank
def __init__( self, env, policy, num_workers: int, *, min_rollouts: int = None, min_steps: int = None, show_progress_bar: bool = True, seed: int = NO_SEED_PASSED, ): """ Constructor :param env: environment to sample from :param policy: policy to act in the environment (can also be an exploration strategy) :param num_workers: number of parallel samplers :param min_rollouts: minimum number of complete rollouts to sample :param min_steps: minimum total number of steps to sample :param show_progress_bar: it `True`, display a progress bar using `tqdm` :param seed: seed value for the random number generators, pass `None` for no seeding; defaults to the last seed that was set with `pyrado.set_seed` """ Serializable._init(self, locals()) super().__init__(min_rollouts=min_rollouts, min_steps=min_steps) self.env = env self.policy = policy self.show_progress_bar = show_progress_bar # Set method to spawn if using cuda if mp.get_start_method(allow_none=True) != "spawn": mp.set_start_method("spawn", force=True) # Create parallel pool. We use one thread per env because it's easier. self.pool = SamplerPool(num_workers) if seed is NO_SEED_PASSED: seed = pyrado.get_base_seed() self._seed = seed # Initialize with -1 such that we start with the 0-th sample. Incrementing after sampling may cause issues when # the sampling crashes and the sample count is not incremented. self._sample_count = -1 # Distribute environments. We use pickle to make sure a copy is created for n_envs=1 self.pool.invoke_all(_ps_init, pickle.dumps(self.env), pickle.dumps(self.policy))
def test_dataloader(self): dataset = Dataset( data=[{"img": np.array([[[0.0, 1.0], [2.0, 3.0]]])}, {"img": np.array([[[0.0, 1.0], [2.0, 3.0]]])}], transform=IntensityStatsd(keys="img", ops=["max", "mean"], key_prefix="orig"), ) # set num workers = 0 for mac / win num_workers = 2 if sys.platform == "linux" else 0 dataloader = DataLoader(dataset=dataset, num_workers=num_workers, batch_size=2) orig_method = mp.get_start_method() mp.set_start_method("spawn", force=True) for d in dataloader: meta = d[PostFix.meta("img")] np.testing.assert_allclose(meta["orig_max"], [3.0, 3.0], atol=1e-3) np.testing.assert_allclose(meta["orig_mean"], [1.5, 1.5], atol=1e-3) # restore the mp method mp.set_start_method(orig_method, force=True)
def init_dist(opt, local_rank): """ Adopted from BasicSR """ if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl') rank, world_size = get_dist_info() opt.update({ 'dist': True, 'device': 'cuda', 'local_rank': local_rank, 'world_size': world_size, 'rank': rank })
def setup_multi_processes(cfg, workers_per_gpu): """Setup multi-processing environment variables.""" logger = get_root_logger() # set multi-process start method if platform.system() != 'Windows': mp_start_method = cfg.get('mp_start_method', None) current_method = mp.get_start_method(allow_none=True) if mp_start_method in ('fork', 'spawn', 'forkserver'): logger.info( f'Multi-processing start method `{mp_start_method}` is ' f'different from the previous setting `{current_method}`.' f'It will be force set to `{mp_start_method}`.') mp.set_start_method(mp_start_method, force=True) else: logger.info( f'Multi-processing start method is `{mp_start_method}`') # disable opencv multithreading to avoid system being overloaded opencv_num_threads = cfg.get('opencv_num_threads', None) if isinstance(opencv_num_threads, int): logger.info(f'OpenCV num_threads is `{opencv_num_threads}`') cv2.setNumThreads(opencv_num_threads) else: logger.info(f'OpenCV num_threads is `{cv2.getNumThreads}') if workers_per_gpu > 1: # setup OMP threads # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa omp_num_threads = cfg.get('omp_num_threads', None) if 'OMP_NUM_THREADS' not in os.environ: if isinstance(omp_num_threads, int): logger.info(f'OMP num threads is {omp_num_threads}') os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) else: logger.info(f'OMP num threads is {os.environ["OMP_NUM_THREADS"] }') # setup MKL threads if 'MKL_NUM_THREADS' not in os.environ: mkl_num_threads = cfg.get('mkl_num_threads', None) if isinstance(mkl_num_threads, int): logger.info(f'MKL num threads is {mkl_num_threads}') os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) else: logger.info(f'MKL num threads is {os.environ["MKL_NUM_THREADS"]}')
def setup_multi_processes(cfg): """Setup multi-processing environment variables.""" # set multi-process start method as `fork` to speed up the training if platform.system() != 'Windows': mp_start_method = cfg.get('mp_start_method', 'fork') current_method = mp.get_start_method(allow_none=True) if current_method is not None and current_method != mp_start_method: warnings.warn( f'Multi-processing start method `{mp_start_method}` is ' f'different from the previous setting `{current_method}`.' f'It will be force set to `{mp_start_method}`. You can change ' f'this behavior by changing `mp_start_method` in your config.') mp.set_start_method(mp_start_method, force=True) # disable opencv multithreading to avoid system being overloaded opencv_num_threads = cfg.get('opencv_num_threads', 0) cv2.setNumThreads(opencv_num_threads) # setup OMP threads # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa workers_per_gpu = cfg.data.get('workers_per_gpu', 1) if 'train_dataloader' in cfg.data: workers_per_gpu = \ max(cfg.data.train_dataloader.get('workers_per_gpu', 1), workers_per_gpu) if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1: omp_num_threads = 1 warnings.warn( f'Setting OMP_NUM_THREADS environment variable for each process ' f'to be {omp_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) # setup MKL threads if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1: mkl_num_threads = 1 warnings.warn( f'Setting MKL_NUM_THREADS environment variable for each process ' f'to be {mkl_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
def initialSettings(port, backend='nccl'): method = mp.get_start_method(allow_none=True) if method is None: mp.set_start_method('spawn') logger.info('multiprocessing start method:{}'.format(method)) procId = int(os.environ.get('SLURM_PROCID')) numOfTasks = int(os.environ.get('SLURM_NTASKS')) nodeList = os.environ.get('SLURM_JOB_NODELIST') numOfGPUs = torch.cuda.device_count() torch.cuda.set_device(procId % numOfGPUs) if '[' in nodeList: beg = nodeList.find('[') pos1 = nodeList.find('-', beg) if pos1 < 0: pos1 = 1000 pos2 = nodeList.find(',', beg) if pos2 < 0: pos2 = 1000 nodeList = nodeList[:min(pos1, pos2)].replace('[', '') addr = nodeList[8:].replace('-', '.') os.environ['MASTER_PORT'] = port os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(numOfTasks) os.environ['RANK'] = str(procId) if backend == 'nccl': distributed.init_process_group(backend='nccl') else: distributed.init_process_group(backend='gloo', rank=procId, world_size=numOfTasks) rank = distributed.get_rank() worldSize = distributed.get_world_size() return rank, worldSize
def main(): parser = utils.prepare_parser() parser = utils.add_dgp_parser(parser) config = vars(parser.parse_args()) utils.dgp_update_config(config) print(config) rank = 0 if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn', force=True) if config['dist']: rank, world_size = dist_init(config['port']) # Seed RNG utils.seed_rng(rank + config['seed']) # Setup cudnn.benchmark for free speed torch.backends.cudnn.benchmark = True # train trainer = Trainer(config) trainer.run()
def init_dist(distributed=True, backend='nccl', master_ip='tcp://127.0.0.1', port=6501): if not distributed: return if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') global gpu_id os.environ['MASTER_ADDR'] = master_ip os.environ['MASTER_PORT'] = str(port) rank = int(os.environ['RANK']) world_size = int(os.environ['WORLD_SIZE']) num_gpus = torch.cuda.device_count() gpu_id = rank % num_gpus torch.cuda.set_device(gpu_id) dist_url = master_ip + ':' + str(port) dist.init_process_group(backend=backend, init_method=dist_url, \ world_size=world_size, rank=rank) print("dist initialized. master_ip: %s, port: %s, rank: %d/%d" % \ (master_ip, str(port), rank, world_size)) return rank, world_size
def main(): # get local rank from distributed launcher parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) parser.add_argument('--world_size', type=int) args = parser.parse_args() print('what is the rank of the current program: ') print(args.local_rank) print('world size: ') print(args.world_size) # initialize dist if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') torch.cuda.set_device(int(args.local_rank)) dist.init_process_group(backend='nccl', init_method='env://') # define dataset dataset = DentalDataset( num_class=NUM_CLASS, ann_file=ANN_FILE, img_prefix=IMG_PREFIX, img_scale=IMG_SCALE, img_norm_cfg=IMG_TRANSFORM_CONFIG, multiscale_mode='value', flip_ratio=FLIP_RATIO, with_label=False, extra_aug=None, test_mode=True, ) # sampler for make number of samples % number of gpu == 0 sampler = NewDistributedSampler(dataset=dataset, num_replicas=args.world_size, images_per_gpu=IMGS_PER_GPU, rank=args.local_rank, shuffle=False) # data loader. Note this is the code for one (each) gpu. data_loader = DataLoader( dataset=dataset, batch_size=IMGS_PER_GPU, # when sampler is given, shuffle must be False. shuffle=False, sampler=sampler, batch_sampler=None, num_workers=WORKERS_PER_GPU, collate_fn=partial(collate, samples_per_gpu=IMGS_PER_GPU), pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, ) # define the model and restore checkpoint model = SSDDetector( # basic input_size=IMG_SCALE, num_classes=NUM_CLASS, in_channels=(512, 1024, 512, 256, 256), use_dropout=False, dropout_rate=None, # anchor generate anchor_ratios=([1 / 2.0, 1.0, 2.0], [1 / 3.0, 1 / 2.0, 1.0, 2.0, 3.0], [1 / 3.0, 1 / 2.0, 1.0, 2.0, 3.0], [1 / 3.0, 1 / 2.0, 1.0, 2.0, 3.0], [1 / 2.0, 1.0, 2.0]), anchor_strides=((16, 16), (16, 16), (30, 30), (60, 60), (100, 100)), basesizes=((12, 12), (16, 16), (24, 24), (30, 30), (36, 36)), allowed_border=-1, # regression target_means=(.0, .0, .0, .0), target_stds=(0.1, 0.1, 0.2, 0.2), # box assign pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0., gt_max_assign_all=False, # sampling sampling=False, # balancing the loss neg_pos_ratio=3, # loss smoothl1_beta=1., # inference nms nms_pre=-1, score_thr=0.02, min_size=100.0, max_scale_ratio=10.0, nms_cfg=['nms', 0.45, None], max_per_img=200, # device device=None, ) model.cuda(args.local_rank) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) if hasattr(model, 'module'): model = model.module # load checkpoint loc = 'cuda:{}'.format(args.local_rank) checkpoint = torch.load(CHECKPOINT_FILE, map_location=loc) # optimizer.state_dict -> state, param_groups # state -> var series number -> step / exp_avg / exp_avg_sq # param_groups -> lr / betas / eps / weight_decay / amsgrad / params model.load_state_dict(checkpoint['state_dict'], strict=True) # enable dropout during inference model.eval() # for m in model.modules(): # if m.__class__.__name__.startswith('Dropout'): # m.train() # results and progress bar results = [] if args.local_rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) # enumerate all data for i, data_pair in enumerate(data_loader): data_pair_img = data_pair['img'].data[0].cuda(args.local_rank, non_blocking=True) data_pair_img_meta = data_pair['img_meta'].data[0] with torch.no_grad(): result = model( is_test=True, img=data_pair_img, img_meta=data_pair_img_meta, rescale=True, ) results.extend(result) # update program bar only if it is rank 0. if args.local_rank == 0: for _ in range(IMGS_PER_GPU * args.world_size): prog_bar.update() # collect results from all gpus results = collect_results(result_part=results, dataset_real_size=len(dataset), tmpdir=TMPDIR) # write results to file # [Number of images, Number of classes, (k, 5)]. # 5 for t, l, b, r, and prob. if args.local_rank == 0: print('\nwriting results to {}'.format(OUT_FILE)) mmcv.dump(results, OUT_FILE)
def __init__(self, args, model, optimizer, train_loader, val_loader, input_train_transform, input_val_transform, output_transform, losses, scheduler=None): # Allow multiple processes to access tensors on GPU. Add checking for multiple continuous runs. if multiprocessing.get_start_method(allow_none=True) is None: multiprocessing.set_start_method(method='spawn') self.logger = get_logger(name=__name__, save_file=args.log_path / args.run_name) # Checking whether inputs are correct. assert isinstance(model, nn.Module), '`model` must be a Pytorch Module.' assert isinstance(optimizer, optim.Optimizer), '`optimizer` must be a Pytorch Optimizer.' assert isinstance(train_loader, DataLoader) and isinstance(val_loader, DataLoader), \ '`train_loader` and `val_loader` must be Pytorch DataLoader objects.' assert callable(input_train_transform) and callable(input_val_transform), \ 'input_transforms must be callable functions.' # I think this would be best practice. assert isinstance(output_transform, nn.Module), '`output_transform` must be a Pytorch Module.' # 'losses' is expected to be a dictionary. # Even composite losses should be a single loss module with a dictionary output. losses = nn.ModuleDict(losses) if scheduler is not None: if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): self.metric_scheduler = True elif isinstance(scheduler, optim.lr_scheduler._LRScheduler): self.metric_scheduler = False else: raise TypeError('`scheduler` must be a Pytorch Learning Rate Scheduler.') # Display interval of 0 means no display of validation images on TensorBoard. if args.max_images <= 0: self.display_interval = 0 else: self.display_interval = int(len(val_loader.dataset) // (args.max_images * args.batch_size)) self.checkpointer = CheckpointManager(model, optimizer, mode='min', save_best_only=args.save_best_only, ckpt_dir=args.ckpt_path, max_to_keep=args.max_to_keep) # loading from checkpoint if specified. if vars(args).get('prev_model_ckpt'): self.checkpointer.load(load_dir=args.prev_model_ckpt, load_optimizer=False) self.model = model self.optimizer = optimizer self.train_loader = train_loader self.val_loader = val_loader self.input_train_transform = input_train_transform self.input_val_transform = input_val_transform self.output_transform = output_transform self.losses = losses self.scheduler = scheduler self.verbose = args.verbose self.num_epochs = args.num_epochs self.smoothing_factor = args.smoothing_factor self.use_slice_metrics = args.use_slice_metrics self.img_lambda = torch.tensor(args.img_lambda, dtype=torch.float32, device=args.device) self.writer = SummaryWriter(str(args.log_path))
def main(): # get local rank from distributed launcher parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) args = parser.parse_args() print('what is the rank of the current program: ') print(args.local_rank) # initialize dist if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') rank = int(args.local_rank) torch.cuda.set_device(rank) dist.init_process_group(backend='nccl', init_method='env://') # define dataset dataset = DentalClassDataset( ann_file=ann_file, img_prefix=img_prefix, img_scale=img_scale, img_norm_cfg=img_transform_cfg, multiscale_mode='value', # select a scale, rather than random from a range. flip_ratio=flip_ratio, with_label=False, extra_aug=None, test_mode=True, ) # sampler for make number of samples % number of gpu == 0 rank, world_size = get_dist_info() sampler = NewDistributedSampler( dataset=dataset, num_replicas=world_size, images_per_gpu=imgs_per_gpu, rank=rank, shuffle=False ) # data loader. Note this is the code for one (each) gpu. batch_size = imgs_per_gpu num_workers = workers_per_gpu data_loader = DataLoader( dataset=dataset, batch_size=batch_size, # when sampler is given, shuffle must be False. shuffle=False, sampler=sampler, batch_sampler=None, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, ) # define the model and restore checkpoint model = VGGClassifier( with_bn=False, num_classes=len(dataset.CLASSES), num_stages=5, dilations=(1, 1, 1, 1, 1), out_indices=(30,), frozen_stages=-1, bn_eval=True, bn_frozen=False, ceil_mode=True, with_last_pool=True, dimension_before_fc=(10, 15), dropout_rate=0.5, pos_loss_weights=torch.tensor((15, 8), dtype=torch.float32, device=torch.device('cuda', rank)), ) checkpoint = load_checkpoint( model=model, filename=checkpoint_file, map_location='cpu', strict=False, logger=None ) # define classes model.CLASSES = checkpoint['meta']['CLASSES'] # parallelize model model = model.cuda() model = MMDistributedDataParallel( module=model, dim=0, broadcast_buffers=True, bucket_cap_mb=25 ) model.eval() # results and progress bar results = [] dataset = data_loader.dataset if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) # enumerate all data for i, data in enumerate(data_loader): with torch.no_grad(): result = model(is_test=True, rescale=True, **data) results.extend(result) # update program bar only if it is rank 0. if rank == 0: batch_size = data['img'].size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all gpus results = collect_results( result_part=results, dataset_real_size=len(dataset), tmpdir=tmpdir ) # write results to file # [Number of images, Number of classes, (k, 5)]. # 5 for t, l, b, r, and prob. if rank == 0: print('\nwriting results to {}'.format(out_file)) mmcv.dump(results, out_file+'.pickle')