def multiprocessing_context(self, multiprocessing_context): if multiprocessing_context is not None: if self.num_workers > 0: if not multiprocessing._supports_context: raise ValueError( 'multiprocessing_context relies on Python >= 3.4, with ' 'support for different start methods') if isinstance(multiprocessing_context, string_classes): valid_start_methods = multiprocessing.get_all_start_methods( ) if multiprocessing_context not in valid_start_methods: raise ValueError(( 'multiprocessing_context option ' 'should specify a valid start method in {}, but got ' 'multiprocessing_context={}').format( valid_start_methods, multiprocessing_context)) multiprocessing_context = multiprocessing.get_context( multiprocessing_context) if not isinstance(multiprocessing_context, python_multiprocessing.context.BaseContext): raise ValueError(( 'multiprocessing_context option should be a valid context ' 'object or a string specifying the start method, but got ' 'multiprocessing_context={}' ).format(multiprocessing_context)) else: raise ValueError( ('multiprocessing_context can only be used with ' 'multi-process loading (num_workers > 0), but got ' 'num_workers={}').format(self.num_workers)) self.__multiprocessing_context = multiprocessing_context
def __init__( self, strategy: Strategy, start_method: Literal["spawn", "fork", "forkserver"] = "spawn") -> None: self._strategy = strategy self._start_method = start_method if start_method not in mp.get_all_start_methods(): raise ValueError( f"The start method '{self._start_method}' is not available on this platform. Available methods are:" f" {', '.join(mp.get_all_start_methods())}")
def get_multiprocessing_context(self, start_method: str): """Get the right context for the multiprocessing. Fork is the best option, but is only available on unix systems and does not support actors and learner on gpu. Forkserver is then the second choice, Spawn the third. :return: """ fork_available = 'fork' in multiprocessing.get_all_start_methods() and self.policy.device == "cpu" forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods() if fork_available and start_method == 'fork': start_method_used = 'fork' elif forkserver_available and start_method == 'forkserver': start_method_used = 'forkserver' elif start_method == 'spawn': start_method_used = 'spawn' else: raise Exception('Please provide a valid start method. Options are for this system: {}'.format( multiprocessing.get_all_start_methods() )) return multiprocessing.get_context(start_method_used)
def create_dataset_sampler_loader(file_path, cuda, batch_size, hvd): """ Create dataset, sampler and loader """ # When supported, use 'forkserver' to spawn dataloader workers # instead of 'fork' to prevent issues with Infiniband implementations # that are not fork-safe. kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # create dataset dataset = MNISTDataset(file_path) # Horovod: use DistributedSampler to partition the training data sampler = Data.distributed.DistributedSampler(dataset, num_replicas=hvd.size(), rank=hvd.rank()) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler, **kwargs) return dataset, sampler, loader
def multiprocessing_context(self, multiprocessing_context): if multiprocessing_context is not None: if isinstance(multiprocessing_context, string_classes): valid_start_methods = multiprocessing.get_all_start_methods() if multiprocessing_context not in valid_start_methods: raise ValueError( ('multiprocessing_context option ' 'should specify a valid start method in {}, but got ' 'multiprocessing_context={}').format( valid_start_methods, multiprocessing_context)) multiprocessing_context = multiprocessing.get_context( multiprocessing_context) if not isinstance(multiprocessing_context, python_multiprocessing.context.BaseContext): raise TypeError(( 'multiprocessing_context option should be a valid context ' 'object or a string specifying the start method, but got ' 'multiprocessing_context={}' ).format(multiprocessing_context)) self.__multiprocessing_context = multiprocessing_context
def __init__(self, env_fns, env_mem_usages=None, start_method=None): self.waiting = False self.closed = False n_envs = len(env_fns) if start_method is None: # Fork is not a thread safe method (see issue #217) # but is more user friendly (does not require to wrap the code in # a `if __name__ == "__main__":`) forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods( ) start_method = 'forkserver' if forkserver_available else 'spawn' ctx = multiprocessing.get_context(start_method) self.remotes, self.work_remotes = zip( *[ctx.Pipe(duplex=True) for _ in range(n_envs)]) self.processes = [] for i, (work_remote, remote, env_fn) in enumerate( zip(self.work_remotes, self.remotes, env_fns)): if not env_mem_usages is None: args = (work_remote, remote, CloudpickleWrapper(env_fn), env_mem_usages[i]) else: args = (work_remote, remote, CloudpickleWrapper(env_fn), i) # daemon=True: if the main process crashes, we should not cause things to hang # pytype:disable=attribute-error process = ctx.Process(target=profiled_worker, args=args, daemon=True) process.start() self.processes.append(process) work_remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def evaluate(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) if hvd.local_rank() == 0 and not os.path.exists(args.dir): # create 16 random image, mask paris for evaluation print(f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(16): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation val_transforms = Compose( [ LoadImaged(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys="img"), EnsureTyped(keys=["img", "seg"]), ] ) # create a evaluation data loader val_ds = Dataset(data=val_files, transform=val_transforms) # create a evaluation data sampler val_sampler = DistributedSampler(val_ds, shuffle=False, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # sliding window inference need to input 1 image in every iteration val_loader = DataLoader( val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler, multiprocessing_context=multiprocessing_context, ) dice_metric = DiceMetric(include_background=True, reduction="mean", get_not_nans=False) post_trans = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold=0.5)]) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") torch.cuda.set_device(device) model = monai.networks.nets.UNet( spatial_dims=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) if hvd.rank() == 0: # load model parameters for evaluation model.load_state_dict(torch.load("final_model.pth")) # Horovod broadcasts parameters hvd.broadcast_parameters(model.state_dict(), root_rank=0) model.eval() with torch.no_grad(): for val_data in val_loader: val_images, val_labels = val_data["img"].to(device), val_data["seg"].to(device) # define sliding window size and batch size for windows inference roi_size = (96, 96, 96) sw_batch_size = 4 val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model) val_outputs = [post_trans(i) for i in decollate_batch(val_outputs)] dice_metric(y_pred=val_outputs, y=val_labels) metric = dice_metric.aggregate().item() dice_metric.reset() if hvd.rank() == 0: print("evaluation metric:", metric)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='cuda') parser.add_argument('--evaluate', action='store_true') parser.add_argument('--suffix', default='') parser.add_argument('--seed', type=int) parser.add_argument('--num_epochs', type=int) parser.add_argument('--total_batch_size', type=int) parser.add_argument('--batch_size', type=int) parser.add_argument('--dataset_path') parser.add_argument('--num_workers', type=int) parser.add_argument('--num_threads', type=int) parser.add_argument('--base_lr', type=float) parser.add_argument('--lr_scaling') parser.add_argument('--weight_decay', type=float) parser.add_argument('--warmup_epochs', type=float) parser.add_argument('--bias_correction', default=None, action='store_true') parser.add_argument('--save_checkpoint', default=None, action='store_true') parser.add_argument('--dali', default=None, action='store_true') args = parser.parse_args() ################## # Update configs # ################## for k, v in configs.items(): if getattr(args, k) is None: setattr(args, k, v) for k, v in vars(args).items(): printr(f'[{k}] = {v}') if args.device is not None and args.device != 'cpu': # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) cudnn.benchmark = True random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.device == 'cuda': cudnn.deterministic = True cudnn.benchmark = False num_batches_per_step = args.total_batch_size // (args.batch_size * hvd.size()) if num_batches_per_step * args.batch_size * hvd.size( ) != args.total_batch_size: raise ValueError( f'total_batch_size({args.total_batch_size}) is not integer multiples of batch_size({args.batch_size}) * GPUs({hvd.size()})' ) save_path = f'runs/lamb-{args.total_batch_size}{args.suffix}.np{hvd.size()}' printr(f'[save_path] = {save_path}') checkpoint_path = os.path.join(save_path, 'checkpoints') checkpoint_path_fmt = os.path.join(checkpoint_path, f'e{"{epoch}"}-r{hvd.rank()}.pth') latest_pth_path = os.path.join(checkpoint_path, f'latest-r{hvd.rank()}.pth') best_pth_path = os.path.join(checkpoint_path, f'best-r{hvd.rank()}.pth') os.makedirs(checkpoint_path, exist_ok=True) if args.evaluate: latest_pth_path = best_pth_path ##################################################################### # Initialize DataLoaders, Model, Criterion, LRScheduler & Optimizer # ##################################################################### printr(f'\n==> creating dataset from "{args.dataset_path}"') if args.dali: dataset = DaliImageNet(args.dataset_path, batch_size=args.batch_size, train_batch_size=args.batch_size * num_batches_per_step, shard_id=hvd.rank(), num_shards=hvd.size(), num_workers=args.num_workers) else: dataset = ImageNetFolder(args.dataset_path) # Horovod: limit # of CPU threads to be used per worker. loader_kwargs = { 'num_workers': args.num_workers, 'pin_memory': True } if args.device == 'cuda' else {} # When supported, use 'forkserver' to spawn dataloader workers # instead of 'fork' to prevent issues with Infiniband implementations # that are not fork-safe if (loader_kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): loader_kwargs['multiprocessing_context'] = 'forkserver' printr(f'\n==> loading dataset "{loader_kwargs}""') torch.set_num_threads(args.num_threads) if args.dali: samplers, loaders = {split: None for split in dataset}, dataset else: samplers, loaders = {}, {} for split in dataset: # Horovod: use DistributedSampler to partition data among workers. # Manually specify `num_replicas=hvd.size()` and `rank=hvd.rank()`. samplers[split] = torch.utils.data.distributed.DistributedSampler( dataset[split], num_replicas=hvd.size(), rank=hvd.rank()) loaders[split] = torch.utils.data.DataLoader( dataset[split], batch_size=args.batch_size * (num_batches_per_step if split == 'train' else 1), sampler=samplers[split], drop_last=(num_batches_per_step > 1 and split == 'train'), **loader_kwargs) printr(f'\n==> creating model "resnet50"') model = models.resnet50() model = model.to(args.device) criterion = LabelSmoothLoss(smoothing=0.1).to(args.device) # Horovod: scale learning rate by the number of GPUs. lr = args.base_lr if args.lr_scaling == 'sqrt': lr *= math.sqrt(num_batches_per_step * hvd.size()) elif args.lr_scaling == 'linear': lr *= num_batches_per_step * hvd.size() printr(f'\n==> creating optimizer LAMB with LR = {lr}') optimizer = create_lamb_optimizer(model, lr, weight_decay=args.weight_decay, bias_correction=args.bias_correction) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=num_batches_per_step, op=hvd.Average) # resume from checkpoint last_epoch, best_metric = -1, None if os.path.exists(latest_pth_path): printr(f'\n[resume_path] = {latest_pth_path}') checkpoint = torch.load(latest_pth_path) if 'model' in checkpoint: model.load_state_dict(checkpoint.pop('model')) if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint.pop('optimizer')) last_epoch = checkpoint.get('epoch', last_epoch) best_metric = checkpoint.get('meters', {}).get(f'{METRIC}_best', best_metric) # Horovod: broadcast parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) else: printr('\n==> train from scratch') # Horovod: broadcast parameters & optimizer state. printr('\n==> broadcasting paramters and optimizer state') hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) num_steps_per_epoch = len(loaders['train']) warmup_lr_epochs = getattr(args, 'warmup_epochs', 0) last = max((last_epoch - warmup_lr_epochs + 1) * num_steps_per_epoch - 2, -1) decay_steps = args.num_epochs * num_steps_per_epoch warmup_steps = warmup_lr_epochs if warmup_lr_epochs > 0: warmup_steps *= num_steps_per_epoch scheduler = lr_scheduler.PolynomialWarmup(optimizer, decay_steps, warmup_steps, end_lr=0.0, power=1.0, last_epoch=last) ############ # Training # ############ training_meters = make_meters() meters = evaluate(model, device=args.device, meters=training_meters, loader=loaders['test'], split='test', dali=args.dali) for k, meter in meters.items(): printr(f'[{k}] = {meter:.2f}') if args.evaluate or last_epoch >= args.num_epochs: return if hvd.rank() == 0: from torch.utils.tensorboard import SummaryWriter timestamp = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) tensorboard_path = os.path.join(save_path, timestamp) writer = SummaryWriter(tensorboard_path) else: writer = None for current_epoch in range(last_epoch + 1, args.num_epochs): printr(f'\n==> training epoch {current_epoch + 1}/{args.num_epochs}') train(model=model, loader=loaders['train'], device=args.device, epoch=current_epoch, sampler=samplers['train'], criterion=criterion, optimizer=optimizer, scheduler=scheduler, batch_size=args.batch_size, num_batches_per_step=num_batches_per_step, num_steps_per_epoch=num_steps_per_epoch, warmup_lr_epochs=warmup_lr_epochs, schedule_lr_per_epoch=False, writer=writer, quiet=hvd.rank() != 0, dali=args.dali) meters = dict() for split, loader in loaders.items(): if split != 'train': meters.update( evaluate(model, loader=loader, device=args.device, meters=training_meters, split=split, quiet=hvd.rank() != 0, dali=args.dali)) best = False if best_metric is None or best_metric < meters[METRIC]: best_metric, best = meters[METRIC], True meters[f'{METRIC}_best'] = best_metric if writer is not None: num_inputs = ((current_epoch + 1) * num_steps_per_epoch * num_batches_per_step * args.batch_size * hvd.size()) print('') for k, meter in meters.items(): print(f'[{k}] = {meter:.2f}') writer.add_scalar(k, meter, num_inputs) checkpoint = { 'epoch': current_epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'meters': meters } # save checkpoint if args.save_checkpoint: checkpoint_path = checkpoint_path_fmt.format(epoch=current_epoch) torch.save(checkpoint, checkpoint_path) shutil.copyfile(checkpoint_path, latest_pth_path) if best: shutil.copyfile(checkpoint_path, best_pth_path) if current_epoch >= 3: os.remove(checkpoint_path_fmt.format(epoch=current_epoch - 3)) printr(f'[save_path] = {checkpoint_path}')
def main(): global args, best_prec1, best_prec5 args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() #horovod initialize hvd.init() log = None if hvd.rank() == 0: log = SummaryWriter(log_dir=args.log_dir) print('The Training Model is %s' % args.arch) # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.cuda: torch.cuda.set_device(hvd.local_rank()) normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), download=True) val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) #Horovod Partition the training data train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs) val_loader = torch.utils.data.DataLoader( val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs) # model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) if args.arch in resnet.__dict__: model = resnet.__dict__[args.arch]() elif args.arch == 'alexnet': model = models.AlexNet() elif args.arch == 'vgg16': model = models.VGG16() if hvd.rank() == 0: numel = sum(p.numel() for p in model.parameters()) print('Total params: {:d}'.format(numel)) lr_scaler = hvd.size() if args.cuda: model.cuda() if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer, # milestones=[100, 150], last_epoch=args.start_epoch - 1) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(base_optimizer, root_rank=0) #Compression # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size()) # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size()) compression = Allreduce(NoneCompressor(), NoneMemory()) # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size()) # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size()) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters()) if hvd.rank() == 0: log.add_scalar('train/accuracy', 0., 0) log.add_scalar('test/accuracy', 0., 0) for epoch in range(args.start_epoch + 1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, size=lr_scaler) if hvd.rank() == 0: print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log=log) # evaluate on validation set prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) if hvd.rank() == 0: print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5)) # if epoch > 0 and epoch % args.save_every == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) # # save_checkpoint({ # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'model.th')) if hvd.rank() == 0: log.close()
def main(): args = parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) local_rank = hvd.local_rank() world_size = hvd.size() if args.cuda: device = torch.device(f'cuda:{local_rank}') # Horovod: pin GPU to local rank. torch.cuda.set_device(device) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # Horovod: use DistributedSampler to partition the training data. data = prepare_datasets(args, rank=local_rank, num_workers=world_size, data='mnist') model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) loss_fn = nn.CrossEntropyLoss() epoch_times = [] for epoch in range(1, args.epochs + 1): t0 = time.time() train(epoch, data['training'], rank=local_rank, model=model, loss_fn=loss_fn, optimizer=optimizer, args=args, scaler=None) if epoch > 2: epoch_times.append(time.time() - t0) if epoch % 10 == 0: if hvd.local_rank() == 0: accuracy = evaluate(model=model, test_loader=data['testing'].loader) logger.log('-' * 75) logger.log(f'Epoch: {epoch}, Accuracy: {accuracy}') logger.log('-' * 75) if local_rank == 0: epoch_times_str = ', '.join(str(x) for x in epoch_times) logger.log('Epoch times:') logger.log(epoch_times_str) outdir = os.path.join(os.getcwd(), 'results_mnist', f'size{world_size}') if not os.path.isdir(outdir): os.makedirs(outdir) modeldir = os.path.join(outdir, 'saved_models') modelfile = os.path.join(modeldir, 'hvd_model_mnist.pth') if not os.path.isdir(modeldir): os.makedirs(modeldir) logger.log(f'Saving model to: {modelfile}') torch.save(model.state_dict(), modelfile) args_file = os.path.join(outdir, f'args_size{world_size}.json') logger.log(f'Saving args to: {args_file}.') with open(args_file, 'at') as f: json.dump(args.__dict__, f, indent=4) times_file = os.path.join(outdir, f'epoch_times_size{world_size}.csv') logger.log(f'Saving epoch times to: {times_file}') with open(times_file, 'a') as f: f.write(epoch_times_str + '\n')
if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if ( kwargs.get("num_workers", 0) > 0 and hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods() ): kwargs["multiprocessing_context"] = "forkserver" train_dataset = datasets.MNIST( "data-%d" % hvd.rank(), train=True, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank() )
torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir or f'data-{hvd.rank()}' train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler,
def pretrain( run_name: str, # # Data train_filepath: str = DEFAULT_CSNJS_TRAIN_FILEPATH, spm_filepath: str = DEFAULT_SPM_UNIGRAM_FILEPATH, num_workers=1, limit_dataset_size=-1, max_length=1024, subword_regularization_alpha: float = 0, program_mode="contrastive", loss_mode="infonce", # infonce, mlm, or hybrid min_alternatives=1, # # Model resume_path: str = "", encoder_type: str = "transformer", lstm_project_mode: str = "hidden", n_encoder_layers: int = 6, d_model: int = 512, n_head: int = 8, # # Optimization num_epochs: int = 100, save_every: int = 1, batch_size: int = 256, lr: float = 8e-4, weight_decay: float = 0, adam_betas=(0.9, 0.98), warmup_steps: int = 5000, num_steps: int = 600000, # # Horovod use_adasum: bool = False, fp16_allreduce: bool = False, gradient_predivide_factor: float = 1.0, # # Computational use_cuda: bool = True, seed: int = 0, ): hvd.init() logger.info("L:", n_encoder_layers, type(n_encoder_layers)) logger.info("H:", d_model, type(d_model)) logger.info("A:", n_head, type(n_head)) run_name = str(run_name) # support numerical run ids slurm_job_id = os.environ.get("SLURM_JOB_ID") slurm_job_hostname = os.environ.get("SLURM_JOB_NODELIST") config = locals() logger.info(f"Config = \n{config}") logger.info("Training configuration: {}".format(config)) logger.info( f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") logger.info(f"CUDA_DEVICE_ORDER = '{os.environ.get('CUDA_DEVICE_ORDER')}'") assert program_mode in ["contrastive", "identity", "augmentation"] assert loss_mode == "infonce" or loss_mode == "mlm" or loss_mode == "hybrid" assert not (program_mode == "contrastive" and loss_mode == "mlm") assert not (program_mode != "contrastive" and (loss_mode == "hybrid" or loss_mode == "infonce")) assert not use_cuda or torch.cuda.is_available() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) run_dir = RUN_DIR / "{}_{}".format(run_name, int(time.time())) run_dir.mkdir(exist_ok=True, parents=True) config["run_dir"] = str(run_dir.resolve()) logger.add(str((run_dir / "train.log").resolve())) logger.info(f"Saving logs, model checkpoints to {run_dir}") # Create training dataset and dataloader assert train_filepath.endswith(".pickle") or train_filepath.endswith(".gz") # Setup distributed gpu = hvd.local_rank() ngpus_per_node = 1 chief_node = gpu == 0 assert gpu is not None if chief_node: if config["loss_mode"] == "mlm": project = "bert-pretrain" elif config["loss_mode"] == "infonce": project = "moco-pretrain" elif config["loss_mode"] == "hybrid": project = "hybrid" wandb.init(name=config["run_name"], config=config, job_type="training", project=project, entity="ml4code") logger.info("Use GPU: {} for training".format(gpu)) torch.cuda.set_device(gpu) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get("num_workers", 0) > 0 and hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods()): kwargs["multiprocessing_context"] = "forkserver" sp = spm.SentencePieceProcessor() sp.Load(config["spm_filepath"]) pad_id = sp.PieceToId("[PAD]") logger.info("pad_id {}", pad_id) assert pad_id == 0 # hard coded in pad_collate mask_id = sp.PieceToId("[MASK]") # Create model if config["loss_mode"] == "infonce": # TODO(ajay): Support n_head argument, check how d_model is being used (why not in encoder config dict?) model = CodeMoCo( sp.GetPieceSize(), pad_id=pad_id, d_model=config["d_model"], encoder_config=dict( encoder_type=config["encoder_type"], lstm_project_mode=config["lstm_project_mode"], n_encoder_layers=config["n_encoder_layers"], ), ) logger.info( f"Created CodeMoCo model with {count_parameters(model)} params") elif config["loss_mode"] == "mlm": model = CodeMLM( sp.GetPieceSize(), pad_id=pad_id, encoder_type=config["encoder_type"], n_encoder_layers=config["n_encoder_layers"], d_model=config["d_model"], n_head=config["n_head"], d_ff=4 * config["d_model"], ) logger.info( f"Created CodeMLM model with {count_parameters(model)} params") elif config["loss_mode"] == "hybrid": model = CodeContrastiveMLM( sp.GetPieceSize(), pad_id=pad_id, n_encoder_layers=config["n_encoder_layers"], d_model=config["d_model"], n_head=config["n_head"], d_ff=4 * config["d_model"], use_horovod=True, ) logger.info( f"Created CodeContrastiveMLM model with {count_parameters(model)} params" ) else: raise ValueError(f"Bad loss mode {config['loss_mode']}") assert config["use_cuda"] model.cuda() # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # config["batch_size"] = int(config["batch_size"] / ngpus_per_node) # config["num_workers"] = int((config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) # define optimizer # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not config["use_adasum"] else 1 # If using GPU Adasum allreduce, scale learning rate by local_size. if config["use_adasum"] and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"] * lr_scaler, betas=config["adam_betas"], eps=1e-6, weight_decay=config["weight_decay"]) sched = get_linear_schedule_with_warmup(optimizer, config["warmup_steps"], config["num_steps"]) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if config[ "fp16_allreduce"] else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if config["use_adasum"] else hvd.Average, gradient_predivide_factor=config["gradient_predivide_factor"], ) # Load checkpoint if config["resume_path"]: logger.info(f"Loading parameters from {config['resume_path']}") # configure map_location properly map_location = {"cuda:%d" % 0: "cuda:%d" % hvd.rank()} checkpoint = torch.load(config["resume_path"], map_location=map_location) model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) start_epoch = checkpoint["epoch"] + 1 start_global_step = checkpoint["global_step"] else: start_epoch = 1 start_global_step = 0 # Setup data train_dataset = PrecomputedDataset( config["train_filepath"], min_alternatives=config["min_alternatives"], program_mode=config["program_mode"], limit_size=config["limit_dataset_size"], sp=sp, subword_regularization_alpha=config["subword_regularization_alpha"], max_length=config["max_length"], ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config["batch_size"], shuffle=False, collate_fn=pad_collate_contrastive if config["program_mode"] == "contrastive" else pad_collate, drop_last=True, sampler=train_sampler, **kwargs, ) # Train global_step = 0 while global_step < start_global_step: sched.step() global_step += 1 for epoch in tqdm.trange(start_epoch, config["num_epochs"] + 1, desc="training", unit="epoch", leave=False): logger.info(f"Starting epoch {epoch}\n") train_sampler.set_epoch(epoch) model.train() pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}") for batch in pbar: optimizer.zero_grad() if config["loss_mode"] == "infonce": train_metrics = training_step(model, batch, use_cuda=config["use_cuda"]) elif config["loss_mode"] == "mlm": # replace tokens randomly with tokens from _ (8) train_metrics = training_step_mlm(sp, model, batch, pad_id=pad_id, mask_id=mask_id, vocab_start_idx=8, vocab_end_idx=7999, use_cuda=config["use_cuda"]) elif config["loss_mode"] == "hybrid": train_metrics = training_step_hybrid( sp, model, batch, mask_id=mask_id, pad_id=pad_id, vocab_start_idx=0, vocab_end_idx=7999, use_cuda=config["use_cuda"]) else: raise ValueError("Bad loss type") loss = train_metrics["loss"] loss.backward() optimizer.step() sched.step() global_step += 1 pbar.set_description( f"epoch {epoch} gpu {gpu} step {global_step} loss {loss.item():.4f}" ) if chief_node: wandb.log(dict(lr=sched.get_last_lr()[0])) wandb.log(dict(epoch=epoch, **train_metrics["log"]), step=global_step) # Save checkpoint if config["save_every"] and global_step % config[ "save_every"] == 0: checkpoint = { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "config": config, } model_file = os.path.join( config["run_dir"], f"ckpt_pretrain_ep{epoch:04d}_step{global_step:07d}.pth" ) logger.info(f"Saving checkpoint to {model_file}...") torch.save(checkpoint, model_file) wandb.save(str(model_file)) logger.info("Done.")
import torch.multiprocessing as mp from torch.autograd import Variable import torch import numpy as np import time import torch.multiprocessing as mp print(mp.get_all_start_methods()) def worker(tensor): while True: tensor += 0.01 time.sleep(1) def main(): mp.set_start_method('spawn') tt = np.ones((100, 100, 100)) t = torch.from_numpy(tt) t = t.share_memory_() # share 1 t = t.cuda(async=True) # t=t.share_memory_() #share 2. did not work. processes = [] for i in range(10): p = mp.Process(target=worker, args=(t, )) p.daemon = True
def train(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) # disable logging for processes except 0 on every node if hvd.local_rank() != 0: f = open(os.devnull, "w") sys.stdout = sys.stderr = f elif not os.path.exists(args.dir): # create 40 random image, mask paris on master node for training print( f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation train_transforms = Compose([ LoadImaged(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys="img"), RandCropByPosNegLabeld(keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4), RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]), ToTensord(keys=["img", "seg"]), ]) # create a training data loader train_ds = Dataset(data=train_files, transform=train_transforms) # create a training data sampler train_sampler = DistributedSampler(train_ds, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr( mp, "_supports_context" ) and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=False, num_workers=2, pin_memory=True, sampler=train_sampler, multiprocessing_context=multiprocessing_context, ) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") torch.cuda.set_device(device) model = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss_function = monai.losses.DiceLoss(sigmoid=True).to(device) optimizer = torch.optim.Adam(model.parameters(), 1e-3) # Horovod broadcasts parameters & optimizer state hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod wraps optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # start a typical PyTorch training epoch_loss_values = list() for epoch in range(5): print("-" * 10) print(f"epoch {epoch + 1}/{5}") model.train() epoch_loss = 0 step = 0 train_sampler.set_epoch(epoch) for batch_data in train_loader: step += 1 inputs, labels = batch_data["img"].to( device), batch_data["seg"].to(device) optimizer.zero_grad() outputs = model(inputs) loss = loss_function(outputs, labels) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_len = len(train_ds) // train_loader.batch_size print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}") epoch_loss /= step epoch_loss_values.append(epoch_loss) print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}") print(f"train completed, epoch losses: {epoch_loss_values}") if hvd.rank() == 0: # all processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes, # therefore, saving it in one process is sufficient torch.save(model.state_dict(), "final_model.pth")
def fit(self, input_data=None, input_labels=None, loss="", opt=""): if self.use_model: # use_model # Check Input Data if input_data is None or input_labels is None: return if self.model_onnx: print("Cannot use onnx type to fit model") return # Make TensorDataset and DataLoader for PyTorch train_dataset = TensorDataset(input_data, input_labels) # Handling Input of Loss Function loss_func = F.nll_loss if loss == "nll_loss": loss_func = F.nll_loss elif loss == "mse_loss": loss_func = F.mse_loss elif loss == "cross_entropy": loss_func = F.cross_entropy elif loss == "l1_loss": loss_func = F.l1_loss if self.cuda: ##### HOROVOD ##### train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) kwargs = {'num_workers': 1, 'pin_memory': True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, sampler=train_sampler, **kwargs) # Set Optimizer if self.use_optimizer: optimizer = self.optimizer else: if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() if opt == "SGD": optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar, momentum=self.momentum) else: optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar, momentum=self.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=self.model.named_parameters(), compression=compression, op=hvd.Average) #op=hvd.Adasum if args.use_adasum else hvd.Average) else: train_loader = DataLoader(train_dataset, batch_size=self.batch_size) if self.use_optimizer: optimizer = self.optimizer else: if optim == "SGD": optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) else: optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) if self.debug: # Print model's state_dict print("Model's state_dict:") for param_tensor in self.model.state_dict(): print(param_tensor, "\t", self.model.state_dict()[param_tensor].size()) # Print optimizer's state_dict print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print(var_name, "\t", optimizer.state_dict()[var_name]) losses = [] nums = [] accs = [] for epoch in range(self.epochs): self.model.train() # Horovod: set epoch to sampler for shuffling. if self.cuda: train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if self.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = self.model(data) loss = loss_func(output, target) acc = self.accuracy(output,target) loss.backward() optimizer.step() if batch_idx % self.log_interval == 0: if self.cuda: if hvd.rank() == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format( epoch+1, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item(), acc*100)) else: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format( epoch+1, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item(), acc*100))
def _setup(self): """ Setup the inner attributes of the interface, initializing horovod and the callbacks handler. This method must be called before train and evaluate. """ # Setup horovod: if self._use_horovod: # Import horovod: self._hvd = importlib.import_module("horovod.torch") # Initialize horovod: self._hvd.init() # Limit the number of CPU threads to be used per worker: torch.set_num_threads(1) # Setup additional multiprocessing related key word arguments for the data loaders initialization: mp_data_loader_kwargs = {} # Setup cuda: if self._use_cuda and torch.cuda.is_available(): if self._use_horovod: torch.cuda.set_device(self._hvd.local_rank()) mp_data_loader_kwargs["num_workers"] = 1 mp_data_loader_kwargs["pin_memory"] = True # Move the model and the stored objects to the GPU: self._objects_to_cuda() elif self._model_in_cuda: # Move the model back to the CPU: self._model = self._model.cpu() self._model_in_cuda = False # Initialize a callbacks handler: if self._use_horovod: self._callbacks_handler = CallbacksHandler(callbacks=[ callback for callback in self._callbacks if callback.on_horovod_check(rank=self._hvd.rank()) ]) else: self._callbacks_handler = CallbacksHandler( callbacks=self._callbacks) # Prepare horovod for the run if needed: if self._use_horovod: # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent issues with # Infiniband implementations that are not fork-safe if (mp_data_loader_kwargs.get("num_workers", 0) > 0 and hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods()): mp_data_loader_kwargs["multiprocessing_context"] = "forkserver" # Partition dataset among workers using distributed samplers: if self._training_set is not None: self._training_sampler = DistributedSampler( self._training_set.dataset, num_replicas=self._hvd.size(), rank=self._hvd.rank(), ) self._training_set = self._insert_sampler_to_data_loader( data_loader=self._training_set, sampler=self._training_sampler, multiprocessing_kwargs=mp_data_loader_kwargs, ) if self._validation_set is not None: self._validation_sampler = DistributedSampler( self._validation_set.dataset, num_replicas=self._hvd.size(), rank=self._hvd.rank(), ) self._validation_set = self._insert_sampler_to_data_loader( data_loader=self._validation_set, sampler=self._validation_sampler, multiprocessing_kwargs=mp_data_loader_kwargs, ) # Broadcast parameters and optimizer state: self._hvd.broadcast_parameters(self._model.state_dict(), root_rank=0) if self._optimizer is not None: self._hvd.broadcast_optimizer_state(self._optimizer, root_rank=0) # Add Horovod Distributed Optimizer: self._optimizer = self._hvd.DistributedOptimizer( self._optimizer, named_parameters=self._model.named_parameters()) # Setup the callbacks functions: self._callbacks_handler.on_setup( model=self._model, training_set=self._training_set, validation_set=self._validation_set, loss_function=self._loss_function, optimizer=self._optimizer, metric_functions=self._metric_functions, scheduler=self._scheduler, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--configs', nargs='+') parser.add_argument('--devices', default='gpu') parser.add_argument('--evaluate', action='store_true') parser.add_argument('--suffix', default='') args, opts = parser.parse_known_args() ################## # Update configs # ################## printr(f'==> loading configs from {args.configs}') Config.update_from_modules(*args.configs) Config.update_from_arguments(*opts) if args.devices is not None and args.devices != 'cpu': configs.device = 'cuda' # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) cudnn.benchmark = True else: configs.device = 'cpu' if 'seed' in configs and configs.seed is not None: random.seed(configs.seed) np.random.seed(configs.seed) torch.manual_seed(configs.seed) if configs.device == 'cuda' and configs.get('deterministic', True): cudnn.deterministic = True cudnn.benchmark = False configs.train.num_batches_per_step = \ configs.train.get('num_batches_per_step', 1) configs.train.save_path = get_save_path(*args.configs) \ + f'{args.suffix}.np{hvd.size()}' printr(f'[train.save_path] = {configs.train.save_path}') checkpoint_path = os.path.join(configs.train.save_path, 'checkpoints') configs.train.checkpoint_path = os.path.join( checkpoint_path, f'e{"{epoch}"}-r{hvd.rank()}.pth') configs.train.latest_pth_path = os.path.join(checkpoint_path, f'latest-r{hvd.rank()}.pth') configs.train.best_pth_path = os.path.join(checkpoint_path, f'best-r{hvd.rank()}.pth') os.makedirs(checkpoint_path, exist_ok=True) if args.evaluate: configs.train.latest_pth_path = configs.train.best_pth_path printr(configs) ##################################################################### # Initialize DataLoaders, Model, Criterion, LRScheduler & Optimizer # ##################################################################### printr(f'\n==> creating dataset "{configs.dataset}"') dataset = configs.dataset() # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(configs.data.num_threads_per_worker) loader_kwargs = { 'num_workers': configs.data.num_threads_per_worker, 'pin_memory': True } if configs.device == 'cuda' else {} # When supported, use 'forkserver' to spawn dataloader workers # instead of 'fork' to prevent issues with Infiniband implementations # that are not fork-safe if (loader_kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): loader_kwargs['multiprocessing_context'] = 'forkserver' printr(f'\n==> loading dataset "{loader_kwargs}""') samplers, loaders = {}, {} for split in dataset: # Horovod: use DistributedSampler to partition data among workers. # Manually specify `num_replicas=hvd.size()` and `rank=hvd.rank()`. samplers[split] = torch.utils.data.distributed.DistributedSampler( dataset[split], num_replicas=hvd.size(), rank=hvd.rank()) loaders[split] = torch.utils.data.DataLoader( dataset[split], batch_size=configs.train.batch_size * (configs.train.num_batches_per_step if split == 'train' else 1), sampler=samplers[split], drop_last=(configs.train.num_batches_per_step > 1 and split == 'train'), **loader_kwargs) printr(f'\n==> creating model "{configs.model}"') model = configs.model() model = model.cuda() criterion = configs.train.criterion().to(configs.device) # Horovod: scale learning rate by the number of GPUs. configs.train.base_lr = configs.train.optimizer.lr configs.train.optimizer.lr *= (configs.train.num_batches_per_step * hvd.size()) printr(f'\n==> creating optimizer "{configs.train.optimizer}"') if configs.train.optimize_bn_separately: optimizer = configs.train.optimizer([ dict(params=get_common_parameters(model)), dict(params=get_bn_parameters(model), weight_decay=0) ]) else: optimizer = configs.train.optimizer(model.parameters()) # Horovod: (optional) compression algorithm. printr(f'\n==> creating compression "{configs.train.compression}"') if configs.train.dgc: printr(f'\n==> initializing dgc compression') configs.train.compression.memory = configs.train.compression.memory() compression = configs.train.compression() compression.memory.initialize(model.named_parameters()) cpr_parameters = {} for name, param in model.named_parameters(): if param.dim() > 1: cpr_parameters[name] = param compression.initialize(cpr_parameters.items()) else: compression = configs.train.compression() # Horovod: wrap optimizer with DistributedOptimizer. optimizer = DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, backward_passes_per_step=configs.train.num_batches_per_step, op=hvd.Average) # resume from checkpoint last_epoch, best_metric = -1, None if os.path.exists(configs.train.latest_pth_path): printr(f'\n[resume_path] = {configs.train.latest_pth_path}') checkpoint = torch.load(configs.train.latest_pth_path) if 'model' in checkpoint: model.load_state_dict(checkpoint.pop('model')) if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint.pop('optimizer')) if configs.train.dgc and 'compression' in checkpoint: compression.memory.load_state_dict(checkpoint.pop('compression')) last_epoch = checkpoint.get('epoch', last_epoch) best_metric = checkpoint.get('meters', {}).get(f'{configs.train.metric}_best', best_metric) # Horovod: broadcast parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) else: printr('\n==> train from scratch') # Horovod: broadcast parameters & optimizer state. printr('\n==> broadcasting paramters and optimizer state') hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) num_steps_per_epoch = len(loaders['train']) if 'scheduler' in configs.train and configs.train.scheduler is not None: if configs.train.schedule_lr_per_epoch: last = max(last_epoch - configs.train.warmup_lr_epochs - 1, -1) else: last = max((last_epoch - configs.train.warmup_lr_epochs + 1) * num_steps_per_epoch - 2, -1) scheduler = configs.train.scheduler(optimizer, last_epoch=last) else: scheduler = None ############ # Training # ############ meters = evaluate(model, device=configs.device, meters=configs.train.meters, loader=loaders['test'], split='test') for k, meter in meters.items(): printr(f'[{k}] = {meter:2f}') if args.evaluate or last_epoch >= configs.train.num_epochs: return if hvd.rank() == 0: from tensorboardX import SummaryWriter writer = SummaryWriter(configs.train.save_path) else: writer = None for current_epoch in range(last_epoch + 1, configs.train.num_epochs): printr(f'\n==> training epoch {current_epoch}' f'/{configs.train.num_epochs}') if configs.train.dgc: compression.warmup_compress_ratio(current_epoch) train(model=model, loader=loaders['train'], device=configs.device, epoch=current_epoch, sampler=samplers['train'], criterion=criterion, optimizer=optimizer, scheduler=scheduler, batch_size=configs.train.batch_size, num_batches_per_step=configs.train.num_batches_per_step, num_steps_per_epoch=num_steps_per_epoch, warmup_lr_epochs=configs.train.warmup_lr_epochs, schedule_lr_per_epoch=configs.train.schedule_lr_per_epoch, writer=writer, quiet=hvd.rank() != 0) meters = dict() for split, loader in loaders.items(): if split != 'train': meters.update( evaluate(model, loader=loader, device=configs.device, meters=configs.train.meters, split=split, quiet=hvd.rank() != 0)) best = False if 'metric' in configs.train and configs.train.metric is not None: if best_metric is None or best_metric < meters[ configs.train.metric]: best_metric, best = meters[configs.train.metric], True meters[configs.train.metric + '_best'] = best_metric if writer is not None: num_inputs = ((current_epoch + 1) * num_steps_per_epoch * configs.train.num_batches_per_step * configs.train.batch_size * hvd.size()) print('') for k, meter in meters.items(): print(f'[{k}] = {meter:2f}') writer.add_scalar(k, meter, num_inputs) checkpoint = { 'epoch': current_epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'meters': meters, 'compression': compression.memory.state_dict() \ if configs.train.dgc else None } # save checkpoint checkpoint_path = \ configs.train.checkpoint_path.format(epoch=current_epoch) torch.save(checkpoint, checkpoint_path) shutil.copyfile(checkpoint_path, configs.train.latest_pth_path) if best: shutil.copyfile(checkpoint_path, configs.train.best_pth_path) if current_epoch >= 3: os.remove( configs.train.checkpoint_path.format(epoch=current_epoch - 3)) printr(f'[save_path] = {checkpoint_path}')
def evaluate(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) if hvd.local_rank() == 0 and not os.path.exists(args.dir): # create 16 random image, mask paris for evaluation print( f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(16): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation val_transforms = Compose([ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys=["img", "seg"]), ToTensord(keys=["img", "seg"]), ]) # create a evaluation data loader val_ds = Dataset(data=val_files, transform=val_transforms) # create a evaluation data sampler val_sampler = DistributedSampler(val_ds, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr( mp, "_supports_context" ) and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # sliding window inference need to input 1 image in every iteration val_loader = DataLoader( val_ds, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, sampler=val_sampler, multiprocessing_context=multiprocessing_context, ) dice_metric = DiceMetric(include_background=True, to_onehot_y=False, sigmoid=True, reduction="mean") # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") model = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) if hvd.rank() == 0: # load model parameters for evaluation model.load_state_dict(torch.load("final_model.pth")) # Horovod broadcasts parameters hvd.broadcast_parameters(model.state_dict(), root_rank=0) model.eval() with torch.no_grad(): # define PyTorch Tensor to record metrics result at each GPU # the first value is `sum` of all dice metric, the second value is `count` of not_nan items metric = torch.zeros(2, dtype=torch.float, device=device) for val_data in val_loader: val_images, val_labels = val_data["img"].to( device), val_data["seg"].to(device) # define sliding window size and batch size for windows inference roi_size = (96, 96, 96) sw_batch_size = 4 val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model) value = dice_metric(y_pred=val_outputs, y=val_labels).squeeze() metric[0] += value * dice_metric.not_nans metric[1] += dice_metric.not_nans # synchronizes all processes and reduce results print( f"metric in rank {hvd.rank()}: sum={metric[0].item()}, count={metric[1].item()}" ) avg_metric = hvd.allreduce(metric, name="mean_dice") if hvd.rank() == 0: print( f"average metric: sum={avg_metric[0].item()}, count={avg_metric[1].item()}" ) print("evaluation metric:", (avg_metric[0] / avg_metric[1]).item())
def main(): # base information. log('Model: %s' % 'ResNet50') log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, light.cc.size())) torch.manual_seed(args.seed) if args.cuda: # light: pin GPU to local rank. torch.cuda.set_device(light.cc.local_rank()) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True #light: By default, Adasum doesn't need scaling up learning rate. lr_scaler = args.batches_per_allreduce * light.cc.size() if not args.use_adasum else 1 # Set up standard ResNet-50 model. & Move model to GPU. model = models.resnet50() if args.cuda: model.cuda() if args.use_adasum and light.cc.nccl_built(): ## 修改点8:获取当前机器节点上Light进程数量 lr_scaler = args.batches_per_allreduce * light.cc.local_size() optimizer = optim.SGD(model.parameters(), lr=args.base_lr * lr_scaler, momentum=args.momentum, weight_decay=args.wd) ## 修改点9:指定Light梯度压缩数据类型fp16 compression = light.cc.Compression.fp16 if args.fp16_allreduce else light.cc.Compression.none ## 修改点10:用Light分布式优化器包装原有优化器,让其具有分布式能力 optimizer = light.cc.get_distributed_optimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=light.cc.Adasum if args.use_adasum else light.cc.Average) ## 修改点11:调用Light广播变量方法 light.cc.broadcast_variable(model.state_dict(), root_rank=0) light.cc.broadcast_variable(optimizer, root_rank=0) if args.train_dir is not None: log('Use imagenet dataset.') # limit # of CPU threads to be used per worker. torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.ImageFolder(args.train_dir, transform=transforms.Compose([transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) ## 修改点12:用分布式Sampler在各个机器节点上分配训练数据,需要手动指定`num_replicas=light.cc.size()` and `rank=light.cc.rank()` train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=light.cc.size(), rank=light.cc.rank()) train_loader = light.io.get_data_loader(train_dataset, batch_size=args.batch_size * args.batches_per_allreduce, sampler=train_sampler, **kwargs) train_imagenet(model, optimizer, train_loader, train_sampler) else: log('Use synthetic dataset.') data = torch.randn(args.batch_size, 3, 224, 224) target = torch.LongTensor(args.batch_size).random_() % 1000 if args.cuda: data, target = data.cuda(), target.cuda() train_synthetic_(model, optimizer, data, target)
def main(args): # Create a model, synthetic data, and a guide. pyro.set_rng_seed(args.seed) model = Model(args.size) covariates = torch.randn(args.size) data = model(covariates) guide = AutoNormal(model) if args.horovod: # Initialize Horovod and set PyTorch globals. import horovod.torch as hvd hvd.init() torch.set_num_threads(1) if args.cuda: torch.cuda.set_device(hvd.local_rank()) if args.cuda: torch.set_default_tensor_type("torch.cuda.FloatTensor") device = torch.tensor(0).device if args.horovod: # Initialize parameters and broadcast to all workers. guide(covariates[:1], data[:1]) # Initializes model and guide. hvd.broadcast_parameters(guide.state_dict(), root_rank=0) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Create an ELBO loss and a Pyro optimizer. elbo = Trace_ELBO() optim = Adam({"lr": args.learning_rate}) if args.horovod: # Wrap the basic optimizer in a distributed optimizer. optim = HorovodOptimizer(optim) # Create a dataloader. dataset = torch.utils.data.TensorDataset(covariates, data) if args.horovod: # Horovod requires a distributed sampler. sampler = torch.utils.data.distributed.DistributedSampler( dataset, hvd.size(), hvd.rank()) else: sampler = torch.utils.data.RandomSampler(dataset) config = {"batch_size": args.batch_size, "sampler": sampler} if args.cuda: config["num_workers"] = 1 config["pin_memory"] = True # Try to use forkserver to spawn workers instead of fork. if (hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods()): config["multiprocessing_context"] = "forkserver" dataloader = torch.utils.data.DataLoader(dataset, **config) # Run stochastic variational inference. svi = SVI(model, guide, optim, elbo) for epoch in range(args.num_epochs): if args.horovod: # Set rng seeds on distributed samplers. This is required. sampler.set_epoch(epoch) for step, (covariates_batch, data_batch) in enumerate(dataloader): loss = svi.step(covariates_batch.to(device), data_batch.to(device)) if args.horovod: # Optionally average loss metric across workers. # You can do this with arbitrary torch.Tensors. loss = torch.tensor(loss) loss = hvd.allreduce(loss, "loss") loss = loss.item() # Print only on the rank=0 worker. if step % 100 == 0 and hvd.rank() == 0: print("epoch {} step {} loss = {:0.4g}".format( epoch, step, loss)) else: if step % 100 == 0: print("epoch {} step {} loss = {:0.4g}".format( epoch, step, loss)) if args.horovod: # After we're done with the distributed parts of the program, # we can shutdown all but the rank=0 worker. hvd.shutdown() if hvd.rank() != 0: return if args.outfile: print("saving to {}".format(args.outfile)) torch.save({"model": model, "guide": guide}, args.outfile)
def main(): args = parser.parse_args() # Set-up tensorboard # Horovod: initialize library. seed = 42 hvd.init() torch.manual_seed(seed) # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() loss_function = nn.CrossEntropyLoss() running_loss = 0.0 # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.Adam(model.parameters(), lr=args.base_lr * lr_scaler) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # Profile training logs = "logs/pytorch-" + datetime.now().strftime("%Y%m%d-%H%M%S") writer = SummaryWriter(log_dir=logs) for epoch in range(1, args.epochs + 1): train(epoch, model, train_sampler, train_loader, optimizer, loss_function, args) test_loss, test_accuracy = test(model, test_loader, test_sampler) if hvd.rank() == 0: writer.add_scalars("Test", { "loss": test_loss, "acc.": test_accuracy }) writer.close()
def main(): args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) if args.cuda: torch.cuda.set_device(args.local_rank) torch.cuda.manual_seed(args.seed) # dist.init_process_group('nccl') # else: # dist.init_process_group('mpi') cudnn.benchmark = True torch.set_num_threads(4) if dist.get_rank() == 0 and not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) # dataset kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5])]) dataset = datasets.MNIST(root=args.train_dir, transform=transform, train=True, download=True) sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, **kwargs) model = Model() optimizer = torch.optim.Adam(model.parameters()) if args.cuda: model.cuda() if args.cuda: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) else: model = DDP(model) # train for epoch in range(args.epochs): model.train() sampler.set_epoch(epoch) loss = 0.0 acc = 0.0 for ind, data in enumerate(loader): x, y = data x, y = Variable(x), Variable(y) if args.cuda: x, y = x.cuda(), y.cuda() optimizer.zero_grad() output = model(x) acc = accuracy(output, y) loss = F.cross_entropy(output, y) loss.div_(math.ceil(float(len(data)) / args.batch_size)) loss.backward() optimizer.step() print( f"Train Epoch: {epoch}, batch: {ind}, loss: {loss}, accuracy: {100. * acc}" ) if dist.get_rank() == 0: torch.save(model.state_dict(), os.path.join(args.checkpoint_dir, "model_parameter.pkl"))
def main(args): def train_mixed_precision(epoch, scaler): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() with torch.cuda.amp.autocast(): output = model(data) loss = F.nll_loss(output, target) scaler.scale(loss).backward() # Make sure all async allreduces are done optimizer.synchronize() # In-place unscaling of all gradients before weights update scaler.unscale_(optimizer) with optimizer.skip_synchronize(): scaler.step(optimizer) # Update scaler in case of overflow/underflow scaler.update() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}' .format(epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item(), scaler.get_scale())) def train_epoch(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) def metric_average(val, name): tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) else: if args.use_mixed_precision: raise ValueError( "Mixed precision is only supported with cuda enabled.") if (args.use_mixed_precision and LooseVersion(torch.__version__) < LooseVersion('1.6.0')): raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(), which requires torch >= 1.6.0""") # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir or './data' with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) if args.use_mixed_precision: # Initialize scaler in global scale scaler = torch.cuda.amp.GradScaler() for epoch in range(1, args.epochs + 1): if args.use_mixed_precision: train_mixed_precision(epoch, scaler) else: train_epoch(epoch) # Keep test in full precision since computation is relatively light. test()
if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): from torch.utils.tensorboard import SummaryWriter else: from tensorboardX import SummaryWriter log_writer = SummaryWriter(args.log_dir) if hvd.rank() == 0 else None except ImportError: log_writer = None # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = \ datasets.ImageFolder(args.train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
def run(): args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) """model_init""" model = FFN_no_norm(in_channels=4, out_channels=1, input_size=args.input_size, delta=args.delta, depth=args.depth) #hvd ddl # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) """resume""" if args.resume is not None: model.load_state_dict(torch.load(args.resume)) if os.path.exists(args.save_path + 'resume_step.pkl'): resume = load_obj(args.save_path + 'resume_step.pkl') else: resume = {'resume_step': args.resume_step} args.resume_step = resume['resume_step'] print('resume_step', args.resume_step) if args.tb == None: tb = SummaryWriter('./tensorboard/'+args.tag+'tb_train_log_fov:{}_delta:{}_depth:{}.pth' .format(list(args.input_size)[0], list(args.delta)[0], args.depth)) else: tb = SummaryWriter(args.tb) """data_load""" train_dataset= BatchCreator(args.train_data_dir, args.input_size, delta=args.delta,train=True) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, sampler=train_sampler, **kwargs) batch_it = get_batch(train_loader, args.batch_size, args.input_size, partial(fixed_offsets, fov_moves=train_dataset.shifts)) """ for index in range(files_total): input_h5data_dict = [(abs_path_training_data + sorted_files_train_data)] print(input_h5data_dict) train_dataset_dict = BatchCreator(input_h5data_dict, args.input_size, delta=args.delta, train=True) train_sampler_dict = torch.utils.data.distributed.DistributedSampler(train_dataset_dict, num_replicas=world_size, rank=rank, shuffle=True) train_loader_dict = DataLoader(train_dataset_dict, num_workers=0, sampler=train_sampler_dict , pin_memory=True) batch_it_dict = get_batch(train_loader_dict, args.batch_size, args.input_size, partial(fixed_offsets, fov_moves=train_dataset_dict.shifts)) """ """optimizer""" """ if args.opt == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) else: optimizer = optim.SGD(model.parameters(), lr=1e-3) """ # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.step, gamma=args.gamma, last_epoch=-1) """train_loop""" t_last = time.time() cnt = 0 tp = fp = tn = fn = 0 best_loss = np.inf model.train() while cnt < args.iter: cnt += 1 # resume_tb if cnt % 1000 == 0: resume['resume_step'] = cnt + args.resume_step pickle_obj(resume, 'resume_step', args.save_path) """ index_batch = (cnt % train_num) train_sampler_dict[index_batch].set_epoch(cnt) seeds, images, labels, offsets = next(batch_it_dict[index_batch]) print(input_h5data_dict[index_batch]) """ train_sampler.set_epoch(cnt) seeds, images, labels, offsets = next(batch_it) # train t_curr = time.time() labels = labels.cuda() torch_seed = torch.from_numpy(seeds) input_data = torch.cat([images, torch_seed], dim=1) input_data = Variable(input_data.cuda()) logits = model(input_data) updated = torch_seed.cuda() + logits optimizer.zero_grad() loss = F.binary_cross_entropy_with_logits(updated, labels) loss.backward() # torch.nn.utils.clip_grad_value_(model.parameters(), args.clip_grad_thr) optimizer.step() seeds[...] = updated.detach().cpu().numpy() pred_mask = (updated >= logit(0.8)).detach().cpu().numpy() true_mask = (labels > 0.5).cpu().numpy() true_bg = np.logical_not(true_mask) pred_bg = np.logical_not(pred_mask) tp += (true_mask & pred_mask).sum() fp += (true_bg & pred_mask).sum() fn += (true_mask & pred_bg).sum() tn += (true_bg & pred_bg).sum() precision = 1.0 * tp / max(tp + fp, 1) recall = 1.0 * tp / max(tp + fn, 1) accuracy = 1.0 * (tp + tn) / (tp + tn + fp + fn) print('[rank_{}:, Iter_{}:, loss: {:.4}, Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%]\r'.format(hvd.rank(), cnt, loss.item(), precision * 100, recall * 100, accuracy * 100)) # scheduler.step() """model_saving_(iter)""" if (cnt % args.save_interval) == 0 and hvd.rank() == 0: tp = fp = tn = fn = 0 # t_last = t_curr # best_loss = loss.item() input_size_r = list(args.input_size) delta_r = list(args.delta) torch.save(model.state_dict(), os.path.join(args.save_path, ( str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}.pth'.format(input_size_r[0], delta_r[0], args.depth)))) torch.save(model.state_dict(), os.path.join(args.save_path, ( str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}_recall{}_.pth'.format(input_size_r[0], delta_r[0], args.depth,recall*100)))) print('Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'.format( precision * 100, recall * 100, accuracy * 100)) buffer_step = 3000 resume_step = args.resume_step - buffer_step if cnt > buffer_step: tb.add_scalar("Loss", loss.item(), cnt + resume_step) tb.add_scalar("Precision", precision * 100, cnt + resume_step) tb.add_scalar("Recall", recall * 100, cnt + resume_step) tb.add_scalar("Accuracy", accuracy * 100, cnt + resume_step)