def main(): global best_prec1, args args.distributed = args.world_size > 1 # args.gpu = 0 if args.distributed: # args.gpu = args.rank % torch.cuda.device_count() # torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() n_dev = torch.cuda.device_count() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) #args.lr *= n_dev elif args.dp: model = nn.DataParallel(model) args.batch_size *= n_dev #args.lr *= n_dev global param_copy if args.fp16: param_copy = [ param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters() ] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.sz), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = ( torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(int(args.sz * 1.14)), transforms.CenterCrop(args.sz), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): print("~~epoch\thours\ttop1Accuracy\n") start_time = datetime.now() args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: model = models.__dict__[args.arch](pretrained=True) else: model = models.__dict__[args.arch]() model = model.cuda() n_dev = torch.cuda.device_count() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) elif args.dp: model = nn.DataParallel(model) args.batch_size *= n_dev global param_copy if args.fp16: param_copy = [ param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters() ] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) best_prec1 = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) else: print("=> no checkpoint found at '{}'".format(args.resume)) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') train_loader, val_loader, train_sampler = get_loaders(traindir, valdir) if args.evaluate: return validate(val_loader, model, criterion, epoch, start_time) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) if epoch == args.epochs - 6: args.sz = 288 args.batch_size = 128 train_loader, val_loader, train_sampler, val_sampler = get_loaders( traindir, valdir, use_val_sampler=False, min_scale=0.5) if args.distributed: train_sampler.set_epoch(epoch) val_sampler.set_epoch(epoch) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) train(train_loader, model, criterion, optimizer, epoch) if args.prof: break prec1 = validate(val_loader, model, criterion, epoch, start_time) if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
transform=transform, num_workers=num_workers) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) epoch = 0 if args.checkpoint: load_checkpoint(model, optimizer, args.checkpoint) tag = args.checkpoint.split('_')[-1].split('.')[0] if tag.isnumeric(): epoch = int(tag) if args.half: model = network_to_half(model) model = model.cuda() model.name = f'{args.model}_{args.tag}' hvd.broadcast_parameters(model.state_dict(), root_rank=0) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.5623) # training for i in range(epoch): scheduler.step() for i in range(epoch, 400): epoch += 1 scheduler.step()
elif args.net == 'mobilenet': net = MobileNetV2() # net = DPN92() elif args.net == 'shufflenet': net = ShuffleNetv2() elif args.net == 'efficientnet': net = Efficientnet() else: print("{} not found").format(args.net) net = net.to(device) print(net) if args.fp16: from fp16util import network_to_half net = network_to_half(net) if device == 'cuda': # net = torch.nn.DataParallel(net) # make parallel """ can't use dataparallel for onnx.. see https://github.com/pytorch/pytorch/issues/13397 """ cudnn.benchmark = True if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/{}-ckpt.t7'.format(args.net)) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch']
def run_benchmarking(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: torch.cuda.set_device("cuda:%d" % device_ids[0]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if run_fp16: network = network_to_half(network) if dataparallel: network = torch.nn.DataParallel(network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) elif distributed_dataparallel: rendezvous(distributed_parameters) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) else: num_devices = 1 if net == "inception_v3": inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if run_fp16: inp = inp.half() target = torch.randint( 0, 1, size=(batch_size, ), device='cuda') # torch.arange(batch_size, device="cuda") param_copy = network.parameters() if run_fp16: param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations rank = distributed_parameters.get('rank', -1) world_size = distributed_parameters.get('world_size', 1) process_report = { 'model': net, 'rank': rank, 'num_device': num_devices, 'batch_size': batch_size, 'batch_time': time_per_batch, 'speed': batch_size / time_per_batch } with open(f'{tmp}/process_report_{rank}.json', 'w') as report: json.dump(process_report, report) if rank == 0: overall_report = { 'world_size': world_size, 'batch_size': batch_size * world_size, 'batch_time': time_per_batch, 'speed': batch_size * world_size / time_per_batch } with open(f'{tmp}/overall_report.json', 'w') as report: json.dump(overall_report, report)
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() n_dev = torch.cuda.device_count() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) #args.lr *= n_dev elif args.dp: model = nn.DataParallel(model) args.batch_size *= n_dev #args.lr *= n_dev global param_copy if args.fp16: param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.sz), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(int(args.sz*1.14)), transforms.CenterCrop(args.sz), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main(): global args, best_prec1 args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') args.distributed = args.world_size > 1 print ("INFO: args.distributed values is : {} and value of worldsize is {}".format(args.distributed, args.world_size)) if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.gpu is not None: model = model.cuda(args.gpu) elif args.distributed: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) else: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() if args.fp16: model = network_to_half(model) global param_copy if args.fp16: param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--fp16', type=int, default=0, required=False, help='undergo fp16 training') parser.add_argument('--scale_factor', type=float, default=1, help='Loss scale factor for fp16 training') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") if args.fp16: assert torch.backends.cudnn.enabled kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) if args.fp16: print("INFO: training the network for fp16") model = network_to_half(model) global param_copy if args.fp16: param_copy = [ param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters() ] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) optimizer = optim.SGD(param_copy, lr=args.lr, momentum=args.momentum) print(model) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader)
def run_benchmarking(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: torch.cuda.set_device("cuda:%d" % device_ids[0]) else: torch.cuda.set_device("cuda:0") network = get_network(net) print('Total parameters:', count_parameters(network)) if (run_fp16): network = network_to_half(network) if (dataparallel): network = torch.nn.DataParallel(network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) elif (distributed_dataparallel): rendezvous(distributed_parameters) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) else: num_devices = 1 if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() target = torch.arange(batch_size, device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations print("OK: finished running benchmark..") print("--------------------SUMMARY--------------------------") print("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print("--------This process: rank " + str(distributed_parameters['rank']) + "--------") print("Num devices: {}".format(num_devices)) print("Mini batch size [img] : {}".format(batch_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size / time_per_batch)) if (distributed_dataparallel): print("") print( "--------Overall (all ranks) (assuming same num/type devices for each rank)--------" ) world_size = distributed_parameters['world_size'] print("Num devices: {}".format(num_devices * world_size)) print("Mini batch size [img] : {}".format(batch_size * world_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size * world_size / time_per_batch))
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: assert ngpus == len(device_ids) torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if (run_fp16): network = network_to_half(network) if (dataparallel): devices_to_run_on = device_ids if device_ids else list(range(ngpus)) print ("INFO: Running dataparallel on devices: {}".format(str(devices_to_run_on))) network = torch.nn.DataParallel(network, device_ids=devices_to_run_on) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() target = torch.arange(batch_size, device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9) ## warmup. print ("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print ("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) if(i%10==0): print (time.asctime( time.localtime(time.time())) + " INFO: iteration " + str(i) + " completed.") torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations print ("OK: finished running benchmark..") print ("--------------------SUMMARY--------------------------") print ("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); print ("Num devices: 1") else: print ("Num devices: {}".format(ngpus)) print ("Mini batch size [img] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel): print ("") print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") world_size = distributed_parameters['world_size'] print ("Num devices: {}".format(world_size)) print ("Mini batch size [img] : {}".format(batch_size*world_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, prof_step, amp_opt_level, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: assert ngpus == len(device_ids) torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if "shufflenet" == net: model.apply(weight_init) if (run_fp16): network = network_to_half(network) if (dataparallel): devices_to_run_on = device_ids if device_ids else list(range(ngpus)) print("INFO: Running dataparallel on devices: {}".format( str(devices_to_run_on))) network = torch.nn.DataParallel(network, device_ids=devices_to_run_on) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [ (device_ids[local_rank] if device_ids else local_rank) ] print("INFO: Rank {} running distributed_dataparallel on devices: {}". format(distributed_parameters['rank'], str(devices_to_run_on))) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() if net in models: # number of classes is 1000 for imagenet target = torch.randint(0, 1000, (batch_size, ), device="cuda") elif net in segmentation_models: # number of classes is 21 for segmentation target = torch.randint(0, 21, (batch_size, ), device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) if (amp_opt_level): network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d" % amp_opt_level) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target, amp_opt_level) forwardbackward(inp, optimizer, network, target, amp_opt_level) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): if i == prof_step: forwardbackward(inp, optimizer, network, target, amp_opt_level, i) else: forwardbackward(inp, optimizer, network, target, amp_opt_level) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations if run_fp16: dtype = 'FP16' elif amp_opt_level == 1: dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' elif amp_opt_level == 2: dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' elif amp_opt_level == 3: dtype = 'AMP-O3: Pure FP16 training.' elif amp_opt_level == 4: dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' elif amp_opt_level == 5: dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' else: dtype = 'FP32' print("OK: finished running benchmark..") print("--------------------SUMMARY--------------------------") print("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print("--------This process: rank " + str(distributed_parameters['rank']) + "--------") print("Num devices: 1") else: print("Num devices: {}".format(ngpus)) print("Dtype: {}".format(dtype)) print("Mini batch size [img] : {}".format(batch_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size / time_per_batch)) if (distributed_dataparallel): print("") print( "--------Overall (all ranks) (assuming same num/type devices for each rank)--------" ) world_size = distributed_parameters['world_size'] print("Num devices: {}".format(world_size)) print("Dtype: {}".format(dtype)) print("Mini batch size [img] : {}".format(batch_size * world_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size * world_size / time_per_batch))