def run_benchmarking(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: torch.cuda.set_device("cuda:%d" % device_ids[0]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if run_fp16: network = network_to_half(network) if dataparallel: network = torch.nn.DataParallel(network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) elif distributed_dataparallel: rendezvous(distributed_parameters) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) else: num_devices = 1 if net == "inception_v3": inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if run_fp16: inp = inp.half() target = torch.randint( 0, 1, size=(batch_size, ), device='cuda') # torch.arange(batch_size, device="cuda") param_copy = network.parameters() if run_fp16: param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations rank = distributed_parameters.get('rank', -1) world_size = distributed_parameters.get('world_size', 1) process_report = { 'model': net, 'rank': rank, 'num_device': num_devices, 'batch_size': batch_size, 'batch_time': time_per_batch, 'speed': batch_size / time_per_batch } with open(f'{tmp}/process_report_{rank}.json', 'w') as report: json.dump(process_report, report) if rank == 0: overall_report = { 'world_size': world_size, 'batch_size': batch_size * world_size, 'batch_time': time_per_batch, 'speed': batch_size * world_size / time_per_batch } with open(f'{tmp}/overall_report.json', 'w') as report: json.dump(overall_report, report)
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: assert ngpus == len(device_ids) torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if (run_fp16): network = network_to_half(network) if (dataparallel): devices_to_run_on = device_ids if device_ids else list(range(ngpus)) print ("INFO: Running dataparallel on devices: {}".format(str(devices_to_run_on))) network = torch.nn.DataParallel(network, device_ids=devices_to_run_on) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() target = torch.arange(batch_size, device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9) ## warmup. print ("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print ("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) if(i%10==0): print (time.asctime( time.localtime(time.time())) + " INFO: iteration " + str(i) + " completed.") torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations print ("OK: finished running benchmark..") print ("--------------------SUMMARY--------------------------") print ("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); print ("Num devices: 1") else: print ("Num devices: {}".format(ngpus)) print ("Mini batch size [img] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel): print ("") print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") world_size = distributed_parameters['world_size'] print ("Num devices: {}".format(world_size)) print ("Mini batch size [img] : {}".format(batch_size*world_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
def run_benchmarking(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: torch.cuda.set_device("cuda:%d" % device_ids[0]) else: torch.cuda.set_device("cuda:0") network = get_network(net) print('Total parameters:', count_parameters(network)) if (run_fp16): network = network_to_half(network) if (dataparallel): network = torch.nn.DataParallel(network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) elif (distributed_dataparallel): rendezvous(distributed_parameters) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=device_ids) num_devices = len( device_ids) if device_ids is not None else torch.cuda.device_count( ) else: num_devices = 1 if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() target = torch.arange(batch_size, device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations print("OK: finished running benchmark..") print("--------------------SUMMARY--------------------------") print("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print("--------This process: rank " + str(distributed_parameters['rank']) + "--------") print("Num devices: {}".format(num_devices)) print("Mini batch size [img] : {}".format(batch_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size / time_per_batch)) if (distributed_dataparallel): print("") print( "--------Overall (all ranks) (assuming same num/type devices for each rank)--------" ) world_size = distributed_parameters['world_size'] print("Num devices: {}".format(num_devices * world_size)) print("Mini batch size [img] : {}".format(batch_size * world_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size * world_size / time_per_batch))
def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, prof_step, amp_opt_level, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: assert ngpus == len(device_ids) torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if "shufflenet" == net: model.apply(weight_init) if (run_fp16): network = network_to_half(network) if (dataparallel): devices_to_run_on = device_ids if device_ids else list(range(ngpus)) print("INFO: Running dataparallel on devices: {}".format( str(devices_to_run_on))) network = torch.nn.DataParallel(network, device_ids=devices_to_run_on) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [ (device_ids[local_rank] if device_ids else local_rank) ] print("INFO: Rank {} running distributed_dataparallel on devices: {}". format(distributed_parameters['rank'], str(devices_to_run_on))) network = torch.nn.parallel.DistributedDataParallel( network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() if net in models: # number of classes is 1000 for imagenet target = torch.randint(0, 1000, (batch_size, ), device="cuda") elif net in segmentation_models: # number of classes is 21 for segmentation target = torch.randint(0, 21, (batch_size, ), device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr=0.01, momentum=0.9) if (amp_opt_level): network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d" % amp_opt_level) ## warmup. print("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target, amp_opt_level) forwardbackward(inp, optimizer, network, target, amp_opt_level) time.sleep(1) torch.cuda.synchronize() ## benchmark. print("INFO: running the benchmark..") tm = time.time() for i in range(iterations): if i == prof_step: forwardbackward(inp, optimizer, network, target, amp_opt_level, i) else: forwardbackward(inp, optimizer, network, target, amp_opt_level) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations if run_fp16: dtype = 'FP16' elif amp_opt_level == 1: dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' elif amp_opt_level == 2: dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' elif amp_opt_level == 3: dtype = 'AMP-O3: Pure FP16 training.' elif amp_opt_level == 4: dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' elif amp_opt_level == 5: dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' else: dtype = 'FP32' print("OK: finished running benchmark..") print("--------------------SUMMARY--------------------------") print("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print("--------This process: rank " + str(distributed_parameters['rank']) + "--------") print("Num devices: 1") else: print("Num devices: {}".format(ngpus)) print("Dtype: {}".format(dtype)) print("Mini batch size [img] : {}".format(batch_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size / time_per_batch)) if (distributed_dataparallel): print("") print( "--------Overall (all ranks) (assuming same num/type devices for each rank)--------" ) world_size = distributed_parameters['world_size'] print("Num devices: {}".format(world_size)) print("Dtype: {}".format(dtype)) print("Mini batch size [img] : {}".format(batch_size * world_size)) print("Time per mini-batch : {}".format(time_per_batch)) print("Throughput [img/sec] : {}".format(batch_size * world_size / time_per_batch))