def test_model(model, forward_only=False): optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, weight_decay=1e-4) f_times = [] fb_times = [] with cudnn.flags(enabled=True, benchmark=True), torch.set_grad_enabled(not forward_only): start = torch.cuda.Event(enable_timing=True) f_end = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) def run_once(): start.record() output = model(INP) f_end.record() if forward_only: torch.cuda.synchronize() return start.elapsed_time(f_end), start.elapsed_time(f_end) loss = criterion(output, TARGET) optimizer.zero_grad() loss.backward() optimizer.step() end.record() torch.cuda.synchronize() return start.elapsed_time(f_end), start.elapsed_time(end) # benchmark runs. usually much slower than following ones for _ in range(2): run_once() # during cudnn benchmarking a lot of memory is used. we need to reset # in order to get max mem alloc by the fastest algorithm torch.cuda.reset_max_memory_allocated(0) for _ in range(N_RUNS): f_meter = AverageMeter() fb_meter = AverageMeter() for _ in range(RUN_ITERS): f_t, fb_t = run_once() f_meter.update(f_t) fb_meter.update(fb_t) f_times.append(f_meter.avg) fb_times.append(fb_meter.avg) f_times = np.array(f_times) fb_times = np.array(fb_times) print( "Mean of {} runs {} iters each BS={}:\n\t {:.2f}+-{:.2f} msecs Forward. {:.2f}+-{:.2f} msecs Backward. Max memory: {:.2f}Mb. {:.2f} imgs/sec" .format( N_RUNS, RUN_ITERS, BS, f_times.mean(), f_times.std(), (fb_times - f_times).mean(), (fb_times - f_times).std(), torch.cuda.max_memory_allocated(0) / 2**20, BS * 1000 / fb_times.mean(), )) del optimizer del model
def func(*inputs): if use_bias: lx, lweight, lbias = inputs else: lx, lweight = inputs lbias = None # We disable cudnn during forward to avoid finite difference imprecision issues with cudnn.flags(enabled=False): out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups) return out
def test_resnet_baseline(self): N = 100 total_iters = 20 # (warmup + benchmark) iterations = 4 target = Variable(torch.randn(N // 5).fill_(1)).type("torch.LongTensor") x = Variable(torch.randn(N, 3, 224, 224).fill_(1.0), requires_grad=True) # x = Variable(torch.randn(N, 3, 32, 32).fill_(1.0), requires_grad=True) # model = resnet_baseline.resnet200() # model = resnet_baseline.resnet101() model = resnet_baseline.load_resnet() model = DataParallel(model) # model = resnet_baseline.resnet1001() # switch the model to train mode model.train() # convert the model and input to cuda model = model.cuda() input_var = x.cuda() target_var = target.cuda() # declare the optimizer and criterion criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, weight_decay=1e-4) optimizer.zero_grad() start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): start.record() start_cpu = time.time() for j in range(iterations): output = model(input_var) loss = criterion(output, target_var) loss.backward() optimizer.step() end_cpu = time.time() end.record() torch.cuda.synchronize() gpu_msec = start.elapsed_time(end) print( "Baseline resnet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)" .format(i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000, file=sys.stderr))
def test_vnet_optim(self): # optimized N = 8 total_iters = 20 # (warmup + benchmark) iterations = 1 # baseline # N = 4 # total_iters = 10 # (warmup + benchmark) # iterations = 2 target = torch.ones(N, 1, 128, 128, 64).type("torch.LongTensor") x = torch.ones(N, 1, 128, 128, 64, requires_grad=True) model = vnet_optim.VNet(elu=False, nll=True) bg_weight = 0.5 fg_weight = 0.5 weights = torch.FloatTensor([bg_weight, fg_weight]) weights = weights.cuda() model.train() # convert the model and input to cuda model = model.cuda() input_var = x.cuda() target_var = target.cuda() target = target_var.view(target_var.numel()) # declare the optimizer and criterion optimizer = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.99, weight_decay=1e-8) optimizer.zero_grad() model.apply(self.weights_init) start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): start.record() start_cpu = time.time() for j in range(iterations): output = model(input_var) loss = F.nll_loss(output, target, weight=weights) loss.backward() optimizer.step() end_cpu = time.time() end.record() torch.cuda.synchronize() gpu_msec = start.elapsed_time(end) print("Optimized vnet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format( i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000, file=sys.stderr))
def test_densenet_optim(self): N = 32 # N = 72 chunks = 4 total_iters = 20 # (warmup + benchmark) iterations = 1 x = torch.ones(N, 3, 224, 224, requires_grad=True) target = torch.ones(N).type("torch.LongTensor") # model = densenet_optimized.densenet100() # model = densenet_optimized.densenet121() # model = densenet_optimized.densenet201() model = densenet_optim.densenet264() # switch the model to train mode model.train() # convert the model and input to cuda model = model.cuda() input_var = x.cuda() target_var = target.cuda() # declare the optimizer and criterion criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, weight_decay=1e-4) start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): start.record() start_cpu = time.time() for j in range(iterations): output = model(input_var, chunks=chunks) loss = criterion(output, target_var) optimizer.zero_grad() loss.backward() optimizer.step() end_cpu = time.time() end.record() torch.cuda.synchronize() gpu_msec = start.elapsed_time(end) print("Optimized densenet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format( i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000, file=sys.stderr))
def test_wlm_optim(self): total_iters = 20 iterations = 1 chunks = 4 model_name = 'LSTM' ntokens = 33278 emsize = 200 nhid = 200 nlayers = 1 dropout = 0.2 tied = False batchsize = 20 bptt = 7000 data = Variable(torch.LongTensor(bptt, batchsize).fill_(1), volatile=False) # data = torch.ones(bptt, batchsize, volatile=False).type("torch.LongTensor") target_var = torch.ones(bptt * batchsize).type("torch.LongTensor") targets = target_var.cuda() input_data = data.cuda() model = wlm_optim.RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, tied) model = model.cuda() model.train() criterion = nn.CrossEntropyLoss().cuda() hidden = model.init_hidden(batchsize) start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): start.record() start_cpu = time.time() for j in range(iterations): hidden = self.repackage_hidden(hidden) output, hidden = model(input_data, hidden, targets, chunks=chunks) model.backward(output) end_cpu = time.time() end.record() torch.cuda.synchronize() gpu_msec = start.elapsed_time(end) print("Optimized WLM ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format( i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000, file=sys.stderr))
torch.distributed.barrier() logging.info("apex sync") if use_amp == 0: # horovod optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # switch the model to train mode model.train() # convert the model and input to cuda input_var = x.cuda(device) target_var = target.cuda(device) # declare the optimizer and criterion criterion = nn.CrossEntropyLoss().cuda(device) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): logging.info(f"local_rank {local_rank} iteration {i}") for j in range(iterations): output = model(input_var, chunks=chunks) loss = criterion(output, target_var) logging.info(f"local_rank {local_rank} loss {loss}") #logging.info(f"local_rank {local_rank} loss requires_grad {loss.requires_grad}") #logging.info(f"local_rank {local_rank} loss grad_fn {loss.grad_fn}") optimizer.zero_grad() if use_amp == 1: with amp.scale_loss( loss, optimizer, delay_unscale=False) as scaled_loss: scaled_loss.backward() if use_amp == 0:
def forward(self, log_probs, targets, input_lengths, target_lengths): with cudnn.flags(enabled=False): return super().forward(log_probs=log_probs, targets=targets, input_lengths=input_lengths, target_lengths=target_lengths)