예제 #1
0
def test_model(model, forward_only=False):
    optimizer = torch.optim.SGD(model.parameters(),
                                0.01,
                                momentum=0.9,
                                weight_decay=1e-4)
    f_times = []
    fb_times = []
    with cudnn.flags(enabled=True,
                     benchmark=True), torch.set_grad_enabled(not forward_only):
        start = torch.cuda.Event(enable_timing=True)
        f_end = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        def run_once():
            start.record()
            output = model(INP)
            f_end.record()
            if forward_only:
                torch.cuda.synchronize()
                return start.elapsed_time(f_end), start.elapsed_time(f_end)
            loss = criterion(output, TARGET)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            end.record()
            torch.cuda.synchronize()
            return start.elapsed_time(f_end), start.elapsed_time(end)

        # benchmark runs. usually much slower than following ones
        for _ in range(2):
            run_once()
        # during cudnn benchmarking a lot of memory is used. we need to reset
        # in order to get max mem alloc by the fastest algorithm
        torch.cuda.reset_max_memory_allocated(0)
        for _ in range(N_RUNS):
            f_meter = AverageMeter()
            fb_meter = AverageMeter()
            for _ in range(RUN_ITERS):
                f_t, fb_t = run_once()
                f_meter.update(f_t)
                fb_meter.update(fb_t)
            f_times.append(f_meter.avg)
            fb_times.append(fb_meter.avg)
        f_times = np.array(f_times)
        fb_times = np.array(fb_times)
    print(
        "Mean of {} runs {} iters each BS={}:\n\t {:.2f}+-{:.2f} msecs Forward. {:.2f}+-{:.2f} msecs Backward. Max memory: {:.2f}Mb. {:.2f} imgs/sec"
        .format(
            N_RUNS,
            RUN_ITERS,
            BS,
            f_times.mean(),
            f_times.std(),
            (fb_times - f_times).mean(),
            (fb_times - f_times).std(),
            torch.cuda.max_memory_allocated(0) / 2**20,
            BS * 1000 / fb_times.mean(),
        ))
    del optimizer
    del model
 def func(*inputs):
     if use_bias:
         lx, lweight, lbias = inputs
     else:
         lx, lweight = inputs
         lbias = None
     # We disable cudnn during forward to avoid finite difference imprecision issues
     with cudnn.flags(enabled=False):
         out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups)
     return out
예제 #3
0
    def test_resnet_baseline(self):
        N = 100
        total_iters = 20  # (warmup + benchmark)
        iterations = 4

        target = Variable(torch.randn(N //
                                      5).fill_(1)).type("torch.LongTensor")
        x = Variable(torch.randn(N, 3, 224, 224).fill_(1.0),
                     requires_grad=True)
        # x = Variable(torch.randn(N, 3, 32, 32).fill_(1.0), requires_grad=True)
        # model = resnet_baseline.resnet200()
        # model = resnet_baseline.resnet101()
        model = resnet_baseline.load_resnet()
        model = DataParallel(model)
        # model = resnet_baseline.resnet1001()

        # switch the model to train mode
        model.train()

        # convert the model and input to cuda
        model = model.cuda()
        input_var = x.cuda()
        target_var = target.cuda()

        # declare the optimizer and criterion
        criterion = nn.CrossEntropyLoss().cuda()
        optimizer = torch.optim.SGD(model.parameters(),
                                    0.01,
                                    momentum=0.9,
                                    weight_decay=1e-4)
        optimizer.zero_grad()

        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        with cudnn.flags(enabled=True, benchmark=True):
            for i in range(total_iters):
                start.record()
                start_cpu = time.time()
                for j in range(iterations):
                    output = model(input_var)
                    loss = criterion(output, target_var)
                    loss.backward()
                    optimizer.step()

                end_cpu = time.time()
                end.record()
                torch.cuda.synchronize()
                gpu_msec = start.elapsed_time(end)
                print(
                    "Baseline resnet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)"
                    .format(i,
                            gpu_msec * 1000, (end_cpu - start_cpu) * 1000000,
                            file=sys.stderr))
    def test_vnet_optim(self):
        # optimized
        N = 8
        total_iters = 20    # (warmup + benchmark)
        iterations = 1

        # baseline
        # N = 4
        # total_iters = 10    # (warmup + benchmark)
        # iterations = 2

        target = torch.ones(N, 1, 128, 128, 64).type("torch.LongTensor")
        x = torch.ones(N, 1, 128, 128, 64, requires_grad=True)
        model = vnet_optim.VNet(elu=False, nll=True)
        bg_weight = 0.5
        fg_weight = 0.5
        weights = torch.FloatTensor([bg_weight, fg_weight])
        weights = weights.cuda()
        model.train()

        # convert the model and input to cuda
        model = model.cuda()
        input_var = x.cuda()
        target_var = target.cuda()
        target = target_var.view(target_var.numel())

        # declare the optimizer and criterion
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.99, weight_decay=1e-8)
        optimizer.zero_grad()
        model.apply(self.weights_init)
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        with cudnn.flags(enabled=True, benchmark=True):
            for i in range(total_iters):
                start.record()
                start_cpu = time.time()
                for j in range(iterations):
                    output = model(input_var)
                    loss = F.nll_loss(output, target, weight=weights)
                    loss.backward()
                    optimizer.step()

                end_cpu = time.time()
                end.record()
                torch.cuda.synchronize()
                gpu_msec = start.elapsed_time(end)
                print("Optimized vnet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format(
                    i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000,
                    file=sys.stderr))
    def test_densenet_optim(self):
        N = 32
        # N = 72
        chunks = 4
        total_iters = 20    # (warmup + benchmark)
        iterations = 1

        x = torch.ones(N, 3, 224, 224, requires_grad=True)
        target = torch.ones(N).type("torch.LongTensor")

        # model = densenet_optimized.densenet100()
        # model = densenet_optimized.densenet121()
        # model = densenet_optimized.densenet201()
        model = densenet_optim.densenet264()

        # switch the model to train mode
        model.train()

        # convert the model and input to cuda
        model = model.cuda()
        input_var = x.cuda()
        target_var = target.cuda()

        # declare the optimizer and criterion
        criterion = nn.CrossEntropyLoss().cuda()
        optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, weight_decay=1e-4)

        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        with cudnn.flags(enabled=True, benchmark=True):
            for i in range(total_iters):
                start.record()
                start_cpu = time.time()
                for j in range(iterations):
                    output = model(input_var, chunks=chunks)
                    loss = criterion(output, target_var)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                end_cpu = time.time()
                end.record()
                torch.cuda.synchronize()
                gpu_msec = start.elapsed_time(end)
                print("Optimized densenet ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format(
                    i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000,
                    file=sys.stderr))
    def test_wlm_optim(self):
        total_iters = 20
        iterations = 1
        chunks = 4

        model_name = 'LSTM'
        ntokens = 33278
        emsize = 200
        nhid = 200
        nlayers = 1
        dropout = 0.2
        tied = False
        batchsize = 20
        bptt = 7000

        data = Variable(torch.LongTensor(bptt, batchsize).fill_(1), volatile=False)
        # data = torch.ones(bptt, batchsize, volatile=False).type("torch.LongTensor")
        target_var = torch.ones(bptt * batchsize).type("torch.LongTensor")
        targets = target_var.cuda()
        input_data = data.cuda()

        model = wlm_optim.RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, tied)
        model = model.cuda()
        model.train()
        criterion = nn.CrossEntropyLoss().cuda()
        hidden = model.init_hidden(batchsize)

        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        with cudnn.flags(enabled=True, benchmark=True):
            for i in range(total_iters):
                start.record()
                start_cpu = time.time()
                for j in range(iterations):
                    hidden = self.repackage_hidden(hidden)
                    output, hidden = model(input_data, hidden, targets, chunks=chunks)
                    model.backward(output)

                end_cpu = time.time()
                end.record()
                torch.cuda.synchronize()
                gpu_msec = start.elapsed_time(end)
                print("Optimized WLM ({:2d}): ({:8.3f} usecs gpu) ({:8.3f} usecs cpu)".format(
                    i, gpu_msec * 1000, (end_cpu - start_cpu) * 1000000,
                    file=sys.stderr))
예제 #7
0
     torch.distributed.barrier()
     logging.info("apex sync")
 if use_amp == 0:
     # horovod
     optimizer = hvd.DistributedOptimizer(
         optimizer, named_parameters=model.named_parameters())
     hvd.broadcast_parameters(model.state_dict(), root_rank=0)
     hvd.broadcast_optimizer_state(optimizer, root_rank=0)
 # switch the model to train mode
 model.train()
 # convert the model and input to cuda
 input_var = x.cuda(device)
 target_var = target.cuda(device)
 # declare the optimizer and criterion
 criterion = nn.CrossEntropyLoss().cuda(device)
 with cudnn.flags(enabled=True, benchmark=True):
     for i in range(total_iters):
         logging.info(f"local_rank {local_rank}   iteration {i}")
         for j in range(iterations):
             output = model(input_var, chunks=chunks)
             loss = criterion(output, target_var)
             logging.info(f"local_rank {local_rank}   loss   {loss}")
             #logging.info(f"local_rank {local_rank}   loss  requires_grad  {loss.requires_grad}")
             #logging.info(f"local_rank {local_rank}   loss grad_fn  {loss.grad_fn}")
             optimizer.zero_grad()
             if use_amp == 1:
                 with amp.scale_loss(
                         loss, optimizer,
                         delay_unscale=False) as scaled_loss:
                     scaled_loss.backward()
             if use_amp == 0:
예제 #8
0
파일: patch.py 프로젝트: alex-epp/ocr
 def forward(self, log_probs, targets, input_lengths, target_lengths):
     with cudnn.flags(enabled=False):
         return super().forward(log_probs=log_probs,
                                targets=targets,
                                input_lengths=input_lengths,
                                target_lengths=target_lengths)