def iterate(self, src, tgt, update=True, training=True): """ Performs one iteration of the training/validation. :param src: batch of examples from the source language :param tgt: batch of examples from the target language :param update: if True: optimizer does update of the weights :param training: if True: executes optimizer """ pyprof2.init() src, src_length = src tgt, tgt_length = tgt src = src.to(self.device) tgt = tgt.to(self.device) src_length = src_length.to(self.device) num_toks = {} num_toks['tgt'] = int(sum(tgt_length - 1)) num_toks['src'] = int(sum(src_length)) with torch.autograd.profiler.emit_nvtx(): profiler.start() if self.batch_first: output = self.model(src, src_length, tgt[:, :-1]) tgt_labels = tgt[:, 1:] T, B = output.size(1), output.size(0) else: output = self.model(src, src_length, tgt[:-1]) tgt_labels = tgt[1:] T, B = output.size(0), output.size(1) loss = self.criterion(output.view(T * B, -1), tgt_labels.contiguous().view(-1)) loss_per_batch = loss.item() loss /= (B * self.iter_size) if training: self.fp_optimizer.step(loss, self.optimizer, self.scheduler, update) loss_per_token = loss_per_batch / num_toks['tgt'] loss_per_sentence = loss_per_batch / B profiler.stop() print('You can stop now') exit() return loss_per_token, loss_per_sentence, num_toks
def main(): args = parseArgs() pyprof2.init() pyprof2.wrap(fused_adam_cuda, 'adam') N = args.b C = 3 H = d[args.m]['H'] W = d[args.m]['W'] opts = d[args.m]['opts'] classes = 1000 net = getattr(models, args.m) net = net(**opts).cuda().half() net.train() x = torch.rand(N, C, H, W).cuda().half() target = torch.empty(N, dtype=torch.long).random_(classes).cuda() criterion = nn.CrossEntropyLoss().cuda() if (args.o == "sgd"): optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9) elif (args.o == "adam"): optimizer = FusedAdam(net.parameters()) #optimizer = FP16_Optimizer(optimizer) else: assert False #Warm up without profiler for i in range(2): output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() with torch.autograd.profiler.emit_nvtx(): profiler.start() output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() profiler.stop()
#!/usr/bin/env python3 """ This file checks all Python operators. """ import sys import torch import torch.cuda.profiler as profiler import operator import inspect #Import and initialize pyprof2 import pyprof2 pyprof2.init() X = 1024 Y = 1024 fa = torch.rand(X, Y).cuda() fb = torch.rand(X, Y).cuda() fc = torch.rand(X, Y).cuda() ia = torch.randint(0, 100, (X, Y)).cuda() ib = torch.randint(0, 100, (X, Y)).cuda() sa = torch.ones(1, 1).cuda() sb = torch.ones(1, 1).cuda() ba = fa.byte()