def main(): args = parseArgs() pyprof2.init() pyprof2.wrap(fused_adam_cuda, 'adam') N = args.b C = 3 H = d[args.m]['H'] W = d[args.m]['W'] opts = d[args.m]['opts'] classes = 1000 net = getattr(models, args.m) net = net(**opts).cuda().half() net.train() x = torch.rand(N, C, H, W).cuda().half() target = torch.empty(N, dtype=torch.long).random_(classes).cuda() criterion = nn.CrossEntropyLoss().cuda() if (args.o == "sgd"): optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9) elif (args.o == "adam"): optimizer = FusedAdam(net.parameters()) #optimizer = FP16_Optimizer(optimizer) else: assert False #Warm up without profiler for i in range(2): output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() with torch.autograd.profiler.emit_nvtx(): profiler.start() output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() profiler.stop()
import torch import torch.cuda.profiler as profiler import pyprof2 def foo(x, y): return torch.sigmoid(x) + y x = torch.zeros(4, 4).cuda() y = torch.ones(4, 4).cuda() #JIT the function using tracing #This returns an object of type ScriptModule with a forward method. traced_foo = torch.jit.trace(foo, (x, y)) #Initialize pyprof2 after the JIT step pyprof2.init() #Assign a name to the object "traced_foo" traced_foo.__dict__['__name__'] = "foo" #Hook up the forward function to pyprof2 pyprof2.wrap(traced_foo, 'forward') with torch.autograd.profiler.emit_nvtx(): profiler.start() z = traced_foo(x, y) profiler.stop() print(z)
import torch import fused_adam_cuda from apex.optimizers import FusedAdam, FP16_Optimizer import pyprof2 pyprof2.init() pyprof2.wrap(fused_adam_cuda, 'adam') model = torch.nn.Linear(10, 20).cuda().half() criterion = torch.nn.CrossEntropyLoss().cuda() optimizer = FusedAdam(model.parameters()) optimizer = FP16_Optimizer(optimizer) x = torch.ones(32, 10).cuda().half() target = torch.empty(32, dtype=torch.long).random_(20).cuda() y = model(x) loss = criterion(y, target) optimizer.zero_grad() loss.backward() optimizer.step()
import torch import fused_layer_norm_cuda from apex.normalization import FusedLayerNorm import pyprof2 pyprof2.init() pyprof2.wrap(fused_layer_norm_cuda, 'forward') pyprof2.wrap(fused_layer_norm_cuda, 'backward') pyprof2.wrap(fused_layer_norm_cuda, 'forward_affine') pyprof2.wrap(fused_layer_norm_cuda, 'backward_affine') input = torch.randn(20, 5, 10, 10).cuda() # With Learnable Parameters m = FusedLayerNorm(input.size()[1:]).cuda() output = m(input) # Without Learnable Parameters m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda() output = m(input) # Normalize over last two dimensions m = FusedLayerNorm([10, 10]).cuda() output = m(input) # Normalize over last dimension of size 10 m = FusedLayerNorm(10).cuda() output = m(input)
#!/usr/bin/env python3 import torch import torch.cuda.profiler as profiler import pyprof2 pyprof2.init() class Foo(torch.nn.Module): def __init__(self, size): super(Foo, self).__init__() self.n = torch.nn.Parameter(torch.ones(size)) self.m = torch.nn.Parameter(torch.ones(size)) def forward(self, input): return self.n * input + self.m #Hook the forward function to pyprof2 pyprof2.wrap(Foo, 'forward') foo = Foo(4) foo.cuda() x = torch.ones(4).cuda() with torch.autograd.profiler.emit_nvtx(): profiler.start() z = foo(x) profiler.stop()
import torch from apex.optimizers import FusedAdam import amp_C import pyprof2 pyprof2.init() # Wrap the custom fused multi tensor Adam implementation pyprof2.wrap(amp_C, 'multi_tensor_adam') inp = 1024 hid = 2048 out = 4096 batch = 128 # Model model = torch.nn.Sequential( torch.nn.Linear(inp, hid).cuda().half(), torch.nn.ReLU(), torch.nn.Linear(hid, out).cuda().half()) # Loss criterion = torch.nn.CrossEntropyLoss().cuda() # Adam optimizer optimizer = FusedAdam(model.parameters()) # Input x = torch.ones(batch, inp).cuda().half() # Target target = torch.empty(batch, dtype=torch.long).random_(out).cuda() with torch.autograd.profiler.emit_nvtx(): y = model(x) loss = criterion(y, target) optimizer.zero_grad()