def main(args): if args.platform == "gpu": device = torch.device('cuda:0') device_func = torch.cuda elif args.platform == "npu": device = torch.device('npu:0') device_func = torch.npu else: device = torch.device('cpu') print("Running on device {}".format(device)) # select commpute type if args.compute_type == "forward": compfunc = run_forward elif args.compute_type == "backward": compfunc = run_backward elif args.compute_type == "calibrate": compfunc = run_calibrate else: raise ValueError( "Error, compute_type should be either forward or backward or calibrate") kernel_shape = args.kernel_shape # requires_grad=True indicates that we want to compute gradients during the backward pass if args.compute_type == "backward": weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0], kernel_shape[1], device=device, requires_grad=True) biases = torch.randn( kernel_shape[3], device=device, requires_grad=True) else: weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0], kernel_shape[1], device=device) biases = torch.randn(kernel_shape[3], device=device) input_tensor_shape = tuple(args.input_tensor_shape) input_tensor = torch.randn(input_tensor_shape[0], input_tensor_shape[1], input_tensor_shape[2], input_tensor_shape[3], device=device) if args.dtype == "float16": input_tensor = input_tensor.half() # init the conv2d kernel conv2d = nn.Conv2d(in_channels=kernel_shape[2], out_channels=kernel_shape[3], kernel_size=kernel_shape[0], stride=args.stride) conv2d.weight = torch.nn.Parameter(weights) conv2d.bias = torch.nn.Parameter(biases) # move the kernel to device conv2d.to(device) if args.dtype == "float16": conv2d = conv2d.half() # warm up conv2d.eval() flops, mem, params = nnstats.get_flops_mem(conv2d, input_tensor_shape) if args.dtype == 'float16': mem = mem * 2 elif args.dtype == 'float32': mem = mem * 4 if args.compute_type == "forward": flops = flops elif args.compute_type == "backward": flops = flops * 3 else: flop_sec = 0.0 for i in range(args.num_warmups): compfunc(input_tensor, conv2d) device_func.synchronize() # bench start_event = device_func.Event(enable_timing=True) end_event = device_func.Event(enable_timing=True) start_event.record() for i in range(args.num_iterations): compfunc(input_tensor, conv2d) end_event.record() device_func.synchronize() elapsed_time = start_event.elapsed_time(end_event) / 1000 flop_sec = flops * args.num_iterations / elapsed_time example_per_sec = input_tensor_shape[0] * args.num_iterations / elapsed_time flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec) mem_scaled, mem_unit = nnutils.unit_scale(mem) if mem > 0: arithemetic_intensity = flop_sec / mem else: arithemetic_intensity = 0 print(f"-----performance----") print(f"\n") print(f"device time: {elapsed_time:.6f}") print(f"flops: {flop_sec}") print(f"memory: {mem}") print(f"example_per_sec: {example_per_sec:.3f}") print(f"arithemetic intensity: {arithemetic_intensity}") print(f"parameter size: {params}") print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}") print(f"memory_scaled: {mem_scaled} {mem_unit}")
import nnstats from torchvision.models import resnet18, densenet121 import nnutils import logging # linear = nn.Linear(in_features=64, out_features=128).eval() # module_info = nns.crawl_module(linear, (1, 64)) # print(module_info) conv2d = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3, bias=False).eval() flops, memory = nnstats.get_flops_mem(conv2d, (2, 3, 224, 224), 0) logging.debug(nnutils.unit_scale(flops)) logging.debug(nnutils.unit_scale(memory)) # rnn = nn.RNN(input_size=100, hidden_size=256).eval() # flops, memory = nnstats.get_flops_mem(rnn, (4, 1, 100), 0) # print(flops) # print(nnutils.unit_scale(flops)) # stats = crawler.crawl_module(conv2d, (128, 3, 224, 224)) # module_info = nnutils.aggregate_info(stats, 0) # print(nnutils.flops_dmas(module_info)) # resnet18 = resnet18().eval() # print(nnstats.get_flops_dmas(resnet18, (1, 3, 224, 224))) # densenet121 = densenet121().eval()
def main(args): # datatype selection if args.dtype == 'float16': tensor_type = torch.float16 elif args.dtype == 'float32': tensor_type = torch.float32 else: raise Exception('data type can only be float16 or float32') if args.platform == "gpu": device = torch.device('cuda:0') device_func = torch.cuda elif args.platform == "npu": device = torch.device('npu:0') device_func = torch.npu else: device = torch.device('cpu') print("Running on device {}".format(device)) # select commpute type if args.compute_type == "forward": compfunc = run_forward elif args.compute_type == "backward": compfunc = run_backward elif args.compute_type == "calibrate": compfunc = run_calibrate else: raise ValueError( "Error, compute_type should be either forward or backward or calibrate" ) kernel_shape = args.kernel_shape # requires_grad=True indicates that we want to compute gradients during the backward pass if args.compute_type == "backward": weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0], kernel_shape[1], device=device, dtype=tensor_type, requires_grad=True) biases = torch.randn(kernel_shape[3], device=device, dtype=tensor_type, requires_grad=True) else: weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0], kernel_shape[1], device=device, dtype=tensor_type) biases = torch.randn(kernel_shape[3], device=device, dtype=tensor_type) input_tensor_shape = args.input_tensor_shape # the input format is NHWC, pytorch requires NCHW thus we do a transpose here input_image = torch.randn(input_tensor_shape[0], input_tensor_shape[1], input_tensor_shape[2], input_tensor_shape[3], device=device, dtype=tensor_type) # init the conv2d kernel conv2d = nn.Conv2d(in_channels=kernel_shape[2], out_channels=kernel_shape[3], kernel_size=kernel_shape[0], stride=args.stride) conv2d.weight = torch.nn.Parameter(weights) conv2d.bias = torch.nn.Parameter(biases) # move the kernel to device conv2d.to(device) # start session print("warming up for {} steps".format(args.num_warmups)) start = time.time() conv2d.eval() flops, mem = nnstats.get_flops_mem(conv2d, (64, 3, 224, 224)) if args.compute_type == "forward": flops = flops elif args.compute_type == "backward": flops = flops * 3 else: flop_sec = 0.0 print(f"{flops}, {mem}") for i in range(args.num_warmups): compfunc(input_image, conv2d) end = time.time() print("done") duration = end - start print('Warmup {:.2f} seconds, {:.2f} seconds/iter'.format( duration, duration / float(args.num_warmups))) print("running for {} steps".format(args.num_iterations)) start = time.time() start_event = device_func.Event(enable_timing=True) end_event = device_func.Event(enable_timing=True) start_event.record() for i in range(args.num_iterations): compfunc(input_image, conv2d) end_event.record() device_func.synchronize() elapsed_time = start_event.elapsed_time(end_event) / 1000 end = time.time() print("done") duration = end - start flop_sec = flops * args.num_iterations / elapsed_time flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec) mem_scaled, mem_unit = nnutils.unit_scale(mem) print(f"time.time {duration:.6f} seconds cuda.time {elapsed_time:.6f}") print( f"FLOPS: {flop_sec_scaled:.6f} {flop_sec_unit}, memory access: {mem_scaled:.6f} {mem_unit}" )
def main(args): if args.platform == "gpu": device = torch.device('cuda:0') device_func = torch.cuda elif args.platform == "npu": device = torch.device('npu:0') device_func = torch.npu else: device = torch.device('cpu') print("Running on device {}".format(device)) # the input format is (seq_len, batch, input_size) input_tensor_shape = tuple(args.input_tensor_shape) input_tensor = torch.randn(input_tensor_shape[0], input_tensor_shape[1], input_tensor_shape[2], device=device) if args.dtype == 'float16': input_tensor = input_tensor.half() input_size = input_tensor_shape[2] hidden_size = args.hidden_size # init rnn kernel if args.rnn_type == 'lstm': myRNN = nn.LSTM(input_size, hidden_size) elif args.rnn_type == 'rnn': myRNN = nn.RNN(input_size, hidden_size) elif args.rnn_type == 'gru': myRNN = nn.GRU(input_size, hidden_size) else: raise ValueError("Error of input cell_type, please choose one from [rnn, lstm, gru]") myRNN.to(device) if args.dtype == 'float16': myRNN = myRNN.half() if args.compute_type=="forward": compfunc = run_forward elif args.compute_type=="backward": compfunc = run_backward elif args.compute_type=="calibrate": compfunc = run_calibrate else: raise ValueError("Error, compute_type should be either forward or backward or calibrate") start = time.time() flops, mem, params = nnstats.get_flops_mem(myRNN, input_tensor_shape) if args.dtype == 'float16': mem = mem * 2 elif args.dtype == 'float32': mem = mem * 4 print(f"float point operations: {flops}") for i in range(args.num_warmups): compfunc(input_tensor, myRNN) device_func.synchronize() # torch.cuda.reset_max_memory_allocated() end = time.time() duration = end-start start_event = device_func.Event(enable_timing=True) end_event = device_func.Event(enable_timing=True) start_event.record() for i in range(args.num_iterations): compfunc(input_tensor, myRNN) end_event.record() # max_mem = torch.cuda.max_memory_allocated() device_func.synchronize() end = time.time() elapsed_time = start_event.elapsed_time(end_event) / 1000 flop_sec = flops * args.num_iterations / elapsed_time example_per_sec = input_tensor_shape[1] * args.num_iterations / elapsed_time duration = end - start flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec) mem_scaled, mem_unit = nnutils.unit_scale(mem) # max_mem_scaled, max_mem_unit = nnutils.unit_scale(max_mem) if mem > 0: arithemetic_intensity = flop_sec / mem else: arithemetic_intensity = 0 print(f"-----performance----") print(f" ") print(f"device time: {elapsed_time:.6f}") print(f"flops: {flop_sec}") print(f"memory: {mem}") print(f"example_per_sec: {example_per_sec:.3f}") print(f"arithemetic intensity: {arithemetic_intensity}") print(f"parameter size: {params}") print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}") print(f"memory_scaled: {mem_scaled} {mem_unit}")
def main(args): if args.platform == "gpu": device = torch.device('cuda:' + args.device_id) device_func = torch.cuda elif args.platform == "npu": device = torch.device('npu:' + args.device_id) device_func = torch.npu else: device = torch.device('cpu') print("Running on device {}".format(device)) input_tensor_shape = tuple(args.input_tensor_shape) (input_tensor, label) = get_synthetic_data(input_tensor_shape) input_tensor = input_tensor.to(device) label = label.to(device) model = torchvision.models.__dict__[args.arch](args.pretrained) flops, mem, params = nnstats.get_flops_mem(model, input_tensor_shape) if args.dtype == 'float16': model = model.half() input_tensor = input_tensor.half() mem = mem * 2 elif args.dtype == 'float32': mem = mem * 4 if args.compute_type == "forward": flops = flops elif args.compute_type == "backward": flops = flops * 3 else: flop_sec = 0.0 # define optimizer optimizer = optim.Adam(model.parameters()) model = model.to(device) if args.amp: model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, verbosity=0, loss_scale=None if args.loss_scale == -1 else args.loss_scale) criterion = nn.CrossEntropyLoss().to(device) # warm up for i in range(args.num_warmups): if args.compute_type == "forward": predicted = forward(input_tensor, model, args) elif args.compute_type == "backward": loss = train(input_tensor, label, model, criterion, optimizer, args) device_func.synchronize() # bench # start time start_event = device_func.Event(enable_timing=True) end_event = device_func.Event(enable_timing=True) start_event.record() for i in range(args.num_iterations): if args.compute_type == "forward": predicted = forward(input_tensor, model, args) elif args.compute_type == "backward": loss = train(input_tensor, label, model, criterion, optimizer, args) # end time end_event.record() device_func.synchronize() elapsed_time = start_event.elapsed_time(end_event) / 1000 flop_sec = flops * args.num_iterations / elapsed_time arithemetic_intensity = flop_sec / mem example_per_sec = input_tensor_shape[0] * args.num_iterations / elapsed_time flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec) print(f"flops: {flop_sec}") print(f"time: {elapsed_time:.3f}") print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}") print(f"arithemetic intensity: {arithemetic_intensity}") print(f"example_per_sec: {example_per_sec:.3f}") print(f"params: {params}")
def main(args): # datatype selection if args.dtype == 'float16': tensor_type = torch.float16 elif args.dtype == 'float32': tensor_type = torch.float32 else: raise Exception('data type can only be float16 or float32') if args.platform == "gpu": device = torch.device('cuda:0') device_func = torch.cuda elif args.platform == "npu": device = torch.device('npu:0') device_func = torch.npu else: device = torch.device('cpu') print("Running on device {}".format(device)) # select commpute type if args.compute_type == "forward": compfunc = run_forward elif args.compute_type == "backward": compfunc = run_backward elif args.compute_type == "calibrate": compfunc = run_calibrate else: raise ValueError( "Error, compute_type should be either forward or backward or calibrate" ) kernel_shape = args.kernel_shape # requires_grad=True indicates that we want to compute gradients during the backward pass if args.compute_type == "backward": weights = torch.randn(kernel_shape[1], kernel_shape[0], device=device, dtype=tensor_type, requires_grad=True) biases = torch.randn(kernel_shape[1], device=device, dtype=tensor_type, requires_grad=True) else: weights = torch.randn(kernel_shape[1], kernel_shape[0], device=device, dtype=tensor_type) biases = torch.randn(kernel_shape[1], device=device, dtype=tensor_type) input_tensor_shape = tuple(args.input_tensor_shape) input_tensor = torch.randn(input_tensor_shape[0], input_tensor_shape[1], device=device, dtype=tensor_type) # init the Linear kernel linear = nn.Linear(in_features=kernel_shape[0], out_features=kernel_shape[1]).eval() linear.weight = torch.nn.Parameter(weights) linear.bias = torch.nn.Parameter(biases) # move the kernel to device linear.to(device) # start session # print("warming up for {} steps".format(args.num_warmups)) start = time.time() linear.eval() flops, mem = nnstats.get_flops_mem(linear, input_tensor_shape) if args.compute_type == "forward": flops = flops elif args.compute_type == "backward": flops = flops * 3 else: flop_sec = 0.0 for i in range(args.num_warmups): compfunc(input_tensor, linear) end = time.time() # print("done") duration = end - start # print('Warmup {:.2f} seconds, {:.2f} seconds/iter'.format(duration, # duration/float(args.num_warmups))) # print("running for {} steps".format(args.num_iterations)) start = time.time() start_event = device_func.Event(enable_timing=True) end_event = device_func.Event(enable_timing=True) start_event.record() # cupy.cuda.profiler.start() for i in range(args.num_iterations): compfunc(input_tensor, linear) end_event.record() device_func.synchronize() # Wait for the events to be recorded! end = time.time() elapsed_time = start_event.elapsed_time(end_event) / 1000 # print("done") flop_sec = flops * args.num_iterations / elapsed_time duration = end - start flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec) mem_scaled, mem_unit = nnutils.unit_scale(mem) print(f"time.time {duration:.6f} seconds device.time {elapsed_time:.6f}") print(f"FLOPS: {flop_sec}") print(f"memory: {mem}")