示例#1
0
def main(args):

    if args.platform == "gpu":
        device = torch.device('cuda:0')
        device_func = torch.cuda
    elif args.platform == "npu":
        device = torch.device('npu:0')
        device_func = torch.npu
    else:
        device = torch.device('cpu')
    print("Running on device {}".format(device))

    # select commpute type
    if args.compute_type == "forward":
        compfunc = run_forward
    elif args.compute_type == "backward":
        compfunc = run_backward
    elif args.compute_type == "calibrate":
        compfunc = run_calibrate
    else:
        raise ValueError(
            "Error, compute_type should be either forward or backward or calibrate")

    kernel_shape = args.kernel_shape
    # requires_grad=True indicates that we want to compute gradients during the backward pass
    if args.compute_type == "backward":
        weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0],
                              kernel_shape[1], device=device, requires_grad=True)
        biases = torch.randn(
            kernel_shape[3], device=device, requires_grad=True)
    else:
        weights = torch.randn(kernel_shape[3], kernel_shape[2], kernel_shape[0],
                              kernel_shape[1], device=device)
        biases = torch.randn(kernel_shape[3], device=device)

    input_tensor_shape = tuple(args.input_tensor_shape)
    input_tensor = torch.randn(input_tensor_shape[0], input_tensor_shape[1],
                              input_tensor_shape[2], input_tensor_shape[3], device=device)
    
    if args.dtype == "float16":
        input_tensor = input_tensor.half()

    # init the conv2d kernel
    conv2d = nn.Conv2d(in_channels=kernel_shape[2], out_channels=kernel_shape[3], kernel_size=kernel_shape[0], stride=args.stride)
    conv2d.weight = torch.nn.Parameter(weights)
    conv2d.bias = torch.nn.Parameter(biases)
    # move the kernel to device
    conv2d.to(device)
    if args.dtype == "float16":
        conv2d = conv2d.half()

    # warm up
    conv2d.eval()
    flops, mem, params = nnstats.get_flops_mem(conv2d, input_tensor_shape)

    if args.dtype == 'float16':
        mem = mem * 2
    elif args.dtype == 'float32':
        mem = mem * 4

    if args.compute_type == "forward":
        flops = flops
    elif args.compute_type == "backward":
        flops = flops * 3
    else:
        flop_sec = 0.0
    for i in range(args.num_warmups):
        compfunc(input_tensor, conv2d)
    device_func.synchronize()
    
    # bench
    start_event = device_func.Event(enable_timing=True)
    end_event = device_func.Event(enable_timing=True)
    start_event.record()

    for i in range(args.num_iterations):
        compfunc(input_tensor, conv2d)

    end_event.record()
    device_func.synchronize()
    elapsed_time = start_event.elapsed_time(end_event) / 1000

    flop_sec = flops * args.num_iterations / elapsed_time
    example_per_sec = input_tensor_shape[0] * args.num_iterations / elapsed_time
    flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec)
    mem_scaled, mem_unit = nnutils.unit_scale(mem)
    if mem > 0:
        arithemetic_intensity = flop_sec / mem
    else:
        arithemetic_intensity = 0

    print(f"-----performance----")
    print(f"\n")
    print(f"device time: {elapsed_time:.6f}")
    print(f"flops: {flop_sec}")
    print(f"memory: {mem}")
    print(f"example_per_sec: {example_per_sec:.3f}")
    print(f"arithemetic intensity: {arithemetic_intensity}")
    print(f"parameter size: {params}")
    print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}")
    print(f"memory_scaled: {mem_scaled} {mem_unit}")
示例#2
0
import nnstats
from torchvision.models import resnet18, densenet121
import nnutils
import logging

# linear = nn.Linear(in_features=64, out_features=128).eval()
# module_info = nns.crawl_module(linear, (1, 64))
# print(module_info)
conv2d = nn.Conv2d(in_channels=3,
                   out_channels=64,
                   kernel_size=7,
                   stride=2,
                   padding=3,
                   bias=False).eval()
flops, memory = nnstats.get_flops_mem(conv2d, (2, 3, 224, 224), 0)
logging.debug(nnutils.unit_scale(flops))
logging.debug(nnutils.unit_scale(memory))

# rnn = nn.RNN(input_size=100, hidden_size=256).eval()
# flops, memory = nnstats.get_flops_mem(rnn, (4, 1, 100), 0)
# print(flops)
# print(nnutils.unit_scale(flops))

# stats = crawler.crawl_module(conv2d, (128, 3, 224, 224))
# module_info = nnutils.aggregate_info(stats, 0)
# print(nnutils.flops_dmas(module_info))

# resnet18 = resnet18().eval()
# print(nnstats.get_flops_dmas(resnet18, (1, 3, 224, 224)))

# densenet121 = densenet121().eval()
示例#3
0
def main(args):

    # datatype selection
    if args.dtype == 'float16':
        tensor_type = torch.float16
    elif args.dtype == 'float32':
        tensor_type = torch.float32
    else:
        raise Exception('data type can only be float16 or float32')

    if args.platform == "gpu":
        device = torch.device('cuda:0')
        device_func = torch.cuda
    elif args.platform == "npu":
        device = torch.device('npu:0')
        device_func = torch.npu
    else:
        device = torch.device('cpu')
    print("Running on device {}".format(device))

    # select commpute type
    if args.compute_type == "forward":
        compfunc = run_forward
    elif args.compute_type == "backward":
        compfunc = run_backward
    elif args.compute_type == "calibrate":
        compfunc = run_calibrate
    else:
        raise ValueError(
            "Error, compute_type should be either forward or backward or calibrate"
        )

    kernel_shape = args.kernel_shape
    # requires_grad=True indicates that we want to compute gradients during the backward pass
    if args.compute_type == "backward":
        weights = torch.randn(kernel_shape[3],
                              kernel_shape[2],
                              kernel_shape[0],
                              kernel_shape[1],
                              device=device,
                              dtype=tensor_type,
                              requires_grad=True)
        biases = torch.randn(kernel_shape[3],
                             device=device,
                             dtype=tensor_type,
                             requires_grad=True)
    else:
        weights = torch.randn(kernel_shape[3],
                              kernel_shape[2],
                              kernel_shape[0],
                              kernel_shape[1],
                              device=device,
                              dtype=tensor_type)
        biases = torch.randn(kernel_shape[3], device=device, dtype=tensor_type)

    input_tensor_shape = args.input_tensor_shape
    # the input format is NHWC, pytorch requires NCHW thus we do a transpose here
    input_image = torch.randn(input_tensor_shape[0],
                              input_tensor_shape[1],
                              input_tensor_shape[2],
                              input_tensor_shape[3],
                              device=device,
                              dtype=tensor_type)

    # init the conv2d kernel
    conv2d = nn.Conv2d(in_channels=kernel_shape[2],
                       out_channels=kernel_shape[3],
                       kernel_size=kernel_shape[0],
                       stride=args.stride)
    conv2d.weight = torch.nn.Parameter(weights)
    conv2d.bias = torch.nn.Parameter(biases)
    # move the kernel to device
    conv2d.to(device)

    # start session
    print("warming up for {} steps".format(args.num_warmups))
    start = time.time()
    conv2d.eval()
    flops, mem = nnstats.get_flops_mem(conv2d, (64, 3, 224, 224))
    if args.compute_type == "forward":
        flops = flops
    elif args.compute_type == "backward":
        flops = flops * 3
    else:
        flop_sec = 0.0
    print(f"{flops}, {mem}")
    for i in range(args.num_warmups):
        compfunc(input_image, conv2d)
    end = time.time()
    print("done")
    duration = end - start
    print('Warmup {:.2f} seconds, {:.2f} seconds/iter'.format(
        duration, duration / float(args.num_warmups)))

    print("running for {} steps".format(args.num_iterations))
    start = time.time()
    start_event = device_func.Event(enable_timing=True)
    end_event = device_func.Event(enable_timing=True)
    start_event.record()

    for i in range(args.num_iterations):
        compfunc(input_image, conv2d)

    end_event.record()
    device_func.synchronize()
    elapsed_time = start_event.elapsed_time(end_event) / 1000
    end = time.time()
    print("done")

    duration = end - start
    flop_sec = flops * args.num_iterations / elapsed_time
    flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec)
    mem_scaled, mem_unit = nnutils.unit_scale(mem)

    print(f"time.time {duration:.6f} seconds cuda.time {elapsed_time:.6f}")
    print(
        f"FLOPS: {flop_sec_scaled:.6f} {flop_sec_unit}, memory access: {mem_scaled:.6f} {mem_unit}"
    )
示例#4
0
def main(args): 
    
    if args.platform == "gpu":
        device = torch.device('cuda:0')
        device_func = torch.cuda
    elif args.platform == "npu":
        device = torch.device('npu:0')
        device_func = torch.npu
    else:
        device = torch.device('cpu')
    print("Running on device {}".format(device))
    
    # the input format is (seq_len, batch, input_size)
    input_tensor_shape = tuple(args.input_tensor_shape)
    input_tensor = torch.randn(input_tensor_shape[0], input_tensor_shape[1], input_tensor_shape[2], device=device)

    if args.dtype == 'float16':
        input_tensor = input_tensor.half()

    input_size = input_tensor_shape[2]
    hidden_size = args.hidden_size

    # init rnn kernel
    if args.rnn_type == 'lstm':
        myRNN = nn.LSTM(input_size, hidden_size)
    elif args.rnn_type == 'rnn':
        myRNN = nn.RNN(input_size, hidden_size)
    elif args.rnn_type == 'gru':
        myRNN = nn.GRU(input_size, hidden_size)
    else:
        raise ValueError("Error of input cell_type, please choose one from [rnn, lstm, gru]")

    myRNN.to(device)
    if args.dtype == 'float16':
        myRNN = myRNN.half()
    
    if args.compute_type=="forward":
        compfunc = run_forward  
    elif args.compute_type=="backward":
        compfunc = run_backward
    elif args.compute_type=="calibrate":
        compfunc = run_calibrate
    else:
        raise ValueError("Error, compute_type should be either forward or backward or calibrate")
    
    start = time.time()

    flops, mem, params = nnstats.get_flops_mem(myRNN, input_tensor_shape)

    if args.dtype == 'float16':
        mem = mem * 2
    elif args.dtype == 'float32':
        mem = mem * 4
    
    print(f"float point operations: {flops}")
    for i in range(args.num_warmups):
        compfunc(input_tensor, myRNN)
    device_func.synchronize()
    # torch.cuda.reset_max_memory_allocated()
    end = time.time()
    duration = end-start
    
    start_event = device_func.Event(enable_timing=True)
    end_event = device_func.Event(enable_timing=True)
    start_event.record()
            
    for i in range(args.num_iterations):
        compfunc(input_tensor, myRNN)
        
    end_event.record()
    # max_mem = torch.cuda.max_memory_allocated()
    device_func.synchronize()
        
    end = time.time()
    elapsed_time = start_event.elapsed_time(end_event) / 1000

    flop_sec = flops * args.num_iterations / elapsed_time
    example_per_sec = input_tensor_shape[1] * args.num_iterations / elapsed_time

    duration = end - start
    flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec)
    mem_scaled, mem_unit = nnutils.unit_scale(mem)
    # max_mem_scaled, max_mem_unit = nnutils.unit_scale(max_mem)
    if mem > 0:
        arithemetic_intensity = flop_sec / mem
    else:
        arithemetic_intensity = 0
    
    print(f"-----performance----")
    print(f"  ")
    print(f"device time: {elapsed_time:.6f}")
    print(f"flops: {flop_sec}")
    print(f"memory: {mem}")
    print(f"example_per_sec: {example_per_sec:.3f}")
    print(f"arithemetic intensity: {arithemetic_intensity}")
    print(f"parameter size: {params}")
    print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}")
    print(f"memory_scaled: {mem_scaled} {mem_unit}")
示例#5
0
def main(args):

    if args.platform == "gpu":
        device = torch.device('cuda:' + args.device_id)
        device_func = torch.cuda
    elif args.platform == "npu":
        device = torch.device('npu:' + args.device_id)
        device_func = torch.npu
    else:
        device = torch.device('cpu')
    print("Running on device {}".format(device))

    input_tensor_shape = tuple(args.input_tensor_shape)
    (input_tensor, label) = get_synthetic_data(input_tensor_shape)
    input_tensor = input_tensor.to(device)
    label = label.to(device)

    model = torchvision.models.__dict__[args.arch](args.pretrained)
    flops, mem, params = nnstats.get_flops_mem(model, input_tensor_shape)

    if args.dtype == 'float16':
        model = model.half()
        input_tensor = input_tensor.half()
        mem = mem * 2
    elif args.dtype == 'float32':
        mem = mem * 4

    if args.compute_type == "forward":
        flops = flops
    elif args.compute_type == "backward":
        flops = flops * 3
    else:
        flop_sec = 0.0

    # define optimizer
    optimizer = optim.Adam(model.parameters())
    model = model.to(device)
    if args.amp:
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=args.opt_level,
            verbosity=0,
            loss_scale=None if args.loss_scale == -1 else args.loss_scale)

    criterion = nn.CrossEntropyLoss().to(device)

    # warm up
    for i in range(args.num_warmups):
        if args.compute_type == "forward":
            predicted = forward(input_tensor, model, args)
        elif args.compute_type == "backward":
            loss = train(input_tensor, label, model, criterion, optimizer,
                         args)
    device_func.synchronize()

    # bench
    # start time
    start_event = device_func.Event(enable_timing=True)
    end_event = device_func.Event(enable_timing=True)
    start_event.record()

    for i in range(args.num_iterations):
        if args.compute_type == "forward":
            predicted = forward(input_tensor, model, args)
        elif args.compute_type == "backward":
            loss = train(input_tensor, label, model, criterion, optimizer,
                         args)

    # end time
    end_event.record()
    device_func.synchronize()
    elapsed_time = start_event.elapsed_time(end_event) / 1000
    flop_sec = flops * args.num_iterations / elapsed_time
    arithemetic_intensity = flop_sec / mem
    example_per_sec = input_tensor_shape[0] * args.num_iterations / elapsed_time
    flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec)

    print(f"flops: {flop_sec}")
    print(f"time: {elapsed_time:.3f}")
    print(f"flops_scaled: {flop_sec_scaled} {flop_sec_unit}")
    print(f"arithemetic intensity: {arithemetic_intensity}")
    print(f"example_per_sec: {example_per_sec:.3f}")
    print(f"params: {params}")
示例#6
0
文件: linear.py 项目: ADCHYX/nn-bench
def main(args):

    # datatype selection
    if args.dtype == 'float16':
        tensor_type = torch.float16
    elif args.dtype == 'float32':
        tensor_type = torch.float32
    else:
        raise Exception('data type can only be float16 or float32')

    if args.platform == "gpu":
        device = torch.device('cuda:0')
        device_func = torch.cuda
    elif args.platform == "npu":
        device = torch.device('npu:0')
        device_func = torch.npu
    else:
        device = torch.device('cpu')
    print("Running on device {}".format(device))

    # select commpute type
    if args.compute_type == "forward":
        compfunc = run_forward
    elif args.compute_type == "backward":
        compfunc = run_backward
    elif args.compute_type == "calibrate":
        compfunc = run_calibrate
    else:
        raise ValueError(
            "Error, compute_type should be either forward or backward or calibrate"
        )

    kernel_shape = args.kernel_shape
    # requires_grad=True indicates that we want to compute gradients during the backward pass
    if args.compute_type == "backward":
        weights = torch.randn(kernel_shape[1],
                              kernel_shape[0],
                              device=device,
                              dtype=tensor_type,
                              requires_grad=True)
        biases = torch.randn(kernel_shape[1],
                             device=device,
                             dtype=tensor_type,
                             requires_grad=True)
    else:
        weights = torch.randn(kernel_shape[1],
                              kernel_shape[0],
                              device=device,
                              dtype=tensor_type)
        biases = torch.randn(kernel_shape[1], device=device, dtype=tensor_type)

    input_tensor_shape = tuple(args.input_tensor_shape)

    input_tensor = torch.randn(input_tensor_shape[0],
                               input_tensor_shape[1],
                               device=device,
                               dtype=tensor_type)

    # init the Linear kernel
    linear = nn.Linear(in_features=kernel_shape[0],
                       out_features=kernel_shape[1]).eval()
    linear.weight = torch.nn.Parameter(weights)
    linear.bias = torch.nn.Parameter(biases)
    # move the kernel to device
    linear.to(device)

    # start session
    # print("warming up for {} steps".format(args.num_warmups))
    start = time.time()
    linear.eval()
    flops, mem = nnstats.get_flops_mem(linear, input_tensor_shape)

    if args.compute_type == "forward":
        flops = flops
    elif args.compute_type == "backward":
        flops = flops * 3
    else:
        flop_sec = 0.0

    for i in range(args.num_warmups):
        compfunc(input_tensor, linear)
    end = time.time()
    # print("done")
    duration = end - start
    # print('Warmup {:.2f} seconds, {:.2f} seconds/iter'.format(duration,
    #                                                           duration/float(args.num_warmups)))

    # print("running for {} steps".format(args.num_iterations))
    start = time.time()
    start_event = device_func.Event(enable_timing=True)
    end_event = device_func.Event(enable_timing=True)
    start_event.record()

    # cupy.cuda.profiler.start()
    for i in range(args.num_iterations):
        compfunc(input_tensor, linear)

    end_event.record()
    device_func.synchronize()  # Wait for the events to be recorded!
    end = time.time()
    elapsed_time = start_event.elapsed_time(end_event) / 1000

    # print("done")

    flop_sec = flops * args.num_iterations / elapsed_time

    duration = end - start
    flop_sec_scaled, flop_sec_unit = nnutils.unit_scale(flop_sec)
    mem_scaled, mem_unit = nnutils.unit_scale(mem)

    print(f"time.time {duration:.6f} seconds device.time {elapsed_time:.6f}")
    print(f"FLOPS: {flop_sec}")
    print(f"memory: {mem}")