def run(point): start = time.time() try: batch_size = point["batch_size"] seq_length = point["seq_length"] in_features = point["in_features"] hidden_units = point["hidden_units"] num_layers = point["num_layers"] # out_features = point["out_features"] bias = int(point["bias"]) == 1 print(point) import torch device, dtype = load_cuda_vs_knl(point) init_mem = None if use_cuda: init_mem = get_first_gpu_memory_usage() inputs = torch.arange( seq_length * batch_size * in_features, dtype=dtype, device=device, requires_grad=True, ).view((seq_length, batch_size, in_features)) layer = torch.nn.GRU( in_features, hidden_units, num_layers, # out_features, bias=bias, ).to(device, dtype=dtype) ave_time = benchmark_forward(layer, inputs, init_mem=init_mem) # See Dey (2017), GRU has 3*(n^2 + m*n + n) trainable parameters across # 6x matrices and 3x bias vectors (size 1xn), where m= input dim, n= hidden dim # # Or, consider only matrix-vector mults, using 3x combined matrices: # [x, h]*A, for A=W_z, W_r, W all (m+n) x n, vector is 1x(m+n) # ---> 3*(m+n)*n MACs # ---> 3*(2*(m+n) - 1)*n FLOPs total_flop = (3 * seq_length * batch_size * (2 * (in_features + hidden_units) - 1) * hidden_units) print("flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time print("runtime=", time.time() - start, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] in_channels = point["in_channels"] out_channels = point["out_channels"] kernel_size = point["kernel_size"] print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs = torch.arange( batch_size * image_size * image_size * in_channels, dtype=dtype, device=device, ).view((batch_size, in_channels, image_size, image_size)) layer = torch.nn.Conv2d( in_channels, out_channels, kernel_size, stride=1, # padding="same" ).to(device, dtype=dtype) ave_time = benchmark_forward(layer, inputs) # flops, params = get_model_complexity_info( # layer, tuple(inputs.shape[1:]), as_strings=False # ) # print(flops) outputs = layer(inputs) total_flop = (kernel_size * kernel_size * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * batch_size) print(outputs.shape) print("flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) # KGF: random addition... # logger.exception("exception raised") return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] height = point["height"] width = point["width"] in_channels = point["in_channels"] out_channels = point["out_channels"] kernel_size = point["kernel_size"] print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs = torch.arange( batch_size * height * width * in_channels, dtype=dtype, device=device ).view((batch_size, in_channels, height, width)) layer = torch.nn.Conv2d( in_channels, out_channels, (kernel_size, kernel_size), stride=1, padding=1 ).to(device, dtype=dtype) ave_time = benchmark_forward(layer, inputs) # flops, params = get_model_complexity_info(layer, # tuple(inputs.shape[1:]),as_strings=False) # print('ptflops=',flops*batch_size) outputs = layer(inputs) # print('shapes: ',inputs.shape,outputs.shape) total_flop = ( kernel_size * kernel_size * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * batch_size ) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() # memorymon = mp.Process(target=print_mem_cpu) # memorymon.start() try: batch_size = point["batch_size"] image_size = point["image_size"] in_channels = point["in_channels"] out_channels = point["out_channels"] kernel_size = point["kernel_size"] print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs = torch.arange(batch_size * image_size**3 * in_channels, dtype=dtype, device=device).view( (batch_size, in_channels, image_size, image_size, image_size)) layer = torch.nn.Conv3d(in_channels, out_channels, kernel_size, stride=1, padding=1).to(device, dtype=dtype) # layer.eval() ave_time = benchmark_forward(layer, inputs) outputs = layer(inputs) total_flop = (kernel_size**3 * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * outputs.shape[-3] * batch_size) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) # memorymon.terminate() # memorymon.join() return ave_flops except Exception as e: import traceback print("received exception: ", str(e), "for point: ", point) print(traceback.print_exc()) print("runtime=", time.time() - start) # memorymon.terminate() # memorymon.join() return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] in_channels = point["in_channels"] out_channels = point["out_channels"] kernel_size = point["kernel_size"] print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs = torch.arange(batch_size * image_size * in_channels, dtype=dtype, device=device).view( (batch_size, in_channels, image_size)) layer = torch.nn.Conv1d( in_channels, out_channels, kernel_size, stride=1, padding=1 ).to( # KGF: unlike linear_run.py, dtype=float causes: # RuntimeError: Input type (torch.cuda.FloatTensor) and # weight type (torch.cuda.DoubleTensor) should be the same # device, float) device, dtype=dtype, ) # torch.float32 = torch.float ave_time = benchmark_forward(layer, inputs) outputs = layer(inputs) total_flop = (kernel_size * in_channels * out_channels * outputs.shape[-1] * batch_size) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] in_channels = point["in_channels"] out_channels = point["out_channels"] kernel_size = point["kernel_size"] print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs = torch.arange( batch_size * image_size * image_size * in_channels, dtype=dtype, device=device, ).view((batch_size, in_channels, image_size, image_size)) layer = torch.nn.Conv2d(in_channels, out_channels, (kernel_size, kernel_size), stride=1).to(device, dtype=dtype) ave_time = benchmark_forward(layer, inputs) outputs = layer(inputs) total_flop = (kernel_size * kernel_size * in_channels * out_channels * outputs.shape[-1] * outputs.shape[-2] * batch_size) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] in_features = point["in_features"] out_features = point["out_features"] bias = int(point["bias"]) == 1 print(point) import torch device, dtype = load_cuda_vs_knl(point) # KGF: check GPU memory usage baseline after loading PyTorch, but before any # inputs, model, etc. are defined init_mem = None if use_cuda: init_mem = get_first_gpu_memory_usage() # KGF: attempt to max-out V100 utilization in nvidia-smi for a sustained time: # batch_size *= 100 inputs = torch.arange(batch_size * in_features, dtype=dtype, device=device, requires_grad=True).view( (batch_size, in_features)) # .type(dtype) # manually computing flops from the formulas given here: # https://machinethink.net/blog/how-fast-is-my-model/ # # FC layer: x*W, x is 1xI vector, W is IxJ matrix # ----> I*J MACs # dot product (specifically) of n MACs = n multiplications, n-1 adds # FC layer computes J dot products: # ----> (2I-1)*J FLOPs total_flop = batch_size * (2 * in_features - 1) * out_features layer = torch.nn.Linear(in_features, out_features, bias=bias).to( device, dtype=dtype) # dtype=float != torch.float ave_time = benchmark_forward(layer, inputs, init_mem=init_mem) print("flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time print("runtime=", time.time() - start, "ave_flops=", ave_flops) # inputs = torch.arange( # batch_size * in_features, dtype=dtype, device=device, # ) # flop, params = profile(layer, inputs=(inputs,)) # print("-------") # print("THOP") # print("-------") # # E.g. for "in_features": 8153, "out_features": 7533, no bias # 8153*7533 = 61,416,549 trainable parameters # (# of trainable params = number of MACs, in this case) # ----> 6871*(2*8153 -1)*7533 = 843934457115 FLOP # But THOP returns: # 843934466048.0 flop 61416548.0 parameters # = 8933 more FLOP, 1 fewer trainable parameter # BUG: https://github.com/Lyken17/pytorch-OpCounter/issues/71 # print(f"{flop} flop {params} parameters") # KGF: note, THOP documentation is inconsistent; claims MACs, but must be FLOPs # flop, params = clever_format([flop, params], "%.3f") # print(f"{flop} flop {params} parameters") return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] conv1_in_chan = point["conv1_in_chan"] conv1_out_chan = point["conv1_out_chan"] conv1_kern = point["conv1_kern"] pool_size_1 = point["pool_size_1"] pool_size_2 = point["pool_size_2"] pool_size_5 = point["pool_size_5"] conv2_out_chan = point["conv2_out_chan"] conv2_kern = point["conv2_kern"] conv3_out_chan = point["conv3_out_chan"] conv3_kern = point["conv3_kern"] conv4_out_chan = point["conv4_out_chan"] conv4_kern = point["conv4_kern"] conv5_out_chan = point["conv5_out_chan"] conv5_kern = point["conv5_kern"] adaptive_pool_dim = point["adaptive_pool_dim"] fc1_out = point["fc1_out"] fc2_out = point["fc2_out"] fc3_out = point["fc3_out"] print(point) import torch import torch.nn as nn device, dtype = load_cuda_vs_knl(point) class AlexNet(nn.Module): def __init__( self, batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size_1, pool_size_2, pool_size_5, conv2_out_chan, conv2_kern, conv3_out_chan, conv3_kern, conv4_out_chan, conv4_kern, conv5_out_chan, conv5_kern, adaptive_pool_dim, fc1_out, fc2_out, fc3_out, ): super(AlexNet, self).__init__() self.flop = 0 self.features = nn.Sequential( # 1st conv nn.Conv2d( conv1_in_chan, conv1_out_chan, kernel_size=conv1_kern, stride=4, padding=2, ), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=pool_size_1, stride=2), # 2nd conv nn.Conv2d( conv1_out_chan, conv2_out_chan, kernel_size=conv2_kern, padding=2, ), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=pool_size_2, stride=2), # 3rd conv nn.Conv2d( conv2_out_chan, conv3_out_chan, kernel_size=conv3_kern, padding=1, ), nn.ReLU(inplace=True), # 4th conv nn.Conv2d( conv3_out_chan, conv4_out_chan, kernel_size=conv4_kern, padding=1, ), nn.ReLU(inplace=True), # 5th conv nn.Conv2d( conv4_out_chan, conv5_out_chan, kernel_size=conv5_kern, padding=1, ), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=pool_size_5, stride=2), ) # FLOPS claculations for convolutional layers: # 1st conv2d layer_input_size = image_size print(layer_input_size) self.flop += ( conv1_kern ** 2 * conv1_in_chan * conv1_out_chan * layer_input_size ** 2 * batch_size ) # (((W - K + 2P)/S)+1) layer_input_size = int(((image_size - conv1_kern + 2 * 2) / 4) + 1) print(layer_input_size) layer_input_size = int(((layer_input_size - pool_size_1) / 2) + 1) print(layer_input_size) # 2nd conv2d self.flop += ( conv2_kern ** 2 * conv1_out_chan * conv2_out_chan * layer_input_size ** 2 * batch_size ) layer_input_size = int( ((layer_input_size - conv2_kern + 2 * 2) / 1) + 1 ) print(layer_input_size) layer_input_size = int(((layer_input_size - pool_size_2) / 2) + 1) print(layer_input_size) # 3rd conv2d self.flop += ( conv3_kern ** 2 * conv2_out_chan * conv3_out_chan * layer_input_size ** 2 * batch_size ) layer_input_size = int( ((layer_input_size - conv3_kern + 2 * 1) / 1) + 1 ) print(layer_input_size) # 4st conv2d self.flop += ( conv4_kern ** 2 * conv3_out_chan * conv4_out_chan * layer_input_size ** 2 * batch_size ) layer_input_size = int( ((layer_input_size - conv4_kern + 2 * 1) / 1) + 1 ) print(layer_input_size) # 5th conv2d self.flop += ( conv5_kern ** 2 * conv4_out_chan * conv5_out_chan * layer_input_size ** 2 * batch_size ) layer_input_size = int( ((layer_input_size - conv5_kern + 2 * 1) / 1) + 1 ) print(layer_input_size) layer_input_size = int(((layer_input_size - pool_size_5) / 2) + 1) print(layer_input_size) self.avgpool = nn.AdaptiveAvgPool2d( (adaptive_pool_dim, adaptive_pool_dim) ) self.classifier = nn.Sequential( # linear 1 nn.Dropout(), nn.Linear(conv5_out_chan * adaptive_pool_dim ** 2, fc1_out), nn.ReLU(inplace=True), # linear 2 nn.Dropout(), nn.Linear(fc1_out, fc2_out), nn.ReLU(inplace=True), # linear 3 nn.Linear(fc2_out, fc3_out), ) # FLOPS calculatios for linear layers # 1st linear layer self.flop += ( (2 * (conv5_out_chan * adaptive_pool_dim ** 2) - 1) * fc1_out * batch_size ) # 2nd linear layer self.flop += (2 * fc1_out - 1) * fc2_out * batch_size # 3rd linear layer self.flop += (2 * fc2_out - 1) * fc3_out * batch_size def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x inputs = torch.arange( batch_size * image_size ** 2 * conv1_in_chan, dtype=dtype, device=device ).view((batch_size, conv1_in_chan, image_size, image_size)) # create and move model to GPU net = AlexNet( batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size_1, pool_size_2, pool_size_5, conv2_out_chan, conv2_kern, conv3_out_chan, conv3_kern, conv4_out_chan, conv4_kern, conv5_out_chan, conv5_kern, adaptive_pool_dim, fc1_out, fc2_out, fc3_out, ).to(device, dtype=dtype) total_flop = net.flop ave_time = benchmark_forward(net, inputs) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e), "for point: ", point) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] seq_length = point["seq_length"] in_features = point["in_features"] hidden_units = point["hidden_units"] num_layers = point["num_layers"] # out_features = point["out_features"] bias = int(point["bias"]) == 1 print(point) import torch device, dtype = load_cuda_vs_knl(point) init_mem = None if use_cuda: init_mem = get_first_gpu_memory_usage() inputs = torch.arange( seq_length * batch_size * in_features, dtype=dtype, device=device, requires_grad=True, ).view((seq_length, batch_size, in_features)) layer = torch.nn.LSTM( in_features, hidden_units, num_layers, # out_features, bias=bias, ).to(device, dtype=dtype) ave_time = benchmark_forward(layer, inputs, init_mem=init_mem) # https://stats.stackexchange.com/questions/328926/how-many-parameters-are-in-a-gated-recurrent-unit-gru-recurrent-neural-network # See Dey (2017), LSTM has 4*(n^2 + m*n + n) trainable parameters across # 8x matrices and 4x bias vectors (size 1xn), where m= input dim, n= hidden dim # # Or, consider only matrix-vector mults, using 4x combined matrices: # [x, h]*A, for A=W_i, W_c, W_o, W_f all (m+n) x n, vector is 1x(m+n) # ---> 4*(m+n)*n MACs # ---> 4*(2*(m+n) - 1)*n FLOPs total_flop = (4 * seq_length * batch_size * (2 * (in_features + hidden_units) - 1) * hidden_units) # Compare to incorrect LSTM answer from: # https://github.com/NVIDIA-developer-blog/code-samples/issues/7 # which assumes input dim = hidden dim, and uses wrong matmul --> FLOPs formula print("flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time print("runtime=", time.time() - start, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] conv1_in_chan = point["conv1_in_chan"] conv1_out_chan = point["conv1_out_chan"] conv1_kern = point["conv1_kern"] pool_size = point["pool_size"] conv2_out_chan = point["conv2_out_chan"] conv2_kern = point["conv2_kern"] fc1_out = point["fc1_out"] fc2_out = point["fc2_out"] fc3_out = point["fc3_out"] n_conv_block = point["n_conv_block"] print(point) import torch device, dtype = load_cuda_vs_knl(point) class Net(torch.nn.Module): def __init__( self, batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size, conv2_out_chan, conv2_kern, fc1_out, fc2_out, fc3_out, n_conv_block, ): super(Net, self).__init__() self.flop = 0 self.conv1 = torch.nn.Conv2d(conv1_in_chan, conv1_out_chan, conv1_kern).to(device, dtype=dtype) self.flop += (conv1_kern**2 * conv1_in_chan * conv1_out_chan * image_size**2 * batch_size) self.pool = torch.nn.MaxPool2d(pool_size, pool_size).to(device, dtype=dtype) self.conv1_size = image_size - conv1_kern + 1 self.maxpool1_size = int((self.conv1_size - pool_size) / pool_size + 1) # self.flop += image_size ** 2 * conv1_out_chan * batch_size self.conv2 = torch.nn.Conv2d(conv1_out_chan, conv2_out_chan, conv2_kern).to(device, dtype=dtype) self.flop += (conv2_kern**2 * conv1_out_chan * conv2_out_chan * int(image_size / pool_size)**2 * batch_size) # account for loop of convolutions: self.flop = self.flop * n_conv_block self.conv2_size = self.maxpool1_size - conv2_kern + 1 self.maxpool2_size = int((self.conv2_size - pool_size) / pool_size + 1) self.view_size = (conv2_out_chan * self.maxpool2_size * self.maxpool2_size) self.fc1 = torch.nn.Linear(self.view_size, fc1_out).to(device, dtype=dtype) self.flop += (2 * self.view_size - 1) * fc1_out * batch_size self.fc2 = torch.nn.Linear(fc1_out, fc2_out).to(device, dtype=dtype) self.flop += (2 * fc1_out - 1) * fc2_out * batch_size self.fc3 = torch.nn.Linear(fc2_out, fc3_out).to(device, dtype=dtype) self.flop += (2 * fc2_out - 1) * fc3_out * batch_size def forward(self, x): block_output = torch.zeros( inputs.shape[0] * n_conv_block, self.view_size, device=device, dtype=dtype, ) for i in range(n_conv_block): # Will need to use this sort of strategy when we are using real # datasets, not one dummy batch: # batch = inputs[i * batch_size:(i + 1) * batch_size] batch = inputs x = self.pool(torch.nn.functional.relu(self.conv1(batch))) x = self.pool(torch.nn.functional.relu(self.conv2(x))) x = x.view(-1, self.view_size) block_output[i * batch_size:(i + 1) * batch_size] = x x = torch.nn.functional.relu(self.fc1(block_output)) x = torch.nn.functional.relu(self.fc2(x)) x = self.fc3(x) return x inputs = torch.arange(batch_size * image_size**2 * conv1_in_chan, dtype=dtype, device=device).view((batch_size, conv1_in_chan, image_size, image_size)) net = Net( batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size, conv2_out_chan, conv2_kern, fc1_out, fc2_out, fc3_out, n_conv_block, ) total_flop = net.flop ave_time = benchmark_forward(net, inputs) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e), "for point: ", point) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: batch_size = point["batch_size"] image_size = point["image_size"] conv1_in_chan = point["conv1_in_chan"] conv1_out_chan = point["conv1_out_chan"] conv1_kern = point["conv1_kern"] pool_size = point["pool_size"] conv2_out_chan = point["conv2_out_chan"] conv2_kern = point["conv2_kern"] fc1_out = point["fc1_out"] fc2_out = point["fc2_out"] fc3_out = point["fc3_out"] print(point) import torch device, dtype = load_cuda_vs_knl(point) class Net(torch.nn.Module): def __init__( self, batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size, conv2_out_chan, conv2_kern, fc1_out, fc2_out, fc3_out, ): super(Net, self).__init__() self.flop = 0 self.conv1 = torch.nn.Conv2d( conv1_in_chan, conv1_out_chan, conv1_kern ).to(device, dtype=dtype) self.flop += ( conv1_kern ** 2 * conv1_in_chan * conv1_out_chan * image_size ** 2 * batch_size ) print(self.flop) self.pool = torch.nn.MaxPool2d(pool_size, pool_size).to( device, dtype=dtype ) self.conv1_size = image_size - conv1_kern + 1 self.maxpool1_size = int((self.conv1_size - pool_size) / pool_size + 1) self.flop += image_size ** 2 * conv1_out_chan * batch_size self.conv2 = torch.nn.Conv2d( conv1_out_chan, conv2_out_chan, conv2_kern ).to(device, dtype=dtype) self.flop += ( conv2_kern ** 2 * conv1_out_chan * conv2_out_chan * int(image_size / pool_size) ** 2 * batch_size ) print(self.flop) self.conv2_size = self.maxpool1_size - conv2_kern + 1 self.maxpool2_size = int((self.conv2_size - pool_size) / pool_size + 1) self.view_size = ( conv2_out_chan * self.maxpool2_size * self.maxpool2_size ) self.fc1 = torch.nn.Linear(self.view_size, fc1_out).to( device, dtype=dtype ) self.flop += (2 * self.view_size - 1) * fc1_out * batch_size self.fc2 = torch.nn.Linear(fc1_out, fc2_out).to(device, dtype=dtype) self.flop += (2 * fc1_out - 1) * fc2_out * batch_size self.fc3 = torch.nn.Linear(fc2_out, fc3_out).to(device, dtype=dtype) self.flop += (2 * fc2_out - 1) * fc3_out * batch_size def forward(self, x): x = self.pool(torch.nn.functional.relu(self.conv1(x))) x = self.pool(torch.nn.functional.relu(self.conv2(x))) x = x.view(-1, self.view_size) x = torch.nn.functional.relu(self.fc1(x)) x = torch.nn.functional.relu(self.fc2(x)) x = self.fc3(x) return x inputs = torch.arange( batch_size * image_size ** 2 * conv1_in_chan, dtype=dtype, device=device ).view((batch_size, conv1_in_chan, image_size, image_size)) net = Net( batch_size, image_size, conv1_in_chan, conv1_out_chan, conv1_kern, pool_size, conv2_out_chan, conv2_kern, fc1_out, fc2_out, fc3_out, ) total_flop = net.flop ave_time = benchmark_forward(net, inputs) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e), "for point: ", point) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: num_classes = point["num_classes"] batch_size = point["batch_size"] image_size = point["image_size"] conv1_in_chan = point["conv1_in_chan"] conv1_out_chan = point["conv1_out_chan"] conv_kern = point["conv_kern"] pool_size = point["pool_size"] conv2_out_chan = point["conv2_out_chan"] conv3_out_chan = point["conv3_out_chan"] conv4_out_chan = point["conv4_out_chan"] conv5_out_chan = point["conv5_out_chan"] adaptive_pool_dim = point["adaptive_pool_dim"] fc1_out = point["fc1_out"] fc2_out = point["fc2_out"] print(point) device, dtype = load_cuda_vs_knl(point) class VGG(nn.Module): def __init__( self, features, num_classes, batch_size, image_size, conv1_in_chan, conv1_out_chan, conv2_out_chan, conv3_out_chan, conv4_out_chan, conv5_out_chan, conv_kern, pool_size, adaptive_pool_dim, fc1_out, fc2_out, ) -> None: super(VGG, self).__init__() self.flop = 0 self.features = features #FLOPS calculations for convolutional layers: layer_input_size = image_size #1st block of convolutional layers for i in range(2): if i == 1: self.flop += (conv_kern**2 * conv1_in_chan * conv1_out_chan * layer_input_size**2 * batch_size) else: self.flop += (conv_kern**2 * conv1_out_chan * conv1_out_chan * layer_input_size**2 * batch_size) layer_input_size = int(( (layer_input_size - conv_kern + 2 * 1) / 1) + 1) #Reshape for max pool layer: layer_input_size = int(((layer_input_size - pool_size) / 2) + 1) #2nd block of convolutional layers for i in range(2): if i == 1: self.flop += (conv_kern**2 * conv1_out_chan * conv2_out_chan * layer_input_size**2 * batch_size) else: self.flop += (conv_kern**2 * conv2_out_chan * conv2_out_chan * layer_input_size**2 * batch_size) layer_input_size = int(( (layer_input_size - conv_kern + 2 * 1) / 1) + 1) #Reshape for max pool layer: layer_input_size = int(((layer_input_size - pool_size) / 2) + 1) #3rd block of convolutional layers for i in range(3): if i == 1: self.flop += (conv_kern**2 * conv2_out_chan * conv3_out_chan * layer_input_size**2 * batch_size) else: self.flop += (conv_kern**2 * conv3_out_chan * conv3_out_chan * layer_input_size**2 * batch_size) layer_input_size = int(( (layer_input_size - conv_kern + 2 * 1) / 1) + 1) #Reshape for max pool layer: layer_input_size = int(((layer_input_size - pool_size) / 2) + 1) #4th block of convolutional layers for i in range(3): if i == 1: self.flop += (conv_kern**2 * conv3_out_chan * conv4_out_chan * layer_input_size**2 * batch_size) else: self.flop += (conv_kern**2 * conv4_out_chan * conv4_out_chan * layer_input_size**2 * batch_size) layer_input_size = int(( (layer_input_size - conv_kern + 2 * 1) / 1) + 1) #Reshape for max pool layer: layer_input_size = int(((layer_input_size - pool_size) / 2) + 1) #5th block of convolutional layers for i in range(3): if i == 1: self.flop += (conv_kern**2 * conv4_out_chan * conv5_out_chan * layer_input_size**2 * batch_size) else: self.flop += (conv_kern**2 * conv5_out_chan * conv5_out_chan * layer_input_size**2 * batch_size) layer_input_size = int(( (layer_input_size - conv_kern + 2 * 1) / 1) + 1) #Reshape for max pool layer: layer_input_size = int(((layer_input_size - pool_size) / 2) + 1) self.avgpool = nn.AdaptiveAvgPool2d( (adaptive_pool_dim, adaptive_pool_dim)) self.classifier = nn.Sequential( nn.Linear( conv5_out_chan * adaptive_pool_dim * adaptive_pool_dim, fc1_out), nn.ReLU(True), nn.Dropout(), nn.Linear(fc1_out, fc2_out), nn.ReLU(True), nn.Dropout(), nn.Linear(fc2_out, num_classes), ) # FLOPS calculatios for linear layers # 1st linear layer self.flop += ((2 * (conv5_out_chan * adaptive_pool_dim**2) - 1) * fc1_out * batch_size) # 2nd linear layer self.flop += (2 * fc1_out - 1) * fc2_out * batch_size # 3rd linear layer self.flop += (2 * fc2_out - 1) * num_classes * batch_size def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False) -> nn.Sequential: layers: List[nn.Module] = [] in_channels = conv1_in_chan for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=pool_size, stride=2)] else: v = cast(int, v) conv2d = nn.Conv2d(in_channels, v, kernel_size=conv_kern, padding=1) if batch_norm: layers += [ conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True) ] else: layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v return nn.Sequential(*layers) cfgs: Dict[str, List[Union[str, int]]] = { 'VGG16': [ conv1_out_chan, conv1_out_chan, 'M', conv2_out_chan, conv2_out_chan, 'M', conv3_out_chan, conv3_out_chan, conv3_out_chan, 'M', conv4_out_chan, conv4_out_chan, conv4_out_chan, 'M', conv5_out_chan, conv5_out_chan, conv5_out_chan, 'M' ], } inputs = torch.arange(batch_size * image_size**2 * conv1_in_chan, dtype=dtype, device=device).view((batch_size, conv1_in_chan, image_size, image_size)) #create and move model to GPU #"verion D" is VGG-16 net = VGG( make_layers(cfgs['VGG16'], batch_norm=True), num_classes, batch_size, image_size, conv1_in_chan, conv1_out_chan, conv2_out_chan, conv3_out_chan, conv4_out_chan, conv5_out_chan, conv_kern, pool_size, adaptive_pool_dim, fc1_out, fc2_out, ).to(device, dtype=dtype) total_flop = net.flop ave_time = benchmark_forward(net, inputs) print("total_flop = ", total_flop, "ave_time = ", ave_time) ave_flops = total_flop / ave_time runtime = time.time() - start print("runtime=", runtime, "ave_flops=", ave_flops) return ave_flops except Exception as e: import traceback print("received exception: ", str(e), "for point: ", point) print(traceback.print_exc()) print("runtime=", time.time() - start) return 0.0
def run(point): start = time.time() try: out_channels = point["out_channels"] batch_size_1 = point["batch_size"] image_size_1 = point["image_size"] in_channels_1 = point["in_channels"] kernel_size_1 = point["kernel_size"] batch_size_2 = point["batch_size"] + 1 image_size_2 = point["image_size"] + 1 in_channels_2 = point["in_channels"] + 1 kernel_size_2 = point["kernel_size"] + 1 batch_size_3 = point["batch_size"] + 2 image_size_3 = point["image_size"] + 2 in_channels_3 = point["in_channels"] + 2 kernel_size_3 = point["kernel_size"] + 2 batch_size_4 = point["batch_size"] + 3 image_size_4 = point["image_size"] + 3 in_channels_4 = point["in_channels"] + 3 kernel_size_4 = point["kernel_size"] + 3 batch_size_5 = point["batch_size"] + 4 image_size_5 = point["image_size"] + 4 in_channels_5 = point["in_channels"] + 4 kernel_size_5 = point["kernel_size"] + 4 print(point) import torch device, dtype = load_cuda_vs_knl(point) inputs_1 = torch.arange( batch_size_1 * image_size_1 * image_size_1 * in_channels_1, dtype=dtype, device=device, ).view((batch_size_1, in_channels_1, image_size_1, image_size_1)) inputs_2 = torch.arange( batch_size_2 * image_size_2 * image_size_2 * in_channels_2, dtype=dtype, device=device, ).view((batch_size_2, in_channels_2, image_size_2, image_size_2)) inputs_3 = torch.arange( batch_size_3 * image_size_3 * image_size_3 * in_channels_3, dtype=dtype, device=device, ).view((batch_size_3, in_channels_3, image_size_3, image_size_3)) inputs_4 = torch.arange( batch_size_4 * image_size_4 * image_size_4 * in_channels_4, dtype=dtype, device=device, ).view((batch_size_4, in_channels_4, image_size_4, image_size_4)) inputs_5 = torch.arange( batch_size_5 * image_size_5 * image_size_5 * in_channels_5, dtype=dtype, device=device, ).view((batch_size_5, in_channels_5, image_size_5, image_size_5)) layer_1 = torch.nn.Conv2d( in_channels_1, out_channels, kernel_size_2, stride=1 ).to(device, dtype=dtype) layer_2 = torch.nn.Conv2d( in_channels_2, out_channels, kernel_size_2, stride=1 ).to(device, dtype=dtype) layer_3 = torch.nn.Conv2d( in_channels_3, out_channels, kernel_size_3, stride=1 ).to(device, dtype=dtype) layer_4 = torch.nn.Conv2d( in_channels_4, out_channels, kernel_size_4, stride=1 ).to(device, dtype=dtype) layer_5 = torch.nn.Conv2d( in_channels_5, out_channels, kernel_size_5, stride=1 ).to(device, dtype=dtype) # TODO ave_time_1 = benchmark_forward(layer_1, inputs_1) ave_time_2 = benchmark_forward(layer_2, inputs_2) ave_time_3 = benchmark_forward(layer_3, inputs_3) ave_time_4 = benchmark_forward(layer_4, inputs_4) ave_time_5 = benchmark_forward(layer_5, inputs_5) outputs_1 = layer_1(inputs_1) outputs_2 = layer_2(inputs_2) outputs_3 = layer_3(inputs_3) outputs_4 = layer_4(inputs_4) outputs_5 = layer_5(inputs_5) total_flop_1 = ( kernel_size_1 * kernel_size_1 * in_channels_1 * out_channels * outputs_1.shape[-1] * outputs_1.shape[-2] * batch_size_1 ) total_flop_2 = ( kernel_size_2 * kernel_size_2 * in_channels_2 * out_channels * outputs_2.shape[-1] * outputs_2.shape[-2] * batch_size_2 ) total_flop_3 = ( kernel_size_3 * kernel_size_3 * in_channels_3 * out_channels * outputs_3.shape[-1] * outputs_3.shape[-2] * batch_size_3 ) total_flop_4 = ( kernel_size_4 * kernel_size_4 * in_channels_4 * out_channels * outputs_4.shape[-1] * outputs_4.shape[-2] * batch_size_4 ) total_flop_5 = ( kernel_size_5 * kernel_size_5 * in_channels_5 * out_channels * outputs_5.shape[-1] * outputs_5.shape[-2] * batch_size_5 ) print("OUTPUT SHAPES: 1,2,3,4,5 (respectively)") print(outputs_1.shape) print(outputs_2.shape) print(outputs_3.shape) print(outputs_4.shape) print(outputs_5.shape) # TODO print("flop_1 = ", total_flop_1, "ave_time_1 = ", ave_time_1) ave_flops_1 = total_flop_1 / ave_time_1 * batch_size_1 runtime_1 = time.time() - start print("runtime_1=", runtime_1, "ave_flops_1=", ave_flops_1) print("flop_2 = ", total_flop_2, "ave_time_2 = ", ave_time_2) ave_flops_2 = total_flop_2 / ave_time_2 * batch_size_2 runtime_2 = time.time() - start print("runtime_2=", runtime_2, "ave_flops_2=", ave_flops_2) print("flop_3 = ", total_flop_3, "ave_time_3 = ", ave_time_3) ave_flops_3 = total_flop_3 / ave_time_3 * batch_size_3 runtime_3 = time.time() - start print("runtime_3=", runtime_3, "ave_flops_3=", ave_flops_3) print("flop_4 = ", total_flop_4, "ave_time_4 = ", ave_time_4) ave_flops_4 = total_flop_4 / ave_time_4 * batch_size_4 runtime_4 = time.time() - start print("runtime_4=", runtime_4, "ave_flops_4=", ave_flops_4) print("flop_5 = ", total_flop_5, "ave_time_5 = ", ave_time_5) ave_flops_5 = total_flop_5 / ave_time_5 * batch_size_5 runtime_5 = time.time() - start print("runtime_5=", runtime_5, "ave_flops_5=", ave_flops_5) # TODO return tuple return ave_flops_1 except Exception as e: import traceback print("received exception: ", str(e)) print(traceback.print_exc()) print("runtime=", time.time() - start) # KGF: random addition... # logger.exception("exception raised") return 0.0