Python load_cuda_vs_knl 예제들, torch_wrapper.load_cuda_vs_knl Python 예제들

예제 #1

0

파일 보기

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        seq_length = point["seq_length"]
        in_features = point["in_features"]
        hidden_units = point["hidden_units"]
        num_layers = point["num_layers"]
        # out_features = point["out_features"]
        bias = int(point["bias"]) == 1
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        init_mem = None
        if use_cuda:
            init_mem = get_first_gpu_memory_usage()

        inputs = torch.arange(
            seq_length * batch_size * in_features,
            dtype=dtype,
            device=device,
            requires_grad=True,
        ).view((seq_length, batch_size, in_features))

        layer = torch.nn.GRU(
            in_features,
            hidden_units,
            num_layers,
            # out_features,
            bias=bias,
        ).to(device, dtype=dtype)

        ave_time = benchmark_forward(layer, inputs, init_mem=init_mem)

        # See Dey (2017), GRU has 3*(n^2 + m*n + n) trainable parameters across
        # 6x matrices and 3x bias vectors (size 1xn), where m= input dim, n= hidden dim
        #
        # Or, consider only matrix-vector mults, using 3x combined matrices:
        # [x, h]*A, for A=W_z, W_r, W all (m+n) x n, vector is 1x(m+n)
        # ---> 3*(m+n)*n MACs
        # ---> 3*(2*(m+n) - 1)*n FLOPs
        total_flop = (3 * seq_length * batch_size *
                      (2 * (in_features + hidden_units) - 1) * hidden_units)
        print("flop = ", total_flop, "ave_time = ", ave_time)
        ave_flops = total_flop / ave_time

        print("runtime=", time.time() - start, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #2

0

파일 보기

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        in_channels = point["in_channels"]
        out_channels = point["out_channels"]
        kernel_size = point["kernel_size"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs = torch.arange(
            batch_size * image_size * image_size * in_channels,
            dtype=dtype,
            device=device,
        ).view((batch_size, in_channels, image_size, image_size))

        layer = torch.nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=1,
            # padding="same"
        ).to(device, dtype=dtype)

        ave_time = benchmark_forward(layer, inputs)

        # flops, params = get_model_complexity_info(
        #     layer, tuple(inputs.shape[1:]), as_strings=False
        # )
        # print(flops)
        outputs = layer(inputs)
        total_flop = (kernel_size * kernel_size * in_channels * out_channels *
                      outputs.shape[-1] * outputs.shape[-2] * batch_size)

        print(outputs.shape)

        print("flop = ", total_flop, "ave_time = ", ave_time)
        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        # KGF: random addition...
        # logger.exception("exception raised")
        return 0.0

예제 #3

0

파일 보기

파일: conv2d_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        height = point["height"]
        width = point["width"]
        in_channels = point["in_channels"]
        out_channels = point["out_channels"]
        kernel_size = point["kernel_size"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs = torch.arange(
            batch_size * height * width * in_channels, dtype=dtype, device=device
        ).view((batch_size, in_channels, height, width))

        layer = torch.nn.Conv2d(
            in_channels, out_channels, (kernel_size, kernel_size), stride=1, padding=1
        ).to(device, dtype=dtype)

        ave_time = benchmark_forward(layer, inputs)
        # flops, params = get_model_complexity_info(layer,
        #         tuple(inputs.shape[1:]),as_strings=False)
        # print('ptflops=',flops*batch_size)
        outputs = layer(inputs)
        # print('shapes: ',inputs.shape,outputs.shape)
        total_flop = (
            kernel_size
            * kernel_size
            * in_channels
            * out_channels
            * outputs.shape[-1]
            * outputs.shape[-2]
            * batch_size
        )

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #4

0

파일 보기

파일: conv3d_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    # memorymon = mp.Process(target=print_mem_cpu)
    # memorymon.start()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        in_channels = point["in_channels"]
        out_channels = point["out_channels"]
        kernel_size = point["kernel_size"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs = torch.arange(batch_size * image_size**3 * in_channels,
                              dtype=dtype,
                              device=device).view(
                                  (batch_size, in_channels, image_size,
                                   image_size, image_size))

        layer = torch.nn.Conv3d(in_channels,
                                out_channels,
                                kernel_size,
                                stride=1,
                                padding=1).to(device, dtype=dtype)

        # layer.eval()
        ave_time = benchmark_forward(layer, inputs)

        outputs = layer(inputs)
        total_flop = (kernel_size**3 * in_channels * out_channels *
                      outputs.shape[-1] * outputs.shape[-2] *
                      outputs.shape[-3] * batch_size)
        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)
        # memorymon.terminate()
        # memorymon.join()
        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e), "for point: ", point)
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        # memorymon.terminate()
        # memorymon.join()
        return 0.0

예제 #5

0

파일 보기

파일: conv1d_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        in_channels = point["in_channels"]
        out_channels = point["out_channels"]
        kernel_size = point["kernel_size"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs = torch.arange(batch_size * image_size * in_channels,
                              dtype=dtype,
                              device=device).view(
                                  (batch_size, in_channels, image_size))

        layer = torch.nn.Conv1d(
            in_channels, out_channels, kernel_size, stride=1, padding=1
        ).to(
            # KGF: unlike linear_run.py, dtype=float causes:
            # RuntimeError: Input type (torch.cuda.FloatTensor) and
            # weight type (torch.cuda.DoubleTensor) should be the same
            # device, float)
            device,
            dtype=dtype,
        )  # torch.float32 = torch.float

        ave_time = benchmark_forward(layer, inputs)

        outputs = layer(inputs)
        total_flop = (kernel_size * in_channels * out_channels *
                      outputs.shape[-1] * batch_size)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)
        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #6

0

파일 보기

파일: square_images_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        in_channels = point["in_channels"]
        out_channels = point["out_channels"]
        kernel_size = point["kernel_size"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs = torch.arange(
            batch_size * image_size * image_size * in_channels,
            dtype=dtype,
            device=device,
        ).view((batch_size, in_channels, image_size, image_size))

        layer = torch.nn.Conv2d(in_channels,
                                out_channels, (kernel_size, kernel_size),
                                stride=1).to(device, dtype=dtype)

        ave_time = benchmark_forward(layer, inputs)

        outputs = layer(inputs)
        total_flop = (kernel_size * kernel_size * in_channels * out_channels *
                      outputs.shape[-1] * outputs.shape[-2] * batch_size)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #7

0

파일 보기

파일: linear_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        in_features = point["in_features"]
        out_features = point["out_features"]
        bias = int(point["bias"]) == 1
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        # KGF: check GPU memory usage baseline after loading PyTorch, but before any
        # inputs, model, etc. are defined
        init_mem = None
        if use_cuda:
            init_mem = get_first_gpu_memory_usage()

        # KGF: attempt to max-out V100 utilization in nvidia-smi for a sustained time:
        # batch_size *= 100
        inputs = torch.arange(batch_size * in_features,
                              dtype=dtype,
                              device=device,
                              requires_grad=True).view(
                                  (batch_size, in_features))  # .type(dtype)
        # manually computing flops from the formulas given here:
        # https://machinethink.net/blog/how-fast-is-my-model/
        #
        # FC layer: x*W, x is 1xI vector, W is IxJ matrix
        # ----> I*J MACs
        # dot product (specifically) of n MACs = n multiplications, n-1 adds
        # FC layer computes J dot products:
        # ----> (2I-1)*J FLOPs
        total_flop = batch_size * (2 * in_features - 1) * out_features
        layer = torch.nn.Linear(in_features, out_features, bias=bias).to(
            device, dtype=dtype)  # dtype=float != torch.float

        ave_time = benchmark_forward(layer, inputs, init_mem=init_mem)

        print("flop = ", total_flop, "ave_time = ", ave_time)
        ave_flops = total_flop / ave_time

        print("runtime=", time.time() - start, "ave_flops=", ave_flops)

        # inputs = torch.arange(
        #     batch_size * in_features, dtype=dtype, device=device,
        # )
        # flop, params = profile(layer, inputs=(inputs,))
        # print("-------")
        # print("THOP")
        # print("-------")
        #
        # E.g. for "in_features": 8153, "out_features": 7533, no bias
        # 8153*7533 = 61,416,549 trainable parameters
        # (# of trainable params = number of MACs, in this case)
        # ----> 6871*(2*8153 -1)*7533 = 843934457115 FLOP
        # But THOP returns:
        # 843934466048.0 flop    61416548.0 parameters
        # = 8933 more FLOP, 1 fewer trainable parameter
        # BUG: https://github.com/Lyken17/pytorch-OpCounter/issues/71
        # print(f"{flop} flop    {params} parameters")

        # KGF: note, THOP documentation is inconsistent; claims MACs, but must be FLOPs
        # flop, params = clever_format([flop, params], "%.3f")
        # print(f"{flop} flop    {params} parameters")

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #8

0

파일 보기

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        conv1_in_chan = point["conv1_in_chan"]
        conv1_out_chan = point["conv1_out_chan"]
        conv1_kern = point["conv1_kern"]
        pool_size_1 = point["pool_size_1"]
        pool_size_2 = point["pool_size_2"]
        pool_size_5 = point["pool_size_5"]
        conv2_out_chan = point["conv2_out_chan"]
        conv2_kern = point["conv2_kern"]
        conv3_out_chan = point["conv3_out_chan"]
        conv3_kern = point["conv3_kern"]
        conv4_out_chan = point["conv4_out_chan"]
        conv4_kern = point["conv4_kern"]
        conv5_out_chan = point["conv5_out_chan"]
        conv5_kern = point["conv5_kern"]
        adaptive_pool_dim = point["adaptive_pool_dim"]
        fc1_out = point["fc1_out"]
        fc2_out = point["fc2_out"]
        fc3_out = point["fc3_out"]
        print(point)
        import torch
        import torch.nn as nn

        device, dtype = load_cuda_vs_knl(point)

        class AlexNet(nn.Module):
            def __init__(
                self,
                batch_size,
                image_size,
                conv1_in_chan,
                conv1_out_chan,
                conv1_kern,
                pool_size_1,
                pool_size_2,
                pool_size_5,
                conv2_out_chan,
                conv2_kern,
                conv3_out_chan,
                conv3_kern,
                conv4_out_chan,
                conv4_kern,
                conv5_out_chan,
                conv5_kern,
                adaptive_pool_dim,
                fc1_out,
                fc2_out,
                fc3_out,
            ):
                super(AlexNet, self).__init__()
                self.flop = 0
                self.features = nn.Sequential(
                    # 1st conv
                    nn.Conv2d(
                        conv1_in_chan,
                        conv1_out_chan,
                        kernel_size=conv1_kern,
                        stride=4,
                        padding=2,
                    ),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(kernel_size=pool_size_1, stride=2),
                    # 2nd conv
                    nn.Conv2d(
                        conv1_out_chan,
                        conv2_out_chan,
                        kernel_size=conv2_kern,
                        padding=2,
                    ),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(kernel_size=pool_size_2, stride=2),
                    # 3rd conv
                    nn.Conv2d(
                        conv2_out_chan,
                        conv3_out_chan,
                        kernel_size=conv3_kern,
                        padding=1,
                    ),
                    nn.ReLU(inplace=True),
                    # 4th conv
                    nn.Conv2d(
                        conv3_out_chan,
                        conv4_out_chan,
                        kernel_size=conv4_kern,
                        padding=1,
                    ),
                    nn.ReLU(inplace=True),
                    # 5th conv
                    nn.Conv2d(
                        conv4_out_chan,
                        conv5_out_chan,
                        kernel_size=conv5_kern,
                        padding=1,
                    ),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(kernel_size=pool_size_5, stride=2),
                )

                # FLOPS claculations for convolutional layers:
                # 1st conv2d
                layer_input_size = image_size
                print(layer_input_size)

                self.flop += (
                    conv1_kern ** 2
                    * conv1_in_chan
                    * conv1_out_chan
                    * layer_input_size ** 2
                    * batch_size
                )
                # (((W - K + 2P)/S)+1)
                layer_input_size = int(((image_size - conv1_kern + 2 * 2) / 4) + 1)
                print(layer_input_size)
                layer_input_size = int(((layer_input_size - pool_size_1) / 2) + 1)

                print(layer_input_size)

                # 2nd conv2d
                self.flop += (
                    conv2_kern ** 2
                    * conv1_out_chan
                    * conv2_out_chan
                    * layer_input_size ** 2
                    * batch_size
                )
                layer_input_size = int(
                    ((layer_input_size - conv2_kern + 2 * 2) / 1) + 1
                )
                print(layer_input_size)
                layer_input_size = int(((layer_input_size - pool_size_2) / 2) + 1)

                print(layer_input_size)

                # 3rd conv2d
                self.flop += (
                    conv3_kern ** 2
                    * conv2_out_chan
                    * conv3_out_chan
                    * layer_input_size ** 2
                    * batch_size
                )
                layer_input_size = int(
                    ((layer_input_size - conv3_kern + 2 * 1) / 1) + 1
                )

                print(layer_input_size)

                # 4st conv2d
                self.flop += (
                    conv4_kern ** 2
                    * conv3_out_chan
                    * conv4_out_chan
                    * layer_input_size ** 2
                    * batch_size
                )
                layer_input_size = int(
                    ((layer_input_size - conv4_kern + 2 * 1) / 1) + 1
                )

                print(layer_input_size)

                # 5th conv2d
                self.flop += (
                    conv5_kern ** 2
                    * conv4_out_chan
                    * conv5_out_chan
                    * layer_input_size ** 2
                    * batch_size
                )
                layer_input_size = int(
                    ((layer_input_size - conv5_kern + 2 * 1) / 1) + 1
                )
                print(layer_input_size)
                layer_input_size = int(((layer_input_size - pool_size_5) / 2) + 1)

                print(layer_input_size)

                self.avgpool = nn.AdaptiveAvgPool2d(
                    (adaptive_pool_dim, adaptive_pool_dim)
                )

                self.classifier = nn.Sequential(
                    # linear 1
                    nn.Dropout(),
                    nn.Linear(conv5_out_chan * adaptive_pool_dim ** 2, fc1_out),
                    nn.ReLU(inplace=True),
                    # linear 2
                    nn.Dropout(),
                    nn.Linear(fc1_out, fc2_out),
                    nn.ReLU(inplace=True),
                    # linear 3
                    nn.Linear(fc2_out, fc3_out),
                )

                # FLOPS calculatios for linear layers
                # 1st linear layer
                self.flop += (
                    (2 * (conv5_out_chan * adaptive_pool_dim ** 2) - 1)
                    * fc1_out
                    * batch_size
                )

                # 2nd linear layer
                self.flop += (2 * fc1_out - 1) * fc2_out * batch_size

                # 3rd linear layer
                self.flop += (2 * fc2_out - 1) * fc3_out * batch_size

            def forward(self, x: torch.Tensor) -> torch.Tensor:
                x = self.features(x)
                x = self.avgpool(x)
                x = torch.flatten(x, 1)
                x = self.classifier(x)
                return x

        inputs = torch.arange(
            batch_size * image_size ** 2 * conv1_in_chan, dtype=dtype, device=device
        ).view((batch_size, conv1_in_chan, image_size, image_size))

        # create and move model to GPU
        net = AlexNet(
            batch_size,
            image_size,
            conv1_in_chan,
            conv1_out_chan,
            conv1_kern,
            pool_size_1,
            pool_size_2,
            pool_size_5,
            conv2_out_chan,
            conv2_kern,
            conv3_out_chan,
            conv3_kern,
            conv4_out_chan,
            conv4_kern,
            conv5_out_chan,
            conv5_kern,
            adaptive_pool_dim,
            fc1_out,
            fc2_out,
            fc3_out,
        ).to(device, dtype=dtype)

        total_flop = net.flop

        ave_time = benchmark_forward(net, inputs)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops

    except Exception as e:
        import traceback

        print("received exception: ", str(e), "for point: ", point)
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #9

0

파일 보기

파일: lstm_run.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        seq_length = point["seq_length"]
        in_features = point["in_features"]
        hidden_units = point["hidden_units"]
        num_layers = point["num_layers"]
        # out_features = point["out_features"]
        bias = int(point["bias"]) == 1
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        init_mem = None
        if use_cuda:
            init_mem = get_first_gpu_memory_usage()

        inputs = torch.arange(
            seq_length * batch_size * in_features,
            dtype=dtype,
            device=device,
            requires_grad=True,
        ).view((seq_length, batch_size, in_features))

        layer = torch.nn.LSTM(
            in_features,
            hidden_units,
            num_layers,
            # out_features,
            bias=bias,
        ).to(device, dtype=dtype)

        ave_time = benchmark_forward(layer, inputs, init_mem=init_mem)

        # https://stats.stackexchange.com/questions/328926/how-many-parameters-are-in-a-gated-recurrent-unit-gru-recurrent-neural-network
        # See Dey (2017), LSTM has 4*(n^2 + m*n + n) trainable parameters across
        # 8x matrices and 4x bias vectors (size 1xn), where m= input dim, n= hidden dim
        #
        # Or, consider only matrix-vector mults, using 4x combined matrices:
        # [x, h]*A, for A=W_i, W_c, W_o, W_f all (m+n) x n, vector is 1x(m+n)
        # ---> 4*(m+n)*n MACs
        # ---> 4*(2*(m+n) - 1)*n FLOPs
        total_flop = (4 * seq_length * batch_size *
                      (2 * (in_features + hidden_units) - 1) * hidden_units)
        # Compare to incorrect LSTM answer from:
        # https://github.com/NVIDIA-developer-blog/code-samples/issues/7
        # which assumes input dim = hidden dim, and uses wrong matmul --> FLOPs formula
        print("flop = ", total_flop, "ave_time = ", ave_time)
        ave_flops = total_flop / ave_time

        print("runtime=", time.time() - start, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #10

0

파일 보기

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        conv1_in_chan = point["conv1_in_chan"]
        conv1_out_chan = point["conv1_out_chan"]
        conv1_kern = point["conv1_kern"]
        pool_size = point["pool_size"]
        conv2_out_chan = point["conv2_out_chan"]
        conv2_kern = point["conv2_kern"]
        fc1_out = point["fc1_out"]
        fc2_out = point["fc2_out"]
        fc3_out = point["fc3_out"]
        n_conv_block = point["n_conv_block"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        class Net(torch.nn.Module):
            def __init__(
                self,
                batch_size,
                image_size,
                conv1_in_chan,
                conv1_out_chan,
                conv1_kern,
                pool_size,
                conv2_out_chan,
                conv2_kern,
                fc1_out,
                fc2_out,
                fc3_out,
                n_conv_block,
            ):
                super(Net, self).__init__()
                self.flop = 0
                self.conv1 = torch.nn.Conv2d(conv1_in_chan, conv1_out_chan,
                                             conv1_kern).to(device,
                                                            dtype=dtype)
                self.flop += (conv1_kern**2 * conv1_in_chan * conv1_out_chan *
                              image_size**2 * batch_size)
                self.pool = torch.nn.MaxPool2d(pool_size,
                                               pool_size).to(device,
                                                             dtype=dtype)

                self.conv1_size = image_size - conv1_kern + 1
                self.maxpool1_size = int((self.conv1_size - pool_size) /
                                         pool_size + 1)

                # self.flop += image_size ** 2 * conv1_out_chan * batch_size
                self.conv2 = torch.nn.Conv2d(conv1_out_chan, conv2_out_chan,
                                             conv2_kern).to(device,
                                                            dtype=dtype)
                self.flop += (conv2_kern**2 * conv1_out_chan * conv2_out_chan *
                              int(image_size / pool_size)**2 * batch_size)

                # account for loop of convolutions:
                self.flop = self.flop * n_conv_block

                self.conv2_size = self.maxpool1_size - conv2_kern + 1
                self.maxpool2_size = int((self.conv2_size - pool_size) /
                                         pool_size + 1)
                self.view_size = (conv2_out_chan * self.maxpool2_size *
                                  self.maxpool2_size)

                self.fc1 = torch.nn.Linear(self.view_size,
                                           fc1_out).to(device, dtype=dtype)
                self.flop += (2 * self.view_size - 1) * fc1_out * batch_size
                self.fc2 = torch.nn.Linear(fc1_out, fc2_out).to(device,
                                                                dtype=dtype)
                self.flop += (2 * fc1_out - 1) * fc2_out * batch_size
                self.fc3 = torch.nn.Linear(fc2_out, fc3_out).to(device,
                                                                dtype=dtype)
                self.flop += (2 * fc2_out - 1) * fc3_out * batch_size

            def forward(self, x):
                block_output = torch.zeros(
                    inputs.shape[0] * n_conv_block,
                    self.view_size,
                    device=device,
                    dtype=dtype,
                )

                for i in range(n_conv_block):
                    # Will need to use this sort of strategy when we are using real
                    # datasets, not one dummy batch:
                    # batch = inputs[i * batch_size:(i + 1) * batch_size]
                    batch = inputs

                    x = self.pool(torch.nn.functional.relu(self.conv1(batch)))
                    x = self.pool(torch.nn.functional.relu(self.conv2(x)))
                    x = x.view(-1, self.view_size)
                    block_output[i * batch_size:(i + 1) * batch_size] = x

                x = torch.nn.functional.relu(self.fc1(block_output))
                x = torch.nn.functional.relu(self.fc2(x))
                x = self.fc3(x)

                return x

        inputs = torch.arange(batch_size * image_size**2 * conv1_in_chan,
                              dtype=dtype,
                              device=device).view((batch_size, conv1_in_chan,
                                                   image_size, image_size))
        net = Net(
            batch_size,
            image_size,
            conv1_in_chan,
            conv1_out_chan,
            conv1_kern,
            pool_size,
            conv2_out_chan,
            conv2_kern,
            fc1_out,
            fc2_out,
            fc3_out,
            n_conv_block,
        )

        total_flop = net.flop

        ave_time = benchmark_forward(net, inputs)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e), "for point: ", point)
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #11

0

파일 보기

def run(point):
    start = time.time()
    try:
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        conv1_in_chan = point["conv1_in_chan"]
        conv1_out_chan = point["conv1_out_chan"]
        conv1_kern = point["conv1_kern"]
        pool_size = point["pool_size"]
        conv2_out_chan = point["conv2_out_chan"]
        conv2_kern = point["conv2_kern"]
        fc1_out = point["fc1_out"]
        fc2_out = point["fc2_out"]
        fc3_out = point["fc3_out"]
        print(point)
        import torch

        device, dtype = load_cuda_vs_knl(point)

        class Net(torch.nn.Module):
            def __init__(
                self,
                batch_size,
                image_size,
                conv1_in_chan,
                conv1_out_chan,
                conv1_kern,
                pool_size,
                conv2_out_chan,
                conv2_kern,
                fc1_out,
                fc2_out,
                fc3_out,
            ):
                super(Net, self).__init__()
                self.flop = 0
                self.conv1 = torch.nn.Conv2d(
                    conv1_in_chan, conv1_out_chan, conv1_kern
                ).to(device, dtype=dtype)
                self.flop += (
                    conv1_kern ** 2
                    * conv1_in_chan
                    * conv1_out_chan
                    * image_size ** 2
                    * batch_size
                )
                print(self.flop)
                self.pool = torch.nn.MaxPool2d(pool_size, pool_size).to(
                    device, dtype=dtype
                )
                self.conv1_size = image_size - conv1_kern + 1
                self.maxpool1_size = int((self.conv1_size - pool_size) / pool_size + 1)

                self.flop += image_size ** 2 * conv1_out_chan * batch_size
                self.conv2 = torch.nn.Conv2d(
                    conv1_out_chan, conv2_out_chan, conv2_kern
                ).to(device, dtype=dtype)
                self.flop += (
                    conv2_kern ** 2
                    * conv1_out_chan
                    * conv2_out_chan
                    * int(image_size / pool_size) ** 2
                    * batch_size
                )
                print(self.flop)
                self.conv2_size = self.maxpool1_size - conv2_kern + 1
                self.maxpool2_size = int((self.conv2_size - pool_size) / pool_size + 1)
                self.view_size = (
                    conv2_out_chan * self.maxpool2_size * self.maxpool2_size
                )

                self.fc1 = torch.nn.Linear(self.view_size, fc1_out).to(
                    device, dtype=dtype
                )
                self.flop += (2 * self.view_size - 1) * fc1_out * batch_size

                self.fc2 = torch.nn.Linear(fc1_out, fc2_out).to(device, dtype=dtype)
                self.flop += (2 * fc1_out - 1) * fc2_out * batch_size
                self.fc3 = torch.nn.Linear(fc2_out, fc3_out).to(device, dtype=dtype)
                self.flop += (2 * fc2_out - 1) * fc3_out * batch_size

            def forward(self, x):
                x = self.pool(torch.nn.functional.relu(self.conv1(x)))
                x = self.pool(torch.nn.functional.relu(self.conv2(x)))
                x = x.view(-1, self.view_size)
                x = torch.nn.functional.relu(self.fc1(x))
                x = torch.nn.functional.relu(self.fc2(x))
                x = self.fc3(x)
                return x

        inputs = torch.arange(
            batch_size * image_size ** 2 * conv1_in_chan, dtype=dtype, device=device
        ).view((batch_size, conv1_in_chan, image_size, image_size))
        net = Net(
            batch_size,
            image_size,
            conv1_in_chan,
            conv1_out_chan,
            conv1_kern,
            pool_size,
            conv2_out_chan,
            conv2_kern,
            fc1_out,
            fc2_out,
            fc3_out,
        )

        total_flop = net.flop

        ave_time = benchmark_forward(net, inputs)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops
    except Exception as e:
        import traceback

        print("received exception: ", str(e), "for point: ", point)
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #12

0

파일 보기

def run(point):
    start = time.time()
    try:
        num_classes = point["num_classes"]
        batch_size = point["batch_size"]
        image_size = point["image_size"]
        conv1_in_chan = point["conv1_in_chan"]
        conv1_out_chan = point["conv1_out_chan"]
        conv_kern = point["conv_kern"]
        pool_size = point["pool_size"]
        conv2_out_chan = point["conv2_out_chan"]
        conv3_out_chan = point["conv3_out_chan"]
        conv4_out_chan = point["conv4_out_chan"]
        conv5_out_chan = point["conv5_out_chan"]
        adaptive_pool_dim = point["adaptive_pool_dim"]
        fc1_out = point["fc1_out"]
        fc2_out = point["fc2_out"]
        print(point)

        device, dtype = load_cuda_vs_knl(point)

        class VGG(nn.Module):
            def __init__(
                self,
                features,
                num_classes,
                batch_size,
                image_size,
                conv1_in_chan,
                conv1_out_chan,
                conv2_out_chan,
                conv3_out_chan,
                conv4_out_chan,
                conv5_out_chan,
                conv_kern,
                pool_size,
                adaptive_pool_dim,
                fc1_out,
                fc2_out,
            ) -> None:
                super(VGG, self).__init__()
                self.flop = 0
                self.features = features

                #FLOPS calculations for convolutional layers:
                layer_input_size = image_size
                #1st block of convolutional layers
                for i in range(2):
                    if i == 1:
                        self.flop += (conv_kern**2 * conv1_in_chan *
                                      conv1_out_chan * layer_input_size**2 *
                                      batch_size)
                    else:
                        self.flop += (conv_kern**2 * conv1_out_chan *
                                      conv1_out_chan * layer_input_size**2 *
                                      batch_size)

                    layer_input_size = int((
                        (layer_input_size - conv_kern + 2 * 1) / 1) + 1)

                #Reshape for max pool layer:
                layer_input_size = int(((layer_input_size - pool_size) / 2) +
                                       1)

                #2nd block of convolutional layers
                for i in range(2):
                    if i == 1:
                        self.flop += (conv_kern**2 * conv1_out_chan *
                                      conv2_out_chan * layer_input_size**2 *
                                      batch_size)
                    else:
                        self.flop += (conv_kern**2 * conv2_out_chan *
                                      conv2_out_chan * layer_input_size**2 *
                                      batch_size)
                    layer_input_size = int((
                        (layer_input_size - conv_kern + 2 * 1) / 1) + 1)

                #Reshape for max pool layer:
                layer_input_size = int(((layer_input_size - pool_size) / 2) +
                                       1)

                #3rd block of convolutional layers
                for i in range(3):
                    if i == 1:
                        self.flop += (conv_kern**2 * conv2_out_chan *
                                      conv3_out_chan * layer_input_size**2 *
                                      batch_size)
                    else:
                        self.flop += (conv_kern**2 * conv3_out_chan *
                                      conv3_out_chan * layer_input_size**2 *
                                      batch_size)
                    layer_input_size = int((
                        (layer_input_size - conv_kern + 2 * 1) / 1) + 1)

                #Reshape for max pool layer:
                layer_input_size = int(((layer_input_size - pool_size) / 2) +
                                       1)

                #4th block of convolutional layers
                for i in range(3):
                    if i == 1:
                        self.flop += (conv_kern**2 * conv3_out_chan *
                                      conv4_out_chan * layer_input_size**2 *
                                      batch_size)
                    else:
                        self.flop += (conv_kern**2 * conv4_out_chan *
                                      conv4_out_chan * layer_input_size**2 *
                                      batch_size)
                    layer_input_size = int((
                        (layer_input_size - conv_kern + 2 * 1) / 1) + 1)

                #Reshape for max pool layer:
                layer_input_size = int(((layer_input_size - pool_size) / 2) +
                                       1)

                #5th block of convolutional layers
                for i in range(3):
                    if i == 1:
                        self.flop += (conv_kern**2 * conv4_out_chan *
                                      conv5_out_chan * layer_input_size**2 *
                                      batch_size)
                    else:
                        self.flop += (conv_kern**2 * conv5_out_chan *
                                      conv5_out_chan * layer_input_size**2 *
                                      batch_size)
                    layer_input_size = int((
                        (layer_input_size - conv_kern + 2 * 1) / 1) + 1)

                #Reshape for max pool layer:
                layer_input_size = int(((layer_input_size - pool_size) / 2) +
                                       1)

                self.avgpool = nn.AdaptiveAvgPool2d(
                    (adaptive_pool_dim, adaptive_pool_dim))
                self.classifier = nn.Sequential(
                    nn.Linear(
                        conv5_out_chan * adaptive_pool_dim * adaptive_pool_dim,
                        fc1_out),
                    nn.ReLU(True),
                    nn.Dropout(),
                    nn.Linear(fc1_out, fc2_out),
                    nn.ReLU(True),
                    nn.Dropout(),
                    nn.Linear(fc2_out, num_classes),
                )

                # FLOPS calculatios for linear layers
                # 1st linear layer
                self.flop += ((2 *
                               (conv5_out_chan * adaptive_pool_dim**2) - 1) *
                              fc1_out * batch_size)

                # 2nd linear layer
                self.flop += (2 * fc1_out - 1) * fc2_out * batch_size

                # 3rd linear layer
                self.flop += (2 * fc2_out - 1) * num_classes * batch_size

            def forward(self, x: torch.Tensor) -> torch.Tensor:
                x = self.features(x)
                x = self.avgpool(x)
                x = torch.flatten(x, 1)
                x = self.classifier(x)
                return x

        def make_layers(cfg: List[Union[str, int]],
                        batch_norm: bool = False) -> nn.Sequential:
            layers: List[nn.Module] = []
            in_channels = conv1_in_chan
            for v in cfg:
                if v == 'M':
                    layers += [nn.MaxPool2d(kernel_size=pool_size, stride=2)]
                else:
                    v = cast(int, v)

                    conv2d = nn.Conv2d(in_channels,
                                       v,
                                       kernel_size=conv_kern,
                                       padding=1)
                    if batch_norm:
                        layers += [
                            conv2d,
                            nn.BatchNorm2d(v),
                            nn.ReLU(inplace=True)
                        ]
                    else:
                        layers += [conv2d, nn.ReLU(inplace=True)]
                    in_channels = v
            return nn.Sequential(*layers)

        cfgs: Dict[str, List[Union[str, int]]] = {
            'VGG16': [
                conv1_out_chan, conv1_out_chan, 'M', conv2_out_chan,
                conv2_out_chan, 'M', conv3_out_chan, conv3_out_chan,
                conv3_out_chan, 'M', conv4_out_chan, conv4_out_chan,
                conv4_out_chan, 'M', conv5_out_chan, conv5_out_chan,
                conv5_out_chan, 'M'
            ],
        }

        inputs = torch.arange(batch_size * image_size**2 * conv1_in_chan,
                              dtype=dtype,
                              device=device).view((batch_size, conv1_in_chan,
                                                   image_size, image_size))

        #create and move model to GPU

        #"verion D" is VGG-16
        net = VGG(
            make_layers(cfgs['VGG16'], batch_norm=True),
            num_classes,
            batch_size,
            image_size,
            conv1_in_chan,
            conv1_out_chan,
            conv2_out_chan,
            conv3_out_chan,
            conv4_out_chan,
            conv5_out_chan,
            conv_kern,
            pool_size,
            adaptive_pool_dim,
            fc1_out,
            fc2_out,
        ).to(device, dtype=dtype)

        total_flop = net.flop

        ave_time = benchmark_forward(net, inputs)

        print("total_flop = ", total_flop, "ave_time = ", ave_time)

        ave_flops = total_flop / ave_time
        runtime = time.time() - start
        print("runtime=", runtime, "ave_flops=", ave_flops)

        return ave_flops

    except Exception as e:
        import traceback

        print("received exception: ", str(e), "for point: ", point)
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        return 0.0

예제 #13

0

파일 보기

파일: small_multi_inputs_.py 프로젝트: felker/deephyper_pytorch_layers

def run(point):
    start = time.time()
    try:
        out_channels = point["out_channels"]
        batch_size_1 = point["batch_size"]
        image_size_1 = point["image_size"]
        in_channels_1 = point["in_channels"]
        kernel_size_1 = point["kernel_size"]

        batch_size_2 = point["batch_size"] + 1
        image_size_2 = point["image_size"] + 1
        in_channels_2 = point["in_channels"] + 1
        kernel_size_2 = point["kernel_size"] + 1

        batch_size_3 = point["batch_size"] + 2
        image_size_3 = point["image_size"] + 2
        in_channels_3 = point["in_channels"] + 2
        kernel_size_3 = point["kernel_size"] + 2

        batch_size_4 = point["batch_size"] + 3
        image_size_4 = point["image_size"] + 3
        in_channels_4 = point["in_channels"] + 3
        kernel_size_4 = point["kernel_size"] + 3

        batch_size_5 = point["batch_size"] + 4
        image_size_5 = point["image_size"] + 4
        in_channels_5 = point["in_channels"] + 4
        kernel_size_5 = point["kernel_size"] + 4

        print(point)

        import torch

        device, dtype = load_cuda_vs_knl(point)

        inputs_1 = torch.arange(
            batch_size_1 * image_size_1 * image_size_1 * in_channels_1,
            dtype=dtype,
            device=device,
        ).view((batch_size_1, in_channels_1, image_size_1, image_size_1))

        inputs_2 = torch.arange(
            batch_size_2 * image_size_2 * image_size_2 * in_channels_2,
            dtype=dtype,
            device=device,
        ).view((batch_size_2, in_channels_2, image_size_2, image_size_2))

        inputs_3 = torch.arange(
            batch_size_3 * image_size_3 * image_size_3 * in_channels_3,
            dtype=dtype,
            device=device,
        ).view((batch_size_3, in_channels_3, image_size_3, image_size_3))

        inputs_4 = torch.arange(
            batch_size_4 * image_size_4 * image_size_4 * in_channels_4,
            dtype=dtype,
            device=device,
        ).view((batch_size_4, in_channels_4, image_size_4, image_size_4))

        inputs_5 = torch.arange(
            batch_size_5 * image_size_5 * image_size_5 * in_channels_5,
            dtype=dtype,
            device=device,
        ).view((batch_size_5, in_channels_5, image_size_5, image_size_5))

        layer_1 = torch.nn.Conv2d(
            in_channels_1, out_channels, kernel_size_2, stride=1
        ).to(device, dtype=dtype)
        layer_2 = torch.nn.Conv2d(
            in_channels_2, out_channels, kernel_size_2, stride=1
        ).to(device, dtype=dtype)
        layer_3 = torch.nn.Conv2d(
            in_channels_3, out_channels, kernel_size_3, stride=1
        ).to(device, dtype=dtype)
        layer_4 = torch.nn.Conv2d(
            in_channels_4, out_channels, kernel_size_4, stride=1
        ).to(device, dtype=dtype)
        layer_5 = torch.nn.Conv2d(
            in_channels_5, out_channels, kernel_size_5, stride=1
        ).to(device, dtype=dtype)

        # TODO
        ave_time_1 = benchmark_forward(layer_1, inputs_1)
        ave_time_2 = benchmark_forward(layer_2, inputs_2)
        ave_time_3 = benchmark_forward(layer_3, inputs_3)
        ave_time_4 = benchmark_forward(layer_4, inputs_4)
        ave_time_5 = benchmark_forward(layer_5, inputs_5)

        outputs_1 = layer_1(inputs_1)
        outputs_2 = layer_2(inputs_2)
        outputs_3 = layer_3(inputs_3)
        outputs_4 = layer_4(inputs_4)
        outputs_5 = layer_5(inputs_5)

        total_flop_1 = (
            kernel_size_1
            * kernel_size_1
            * in_channels_1
            * out_channels
            * outputs_1.shape[-1]
            * outputs_1.shape[-2]
            * batch_size_1
        )

        total_flop_2 = (
            kernel_size_2
            * kernel_size_2
            * in_channels_2
            * out_channels
            * outputs_2.shape[-1]
            * outputs_2.shape[-2]
            * batch_size_2
        )

        total_flop_3 = (
            kernel_size_3
            * kernel_size_3
            * in_channels_3
            * out_channels
            * outputs_3.shape[-1]
            * outputs_3.shape[-2]
            * batch_size_3
        )

        total_flop_4 = (
            kernel_size_4
            * kernel_size_4
            * in_channels_4
            * out_channels
            * outputs_4.shape[-1]
            * outputs_4.shape[-2]
            * batch_size_4
        )

        total_flop_5 = (
            kernel_size_5
            * kernel_size_5
            * in_channels_5
            * out_channels
            * outputs_5.shape[-1]
            * outputs_5.shape[-2]
            * batch_size_5
        )

        print("OUTPUT SHAPES: 1,2,3,4,5 (respectively)")
        print(outputs_1.shape)
        print(outputs_2.shape)
        print(outputs_3.shape)
        print(outputs_4.shape)
        print(outputs_5.shape)

        # TODO
        print("flop_1 = ", total_flop_1, "ave_time_1 = ", ave_time_1)
        ave_flops_1 = total_flop_1 / ave_time_1 * batch_size_1
        runtime_1 = time.time() - start
        print("runtime_1=", runtime_1, "ave_flops_1=", ave_flops_1)

        print("flop_2 = ", total_flop_2, "ave_time_2 = ", ave_time_2)
        ave_flops_2 = total_flop_2 / ave_time_2 * batch_size_2
        runtime_2 = time.time() - start
        print("runtime_2=", runtime_2, "ave_flops_2=", ave_flops_2)

        print("flop_3 = ", total_flop_3, "ave_time_3 = ", ave_time_3)
        ave_flops_3 = total_flop_3 / ave_time_3 * batch_size_3
        runtime_3 = time.time() - start
        print("runtime_3=", runtime_3, "ave_flops_3=", ave_flops_3)

        print("flop_4 = ", total_flop_4, "ave_time_4 = ", ave_time_4)
        ave_flops_4 = total_flop_4 / ave_time_4 * batch_size_4
        runtime_4 = time.time() - start
        print("runtime_4=", runtime_4, "ave_flops_4=", ave_flops_4)

        print("flop_5 = ", total_flop_5, "ave_time_5 = ", ave_time_5)
        ave_flops_5 = total_flop_5 / ave_time_5 * batch_size_5
        runtime_5 = time.time() - start
        print("runtime_5=", runtime_5, "ave_flops_5=", ave_flops_5)

        # TODO return tuple
        return ave_flops_1

    except Exception as e:
        import traceback

        print("received exception: ", str(e))
        print(traceback.print_exc())
        print("runtime=", time.time() - start)
        # KGF: random addition...
        # logger.exception("exception raised")
        return 0.0