Пример #1
0
    def test_depthwise_convolution(self, batch_size, gc, dc):
        op = core.CreateOperator("Conv", ["X", "w", "b"], ["Y"],
                                 stride=1,
                                 pad=0,
                                 kernel=1,
                                 group=4,
                                 device_option=dc[0])
        op1 = core.CreateOperator("Conv", ["X", "w", "b"], ["Y"],
                                  stride=1,
                                  pad=0,
                                  kernel=1,
                                  group=4,
                                  device_option=dc[1])
        X = np.random.rand(batch_size, 544, 14, 14).astype(np.float32)
        w = np.random.rand(544, 136, 1, 1).astype(np.float32)
        b = np.random.rand(544).astype(np.float32)

        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X', X, dc[0])
        workspace.FeedBlob('w', w, dc[0])
        workspace.FeedBlob('b', b, dc[0])
        workspace.RunOperatorOnce(op)
        Y0 = workspace.FetchBlob('Y')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X', X, dc[1])
        workspace.FeedBlob('w', w, dc[1])
        workspace.FeedBlob('b', b, dc[1])
        net = core.Net("net")
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([op1])
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y1 = workspace.FetchBlob('Y')

        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        workspace.ResetWorkspace()
        workspace.FeedBlob('X', X, dc[1])
        workspace.FeedBlob('w', w, dc[1])
        workspace.FeedBlob('b', b, dc[1])
        workspace.RunOperatorOnce(op1)
        Y2 = workspace.FetchBlob('Y')

        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
            print(Y2.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y2 - Y0)))
            self.assertTrue(False)
Пример #2
0
    def test_convolution_affch_folding(
            self, stride, pad, kernel, size, input_channels,
            output_channels, batch_size, use_bias, group,
            inplace, gc, dc):
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["X1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[1]
        )
        affch = core.CreateOperator(
            "AffineChannel",
            ["X1", "scale", "bias"],
            ["X1" if inplace else "Y"],
            device_option=dc[1]
        )

        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        scale = np.random.rand(output_channels).astype(np.float32) + 0.5
        bias = np.random.rand(output_channels).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        workspace.FeedBlob('scale', scale, dc[1])
        workspace.FeedBlob('bias', bias, dc[1])
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(affch)
        Y = workspace.FetchBlob('X1' if inplace else "Y")

        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        affch_old = caffe2_pb2.OperatorDef()
        affch_old.CopyFrom(affch)
        affch_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_old, affch_old])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        workspace.FeedBlob('scale', scale, dc[1])
        workspace.FeedBlob('bias', bias, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 1)
        self.assertTrue(net.Proto().op[0].type == "Conv")
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y1 = workspace.FetchBlob('X1' if inplace else "Y")
        if not np.allclose(Y, Y1, atol=0.01, rtol=0.01):
            print(Y.flatten())
            print(Y1.flatten())
            print(np.max(np.abs(Y - Y1)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Пример #3
0
    def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, gc, dc):
        conv_S0 = core.CreateOperator(
            "Conv",
            ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
            ["S0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        sum = core.CreateOperator(
            "Sum",
            ["S0", "Y0"],
            ["S0"],
            device_option=dc[0]
        )
        relu = core.CreateOperator(
            "Relu",
            ["S0"],
            ["S0"],
            device_option=dc[0]
        )

        SX = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        Sw = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('Sw0', Sw, dc[0])
        workspace.FeedBlob('Sb0', Sb, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_S0_old = caffe2_pb2.OperatorDef()
        conv_S0_old.CopyFrom(conv_S0)
        conv_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('Sw0', Sw, dc[1])
        workspace.FeedBlob('Sb0', Sb, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Пример #4
0
    def test_convolution_relu_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, gc, dc):
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        relu = core.CreateOperator(
            "Relu",
            ["Y0"],
            ["Y0"],
            device_option=dc[0]
        )

        # Manual fusion for Conv + ReLU
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1"] if use_bias else ["X1", "w1"],
            ["Y1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type = 1,
            device_option=dc[1]
        )

        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu)
        Y0 = workspace.FetchBlob('Y0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(conv_fusion)
        Y1 = workspace.FetchBlob('Y1')
        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        # Auto fusion for Conv + ReLU
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_old, relu_old])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 1)
        self.assertTrue(net.Proto().op[0].type == "ConvFusion")
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y2 = workspace.FetchBlob('Y0')
        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
            print(Y2.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y2 - Y0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Пример #5
0
    def test_convolution_sum_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, sum_add, gc, dc):
        pool_S0 = core.CreateOperator(
            "MaxPool",
            ["SX0"],
            ["S0"],
            stride=2,
            pad=0,
            kernel=2,
            device_option=dc[0]
        )
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        sum = core.CreateOperator(
            sum_add,
            ["S0", "Y0"],
            ["S0"],
            device_option=dc[0]
        )

        # Manual fusion for Conv + Sum
        pool_S1 = core.CreateOperator(
            "MaxPool",
            ["SX1"],
            ["S1"],
            stride=2,
            pad=0,
            kernel=2,
            group=group,
            device_option=dc[1]
        )
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type = 2,
            device_option=dc[1]
        )
        pool_input_size = int(math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2;
        SX = np.random.rand(
            batch_size, output_channels * group, pool_input_size, pool_input_size).astype(np.float32) - 0.5
        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(pool_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX1', SX, dc[1])
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(pool_S1)
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)

        # Auto fusion for Conv + Sum
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        pool_S0_old = caffe2_pb2.OperatorDef()
        pool_S0_old.CopyFrom(pool_S0)
        pool_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([pool_S0_old, conv_old, sum_old])

        # Conv + Sum should be fused case: [PreNode, Conv, Sum]
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should be fused case: [Conv, PreNode, Sum]
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net.op.extend([conv_old, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp
        dropout = core.CreateOperator(
            "Dropout",
            ["Y0"],
            ["Y_dropout"],
            ratio=0.5,
            is_test=True,
            device_option=dc[1]
        )

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp
        sum1 = core.CreateOperator(
            sum_add,
            ["S0", "Y0"],
            ["S3"],
            device_option=dc[1]
        )
        dropout = core.CreateOperator(
            "Dropout",
            ["S0"],
            ["Y_dropout"],
            ratio=0.5,
            is_test=True,
            device_option=dc[1]
        )

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, pool_S0_old, sum1, dropout])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        print("net={}\n".format(net.Proto()))
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-2].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum]
        # midOp output has the same name with that of the Conv input
        relu_0 = core.CreateOperator(
            "Relu",
            ["X0"],
            ["X1"],
            device_option=dc[0]
        )
        conv = core.CreateOperator(
            "Conv",
            ["X1", "w0", "b0"] if use_bias else ["X1", "w0"],
            ["Y0"],
            stride=1,
            pad=0,
            kernel=1,
            device_option=dc[0]
        )
        relu_1 = core.CreateOperator(
            "Relu",
            ["X1"],
            ["X1"],
            device_option=dc[0]
        )
        pool = core.CreateOperator(
            "MaxPool",
            ["X1"],
            ["S0"],
            stride=1,
            pad=0,
            kernel=1,
            device_option=dc[0]
        )
        sum = core.CreateOperator(
            "Sum",
            ["S0", "Y0"],
            ["S0"],
            device_option=dc[0]
        )

        X = np.random.rand(
            batch_size, input_channels, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
            input_channels, input_channels, 1, 1).astype(np.float32) - 0.5
        b = np.random.rand(input_channels).astype(np.float32) - 0.5

        workspace.SwitchWorkspace(old_ws_name)
        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(relu_0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu_1)
        workspace.RunOperatorOnce(pool)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        relu_0_old = caffe2_pb2.OperatorDef()
        relu_0_old.CopyFrom(relu_0)
        relu_0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_1_old = caffe2_pb2.OperatorDef()
        relu_1_old.CopyFrom(relu_1)
        relu_1_old.device_option.CopyFrom(dc[1])
        pool_old = caffe2_pb2.OperatorDef()
        pool_old.CopyFrom(pool)
        pool_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])

        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([relu_0_old, conv_old, relu_1_old, pool_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 5)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)
Пример #6
0
    def test_preConvert(self, input_channels, batch_size):
        def AddModel(model, data):
            conv1 = brew.conv(model,
                              data,
                              'conv1',
                              dim_in=input_channels,
                              dim_out=10,
                              kernel=3,
                              stride=1,
                              pad=1,
                              training_mode=1)
            deconv1 = brew.conv_transpose(model,
                                          conv1,
                                          'deconv1',
                                          dim_in=10,
                                          dim_out=10,
                                          kernel=2,
                                          stride=2,
                                          pad=0,
                                          training_mode=1)
            fc1 = brew.fc(model,
                          deconv1,
                          'fc1',
                          dim_in=10 * 56 * 56,
                          dim_out=3)
            softmax = brew.softmax(model, fc1, 'softmax')

            return softmax

        def AddTrainingOperators(model, softmax, label):
            """Adds training operators to the model."""
            # Compute cross entropy between softmax scores and labels
            xent = model.LabelCrossEntropy([softmax, label], 'xent')
            # Compute the expected loss
            loss = model.AveragedLoss(xent, "loss")
            # Use the average loss we just computed to add gradient operators to the model
            model.AddGradientOperators([loss])

        arg_scope = {"order": "NCHW", 'no_bias': False}
        # Create the model helper for the train model
        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
        with core.DeviceScope(device_opt):
            train_model = model_helper.ModelHelper(name="test_train",
                                                   arg_scope=arg_scope)
            # Add the model definition (fc layers, conv layers, softmax, etc.)
            softmax = AddModel(train_model, "X")
            AddTrainingOperators(train_model, softmax, "label")

            X = np.random.rand(batch_size, input_channels, 28, 28).astype(
                np.float32) - 0.5
            label = np.random.randint(3, size=batch_size).astype(np.int32)
            blob_dict = {}
            output_dict = {}
            output_dict_cosim = {}
            old_ws_name = workspace.CurrentWorkspace()
            workspace.FeedBlob('X', X)
            workspace.FeedBlob('label', label)
            workspace.RunNetOnce(train_model.param_init_net)
            for op in train_model.net.Proto().op:
                if op.type == "Softmax":
                    break
                for j in range(1, len(op.input)):
                    blob_dict[op.input[j]] = workspace.FetchBlob(op.input[j])

            workspace.CreateNet(train_model.net, overwrite=True)
            optimizeForMKLDNN(train_model.net, training_mode=True)
            workspace.RunNet(train_model.net)
            for op in train_model.net.Proto().op:
                for blob in op.output:
                    output_dict[blob] = workspace.FetchBlob(blob)

            workspace.SwitchWorkspace("_device_check_", True)
            workspace.FeedBlob('X', X)
            workspace.FeedBlob('label', label)
            for blob in blob_dict.keys():
                workspace.FeedBlob(blob, blob_dict[blob])
            workspace.CreateNet(train_model.net, overwrite=True)
            workspace.RunNet(train_model.net)
            for blob in output_dict.keys():
                output_dict_cosim[blob] = workspace.FetchBlob(blob)

            for blob in output_dict.keys():
                if not np.allclose(output_dict[blob],
                                   output_dict_cosim[blob],
                                   atol=0.001,
                                   rtol=0.0001):
                    print("blob {} error".format(blob))
                    print(
                        np.max(
                            np.abs(output_dict[blob] -
                                   output_dict_cosim[blob])))
                    self.assertTrue(False)

            workspace.ResetWorkspace()
            workspace.SwitchWorkspace(old_ws_name)
Пример #7
0
    def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, sum_add, gc, dc):
        conv_S0 = core.CreateOperator(
            "Conv",
            ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
            ["S0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        sum = core.CreateOperator(
            sum_add,
            ["S0", "Y0"],
            ["S0"],
            device_option=dc[0]
        )
        relu = core.CreateOperator(
            "Relu",
            ["S0"],
            ["S0"],
            device_option=dc[0]
        )

        # Manual fusion for Conv + Sum + ReLU
        conv_S1 = core.CreateOperator(
            "Conv",
            ["SX1", "Sw1", "Sb1"] if use_bias else ["SX1", "Sw1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[1]
        )
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type = 3,
            device_option=dc[1]
        )
        SX = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        Sw = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('Sw0', Sw, dc[0])
        workspace.FeedBlob('Sb0', Sb, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX1', SX, dc[1])
        workspace.FeedBlob('Sw1', Sw, dc[1])
        workspace.FeedBlob('Sb1', Sb, dc[1])
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(conv_S1)
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)

        # Auto fusion for Conv + Sum + ReLU
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_S0_old = caffe2_pb2.OperatorDef()
        conv_S0_old.CopyFrom(conv_S0)
        conv_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('Sw0', Sw, dc[1])
        workspace.FeedBlob('Sb0', Sb, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob('S0')
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Пример #8
0
def Run(args, extra_args):
    """main func of run inference"""
    if not m.IsSupported(args.model):
        logging.error("Not supported model: {}".format(args.model))
        m.ShowModels()
        return
    images_path = None
    if args.images_path:
        images_path = os.path.abspath(args.images_path)
    elif "CAFFE2_INF_IMG_PATH" in os.environ:
        images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"])
    if not args.dummydata and not os.path.isdir(images_path):
        logging.error("Can not find image path {}.".format(images_path))
        return
    labels = None
    validation = None
    if args.label_file:
        labels = cc2.LoadLabels(args.label_file)
    elif args.validation_file:
        validation = cc2.LoadValidation(args.validation_file)
    elif "CAFFE2_INF_LABEL_FILE" in os.environ:
        labels = cc2.LoadLabels(os.environ["CAFFE2_INF_LABEL_FILE"])
    elif "CAFFE2_INF_VAL_FILE" in os.environ:
        validation = cc2.LoadValidation(os.environ["CAFFE2_INF_VAL_FILE"])
    else:
        logging.warning("No validation or label file!")
    if args.annotations:
        apath = args.annotations
    elif args.model == 'faster-rcnn' or args.model == 'ssd':
        logging.error(
            "currently only support fasterrcnn and ssd for voc dataset, so will just collect performance"
        )
    iterations = args.iterations if args.iterations else sys.maxsize
    warmup_iter = args.warmup_iterations if args.warmup_iterations > 0 else 0
    optimization = []
    if args.optimization:
        optimization = [opt.strip() for opt in args.optimization.split(',')]
    batch_size = 1
    if args.batch_size:
        batch_size = int(args.batch_size)
        if batch_size <= 0:
            logging.error("Invalid batch size {}. Exit!".format(batch_size))
            return
    logging.warning("Run Caffe2 in inference mode with args:\n{}".format(
        vars(args)))
    model_info = m.GetModelInfo(args.model)
    logging.warning("The inference inputs of {0} model:\n{1}".format(
        args.model, {str(k): str(v)
                     for k, v in model_info.items()}))

    crop_size = int(model_info["crop_size"])
    if args.crop_size:
        crop_size = args.crop_size

    need_normalize = False
    if model_info["need_normalize"]:
        need_normalize = True

    mean = 128
    image_mean = str(model_info["image_mean"])
    if str(model_info["image_mean"]) != 'None' and image_mean.split(
            '.')[-1] == "binaryproto":
        from inference.caffe.proto import caffe_pb2
        from inference import caffe
        from inference.caffe import io
        blob = caffe_pb2.BlobProto()
        data = open(image_mean, 'rb').read()
        blob.ParseFromString(data)
        mean = np.array(io.blobproto_to_array(blob))

    mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ')
    if str(model_info["image_mean"]) != 'None' and len(mean_tmp) == 3:
        if need_normalize:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.float)
            mean[0, :, :] = float(mean_tmp[0])  # 104
            mean[1, :, :] = float(mean_tmp[1])  # 117
            mean[2, :, :] = float(mean_tmp[2])  # 124
        else:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.int32)
            mean[0, :, :] = int(mean_tmp[0])  # 104
            mean[1, :, :] = int(mean_tmp[1])  # 117
            mean[2, :, :] = int(mean_tmp[2])  # 124

    scale = [1]
    if str(model_info["scale"]) != '':
        scale = (model_info["scale"]).split(' ')
    rescale_size = 256
    if str(model_info["rescale_size"]) != '':
        rescale_size = int(model_info["rescale_size"])
    color_format = "BGR"
    if str(model_info["color_format"]) != '':
        color_format = model_info["color_format"]

    model_start_time = timeit.default_timer()
    if args.onnx_model:
        init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"])
    else:
        if args.int8_model or args.int8_cosim:
            init_file = model_info["init_net_int8"]
            predict_file = model_info["predict_net_int8"]
        else:
            init_file = model_info["init_net"]
            predict_file = model_info["predict_net"]
        with open(init_file, "rb") as i:
            print(model_info)
            if model_info["model_type"] == "prototext" or init_file.split(
                    '.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef())
            else:
                init_def = caffe2_pb2.NetDef()
                init_def.ParseFromString(i.read())
        with open(predict_file, "rb") as p:
            print(model_info["model_type"])
            if model_info["model_type"] == "prototext" or predict_file.split(
                    '.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef())
            else:
                predict_def = caffe2_pb2.NetDef()
                predict_def.ParseFromString(p.read())
        if args.int8_cosim:
            with open(model_info["predict_net"], "rb") as p:
                if model_info["model_type"] == "prototext" or model_info[
                        "predict_net"].split('.')[-1] == "pbtxt":
                    import google.protobuf.text_format as ptxt
                    cosim_predict_def = ptxt.Parse(p.read(),
                                                   caffe2_pb2.NetDef())
                else:
                    cosim_predict_def = caffe2_pb2.NetDef()
                    cosim_predict_def.ParseFromString(p.read())
    #cc2.SaveAsOnnxModel(init_def, predict_def, (1, 3, crop_size, crop_size),
    #            model_info["model_name"] + "_onnx.pb")

    dev_map = {
        "cpu": caffe2_pb2.CPU,
        "gpu": caffe2_pb2.CUDA,
        "cuda": caffe2_pb2.CUDA,
        "mkldnn": caffe2_pb2.MKLDNN,
        "opengl": caffe2_pb2.OPENGL,
        "opencl": caffe2_pb2.OPENCL,
        "ideep": caffe2_pb2.IDEEP,
    }
    device_opts = caffe2_pb2.DeviceOption()
    if args.device.lower() in dev_map:
        device_opts.device_type = dev_map[args.device.lower()]
    else:
        logging.error("Wrong device {}. Exit!".format(args.device))
        return
    device_opts_cpu = caffe2_pb2.DeviceOption()
    device_opts_cpu.device_type = caffe2_pb2.CPU
    if model_info["allow_device_override"]:
        if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            cc2.UpdateDeviceOption(device_opts_cpu, init_def)
        else:
            cc2.UpdateDeviceOption(device_opts, init_def)
    if model_info["allow_device_override"]:
        cc2.UpdateDeviceOption(device_opts, predict_def)
    # search params shape to replace the 0 with 1 when ideep and throw warning
    if args.device.lower() == 'ideep':
        cc2.FillZeroParamsWithOne(init_def)

    if os.environ.get('DEBUGMODE') == "1":
        with open("{0}_origin_init_net.pb".format(model_info["model_name"]),
                  "w") as fid:
            fid.write(init_def.SerializeToString())
        with open("{}_origin_init_net.pbtxt".format(model_info["model_name"]),
                  "w") as fid:
            fid.write(str(init_def))
        with open("{0}_origin_predict_net.pb".format(model_info["model_name"]),
                  "w") as fid:
            fid.write(predict_def.SerializeToString())
        with open(
                "{}_origin_predict_net.pbtxt".format(model_info["model_name"]),
                "w") as fid:
            fid.write(str(predict_def))

    if model_info["model_type"] == "caffe legacy":
        cc2.MergeScaleBiasInBN(predict_def)
        cc2.RemoveUselessExternalInput(predict_def)
        if args.int8_cosim:
            cc2.MergeScaleBiasInBN(cosim_predict_def)
            cc2.RemoveUselessExternalInput(cosim_predict_def)

    if model_info["model_type"] == "torch legacy":
        cc2.OptimizeTorchModel(init_def, predict_def, model_info, device_opts)
        if os.environ.get('DEBUGMODE') == "1":
            with open(
                    "{0}_opt_predict_net.pb".format(model_info["model_name"]),
                    "w") as fid:
                fid.write(predict_def.SerializeToString())
            with open(
                    "{}_opt_predict_net.pbtxt".format(
                        model_info["model_name"]), "w") as fid:
                fid.write(str(predict_def))

    init_data = np.random.rand(batch_size, 3, crop_size,
                               crop_size).astype(np.float32)
    init_label = np.ones((batch_size), dtype=np.int32)
    if args.cosim:
        def_ws_name = ws.CurrentWorkspace()
        inf_ws_name = "__inf_ws__"
        ws.SwitchWorkspace(inf_ws_name, True)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)
        ws.RunNetOnce(init_def)
        cosim_ws_name = "__cosim_ws__"
        ws.SwitchWorkspace(cosim_ws_name, True)
        device_cosim = caffe2_pb2.DeviceOption()
        device_cosim.device_type = dev_map["cpu"]
        cosim_init_def = copy.deepcopy(init_def)
        cc2.UpdateDeviceOption(device_cosim, cosim_init_def)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_cosim)
        ws.RunNetOnce(cosim_init_def)
        cosim_predict_def = copy.deepcopy(predict_def)
        cc2.UpdateDeviceOption(device_cosim, cosim_predict_def)
    elif args.int8_cosim:
        inf_ws_name = "__int8_ws__"
        ws.SwitchWorkspace(inf_ws_name, True)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)
        ws.RunNetOnce(init_def)

        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(predict_def)
        tf.optimizeForMKLDNN(net)
        predict_def = net.Proto()

        cosim_ws_name = "__fp32_ws__"
        ws.SwitchWorkspace(cosim_ws_name, True)
        ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), init_data,
                    device_opts)
        ws.RunNetOnce(init_def)
        cc2.UpdateDeviceOption(device_opts, cosim_predict_def)

        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(cosim_predict_def)
        tf.optimizeForMKLDNN(net)
        cosim_predict_def = net.Proto()
    else:
        # ApplyOptimizations(init_def, predict_def, model_info, optimization)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)

        if os.environ.get('DEBUGMODE') == "1":
            cc2.SetOpName(predict_def)

        ws.RunNetOnce(init_def)
        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(predict_def)
        if args.device.lower() == 'ideep' and not args.noptimize:
            logging.warning('Optimizing module {} ....................'.format(
                model_info["model_name"]))
            tf.optimizeForMKLDNN(net)
        predict_def = net.Proto()

        # ws.CreateNet(predict_def)
        if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            new_predict_def, _ = core.InjectCrossDeviceCopies(
                core.Net(predict_def))
            net = core.Net(new_predict_def._net)
            #ws.CreateNet(new_predict_def._net)
            predict_def = new_predict_def._net

        if os.environ.get('DEBUGMODE') == "1":
            with open("{0}_opt_init_net.pb".format(model_info["model_name"]),
                      "w") as fid:
                fid.write(init_def.SerializeToString())
            with open("{}_opt_init_net.pbtxt".format(model_info["model_name"]),
                      "w") as fid:
                fid.write(str(init_def))
            with open(
                    "{0}_opt_predict_net.pb".format(model_info["model_name"]),
                    "w") as fid:
                fid.write(predict_def.SerializeToString())
            with open(
                    "{}_opt_predict_net.pbtxt".format(
                        model_info["model_name"]), "w") as fid:
                fid.write(str(predict_def))

        if args.profile or predict_def.op[-1].type == 'Accuracy':
            #predict_model = model_helper.ModelHelper("predict")
            #predict_model.net = core.Net(predict_def)
            #predict_model.net.name = predict_def.name
            if predict_def.op[-1].type == 'Accuracy':
                label = net.AddExternalInput('label')
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(label, init_label, device_opts)
                else:
                    ws.FeedBlob(label, init_label, device_opts_cpu)
                for i, op in enumerate(predict_def.op):
                    if op.type == 'Accuracy':
                        if args.device.lower() == 'gpu':
                            print(device_opts.device_type)
                            ws.FeedBlob(str(predict_def.op[i].output[0]),
                                        init_label, device_opts)
                        else:
                            ws.FeedBlob(str(predict_def.op[i].output[0]),
                                        init_label, device_opts_cpu)
            #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            #    ws.CreateNet(net, True)
            #else:
            ws.CreateNet(net)
            if args.profile:
                #ob = predict_model.net.AddObserver("TimeObserver")
                ob = net.AddObserver("TimeObserver")
        else:
            #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            #    ws.CreateNet(net, True)
            #else:
            ws.CreateNet(net)

    model_elapsed_time = timeit.default_timer() - model_start_time

    outputs = []
    accuracy_top1 = []
    accuracy_top5 = []
    img_time = 0
    comp_time = 0
    processed_images = 0
    images = []
    labels = []
    fnames = []
    if args.dummydata:
        init_label = np.ones((batch_size), dtype=np.int32)
        if args.dummyvalue != "random":
            imgs = np.full((batch_size, 3, crop_size, crop_size),
                           float(args.dummyvalue),
                           dtype=np.float32)
        else:
            imgs = np.random.rand(batch_size, 3, crop_size,
                                  crop_size).astype(np.float32)
        for i in range(iterations):
            labels.append(init_label)
            images.append(imgs)
    else:
        process_data_start_time = timeit.default_timer()
        images, fnames = cc2.ImageProc.BatchImages(images_path, batch_size,
                                                   iterations)
        process_data_elapsed_time = timeit.default_timer(
        ) - process_data_start_time
        logging.warning(
            "processdata time = {}".format(process_data_elapsed_time))
    logging.warning("Start warmup {} iterations...".format(warmup_iter))
    forchw = 1
    if 'style-transfer' in args.model:
        forchw = 0
    wi = warmup_iter - 1
    while warmup_iter and not args.cosim:
        warmup_iter -= 1
        if args.dummydata:
            imgs = images[wi - warmup_iter]
            oshape = (crop_size, crop_size, 3)
        else:
            r = randint(0, len(images) - 1)
            if model_info["model_type"] == "mlperf legacy vgg":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfVGG(
                    images[r])
            elif model_info["model_type"] == "mlperf legacy mb":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfMB(
                    images[r])
            else:
                imgs, oshape = cc2.ImageProc.PreprocessImages(
                    images[r], crop_size, rescale_size, mean, scale, forchw,
                    need_normalize, color_format)
            #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading(
            #    images[r], crop_size, rescale_size, mean, scale, forchw)
        if args.model == 'faster-rcnn':
            # init_def_update=copy.deepcopy(init_def)
            # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size)
            # ws.RunNetOnce(init_def_update)
            im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def,
                                                  crop_size)
            if args.device.lower() == 'gpu':
                ws.FeedBlob(im_info_name, blob, device_opts_cpu)
            else:
                ws.FeedBlob(im_info_name, blob, device_opts)
        if 'style-transfer' in args.model or (args.model == 'faster-rcnn' and
                                              args.device.lower() == 'gpu'):
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs)
        else:
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
        if predict_def.op[-1].type == 'Accuracy' and args.dummydata:
            init_label = labels[wi - warmup_iter]
            ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                        device_opts_cpu)
            ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label,
                        device_opts_cpu)
        elif predict_def.op[-1].type == 'Accuracy' and len(validation) > 0:
            batch_fname = fnames[r]
            init_label = np.ones((len(fnames[r])), dtype=np.int32)
            for j in range(len(fnames[r])):
                init_label[j] = validation[batch_fname[j]]

            if args.device.lower() == 'gpu':
                ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                            device_opts)
                ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label,
                            device_opts)
            else:
                ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                            device_opts_cpu)
                ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label,
                            device_opts_cpu)

        #if args.profile or predict_def.op[-1].type == 'Accuracy':
        #    ws.RunNet(net)
        #else:
        ws.RunNet(net)
    logging.warning("Start running performance")
    for k, raw in enumerate(images):
        processed_images += len(raw)
        img_start_time = timeit.default_timer()
        if args.dummydata:
            imgs = raw
            oshape = (crop_size, crop_size)
        else:
            if model_info["model_type"] == "mlperf legacy vgg":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfVGG(raw)
            elif model_info["model_type"] == "mlperf legacy mb":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfMB(raw)
            else:
                imgs, oshape = cc2.ImageProc.PreprocessImages(
                    raw, crop_size, rescale_size, mean, scale, forchw,
                    need_normalize, color_format)
            #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading(raw, crop_size, rescale_size, mean, scale, forchw)
        # im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def, crop_size)
        # ws.FeedBlob(im_info_name, blob, device_opts)
        # x = ws.FetchBlob(im_info_name)

        init_label = None
        if predict_def.op[-1].type == 'Accuracy' and args.dummydata:
            init_label = labels[k]
        elif predict_def.op[-1].type == 'Accuracy' and len(validation) > 0:
            batch_fname = fnames[k]
            init_label = np.ones((len(fnames[k])), dtype=np.int32)
            for j in range(len(fnames[k])):
                init_label[j] = validation[batch_fname[j]]

        if args.model == 'faster-rcnn':
            # init_def_update=copy.deepcopy(init_def)
            # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size)
            im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def,
                                                  crop_size)

            if args.cosim:
                ws.SwitchWorkspace(inf_ws_name, True)
                # ws.RunNetOnce(init_def_update)
                ws.FeedBlob(im_info_name, blob, device_opts)
                ws.SwitchWorkspace(cosim_ws_name, True)
                # cosim_init_def_update=copy.deepcopy(cosim_init_def)
                # cc2.UpdateImgInfo(oshape, cosim_init_def_update, cosim_predict_def, crop_size)
                # ws.RunNetOnce(cosim_init_def_update)
                ws.FeedBlob(im_info_name, blob, device_cosim)
            else:
                # ws.RunNetOnce(init_def_update)
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(im_info_name, blob, device_opts_cpu)
                else:
                    ws.FeedBlob(im_info_name, blob, device_opts)
        # logging.info("output blob is: {}".format(x))
        # imgs = ImageProc.PreprocessImages(raw, crop_size, mean)
        img_elapsed_time = timeit.default_timer() - img_start_time
        img_time += img_elapsed_time
        if args.cosim or args.int8_cosim:
            ws.SwitchWorkspace(cosim_ws_name)
            if args.cosim:
                ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs,
                            device_cosim)
            else:
                ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs,
                            device_opts)
            ws.SwitchWorkspace(inf_ws_name)
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
            for i in range(len(predict_def.op)):
                ws.SwitchWorkspace(inf_ws_name)
                inf_inputs = []
                for inp in predict_def.op[i].input:
                    inf_inputs.append(ws.FetchBlob(str(inp)))
                ws.RunOperatorOnce(predict_def.op[i])
                inf_results = []
                for res in predict_def.op[i].output:
                    inf_results.append(ws.FetchBlob(str(res)))
                ws.SwitchWorkspace(cosim_ws_name)
                cosim_inputs = []
                for inp in cosim_predict_def.op[i].input:
                    cosim_inputs.append(ws.FetchBlob(str(inp)))
                ws.RunOperatorOnce(cosim_predict_def.op[i])
                cosim_results = []
                for res in cosim_predict_def.op[i].output:
                    cosim_results.append(ws.FetchBlob(str(res)))
                if len(inf_inputs) != len(cosim_inputs):
                    logging.error("Wrong number of inputs")
                if len(inf_results) != len(cosim_results):
                    logging.error("Wrong number of outputs")
                    return
                if args.cosim:
                    tol = {'atol': 1e-02, 'rtol': 1e-03}
                else:
                    tol = {'atol': 5, 'rtol': 1e-01}
                logging.warning("begin to check op[{}] {} input".format(
                    i, predict_def.op[i].type))
                for k in range(len(inf_inputs)):
                    if predict_def.op[i].input[k][0] == '_':
                        continue
                    #cc2.assert_allclose(inf_inputs[k], cosim_inputs[k], **tol)
                    #if not np.allclose(inf_inputs[k], cosim_inputs[k], **tol):
                    #    logging.error("Failure in cosim {} op {} input {}"
                    #        .format(
                    #        i,
                    #        predict_def.op[i].type,
                    #        predict_def.op[i].input[k]))
                    #    logging.error(inf_inputs[k].flatten())
                    #    logging.error(cosim_inputs[k].flatten())
                    #    logging.error("Max error: {}"
                    #        .format(
                    #        np.max(np.abs(
                    #            inf_inputs[k] - cosim_inputs[k]))))
                    #    return
                logging.warning("pass checking op[{0}] {1} input".format(
                    i, predict_def.op[i].type))
                logging.warning("begin to check op[{0}] {1} output".format(
                    i, predict_def.op[i].type))
                for j, _ in enumerate(inf_results):
                    if predict_def.op[i].output[j][0] == '_':
                        continue
                    if args.cosim:
                        if not cc2.assert_allclose(inf_results[j],
                                                   cosim_results[j], **tol):
                            logging.error(
                                "failed checking op[{0}] {1} output".format(
                                    i, predict_def.op[i].type))
                            exit()
                    if args.int8_cosim:
                        cc2.assert_allclose(inf_results[j], cosim_results[j],
                                            **tol)
                        cc2.assert_compare(inf_results[j], cosim_results[j],
                                           1e-01, 'ALL')
                    #if not np.allclose(inf_results[j], cosim_results[j], **tol):
                    # logging.error("Failure in cosim {} op {} output {}"
                    #     .format(
                    #     i,
                    #     predict_def.op[i].type,
                    #     predict_def.op[i].output[j]))
                    # logging.error(inf_results[j].flatten())
                    # logging.error(cosim_results[j].flatten())
                    # logging.error("Max error: {}"
                    #     .format(
                    #     np.max(np.abs(
                    #         inf_results[j] - cosim_results[j]))))
                    # return
                logging.warning("pass checking op[{0}] {1} output".format(
                    i, predict_def.op[i].type))
        else:
            if 'style-transfer' in args.model or (args.model == 'faster-rcnn'
                                                  and args.device.lower()
                                                  == 'gpu'):
                ws.FeedBlob(str(predict_def.op[0].input[0]), imgs)
            else:
                ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
            if predict_def.op[-1].type == 'Accuracy':
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                                device_opts)
                    if predict_def.op[-2].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-2].input[1]),
                                    init_label, device_opts)
                    elif predict_def.op[-3].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-3].input[1]),
                                    init_label, device_opts)
                else:
                    ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                                device_opts_cpu)
                    if predict_def.op[-2].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-2].input[1]),
                                    init_label, device_opts_cpu)
                    elif predict_def.op[-3].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-3].input[1]),
                                    init_label, device_opts_cpu)

            comp_start_time = timeit.default_timer()
            #if args.profile or predict_def.op[-1].type == 'Accuracy':
            #    ws.RunNet(net)
            #else:
            ws.RunNet(net)

            comp_elapsed_time = timeit.default_timer() - comp_start_time
            comp_time += comp_elapsed_time
            output = ws.FetchBlob(str(predict_def.op[-1].output[0]))
            if predict_def.op[-2].type == 'Accuracy':
                output2 = ws.FetchBlob(str(predict_def.op[-2].output[0]))
            elif predict_def.op[-3].type == 'Accuracy':
                output2 = ws.FetchBlob(str(predict_def.op[-3].output[0]))
            elif predict_def.op[-1].type == 'BoxWithNMSLimit':
                output2 = ws.FetchBlob(str(predict_def.op[-1].output[1]))
                output3 = ws.FetchBlob(str(predict_def.op[-1].output[2]))
            logging.warning(
                "[{0:.2%}] Output shape: {1}, computing in {2:.10f}"
                " seconds, processing {3} images in {4:.10f} seconds.".format(
                    ((k + 1) / len(images)), output.shape, comp_elapsed_time,
                    len(raw), img_elapsed_time))
            if predict_def.op[-1].type == 'BoxWithNMSLimit':
                outputs.append([output, output2, output3])
            elif predict_def.op[-1].type != 'Accuracy':
                outputs.append(output)
                #logging.info(output)
            else:
                accuracy_top1.append(output2)
                accuracy_top5.append(output)
            if args.profile:
                logging.warning("observer time = {}".format(ob.average_time()))
                logging.warning("observer time = {}".format(
                    ob.average_time_children()))

        del imgs
        if k >= (iterations - 1):
            logging.warning(
                "Exit after running {} iterations".format(iterations))
            break
    if args.profile:
        net.RemoveObserver(ob)

    if args.cosim:
        ws.SwitchWorkspace(def_ws_name)
        logging.warning("Cosim passed Ran 1 test OK")
        return
    if comp_time <= 0:
        logging.error("The total time is invalid!")
        return
    info_str = ""
    if len(accuracy_top1) > 0:
        mean_accuracy_top1 = 0
        mean_accuracy_top5 = 0
        for i, _ in enumerate(accuracy_top1):
            mean_accuracy_top1 += accuracy_top1[i] * batch_size
            mean_accuracy_top5 += accuracy_top5[i] * batch_size
        mean_accuracy_top1 /= batch_size * len(accuracy_top1)
        mean_accuracy_top5 /= batch_size * len(accuracy_top5)
        info_str += "\nAccuracy: {:.5%}".format(mean_accuracy_top1)
        info_str += "\nTop5Accuracy: {:.5%}".format(mean_accuracy_top5)
        total_image = processed_images
        logging.critical(
            "\nImages per second: {0:.10f}\nTotal computing time:"
            " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n"
            "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}".
            format(total_image / comp_time, comp_time, img_time,
                   model_elapsed_time, total_image, info_str))
        return
    if args.annotations:
        logging.info(" the total length of outputs is {}".format(len(outputs)))
        logging.critical("result is ={}".format(
            cc2.prepare_and_compute_map_data(outputs, fnames, apath)))
    info_str = ""
    accuracy = None
    top5accuracy = None
    summary = None
    total_image = processed_images
    if model_info["output_type"] == "segmentation" or args.dummydata:
        total_image = processed_images
    elif model_info["output_type"] == "possibility":
        label_offset = 0
        if model_info["model_type"] == "mlperf legacy mb":
            label_offset = -1
        results, total_image = cc2.ParsePossOutputs(outputs, label_offset)
        summary = cc2.ParsePossResults(results, labels, validation, fnames)
        if not summary:
            logging.error("Failed to parse the results!")
            return
        elif total_image <= 0 or len(summary) != total_image:
            logging.error("No available results!")
            return
        if validation:
            accuracy = 0
            top5accuracy = 0
            for res in summary:
                if res[1] == "Pass":
                    accuracy += 1
                    top5accuracy += 1
                elif res[1] == "Top5Pass":
                    top5accuracy += 1
            accuracy = accuracy / total_image
            top5accuracy = top5accuracy / total_image
            info_str += "\nAccuracy: {:.5%}".format(accuracy)
            info_str += "\nTop5Accuracy: {:.5%}".format(top5accuracy)
    elif model_info["output_type"] == "argmax":
        results, total_image = cc2.ParsePossOutputsArgMax(outputs, -1)
        summary = cc2.ParsePossResults(results, labels, validation, fnames)
        if not summary:
            logging.error("Failed to parse the results!")
            return
        elif total_image <= 0 or len(summary) != total_image:
            logging.error("No available results!")
            return
        if validation:
            accuracy = 0
            for res in summary:
                if res[1] == "Pass":
                    accuracy += 1
            accuracy = accuracy / total_image
            info_str += "\nAccuracy: {:.5%}".format(accuracy)
    elif model_info["output_type"] == "post image":
        results, total_image = cc2.ParsePostOutputs(outputs)
        if args.post_images_path:
            cc2.SavePostImages(results, args.post_images_path, fnames)
    logging.critical(
        "\nImages per second: {0:.10f}\nTotal computing time:"
        " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n"
        "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}".
        format(total_image / comp_time, comp_time, img_time,
               model_elapsed_time, total_image, info_str))
    cc2.SaveOutput(args, summary, accuracy, top5accuracy, comp_time,
                   total_image, img_time, model_elapsed_time)
Пример #9
0
def Calibration(args, extra_args):
    """
    function to run calibration
    """

    if not m.IsSupported(args.model):
        logging.error("Not supported model: {}".format(args.model))
        m.ShowModels()
        return

    images_path = None
    if args.images_path:
        images_path = os.path.abspath(args.images_path)
    elif "CAFFE2_INF_IMG_PATH" in os.environ:
        images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"])

    batch_size = 1
    if args.batch_size:
        batch_size = int(args.batch_size)
        if batch_size <= 0:
            logging.error("Invalid batch size {}. Exit!".format(batch_size))
            return

    iterations = args.iterations if args.iterations else sys.maxsize
    logging.warning("Run Caffe2 in inference mode with args:\n{}"
                    .format(vars(args)))

    model_info = m.GetModelInfo(args.model)
    logging.warning("The inference inputs of {0} model:\n{1}"
                    .format(
                        args.model,
                        {str(k): str(v) for k, v in model_info.items()}
                        ))

    crop_size = int(model_info["crop_size"])
    if args.crop_size:
        crop_size = args.crop_size

    need_normalize = False
    if model_info["need_normalize"]:
        need_normalize = True

    mean = 128
    if str(model_info["image_mean"]) != 'None':
        mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ')
        if need_normalize:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.float)
            mean[0, :, :] = float(mean_tmp[0])  # 104
            mean[1, :, :] = float(mean_tmp[1])  # 117
            mean[2, :, :] = float(mean_tmp[2])  # 124

        else:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.int32)
            mean[0, :, :] = int(mean_tmp[0])  # 104
            mean[1, :, :] = int(mean_tmp[1])  # 117
            mean[2, :, :] = int(mean_tmp[2])  # 124

    scale = [1]
    if str(model_info["scale"]) != '':
        scale = (model_info["scale"]).split(' ')
    rescale_size = 256
    if str(model_info["rescale_size"]) != '':
        rescale_size = int(model_info["rescale_size"])
    color_format = "BGR"
    if str(model_info["color_format"]) != '':
        color_format = model_info["color_format"]
    if args.onnx_model:
        init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"])
    else:
        with open(model_info["init_net"], 'rb') as i:
            if model_info["model_type"] == "prototext" or \
                     model_info["init_net"].split('.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef())
            else:
                init_def = caffe2_pb2.NetDef()
                init_def.ParseFromString(i.read())
        with open(model_info["predict_net"], 'rb') as p:
            if model_info["model_type"] == "prototext" or \
                     model_info["predict_net"].split('.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef())
            else:
                predict_def = caffe2_pb2.NetDef()
                predict_def.ParseFromString(p.read())

    if model_info["model_type"] == "caffe legacy":
        cc2.MergeScaleBiasInBN(predict_def)
        cc2.RemoveUselessExternalInput(predict_def)

    dev_map = {
        "cpu": caffe2_pb2.CPU,
        "gpu": caffe2_pb2.CUDA,
        "cuda": caffe2_pb2.CUDA,
        "mkldnn": caffe2_pb2.MKLDNN,
        "opengl": caffe2_pb2.OPENGL,
        "opencl": caffe2_pb2.OPENCL,
        "ideep": caffe2_pb2.IDEEP,
    }
    device_opts = caffe2_pb2.DeviceOption()
    if args.device.lower() in dev_map:
        device_opts.device_type = dev_map[args.device.lower()]
    else:
        logging.error("Wrong device {}. Exit!".format(args.device))
        return

    logging.warning("Start running calibration")

    if args.calibration_file:
        images, _ = cc2.ImageProc.BatchImagesByName(images_path, args.calibration_file, batch_size, iterations)
    else:
        images, _ = cc2.ImageProc.BatchImages(images_path, batch_size, iterations)
    # for kl_divergence calibration, we use the first 100 images to get
    # the min and max values, and the remaing images are applied to compute the hist.
    # if the len(images) <= 100, we extend the images with themselves.
    def data_gen():
        images_calib = images
        if args.single_iter_calib:
            images_calib = [images[args.iter_calib]]
        for raw in images_calib:
            if model_info["model_type"] == "mlperf legacy vgg":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfVGG(raw)
            elif model_info["model_type"] == "mlperf legacy mb":
                imgs, oshape = cc2.ImageProc.PreprocessImagesMLPerfMB(raw)
            else:
                imgs, _ = cc2.ImageProc.PreprocessImages(
                    raw, crop_size, rescale_size, mean, scale, 1, need_normalize, color_format)
            #imgs, _ = cc2.ImageProc.PreprocessImagesByThreading(
            #        raw, crop_size,rescale_size, mean, scale, 1)
            yield imgs
            del imgs

    cc2.UpdateDeviceOption(device_opts, init_def)
    workspace.RunNetOnce(init_def)

    cc2.UpdateDeviceOption(device_opts, predict_def)
    net = core.Net(model_info["model_name"])
    net.Proto().CopyFrom(predict_def)
    if args.device.lower() == 'ideep' and not args.noptimize:
        logging.warning('Optimizing module {} ....................'
                        .format(model_info["model_name"]))
        tf.optimizeForMKLDNN(net)
    predict_def = net.Proto()
    if predict_def.op[-1].type == 'Accuracy':
        init_label = np.ones((batch_size), dtype=np.int32)
        label = net.AddExternalInput('label')
        workspace.FeedBlob(label, init_label, device_opts)
        for i, op in enumerate(predict_def.op):
            if op.type == 'Accuracy':
                workspace.FeedBlob(str(predict_def.op[i].output[0]), init_label, device_opts)

    from inference.calibrator import Calibrator, KLCalib, AbsmaxCalib, EMACalib
    algorithm = AbsmaxCalib()
    kind = os.environ.get('INT8CALIB')
    if args.calib_algo:
        kind = args.calib_algo
    if kind == "absmax":
        algorithm = AbsmaxCalib()
    elif kind == "moving_average":
        ema_alpha = 0.5
        algorithm = EMACalib(ema_alpha)
    elif kind == "kl_divergence":
        kl_iter_num_for_range = 500
        while len(images) < 2*kl_iter_num_for_range:
            images += images
        algorithm = KLCalib(kl_iter_num_for_range)
    logging.warning('Use {} calibration method....................'.format(kind))

    i = 0
    length = len(images)
    calib = Calibrator(algorithm, device_opts)
    for data in data_gen():
        i += 1
        workspace.FeedBlob(predict_def.op[0].input[0], data, device_opts)
        logging.warning("in progress {}/{}(batch/batch total)".format(i, length))
        calib.RunCalibIter(workspace, predict_def)

    predict_quantized, init_quantized = calib.DepositQuantizedModule(workspace, predict_def)

    cc2.SaveModel(args.output_file + '/init_net_int8.pb', init_quantized,
                  args.output_file + '/predict_net_int8.pb', predict_quantized)
    cc2.SaveModelPtxt(args.output_file + '/predict_net_int8.pbtxt', predict_quantized)
    cc2.SaveModelPtxt(args.output_file + '/init_net_int8.pbtxt', init_quantized)