Exemplo n.º 1
0
def deploy(model, prefix, data=None, size=224, batch_size=128, fp16=False, int8=False, reload=False, backend='trt', **kwargs):
    r"""
    Args:
        model(nn.Module): full ImageNet model that outputs logits
    """
    from collections.abc import Iterable
    from ml import hub
    from ml import deploy as deployer
    if isinstance(size, int):
        H, W = (size, size)
    elif isinstance(size, Iterable):
        H, W = size * 2 if len(size) == 1 else size
    
    strict_type_constraints = kwargs.pop('strict_type_constraints', False)
    int8_calib_batch_size = kwargs.pop('int8_calib_batch_size', max(batch_size, 512 * 4))
    int8_calib_max_data = kwargs.pop('int8_calib_max_data', 5000 * 4)
    int8_calib_data_path = kwargs.pop('int8_calib_data_path', data)
    name = f"{prefix}-bs{batch_size}_{W}x{H}{fp16 and '-fp16' or ''}{int8 and '-int8' or ''}{strict_type_constraints and '-strict' or ''}"
    cache = f"ILSVRC2012-val-{int8_calib_batch_size}-{int8_calib_max_data}"
    int8_calib_cache_file = kwargs.pop('int8_calib_cache_file', f"{hub.get_dir()}/{cache}.cache") 
    workspace_size = kwargs.pop('workspace_size', 1 << 30)
    
    t = time.time()
    engine = deployer.build(name,
                            model,
                            [(3, -1, -1)],
                            backend='trt',
                            reload=reload,
                            batch_size=batch_size,
                            workspace_size=workspace_size,
                            dynamic_axes={'input_0': {0: 'batch_size', 2: 'height', 3: 'width'}},
                            min_shapes=[(3, 224, 224)],
                            max_shapes=[(3, H, W)],
                            fp16=fp16,
                            int8=int8,
                            strict_type_constraints=strict_type_constraints,
                            int8_calib_cache_file=int8_calib_cache_file,
                            int8_calib_batch_size=int8_calib_batch_size,
                            int8_calib_max_data=int8_calib_max_data,
                            int8_calib_data_path=int8_calib_data_path,
                            int8_calib_preprocess_func=None)
    logging.info(f"deployed {name} with backend={backend} in {time.time() - t:.3f}s")
    imagenet = ImageNetTRT(engine, model.classifier)
    return imagenet
Exemplo n.º 2
0
def test_deploy_onnx(benchmark, backbone_x101_32x8d_wsl, dev, batch, B):
    engine = deploy.build('resnext101_32x8d_wsl',
                          backbone_x101_32x8d_wsl, [batch.shape[1:]],
                          backend='onnx',
                          reload=True)

    outputs = benchmark(engine.predict, batch[:B])
    spatial_feats, scene_feats = outputs[-2][:B], outputs[-1][:B]
    assert spatial_feats.shape == (B, 2048, 23, 40)
    assert scene_feats.shape == (B, 2048)
    with th.no_grad():
        torch_outputs = backbone_x101_32x8d_wsl(batch[:B].to(dev))
    for i, (torch_output, output) in enumerate(zip(torch_outputs, outputs)):
        # logging.info(f"output[{i}] shape={tuple(output.shape)}")
        np.testing.assert_allclose(torch_output.cpu().numpy(),
                                   output,
                                   rtol=1e-03,
                                   atol=3e-04)
        th.testing.assert_allclose(torch_output,
                                   th.from_numpy(output).to(dev),
                                   rtol=1e-03,
                                   atol=3e-04)
Exemplo n.º 3
0
def test_deploy_onnx(benchmark, batch, detector, dev, B):
    module = detector.module
    module.model[-1].export = True
    engine = deploy.build('yolo5x',
                          detector, [batch.shape[1:]],
                          backend='onnx',
                          reload=True)

    #outputs = engine.predict(batch[:B])
    #for output in outputs:
    #    print(output.shape)
    outputs = benchmark(engine.predict, batch[:B])
    # print('outputs:', [o.shape for o in outputs])
    meta_preds, features = outputs[0:3], outputs[3:]
    with th.no_grad():
        torch_meta_preds, torch_features = detector(batch[:B].to(dev))
        # print('torch:', [o.shape for o in torch_meta_preds], [feats.shape for feats in torch_features])
    # logging.info(f"outputs onnx shape={tuple(outputs[0].shape)}, torch shape={tuple(torch_outputs.shape)}")

    for torch_preds, preds in zip(torch_meta_preds, meta_preds):
        np.testing.assert_allclose(torch_preds.cpu().numpy(),
                                   preds,
                                   rtol=1e-03,
                                   atol=3e-04)
        th.testing.assert_allclose(torch_preds,
                                   th.from_numpy(preds).to(dev),
                                   rtol=1e-03,
                                   atol=3e-04)
    for torch_feats, feats in zip(torch_features, features):
        np.testing.assert_allclose(torch_feats.cpu().numpy(),
                                   feats,
                                   rtol=1e-03,
                                   atol=3e-04)
        th.testing.assert_allclose(torch_feats,
                                   th.from_numpy(feats).to(dev),
                                   rtol=1e-03,
                                   atol=3e-04)
Exemplo n.º 4
0
    def deploy(self,
               name='yolo5x',
               batch_size=10,
               spec=(3, 640, 640),
               fp16=True,
               backend='trt',
               reload=False,
               **kwargs):
        r"""Deploy optimized runtime backend.
        Args:
            batch_size(int): max batch size
            spec(Tuple[int]): preprocessed frame shape which must be fixed through the batch
            amp(bool): mixed precision with FP16
            kwargs:
                dynamix_axes: dynamic axes for each input ==> {'input_0': {0: 'batch_size', 2: 'height'}}
                min_shapes: min input shapes ==> [(3, 320, 640)]
                max_shapes: max input shapes ==> [(3, 640, 640)]
        """
        from ml import deploy
        module = self.module
        # avoids warning for dynamic ifs
        module.model[-1].onnx_dynamic = True
        # FIXME: workaround for invalid values with different batch size in tensorrt
        # tensorrt output is not consistent with in place operations
        module.model[-1].inplace = False
        int8 = kwargs.get('int8', False)
        strict = kwargs.get('strict', False)
        if int8:
            from ml import hub
            from ml.vision.datasets.coco import download

            def preprocessor(size=(384, 640)):
                from PIL import Image
                from torchvision import transforms
                trans = transforms.Compose(
                    [transforms.Resize(size),
                     transforms.ToTensor()])

                H, W = size

                def preprocess(image_path, *shape):
                    r'''Preprocessing for TensorRT calibration
                    Args:
                        image_path(str): path to image
                        channels(int):
                    '''
                    image = Image.open(image_path)
                    logging.debug(
                        f"image.size={image.size}, mode={image.mode}")
                    image = image.convert('RGB')
                    C = len(image.mode)
                    im = trans(image)
                    assert im.shape == (C, H, W)
                    return im

                return preprocess

            int8_calib_max = kwargs.get('int8_calib_max', 5000)
            int8_calib_batch_size = kwargs.get('int8_calib_batch_size',
                                               max(batch_size, 64))
            cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache'
            cache_path = Path(os.path.join(hub.get_dir(), cache))
            kwargs['int8_calib_cache'] = str(cache_path)
            kwargs['int8_calib_data'] = download(split='val2017', reload=False)
            kwargs['int8_calib_preprocess_func'] = preprocessor()
            kwargs['int8_calib_max'] = int8_calib_max
            kwargs['int8_calib_batch_size'] = int8_calib_batch_size

        device = next(self.module.parameters()).device
        # FIXME: cuda + onnx_dynamic: causes the onnx export to fail: https://github.com/ultralytics/yolov5/issues/5439
        self.to('cpu')
        self.engine = deploy.build(
            f"{name}-bs{batch_size}_{spec[-2]}x{spec[-1]}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}{strict and '_strict' or ''}",
            self, [spec],
            backend=backend,
            reload=reload,
            batch_size=batch_size,
            fp16=fp16,
            strict_type_constraints=strict,
            **kwargs)
        self.to(device)
        # TODO: avoid storing dummy modules to keep track of module device
        self.dummy = module.model[-1]
        del self.module
Exemplo n.º 5
0
def test_deploy_trt(benchmark, batch, detector, dev, B, fp16, int8, strict,
                    name):
    # FIXME pytorch cuda initialization must be ahead of pycuda
    module = detector.module
    module.model[-1].export = True
    batch = TF.resize(batch, (384, 640)).float()
    h, w = batch.shape[2:]
    kwargs = {}
    if int8:
        import os
        from pathlib import Path
        from ml import hub
        from ml.vision.datasets.coco import download

        def preprocessor(size=(384, 640)):
            from PIL import Image
            from torchvision import transforms
            trans = transforms.Compose(
                [transforms.Resize(size),
                 transforms.ToTensor()])

            H, W = size

            def preprocess(image_path, *shape):
                r'''Preprocessing for TensorRT calibration
                Args:
                    image_path(str): path to image
                    channels(int):
                '''
                image = Image.open(image_path)
                logging.debug(f"image.size={image.size}, mode={image.mode}")
                image = image.convert('RGB')
                C = len(image.mode)
                im = trans(image)
                assert im.shape == (C, H, W)
                return im

            return preprocess

        int8_calib_max = 5000
        int8_calib_batch_size = 64
        cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache'
        cache_path = Path(os.path.join(hub.get_dir(), cache))
        kwargs['int8_calib_cache'] = str(cache_path)
        kwargs['int8_calib_data'] = download(split='val2017', reload=False)
        kwargs['int8_calib_preprocess_func'] = preprocessor()
        kwargs['int8_calib_max'] = int8_calib_max
        kwargs['int8_calib_batch_size'] = int8_calib_batch_size

    engine = deploy.build(
        f"yolo5x-bs{B}_{h}x{w}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}",
        detector, [batch.shape[1:]],
        backend='trt',
        reload=not True,
        batch_size=B,
        fp16=fp16,
        int8=int8,
        strict_type_constraints=strict,
        **kwargs)

    preds, *features = benchmark(engine.predict, batch[:B].to(dev), sync=True)
    assert len(features) == 3
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=fp16):
            torch_preds, torch_features = detector(batch[:B].to(dev))
    logging.info(
        f"outputs trt norm={preds.norm().item()}, torch norm={torch_preds.norm().item()}"
    )
    if fp16 or int8:
        pass
        # th.testing.assert_allclose(torch_preds.float(), preds.float(), rtol=2e-02, atol=4e-02)
    else:
        th.testing.assert_allclose(torch_preds.float(),
                                   preds.float(),
                                   rtol=1e-03,
                                   atol=4e-04)
        for torch_feats, feats in zip(torch_features, features):
            th.testing.assert_allclose(torch_feats.float(),
                                       feats.float(),
                                       rtol=1e-03,
                                       atol=4e-04)
Exemplo n.º 6
0
def test_deploy_trt(benchmark, batch, backbone_x101_32x8d_wsl, dev, B, fp16,
                    int8, strict, min_inp_size, max_inp_size):
    from ml import hub
    # dynamic/static input
    minH, minW = min_inp_size
    maxH, maxW = max_inp_size
    min_shapes = [(3, minH, minW)]
    max_shapes = [(3, maxH, maxW)]
    spec = [[3, minH, minW]]
    dynamic_axes = {'input_0': {0: 'batch_size'}}
    if maxH != minH:
        spec[0][1] = -1
        dynamic_axes['input_0'][2] = 'height'
    if maxW != minW:
        spec[0][2] = -1
        dynamic_axes['input_0'][3] = 'width'

    # int 8 configs
    int8_calib_data = 'data/ILSVRC2012/val'
    int8_calib_max = 5000 * 4
    int8_calib_batch_size = max(B, 512 * 4)
    prefix = 'x101_32x8d_wsl'
    name = f"{prefix}-bs{B}_{maxW}x{maxH}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}{strict and '_strict' or ''}"
    cache = f"ILSVRC2012-val-{int8_calib_batch_size}-{int8_calib_max}"
    int8_calib_cache = f"{hub.get_dir()}/{cache}.cache"
    engine = deploy.build(name,
                          backbone_x101_32x8d_wsl,
                          spec=spec,
                          backend='trt',
                          reload=not True,
                          batch_size=B,
                          dynamic_axes=dynamic_axes,
                          min_shapes=min_shapes,
                          max_shapes=max_shapes,
                          fp16=fp16,
                          int8=int8,
                          strict_type_constraints=strict,
                          int8_calib_cache=int8_calib_cache,
                          int8_calib_data=int8_calib_data,
                          int8_calib_max=int8_calib_max,
                          int8_calib_batch_size=int8_calib_batch_size)

    outputs = benchmark(engine.predict, batch[:B].to(dev), sync=True)
    spatial_feats, scene_feats = outputs[-2], outputs[-1]
    assert len(outputs) == 5
    # assert spatial_feats.shape == (B, 2048, 23, 40), f""
    assert scene_feats.shape == (B, 2048)
    with th.no_grad():
        with th.cuda.amp.autocast(enabled=fp16):
            torch_outputs = backbone_x101_32x8d_wsl(batch[:B].to(dev))
    for i, (torch_output, output) in enumerate(zip(torch_outputs, outputs)):
        logging.info(
            f"output[{i}] shape={tuple(output.shape)}, trt norm={output.norm()}, torch norm={torch_output.norm()}"
        )
        if fp16:
            if int8:
                th.testing.assert_allclose(torch_output,
                                           output,
                                           rtol=15.2,
                                           atol=15.2)
            else:
                th.testing.assert_allclose(torch_output,
                                           output,
                                           rtol=1.9,
                                           atol=1.9)
        else:
            if int8:
                th.testing.assert_allclose(torch_output,
                                           output,
                                           rtol=15.2,
                                           atol=15.2)
            else:
                th.testing.assert_allclose(torch_output,
                                           output,
                                           rtol=1e-03,
                                           atol=3e-04)