def deploy(model, prefix, data=None, size=224, batch_size=128, fp16=False, int8=False, reload=False, backend='trt', **kwargs): r""" Args: model(nn.Module): full ImageNet model that outputs logits """ from collections.abc import Iterable from ml import hub from ml import deploy as deployer if isinstance(size, int): H, W = (size, size) elif isinstance(size, Iterable): H, W = size * 2 if len(size) == 1 else size strict_type_constraints = kwargs.pop('strict_type_constraints', False) int8_calib_batch_size = kwargs.pop('int8_calib_batch_size', max(batch_size, 512 * 4)) int8_calib_max_data = kwargs.pop('int8_calib_max_data', 5000 * 4) int8_calib_data_path = kwargs.pop('int8_calib_data_path', data) name = f"{prefix}-bs{batch_size}_{W}x{H}{fp16 and '-fp16' or ''}{int8 and '-int8' or ''}{strict_type_constraints and '-strict' or ''}" cache = f"ILSVRC2012-val-{int8_calib_batch_size}-{int8_calib_max_data}" int8_calib_cache_file = kwargs.pop('int8_calib_cache_file', f"{hub.get_dir()}/{cache}.cache") workspace_size = kwargs.pop('workspace_size', 1 << 30) t = time.time() engine = deployer.build(name, model, [(3, -1, -1)], backend='trt', reload=reload, batch_size=batch_size, workspace_size=workspace_size, dynamic_axes={'input_0': {0: 'batch_size', 2: 'height', 3: 'width'}}, min_shapes=[(3, 224, 224)], max_shapes=[(3, H, W)], fp16=fp16, int8=int8, strict_type_constraints=strict_type_constraints, int8_calib_cache_file=int8_calib_cache_file, int8_calib_batch_size=int8_calib_batch_size, int8_calib_max_data=int8_calib_max_data, int8_calib_data_path=int8_calib_data_path, int8_calib_preprocess_func=None) logging.info(f"deployed {name} with backend={backend} in {time.time() - t:.3f}s") imagenet = ImageNetTRT(engine, model.classifier) return imagenet
def test_deploy_onnx(benchmark, backbone_x101_32x8d_wsl, dev, batch, B): engine = deploy.build('resnext101_32x8d_wsl', backbone_x101_32x8d_wsl, [batch.shape[1:]], backend='onnx', reload=True) outputs = benchmark(engine.predict, batch[:B]) spatial_feats, scene_feats = outputs[-2][:B], outputs[-1][:B] assert spatial_feats.shape == (B, 2048, 23, 40) assert scene_feats.shape == (B, 2048) with th.no_grad(): torch_outputs = backbone_x101_32x8d_wsl(batch[:B].to(dev)) for i, (torch_output, output) in enumerate(zip(torch_outputs, outputs)): # logging.info(f"output[{i}] shape={tuple(output.shape)}") np.testing.assert_allclose(torch_output.cpu().numpy(), output, rtol=1e-03, atol=3e-04) th.testing.assert_allclose(torch_output, th.from_numpy(output).to(dev), rtol=1e-03, atol=3e-04)
def test_deploy_onnx(benchmark, batch, detector, dev, B): module = detector.module module.model[-1].export = True engine = deploy.build('yolo5x', detector, [batch.shape[1:]], backend='onnx', reload=True) #outputs = engine.predict(batch[:B]) #for output in outputs: # print(output.shape) outputs = benchmark(engine.predict, batch[:B]) # print('outputs:', [o.shape for o in outputs]) meta_preds, features = outputs[0:3], outputs[3:] with th.no_grad(): torch_meta_preds, torch_features = detector(batch[:B].to(dev)) # print('torch:', [o.shape for o in torch_meta_preds], [feats.shape for feats in torch_features]) # logging.info(f"outputs onnx shape={tuple(outputs[0].shape)}, torch shape={tuple(torch_outputs.shape)}") for torch_preds, preds in zip(torch_meta_preds, meta_preds): np.testing.assert_allclose(torch_preds.cpu().numpy(), preds, rtol=1e-03, atol=3e-04) th.testing.assert_allclose(torch_preds, th.from_numpy(preds).to(dev), rtol=1e-03, atol=3e-04) for torch_feats, feats in zip(torch_features, features): np.testing.assert_allclose(torch_feats.cpu().numpy(), feats, rtol=1e-03, atol=3e-04) th.testing.assert_allclose(torch_feats, th.from_numpy(feats).to(dev), rtol=1e-03, atol=3e-04)
def deploy(self, name='yolo5x', batch_size=10, spec=(3, 640, 640), fp16=True, backend='trt', reload=False, **kwargs): r"""Deploy optimized runtime backend. Args: batch_size(int): max batch size spec(Tuple[int]): preprocessed frame shape which must be fixed through the batch amp(bool): mixed precision with FP16 kwargs: dynamix_axes: dynamic axes for each input ==> {'input_0': {0: 'batch_size', 2: 'height'}} min_shapes: min input shapes ==> [(3, 320, 640)] max_shapes: max input shapes ==> [(3, 640, 640)] """ from ml import deploy module = self.module # avoids warning for dynamic ifs module.model[-1].onnx_dynamic = True # FIXME: workaround for invalid values with different batch size in tensorrt # tensorrt output is not consistent with in place operations module.model[-1].inplace = False int8 = kwargs.get('int8', False) strict = kwargs.get('strict', False) if int8: from ml import hub from ml.vision.datasets.coco import download def preprocessor(size=(384, 640)): from PIL import Image from torchvision import transforms trans = transforms.Compose( [transforms.Resize(size), transforms.ToTensor()]) H, W = size def preprocess(image_path, *shape): r'''Preprocessing for TensorRT calibration Args: image_path(str): path to image channels(int): ''' image = Image.open(image_path) logging.debug( f"image.size={image.size}, mode={image.mode}") image = image.convert('RGB') C = len(image.mode) im = trans(image) assert im.shape == (C, H, W) return im return preprocess int8_calib_max = kwargs.get('int8_calib_max', 5000) int8_calib_batch_size = kwargs.get('int8_calib_batch_size', max(batch_size, 64)) cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache' cache_path = Path(os.path.join(hub.get_dir(), cache)) kwargs['int8_calib_cache'] = str(cache_path) kwargs['int8_calib_data'] = download(split='val2017', reload=False) kwargs['int8_calib_preprocess_func'] = preprocessor() kwargs['int8_calib_max'] = int8_calib_max kwargs['int8_calib_batch_size'] = int8_calib_batch_size device = next(self.module.parameters()).device # FIXME: cuda + onnx_dynamic: causes the onnx export to fail: https://github.com/ultralytics/yolov5/issues/5439 self.to('cpu') self.engine = deploy.build( f"{name}-bs{batch_size}_{spec[-2]}x{spec[-1]}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}{strict and '_strict' or ''}", self, [spec], backend=backend, reload=reload, batch_size=batch_size, fp16=fp16, strict_type_constraints=strict, **kwargs) self.to(device) # TODO: avoid storing dummy modules to keep track of module device self.dummy = module.model[-1] del self.module
def test_deploy_trt(benchmark, batch, detector, dev, B, fp16, int8, strict, name): # FIXME pytorch cuda initialization must be ahead of pycuda module = detector.module module.model[-1].export = True batch = TF.resize(batch, (384, 640)).float() h, w = batch.shape[2:] kwargs = {} if int8: import os from pathlib import Path from ml import hub from ml.vision.datasets.coco import download def preprocessor(size=(384, 640)): from PIL import Image from torchvision import transforms trans = transforms.Compose( [transforms.Resize(size), transforms.ToTensor()]) H, W = size def preprocess(image_path, *shape): r'''Preprocessing for TensorRT calibration Args: image_path(str): path to image channels(int): ''' image = Image.open(image_path) logging.debug(f"image.size={image.size}, mode={image.mode}") image = image.convert('RGB') C = len(image.mode) im = trans(image) assert im.shape == (C, H, W) return im return preprocess int8_calib_max = 5000 int8_calib_batch_size = 64 cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache' cache_path = Path(os.path.join(hub.get_dir(), cache)) kwargs['int8_calib_cache'] = str(cache_path) kwargs['int8_calib_data'] = download(split='val2017', reload=False) kwargs['int8_calib_preprocess_func'] = preprocessor() kwargs['int8_calib_max'] = int8_calib_max kwargs['int8_calib_batch_size'] = int8_calib_batch_size engine = deploy.build( f"yolo5x-bs{B}_{h}x{w}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}", detector, [batch.shape[1:]], backend='trt', reload=not True, batch_size=B, fp16=fp16, int8=int8, strict_type_constraints=strict, **kwargs) preds, *features = benchmark(engine.predict, batch[:B].to(dev), sync=True) assert len(features) == 3 with th.no_grad(): with th.cuda.amp.autocast(enabled=fp16): torch_preds, torch_features = detector(batch[:B].to(dev)) logging.info( f"outputs trt norm={preds.norm().item()}, torch norm={torch_preds.norm().item()}" ) if fp16 or int8: pass # th.testing.assert_allclose(torch_preds.float(), preds.float(), rtol=2e-02, atol=4e-02) else: th.testing.assert_allclose(torch_preds.float(), preds.float(), rtol=1e-03, atol=4e-04) for torch_feats, feats in zip(torch_features, features): th.testing.assert_allclose(torch_feats.float(), feats.float(), rtol=1e-03, atol=4e-04)
def test_deploy_trt(benchmark, batch, backbone_x101_32x8d_wsl, dev, B, fp16, int8, strict, min_inp_size, max_inp_size): from ml import hub # dynamic/static input minH, minW = min_inp_size maxH, maxW = max_inp_size min_shapes = [(3, minH, minW)] max_shapes = [(3, maxH, maxW)] spec = [[3, minH, minW]] dynamic_axes = {'input_0': {0: 'batch_size'}} if maxH != minH: spec[0][1] = -1 dynamic_axes['input_0'][2] = 'height' if maxW != minW: spec[0][2] = -1 dynamic_axes['input_0'][3] = 'width' # int 8 configs int8_calib_data = 'data/ILSVRC2012/val' int8_calib_max = 5000 * 4 int8_calib_batch_size = max(B, 512 * 4) prefix = 'x101_32x8d_wsl' name = f"{prefix}-bs{B}_{maxW}x{maxH}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}{strict and '_strict' or ''}" cache = f"ILSVRC2012-val-{int8_calib_batch_size}-{int8_calib_max}" int8_calib_cache = f"{hub.get_dir()}/{cache}.cache" engine = deploy.build(name, backbone_x101_32x8d_wsl, spec=spec, backend='trt', reload=not True, batch_size=B, dynamic_axes=dynamic_axes, min_shapes=min_shapes, max_shapes=max_shapes, fp16=fp16, int8=int8, strict_type_constraints=strict, int8_calib_cache=int8_calib_cache, int8_calib_data=int8_calib_data, int8_calib_max=int8_calib_max, int8_calib_batch_size=int8_calib_batch_size) outputs = benchmark(engine.predict, batch[:B].to(dev), sync=True) spatial_feats, scene_feats = outputs[-2], outputs[-1] assert len(outputs) == 5 # assert spatial_feats.shape == (B, 2048, 23, 40), f"" assert scene_feats.shape == (B, 2048) with th.no_grad(): with th.cuda.amp.autocast(enabled=fp16): torch_outputs = backbone_x101_32x8d_wsl(batch[:B].to(dev)) for i, (torch_output, output) in enumerate(zip(torch_outputs, outputs)): logging.info( f"output[{i}] shape={tuple(output.shape)}, trt norm={output.norm()}, torch norm={torch_output.norm()}" ) if fp16: if int8: th.testing.assert_allclose(torch_output, output, rtol=15.2, atol=15.2) else: th.testing.assert_allclose(torch_output, output, rtol=1.9, atol=1.9) else: if int8: th.testing.assert_allclose(torch_output, output, rtol=15.2, atol=15.2) else: th.testing.assert_allclose(torch_output, output, rtol=1e-03, atol=3e-04)