예제 #1
0
def main():
    args = build_argparser().parse_args()

    cap = open_images_capture(args.input, args.loop)

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')
    # Read IR
    log.info('Reading Mask-RCNN model {}'.format(args.mask_rcnn_model))
    mask_rcnn_net = ie.read_network(args.mask_rcnn_model)

    model_required_inputs = {'image'}
    if set(mask_rcnn_net.input_info) == model_required_inputs:
        required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'}
        n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape
        assert n == 1, 'Only batch 1 is supported by the demo application'
    else:
        raise RuntimeError(
            'Demo supports only topologies with the following input keys: '
            f'{model_required_inputs}.')
    assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \
        f'Demo supports only topologies with the following output keys: {required_output_keys}' \
        f'Found: {mask_rcnn_net.outputs.keys()}.'

    log.info('Reading Text Recognition Encoder model {}'.format(
        args.text_enc_model))
    text_enc_net = ie.read_network(args.text_enc_model)

    log.info('Reading Text Recognition Decoder model {}'.format(
        args.text_dec_model))
    text_dec_net = ie.read_network(args.text_dec_model)

    mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net,
                                         device_name=args.device,
                                         num_requests=2)
    log.info('The Mask-RCNN model {} is loaded to {}'.format(
        args.mask_rcnn_model, args.device))

    text_enc_exec_net = ie.load_network(network=text_enc_net,
                                        device_name=args.device)
    log.info('The Text Recognition Encoder model {} is loaded to {}'.format(
        args.text_enc_model, args.device))

    text_dec_exec_net = ie.load_network(network=text_dec_net,
                                        device_name=args.device)
    log.info('The Text Recognition Decoder model {} is loaded to {}'.format(
        args.text_dec_model, args.device))

    hidden_shape = text_dec_net.input_info[
        args.trd_input_prev_hidden].input_data.shape

    del mask_rcnn_net
    del text_enc_net
    del text_dec_net

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    if args.delay:
        delay = args.delay
    else:
        delay = int(cap.get_type() in ('VIDEO', 'CAMERA'))

    visualizer = Visualizer(['__background__', 'text'],
                            show_boxes=args.show_boxes,
                            show_scores=args.show_scores)

    frames_processed = 0

    metrics = PerformanceMetrics()
    video_writer = cv2.VideoWriter()

    start_time = perf_counter()
    frame = cap.read()
    if frame is None:
        raise RuntimeError("Can't read an image from the input")

    presenter = monitors.Presenter(args.utilization_monitors, 45,
                                   (frame.shape[1] // 4, frame.shape[0] // 8))
    if args.output and not video_writer.open(
            args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(),
        (frame.shape[1], frame.shape[0])):
        raise RuntimeError("Can't open video writer")

    while frame is not None:
        if not args.keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image,
                             ((0, h - input_image_size[0]),
                              (0, w - input_image_size[1]), (0, 0)),
                             mode='constant',
                             constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)

        # Run the net.
        outputs = mask_rcnn_exec_net.infer({'image': input_image})

        # Parse detection results of the current request
        boxes = outputs['boxes'][:, :4]
        scores = outputs['boxes'][:, 4]
        classes = outputs['labels'].astype(np.uint32)
        raw_masks = outputs['masks']
        text_features = outputs['text_features.0']

        # Filter out detections with low confidence.
        detections_filter = scores > args.prob_threshold
        scores = scores[detections_filter]
        classes = classes[detections_filter]
        boxes = boxes[detections_filter]
        raw_masks = raw_masks[detections_filter]
        text_features = text_features[detections_filter]

        boxes[:, 0::2] /= scale_x
        boxes[:, 1::2] /= scale_y
        masks = []
        for box, cls, raw_mask in zip(boxes, classes, raw_masks):
            mask = segm_postprocess(box, raw_mask, frame.shape[0],
                                    frame.shape[1])
            masks.append(mask)

        texts = []
        for feature in text_features:
            feature = text_enc_exec_net.infer({'input': feature})['output']
            feature = np.reshape(feature,
                                 (feature.shape[0], feature.shape[1], -1))
            feature = np.transpose(feature, (0, 2, 1))

            hidden = np.zeros(hidden_shape)
            prev_symbol_index = np.ones((1, )) * SOS_INDEX

            text = ''
            text_confidence = 1.0
            for i in range(MAX_SEQ_LEN):
                decoder_output = text_dec_exec_net.infer({
                    args.trd_input_prev_symbol:
                    prev_symbol_index,
                    args.trd_input_prev_hidden:
                    hidden,
                    args.trd_input_encoder_outputs:
                    feature
                })
                symbols_distr = decoder_output[args.trd_output_symbols_distr]
                symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0]
                prev_symbol_index = int(np.argmax(symbols_distr, axis=1))
                text_confidence *= symbols_distr_softmaxed[prev_symbol_index]
                if prev_symbol_index == EOS_INDEX:
                    break
                text += args.alphabet[prev_symbol_index]
                hidden = decoder_output[args.trd_output_cur_hidden]

            texts.append(text if text_confidence >= args.tr_threshold else '')

        if len(boxes) and args.raw_output_message:
            log.debug(
                '  -------------------------- Frame # {} --------------------------  '
                .format(frames_processed))
            log.debug(
                '  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX '
            )
            for box, cls, score, mask in zip(boxes, classes, scores, masks):
                log.debug(
                    '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '
                    .format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        presenter.drawGraphs(frame)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, masks, texts,
                           masks_tracks_ids)
        metrics.update(start_time, frame)

        frames_processed += 1
        if video_writer.isOpened() and (args.output_limit <= 0 or
                                        frames_processed <= args.output_limit):
            video_writer.write(frame)

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)

        if not args.no_show:
            key = cv2.waitKey(delay)
            esc_code = 27
            if key == esc_code:
                break
            presenter.handleKey(key)

        start_time = perf_counter()
        frame = cap.read()

    metrics.log_total()
    for rep in presenter.reportMeans():
        log.info(rep)
    cv2.destroyAllWindows()
예제 #2
0
def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    mask_rcnn_model_xml = args.mask_rcnn_model
    mask_rcnn_model_bin = os.path.splitext(mask_rcnn_model_xml)[0] + '.bin'

    text_enc_model_xml = args.text_enc_model
    text_enc_model_bin = os.path.splitext(text_enc_model_xml)[0] + '.bin'

    text_dec_model_xml = args.text_dec_model
    text_dec_model_bin = os.path.splitext(text_dec_model_xml)[0] + '.bin'

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('Creating Inference Engine...')
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')
    # Read IR
    log.info('Loading network files:\n\t{}\n\t{}'.format(
        mask_rcnn_model_xml, mask_rcnn_model_bin))
    mask_rcnn_net = IENetwork(model=mask_rcnn_model_xml,
                              weights=mask_rcnn_model_bin)

    log.info('Loading network files:\n\t{}\n\t{}'.format(
        text_enc_model_xml, text_enc_model_bin))
    text_enc_net = IENetwork(model=text_enc_model_xml,
                             weights=text_enc_model_bin)

    log.info('Loading network files:\n\t{}\n\t{}'.format(
        text_dec_model_xml, text_dec_model_bin))
    text_dec_net = IENetwork(model=text_dec_model_xml,
                             weights=text_dec_model_bin)

    if 'CPU' in args.device:
        supported_layers = ie.query_network(mask_rcnn_net, 'CPU')
        not_supported_layers = [
            l for l in mask_rcnn_net.layers.keys() if l not in supported_layers
        ]
        if len(not_supported_layers) != 0:
            log.error(
                'Following layers are not supported by the plugin for specified device {}:\n {}'
                .format(args.device, ', '.join(not_supported_layers)))
            log.error(
                "Please try to specify cpu extensions library path in sample's command line parameters using -l "
                "or --cpu_extension command line argument")
            sys.exit(1)

    required_input_keys = {'im_data', 'im_info'}
    assert required_input_keys == set(mask_rcnn_net.inputs.keys()), \
        'Demo supports only topologies with the following input keys: {}'.format(', '.join(required_input_keys))
    required_output_keys = {
        'boxes', 'scores', 'classes', 'raw_masks', 'text_features'
    }
    assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \
        'Demo supports only topologies with the following output keys: {}'.format(', '.join(required_output_keys))

    n, c, h, w = mask_rcnn_net.inputs['im_data'].shape
    assert n == 1, 'Only batch 1 is supported by the demo application'

    log.info('Loading IR to the plugin...')
    mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net,
                                         device_name=args.device,
                                         num_requests=2)
    text_enc_exec_net = ie.load_network(network=text_enc_net,
                                        device_name=args.device)
    text_dec_exec_net = ie.load_network(network=text_dec_net,
                                        device_name=args.device)

    hidden_shape = text_dec_net.inputs[args.trd_input_prev_hidden].shape

    del mask_rcnn_net
    del text_enc_net
    del text_dec_net

    try:
        input_source = int(args.input_source)
    except ValueError:
        input_source = args.input_source

    if os.path.isdir(input_source):
        cap = FolderCapture(input_source)
    else:
        cap = cv2.VideoCapture(input_source)

    if not cap.isOpened():
        log.error('Failed to open "{}"'.format(args.input_source))
    if isinstance(cap, cv2.VideoCapture):
        cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    visualizer = Visualizer(['__background__', 'text'],
                            show_boxes=args.show_boxes,
                            show_scores=args.show_scores)

    render_time = 0

    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if not args.keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image,
                             ((0, h - input_image_size[0]),
                              (0, w - input_image_size[1]), (0, 0)),
                             mode='constant',
                             constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)
        input_image_info = np.asarray(
            [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32)

        # Run the net.
        inf_start = time.time()
        outputs = mask_rcnn_exec_net.infer({
            'im_data': input_image,
            'im_info': input_image_info
        })

        # Parse detection results of the current request
        boxes = outputs['boxes']
        scores = outputs['scores']
        classes = outputs['classes'].astype(np.uint32)
        raw_masks = outputs['raw_masks']
        text_features = outputs['text_features']

        # Filter out detections with low confidence.
        detections_filter = scores > args.prob_threshold
        scores = scores[detections_filter]
        classes = classes[detections_filter]
        boxes = boxes[detections_filter]
        raw_masks = raw_masks[detections_filter]
        text_features = text_features[detections_filter]

        boxes[:, 0::2] /= scale_x
        boxes[:, 1::2] /= scale_y
        masks = []
        for box, cls, raw_mask in zip(boxes, classes, raw_masks):
            raw_cls_mask = raw_mask[cls, ...]
            mask = segm_postprocess(box, raw_cls_mask, frame.shape[0],
                                    frame.shape[1])
            masks.append(mask)

        texts = []
        for feature in text_features:
            feature = text_enc_exec_net.infer({'input': feature})['output']
            feature = np.reshape(feature,
                                 (feature.shape[0], feature.shape[1], -1))
            feature = np.transpose(feature, (0, 2, 1))

            hidden = np.zeros(hidden_shape)
            prev_symbol_index = np.ones((1, )) * SOS_INDEX

            text = ''
            for i in range(MAX_SEQ_LEN):
                decoder_output = text_dec_exec_net.infer({
                    args.trd_input_prev_symbol:
                    prev_symbol_index,
                    args.trd_input_prev_hidden:
                    hidden,
                    args.trd_input_encoder_outputs:
                    feature
                })
                symbols_distr = decoder_output[args.trd_output_symbols_distr]
                prev_symbol_index = int(np.argmax(symbols_distr, axis=1))
                if prev_symbol_index == EOS_INDEX:
                    break
                text += args.alphabet[prev_symbol_index]
                hidden = decoder_output[args.trd_output_cur_hidden]

            texts.append(text)

        inf_end = time.time()
        inf_time = inf_end - inf_start

        render_start = time.time()

        if len(boxes) and args.raw_output_message:
            log.info('Detected boxes:')
            log.info(
                '  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX '
            )
            for box, cls, score, mask in zip(boxes, classes, scores, masks):
                log.info(
                    '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '
                    .format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, masks, texts,
                           masks_tracks_ids)

        # Draw performance stats.
        inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format(
            inf_time * 1000)
        render_time_message = 'OpenCV rendering time: {:.3f} ms'.format(
            render_time * 1000)
        cv2.putText(frame, inf_time_message, (15, 15),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
        cv2.putText(frame, render_time_message, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)

        # Print performance counters.
        if args.perf_counts:
            perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts()
            log.info('Performance counters:')
            print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                'name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
            for layer, stats in perf_counts.items():
                print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                    layer, stats['layer_type'], stats['exec_type'],
                    stats['status'], stats['real_time']))

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)
        render_end = time.time()
        render_time = render_end - render_start

        if not args.no_show:
            key = cv2.waitKey(args.delay)
            esc_code = 27
            if key == esc_code:
                break

    cv2.destroyAllWindows()
    cap.release()
예제 #3
0
def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s',
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()

    # Plugin initialization for specified device and load extensions library if specified.
    log.info('Creating Inference Engine...')
    ie = IECore()
    if args.cpu_extension and 'CPU' in args.device:
        ie.add_extension(args.cpu_extension, 'CPU')
    # Read IR
    log.info('Loading Mask-RCNN network')
    mask_rcnn_net = ie.read_network(
        args.mask_rcnn_model,
        os.path.splitext(args.mask_rcnn_model)[0] + '.bin')

    log.info('Loading encoder part of text recognition network')
    text_enc_net = ie.read_network(
        args.text_enc_model,
        os.path.splitext(args.text_enc_model)[0] + '.bin')

    log.info('Loading decoder part of text recognition network')
    text_dec_net = ie.read_network(
        args.text_dec_model,
        os.path.splitext(args.text_dec_model)[0] + '.bin')

    model_required_inputs = {'image'}
    old_model_required_inputs = {'im_data', 'im_info'}
    if set(mask_rcnn_net.input_info) == model_required_inputs:
        old_model = False
        required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'}
        n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape
    elif set(mask_rcnn_net.input_info) == old_model_required_inputs:
        old_model = True
        required_output_keys = {
            'boxes', 'scores', 'classes', 'raw_masks', 'text_features'
        }
        n, c, h, w = mask_rcnn_net.input_info['im_data'].input_data.shape
        args.alphabet = '  0123456789abcdefghijklmnopqrstuvwxyz'
        args.tr_threshold = 0
    else:
        raise RuntimeError(
            'Demo supports only topologies with the following input keys: '
            f'{model_required_inputs} or {old_model_required_inputs}.')

    assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \
        f'Demo supports only topologies with the following output keys: {required_output_keys}' \
        f'Found: {mask_rcnn_net.outputs.keys()}.'

    assert n == 1, 'Only batch 1 is supported by the demo application'

    log.info('Loading IR to the plugin...')
    mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net,
                                         device_name=args.device,
                                         num_requests=2)
    text_enc_exec_net = ie.load_network(network=text_enc_net,
                                        device_name=args.device)
    text_dec_exec_net = ie.load_network(network=text_dec_net,
                                        device_name=args.device)

    hidden_shape = text_dec_net.input_info[
        args.trd_input_prev_hidden].input_data.shape

    del mask_rcnn_net
    del text_enc_net
    del text_dec_net

    input_source = args.input_source
    if os.path.isdir(input_source):
        cap = FolderCapture(input_source)
    else:
        try:
            input_source = int(args.input_source)
            cap = cv2.VideoCapture(input_source)
            cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
        except ValueError:
            cap = cv2.VideoCapture(input_source)

    if not cap.isOpened():
        raise RuntimeError('Failed to open "{}"'.format(input_source))

    ret, frame = cap.read()
    if not ret:
        raise RuntimeError("Can't read an image from the input")

    if args.no_track:
        tracker = None
    else:
        tracker = StaticIOUTracker()

    visualizer = Visualizer(['__background__', 'text'],
                            show_boxes=args.show_boxes,
                            show_scores=args.show_scores)

    render_time = 0

    presenter = monitors.Presenter(args.utilization_monitors, 45,
                                   (frame.shape[1] // 4, frame.shape[0] // 8))
    log.info('Starting inference...')
    print(
        "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key"
    )
    while ret:
        if not args.keep_aspect_ratio:
            # Resize the image to a target size.
            scale_x = w / frame.shape[1]
            scale_y = h / frame.shape[0]
            input_image = cv2.resize(frame, (w, h))
        else:
            # Resize the image to keep the same aspect ratio and to fit it to a window of a target size.
            scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1])
            input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y)

        input_image_size = input_image.shape[:2]
        input_image = np.pad(input_image,
                             ((0, h - input_image_size[0]),
                              (0, w - input_image_size[1]), (0, 0)),
                             mode='constant',
                             constant_values=0)
        # Change data layout from HWC to CHW.
        input_image = input_image.transpose((2, 0, 1))
        input_image = input_image.reshape((n, c, h, w)).astype(np.float32)
        input_image_info = np.asarray(
            [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32)

        # Run the net.
        inf_start = time.time()
        if old_model:
            outputs = mask_rcnn_exec_net.infer({
                'im_data': input_image,
                'im_info': input_image_info
            })
        else:
            outputs = mask_rcnn_exec_net.infer({'image': input_image})

        # Parse detection results of the current request
        if old_model:
            boxes = outputs['boxes']
            scores = outputs['scores']
            classes = outputs['classes'].astype(np.uint32)
            raw_masks = outputs['raw_masks']
            text_features = outputs['text_features']
        else:
            boxes = outputs['boxes'][:, :4]
            scores = outputs['boxes'][:, 4]
            classes = outputs['labels'].astype(np.uint32)
            raw_masks = outputs['masks']
            text_features = outputs['text_features.0']

        # Filter out detections with low confidence.
        detections_filter = scores > args.prob_threshold
        scores = scores[detections_filter]
        classes = classes[detections_filter]
        boxes = boxes[detections_filter]
        raw_masks = raw_masks[detections_filter]
        text_features = text_features[detections_filter]

        boxes[:, 0::2] /= scale_x
        boxes[:, 1::2] /= scale_y
        masks = []
        for box, cls, raw_mask in zip(boxes, classes, raw_masks):
            if old_model:
                raw_mask = raw_mask[cls, ...]
            mask = segm_postprocess(box, raw_mask, frame.shape[0],
                                    frame.shape[1])
            masks.append(mask)

        texts = []
        for feature in text_features:
            feature = text_enc_exec_net.infer({'input': feature})['output']
            feature = np.reshape(feature,
                                 (feature.shape[0], feature.shape[1], -1))
            feature = np.transpose(feature, (0, 2, 1))

            hidden = np.zeros(hidden_shape)
            prev_symbol_index = np.ones((1, )) * SOS_INDEX

            text = ''
            text_confidence = 1.0
            for i in range(MAX_SEQ_LEN):
                decoder_output = text_dec_exec_net.infer({
                    args.trd_input_prev_symbol:
                    prev_symbol_index,
                    args.trd_input_prev_hidden:
                    hidden,
                    args.trd_input_encoder_outputs:
                    feature
                })
                symbols_distr = decoder_output[args.trd_output_symbols_distr]
                symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0]
                prev_symbol_index = int(np.argmax(symbols_distr, axis=1))
                text_confidence *= symbols_distr_softmaxed[prev_symbol_index]
                if prev_symbol_index == EOS_INDEX:
                    break
                text += args.alphabet[prev_symbol_index]
                hidden = decoder_output[args.trd_output_cur_hidden]

            texts.append(text if text_confidence >= args.tr_threshold else '')

        inf_end = time.time()
        inf_time = inf_end - inf_start

        render_start = time.time()

        if len(boxes) and args.raw_output_message:
            log.info('Detected boxes:')
            log.info(
                '  Class ID | Confidence |     XMIN |     YMIN |     XMAX |     YMAX '
            )
            for box, cls, score, mask in zip(boxes, classes, scores, masks):
                log.info(
                    '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '
                    .format(cls, score, *box))

        # Get instance track IDs.
        masks_tracks_ids = None
        if tracker is not None:
            masks_tracks_ids = tracker(masks, classes)

        presenter.drawGraphs(frame)

        # Visualize masks.
        frame = visualizer(frame, boxes, classes, scores, masks, texts,
                           masks_tracks_ids)

        # Draw performance stats.
        inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format(
            inf_time * 1000)
        render_time_message = 'OpenCV rendering time: {:.3f} ms'.format(
            render_time * 1000)
        cv2.putText(frame, inf_time_message, (15, 15),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
        cv2.putText(frame, render_time_message, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)

        # Print performance counters.
        if args.perf_counts:
            perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts()
            log.info('Performance counters:')
            print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                'name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
            for layer, stats in perf_counts.items():
                print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(
                    layer, stats['layer_type'], stats['exec_type'],
                    stats['status'], stats['real_time']))

        if not args.no_show:
            # Show resulting image.
            cv2.imshow('Results', frame)
        render_end = time.time()
        render_time = render_end - render_start

        if not args.no_show:
            key = cv2.waitKey(args.delay)
            esc_code = 27
            if key == esc_code:
                break
            presenter.handleKey(key)

        ret, frame = cap.read()

    print(presenter.reportMeans())
    cv2.destroyAllWindows()
    cap.release()