예제 #1
0
def generate_input_transforms(batch: int, height: int, width: int, channels: int, pad: int) -> transforms.Compose:
    """
    Generates a torchvision transformation converting a PIL.Image into a
    tensor usable in a network forward pass.

    Args:
        batch (int): mini-batch size
        height (int): height of input image in pixels
        width (int): width of input image in pixels
        channels (int): color channels of input
        pad (int): Amount of padding on horizontal ends of image

    Returns:
        A torchvision transformation composition converting the input image to
        the appropriate tensor.
    """
    scale = 0  # type: Union[Tuple[int, int], int]
    if height == 1 and width == 0 and channels > 3:
        perm = (1, 0, 2)
        scale = channels
        mode = 'L'
    # arbitrary (or fixed) height and width and channels 1 or 3 => needs a
    # summarizing network (or a not yet implemented scale operation) to move
    # height to the channel dimension.
    elif height > 1 and width == 0 and channels in (1, 3):
        perm = (0, 1, 2)
        scale = height
        mode = 'RGB' if channels == 3 else 'L'
    # fixed height and width image => bicubic scaling of the input image, disable padding
    elif height > 0 and width > 0 and channels in (1, 3):
        perm = (0, 1, 2)
        pad = 0
        scale = (height, width)
        mode = 'RGB' if channels == 3 else 'L'
    elif height == 0 and width == 0 and channels in (1, 3):
        perm = (0, 1, 2)
        pad = 0
        scale = 0
        mode = 'RGB' if channels == 3 else 'L'
    else:
        raise KrakenInputException('Invalid input spec (variable height and fixed width not supported)')

    out_transforms = []
    out_transforms.append(transforms.Lambda(lambda x: x.convert(mode)))
    if scale:
        if isinstance(scale, int):
            if mode not in ['1', 'L']:
                raise KrakenInputException('Invalid mode {} for line dewarping'.format(mode))
            lnorm = CenterNormalizer(scale)
            out_transforms.append(transforms.Lambda(lambda x: dewarp(lnorm, x)))
            out_transforms.append(transforms.Lambda(lambda x: x.convert(mode)))
        elif isinstance(scale, tuple):
            out_transforms.append(transforms.Resize(scale, Image.LANCZOS))
    if pad:
        out_transforms.append(transforms.Pad((pad, 0), fill=255))
    out_transforms.append(transforms.ToTensor())
    # invert
    out_transforms.append(transforms.Lambda(lambda x: x.max() - x))
    out_transforms.append(transforms.Lambda(lambda x: x.permute(*perm)))
    return transforms.Compose(out_transforms)
예제 #2
0
    def __init__(self,
                 images=None,
                 split=lambda x: os.path.splitext(x)[0],
                 suffix='.gt.txt',
                 normalization=None,
                 reorder=True,
                 partition=0.9,
                 pad=16):
        """
        Reads a list of image-text pairs and creates a ground truth set.

        Args:
            images (list): List of file paths of line images
            split (func): Function for generating the base name without
                          extensions from paths
            suffix (str): Suffix to attach to image base name for text
                          retrieval
            normalization (str): Unicode normalization for gt
            reorder (bool): Whether to rearrange code points in "display"/LTR
                            order
            partition (float): Ground truth data partition ratio between
                               train/test set.
            pad (int): Padding to add to images left and right
        """
        self.lnorm = CenterNormalizer()
        self.training_set = []
        self.test_set = []
        self.alphabet = set()

        if not images:
            return
        for line in images:
            self.add(line, split, suffix, normalization, reorder, pad)

        self.repartition(partition)

        self.alphabet = sorted(set(''.join(t for _, t in self.training_set)))
예제 #3
0
def generate_input_transforms(batch: int,
                              height: int,
                              width: int,
                              channels: int,
                              pad: int,
                              valid_norm: bool = True,
                              force_binarization=False) -> transforms.Compose:
    """
    Generates a torchvision transformation converting a PIL.Image into a
    tensor usable in a network forward pass.

    Args:
        batch (int): mini-batch size
        height (int): height of input image in pixels
        width (int): width of input image in pixels
        channels (int): color channels of input
        pad (int): Amount of padding on horizontal ends of image
        valid_norm (bool): Enables/disables baseline normalization as a valid
                           preprocessing step. If disabled we will fall back to
                           standard scaling.
        force_binarization (bool): Forces binarization of input images using
                                   the nlbin algorithm.

    Returns:
        A torchvision transformation composition converting the input image to
        the appropriate tensor.
    """
    scale = (height, width)  # type: Tuple[int, int]
    center_norm = False
    mode = 'RGB' if channels == 3 else 'L'
    if height == 1 and width == 0 and channels > 3:
        perm = (1, 0, 2)
        scale = (channels, 0)
        if valid_norm:
            center_norm = True
        mode = 'L'
    elif height > 1 and width == 0 and channels in (1, 3):
        perm = (0, 1, 2)
        if valid_norm and channels == 1:
            center_norm = True
    elif height == 0 and width > 1 and channels in (1, 3):
        perm = (0, 1, 2)
    # fixed height and width image => bicubic scaling of the input image, disable padding
    elif height > 0 and width > 0 and channels in (1, 3):
        perm = (0, 1, 2)
        pad = 0
    elif height == 0 and width == 0 and channels in (1, 3):
        perm = (0, 1, 2)
        pad = 0
    else:
        raise KrakenInputException(
            'Invalid input spec {}, {}, {}, {}, {}'.format(
                batch, height, width, channels, pad))
    if mode != 'L' and force_binarization:
        raise KrakenInputException(
            'Invalid input spec {}, {}, {}, {} in'
            ' combination with forced binarization.'.format(
                batch, height, width, channels, pad))

    out_transforms = []
    out_transforms.append(transforms.Lambda(lambda x: x.convert(mode)))

    if force_binarization:
        out_transforms.append(transforms.Lambda(lambda x: nlbin(im)))
    # dummy transforms to ensure we can determine color mode of input material
    # from first two transforms. It's stupid but it works.
    out_transforms.append(transforms.Lambda(lambda x: x))
    if scale != (0, 0):
        if center_norm:
            lnorm = CenterNormalizer(scale[0])
            out_transforms.append(
                transforms.Lambda(lambda x: dewarp(lnorm, x)))
            out_transforms.append(transforms.Lambda(lambda x: x.convert(mode)))
        else:
            out_transforms.append(
                transforms.Lambda(
                    lambda x: _fixed_resize(x, scale, Image.LANCZOS)))
    if pad:
        out_transforms.append(transforms.Pad((pad, 0), fill=255))
    out_transforms.append(transforms.ToTensor())
    # invert
    out_transforms.append(transforms.Lambda(lambda x: x.max() - x))
    out_transforms.append(transforms.Lambda(lambda x: x.permute(*perm)))
    return transforms.Compose(out_transforms)
예제 #4
0
파일: rpred.py 프로젝트: asgundogdu/kraken
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """

    lnorm = getattr(network, 'lnorm', CenterNormalizer())

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        line = lstm.prepare_line(line, pad)
        pred = network.predictString(line)

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                pos.append((coords[0] + int(
                    (start - pad) * scale), coords[1], coords[0] + int(
                        (end - pad / 2) * scale), coords[3]))
            else:
                pos.append((coords[0], coords[1] + int(
                    (start - pad) * scale), coords[2], coords[1] + int(
                        (end - pad / 2) * scale)))
            conf.append(c)
        if bidi_reordering:
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)
예제 #5
0
파일: rpred.py 프로젝트: asgundogdu/kraken
def mm_rpred(nets,
             im,
             bounds,
             pad=16,
             line_normalization=True,
             bidi_reordering=True):
    """
    Multi-model version of kraken.rpred.rpred.

    Takes a dictionary of ISO15924 script identifiers->models and an
    script-annotated segmentation to dynamically select appropriate models for
    these lines.

    Args:
        nets (dict): A dict mapping ISO15924 identifiers to SegRecognizer
                     objects. Recommended to be an defaultdict.
        im (PIL.Image): Image to extract text from
                        bounds (dict): A dictionary containing a 'boxes' entry
                        with a list of lists of coordinates (script, (x0, y0,
                        x1, y1)) of a text line in the image and an entry
                        'text_direction' containing
                        'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    for line in bounds['boxes']:
        rec = ocr_record('', [], [])
        for script, (box, coords) in zip(
                map(lambda x: x[0], line),
                extract_boxes(
                    im, {
                        'text_direction': bounds['text_direction'],
                        'boxes': map(lambda x: x[1], line)
                    })):
            # check if boxes are non-zero in any dimension
            if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
                continue
            raw_line = pil2array(box)
            # check if line is non-zero
            if np.amax(raw_line) == np.amin(raw_line):
                continue
            if line_normalization:
                # fail gracefully and return no recognition result in case the
                # input line can not be normalized.
                try:
                    lnorm = getattr(nets[script], 'lnorm', CenterNormalizer())
                    box = dewarp(lnorm, box)
                except Exception as e:
                    continue
            line = pil2array(box)
            line = lstm.prepare_line(line, pad)
            pred = nets[script].predictString(line)
            # calculate recognized LSTM locations of characters
            scale = len(raw_line.T) / (len(nets[script].outputs) - 2 * pad)
            result = lstm.translate_back_locations(nets[script].outputs)
            pos = []
            conf = []

            for _, start, end, c in result:
                if bounds['text_direction'].startswith('horizontal'):
                    pos.append((coords[0] + int(
                        (start - pad) * scale), coords[1], coords[0] + int(
                            (end - pad / 2) * scale), coords[3]))
                else:
                    pos.append((coords[0], coords[1] + int(
                        (start - pad) * scale), coords[2], coords[1] + int(
                            (end - pad / 2) * scale)))
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if bidi_reordering:
            yield bidi_record(rec)
        else:
            yield rec
예제 #6
0
파일: rpred.py 프로젝트: ronysteel/kraken
def rpred(network,
          im,
          bounds,
          pad=16,
          line_normalization=True,
          bidi_reordering=True):
    """
    Uses a RNN to recognize text

    Args:
        network (kraken.lib.lstm.SegRecognizer): A SegRecognizer object
        im (PIL.Image): Image to extract text from
        bounds (dict): A dictionary containing a 'boxes' entry with a list of
                       coordinates (x0, y0, x1, y1) of a text line in the image
                       and an entry 'text_direction' containing
                       'horizontal-lr/rl/vertical-lr/rl'.
        pad (int): Extra blank padding to the left and right of text line
        line_normalization (bool): Dewarp line using the line estimator
                                   contained in the network. If no normalizer
                                   is available one using the default
                                   parameters is created. By aware that you may
                                   have to scale lines manually to the target
                                   line height if disabled.
        bidi_reordering (bool): Reorder classes in the ocr_record according to
                                the Unicode bidirectional algorithm for correct
                                display.
    Yields:
        An ocr_record containing the recognized text, absolute character
        positions, and confidence values for each character.
    """
    im_str = get_im_str(im)
    logger.info(u'Running recognizer on {} with {} lines'.format(
        im_str, len(bounds['boxes'])))
    logger.debug(u'Loading line normalizer')
    lnorm = getattr(network, 'lnorm', CenterNormalizer())
    if not is_bitonal(im):
        logger.info(u'Image is grayscale. Adjusting normalizer parameters')
        lnorm.range = 2

    for box, coords in extract_boxes(im, bounds):
        # check if boxes are non-zero in any dimension
        if sum(coords[::2]) == 0 or coords[3] - coords[1] == 0:
            logger.warning(
                u'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            yield ocr_record('', [], [])
            continue
        raw_line = pil2array(box)
        # check if line is non-zero
        if np.amax(raw_line) == np.amin(raw_line):
            logger.warning(
                u'Empty line {}. Emitting empty record.'.format(coords))
            yield ocr_record('', [], [])
            continue
        if line_normalization:
            # fail gracefully and return no recognition result in case the
            # input line can not be normalized.
            try:
                box = dewarp(lnorm, box)
            except:
                logger.warning(
                    u'Dewarping for bbox {} failed. Emitting empty record.'.
                    format(coords))
                yield ocr_record('', [], [])
                continue
        line = pil2array(box)
        logger.debug(u'Preparing line.')
        line = lstm.prepare_line(line, pad)
        logger.debug(u'Performing forward pass.')
        pred = network.predictString(line)
        logger.info(u'Prediction: {}'.format(pred))

        # calculate recognized LSTM locations of characters
        scale = len(raw_line.T) / (len(network.outputs) - 2 * pad)
        logger.debug(u'Extracting labels.')
        result = lstm.translate_back_locations(network.outputs)
        pos = []
        conf = []

        for _, start, end, c in result:
            if bounds['text_direction'].startswith('horizontal'):
                xmin = coords[0] + int(max((start - pad) * scale, 0))
                xmax = coords[0] + max(
                    int(min((end - pad) * scale, coords[2] - coords[0])), 1)
                pos.append((xmin, coords[1], xmax, coords[3]))
            else:
                ymin = coords[1] + int(max((start - pad) * scale, 0))
                ymax = coords[1] + max(
                    int(min((end - pad) * scale, coords[3] - coords[1])), 1)
                pos.append((coords[0], ymin, coords[2], ymax))
            conf.append(c)
        if bidi_reordering:
            logger.debug(u'BiDi reordering record.')
            yield bidi_record(ocr_record(pred, pos, conf))
        else:
            yield ocr_record(pred, pos, conf)