Exemplo n.º 1
0
def cli(format_type, model, repolygonize, files):
    """
    A small script extracting rectified line polygons as defined in either ALTO or
    PageXML files or run a model to do the same.
    """
    if len(files) == 0:
        ctx = click.get_current_context()
        click.echo(ctx.get_help())
        ctx.exit()

    from PIL import Image
    from os.path import splitext
    from kraken import blla
    from kraken.lib import dataset, segmentation, vgsl, xml

    if model is None:
        for doc in files:
            click.echo(f'Processing {doc} ', nl=False)
            data = xml.preparse_xml_data([doc], format_type, repolygonize=repolygonize)
            if len(data) > 0:
                bounds = {'type': 'baselines', 'lines': [{'boundary': t['boundary'], 'baseline': t['baseline'], 'text': t['text']} for t in data]}
                for idx, (im, box) in enumerate(segmentation.extract_polygons(Image.open(data[0]['image']), bounds)):
                    click.echo('.', nl=False)
                    im.save('{}.{}.jpg'.format(splitext(data[0]['image'])[0], idx))
                    with open('{}.{}.gt.txt'.format(splitext(data[0]['image'])[0], idx), 'w') as fp:
                        fp.write(box['text'])
    else:
        net = vgsl.TorchVGSLModel.load_model(model)
        for doc in files:
            click.echo(f'Processing {doc} ', nl=False)
            full_im = Image.open(doc)
            bounds = blla.segment(full_im, model=net)
            for idx, (im, box) in enumerate(segmentation.extract_polygons(full_im, bounds)):
                click.echo('.', nl=False)
                im.save('{}.{}.jpg'.format(splitext(doc)[0], idx))
Exemplo n.º 2
0
 def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
     if self.preload:
         x, y = self.training_set[index]
         if self.aug:
             x = x.permute((1, 2, 0)).numpy()
             o = self.aug(image=x)
             x = torch.tensor(o['image'].transpose(2, 0, 1))
         return {'image': x, 'target': y}
     else:
         item = self.training_set[index]
         try:
             logger.debug(f'Attempting to load {item[0]}')
             im = item[0][0]
             if not isinstance(im, Image.Image):
                 im = Image.open(im)
             im, _ = next(extract_polygons(im, {'type': 'baselines', 'lines': [{'baseline': item[0][1], 'boundary': item[0][2]}]}))
             im = self.head_transforms(im)
             if not is_bitonal(im):
                 self.im_mode = im.mode
             im = self.tail_transforms(im)
             if self.aug:
                 im = im.permute((1, 2, 0)).numpy()
                 o = self.aug(image=im)
                 im = torch.tensor(o['image'].transpose(2, 0, 1))
             return {'image': im, 'target': item[1]}
         except Exception:
             idx = np.random.randint(0, len(self.training_set))
             logger.debug(traceback.format_exc())
             logger.info(f'Failed. Replacing with sample {idx}')
             return self[np.random.randint(0, len(self.training_set))]
Exemplo n.º 3
0
def _extract_line(xml_record):
    lines = []
    try:
        im = Image.open(xml_record['image'])
    except FileNotFoundError:
        return lines, None, None
    if is_bitonal(im):
        im = im.convert('1')
    line_counts = Counter({'all': 0, 'train': 0, 'validation': 0, 'test': 0})
    seg_key = 'lines' if 'lines' in xml_record else 'boxes'
    recs = xml_record.pop(seg_key)
    for idx, rec in enumerate(recs):
        try:
            line_im, line = next(
                extract_polygons(im, {
                    **xml_record, seg_key: [rec]
                }))
        except KrakenInputException:
            logger.warning(f'Invalid line {idx} in {im.filename}')
            continue
        if not line['text']:
            continue
        fp = io.BytesIO()
        line_im.save(fp, format='png')
        if line['split']:
            line_counts[line['split']] += 1
        else:
            line_counts['all'] += 1
        lines.append({'text': line['text'], 'im': fp.getvalue()})
    return lines, im.mode
Exemplo n.º 4
0
    def _recognize_baseline_line(self, line):
        if self.tags_ignore is not None:
            for tag in line['lines'][0]['tags'].values():
                if tag in self.tags_ignore:
                    logger.info(
                        f'Ignoring line segment with tags {line["lines"][0]["tags"]} based on {tag}.'
                    )
                    return ocr_record('', [], [], line['lines'][0])

        try:
            box, coords = next(extract_polygons(self.im, line))
        except KrakenInputException as e:
            logger.warning(f'Extracting line failed: {e}')
            return ocr_record('', [], [], line['lines'][0])

        self.box = box

        tag, net = self._resolve_tags_to_model(coords['tags'], self.nets)
        # check if boxes are non-zero in any dimension
        if 0 in box.size:
            logger.warning(
                f'bbox {coords} with zero dimension. Emitting empty record.')
            return ocr_record('', [], [], coords)
        # try conversion into tensor
        try:
            line = self.ts[tag](box)
        except Exception:
            return ocr_record('', [], [], coords)
        # check if line is non-zero
        if line.max() == line.min():
            return ocr_record('', [], [], coords)

        preds = net.predict(line.unsqueeze(0))[0]
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        self.net_scale = line.shape[2] / net.outputs.shape[2]
        # scale between network input and original line
        self.in_scale = box.size[0] / (line.shape[2] - 2 * self.pad)

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            pos.append(
                compute_polygon_section(
                    coords['baseline'], coords['boundary'],
                    self._scale_val(start, 0, self.box.size[0]),
                    self._scale_val(end, 0, self.box.size[0])))
            conf.append(c)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            return bidi_record(ocr_record(pred, pos, conf, coords),
                               base_dir=self.bidi_reordering
                               if self.bidi_reordering in ('L', 'R') else None)
        else:
            logger.debug('Emitting raw record')
            return ocr_record(pred, pos, conf, coords)
Exemplo n.º 5
0
    def _recognize_baseline_line(self, line):
        try:
            box, coords = next(extract_polygons(self.im, line))
        except KrakenInputException as e:
            logger.warning(f'Extracting line failed: {e}')
            return ocr_record('', [], [], line['lines'][0])

        script = coords['script']
        # check if boxes are non-zero in any dimension
        if 0 in box.size:
            logger.warning(
                'bbox {} with zero dimension. Emitting empty record.'.format(
                    coords))
            return ocr_record('', [], [], coords)
        # try conversion into tensor
        try:
            line = self.ts[script](box)
        except Exception:
            return ocr_record('', [], [], coords)
        # check if line is non-zero
        if line.max() == line.min():
            return ocr_record('', [], [], coords)

        preds = self.nets[script].predict(line.unsqueeze(0))[0]
        # calculate recognized LSTM locations of characters
        # scale between network output and network input
        net_scale = line.shape[2] / self.nets[script].outputs.shape[2]
        # scale between network input and original line
        in_scale = box.size[0] / (line.shape[2] - 2 * self.pad)

        def _scale_val(val, min_val, max_val):
            return int(
                round(
                    min(
                        max(((val * net_scale) - self.pad) * in_scale,
                            min_val), max_val - 1)))

        # XXX: fix bounding box calculation ocr_record for multi-codepoint labels.
        pred = ''.join(x[0] for x in preds)
        pos = []
        conf = []
        for _, start, end, c in preds:
            pos.append(
                compute_polygon_section(coords['baseline'], coords['boundary'],
                                        _scale_val(start, 0, box.size[0]),
                                        _scale_val(end, 0, box.size[0])))
            conf.append(c)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            rec = bidi_record(ocr_record(pred, pos, conf, coords))
            return rec
        else:
            logger.debug('Emitting raw record')
            return ocr_record(pred, pos, conf, coords)
Exemplo n.º 6
0
    def add(self, image: Union[str, Image.Image], text: str,
            baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]],
            *args, **kwargs):
        """
        Adds a line to the dataset.

        Args:
            im (path): Path to the whole page image
            text (str): Transcription of the line.
            baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]].
            boundary (list): A polygon mask for the line.
        """
        for func in self.text_transforms:
            text = func(text)
        if not text:
            raise KrakenInputException(
                'Text line is empty after transformations')
        if not baseline:
            raise KrakenInputException('No baseline given for line')
        if not boundary:
            raise KrakenInputException('No boundary given for line')
        if self.preload:
            if not isinstance(image, Image.Image):
                im = Image.open(image)
            try:
                im, _ = next(
                    extract_polygons(
                        im, {
                            'type': 'baselines',
                            'lines': [{
                                'baseline': baseline,
                                'boundary': boundary
                            }]
                        }))
            except IndexError:
                raise KrakenInputException(
                    'Patch extraction failed for baseline')
            try:
                im = self.head_transforms(im)
                if not is_bitonal(im):
                    self.im_mode = im.mode
                im = self.tail_transforms(im)
            except ValueError:
                raise KrakenInputException(
                    f'Image transforms failed on {image}')
            self._images.append(im)
        else:
            self._images.append((image, baseline, boundary))
        self._gt.append(text)
        self.alphabet.update(text)
Exemplo n.º 7
0
    def parse(self, image: Union[str, Image.Image], text: str, baseline: List[Tuple[int, int]], boundary: List[Tuple[int, int]], *args, **kwargs):
        """
        Parses a sample for the dataset and returns it.

        This function is mainly uses for parallelized loading of training data.

        Args:
            im (path): Path to the whole page image
            text (str): Transcription of the line.
            baseline (list): A list of coordinates [[x0, y0], ..., [xn, yn]].
            boundary (list): A polygon mask for the line.
        """
        for func in self.text_transforms:
            text = func(text)
        if not text:
            raise KrakenInputException('Text line is empty after transformations')
        if not baseline:
            raise KrakenInputException('No baseline given for line')
        if not boundary:
            raise KrakenInputException('No boundary given for line')
        if self.preload:
            if not isinstance(image, Image.Image):
                im = Image.open(image)
            try:
                im, _ = next(extract_polygons(im, {'type': 'baselines', 'lines': [{'baseline': baseline, 'boundary': boundary}]}))
            except IndexError:
                raise KrakenInputException('Patch extraction failed for baseline')
            try:
                im = self.head_transforms(im)
                im = self.tail_transforms(im)
            except ValueError:
                raise KrakenInputException(f'Image transforms failed on {image}')
            self._images.append(im)
            return {'text': text, 'image': im, 'baseline': baseline, 'boundary': boundary, 'im_mode': im.mode, 'preload': True, 'preparse': True}
        else:
            return {'text': text, 'image': image, 'baseline': baseline, 'boundary': boundary, 'preload': False, 'preparse': True}
Exemplo n.º 8
0
    def _recognize_box_line(self, line):
        flat_box = [point for box in line['boxes'][0] for point in box[1]]
        xmin, xmax = min(flat_box[::2]), max(flat_box[::2])
        ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2])
        rec = ocr_record('', [], [], [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]])
        for script, (box, coords) in zip(map(lambda x: x[0], line['boxes'][0]),
                                         extract_polygons(self.im, {'text_direction': line['text_direction'],
                                                                    'boxes': map(lambda x: x[1], line['boxes'][0])})):
            # skip if script is set to ignore
            if self.script_ignore is not None and script in self.script_ignore:
                logger.info('Ignoring {} line segment.'.format(script))
                continue
            # check if boxes are non-zero in any dimension
            if 0 in box.size:
                logger.warning('bbox {} with zero dimension. Emitting empty record.'.format(coords))
                continue
            # try conversion into tensor
            try:
                logger.debug('Preparing run.')
                line = self.ts[script](box)
            except Exception:
                logger.warning('Conversion of line {} failed. Skipping.'.format(coords))
                continue

            # check if line is non-zero
            if line.max() == line.min():
                logger.warning('Empty run. Skipping.')
                continue

            logger.debug('Forward pass with model {}'.format(script))
            preds = self.nets[script].predict(line.unsqueeze(0))[0]

            # calculate recognized LSTM locations of characters
            logger.debug('Convert to absolute coordinates')
            # calculate recognized LSTM locations of characters
            # scale between network output and network input
            net_scale = line.shape[2]/self.nets[script].outputs.shape[2]
            # scale between network input and original line
            in_scale = box.size[0]/(line.shape[2]-2*self.pad)

            def _scale_val(val, min_val, max_val):
                return int(round(min(max(((val*net_scale)-self.pad)*in_scale, min_val), max_val)))

            pred = ''.join(x[0] for x in preds)
            pos = []
            conf = []

            for _, start, end, c in preds:
                if self.bounds['text_direction'].startswith('horizontal'):
                    xmin = coords[0] + _scale_val(start, 0, box.size[0])
                    xmax = coords[0] + _scale_val(end, 0, box.size[0])
                    pos.append([[xmin, coords[1]], [xmin, coords[3]], [xmax, coords[3]], [xmax, coords[1]]])
                else:
                    ymin = coords[1] + _scale_val(start, 0, box.size[1])
                    ymax = coords[1] + _scale_val(start, 0, box.size[1])
                    pos.append([[coords[0], ymin], [coords[2], ymin], [coords[2], ymax], [coords[0], ymax]])
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            return bidi_record(rec)
        else:
            logger.debug('Emitting raw record')
            return rec
Exemplo n.º 9
0
    def _recognize_box_line(self, line):
        flat_box = [point for box in line['boxes'][0] for point in box[1]]
        xmin, xmax = min(flat_box[::2]), max(flat_box[::2])
        ymin, ymax = min(flat_box[1::2]), max(flat_box[1::2])
        rec = ocr_record(
            '', [], [],
            [[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]])
        for tag, (box, coords) in zip(
                map(lambda x: x[0], line['boxes'][0]),
                extract_polygons(
                    self.im, {
                        'text_direction': line['text_direction'],
                        'boxes': map(lambda x: x[1], line['boxes'][0])
                    })):
            self.box = box
            # skip if tag is set to ignore
            if self.tags_ignore is not None and tag in self.tags_ignore:
                logger.warning(f'Ignoring {tag} line segment.')
                continue
            # check if boxes are non-zero in any dimension
            if 0 in box.size:
                logger.warning(
                    f'bbox {coords} with zero dimension. Emitting empty record.'
                )
                continue
            # try conversion into tensor
            try:
                logger.debug('Preparing run.')
                line = self.ts[tag](box)
            except Exception:
                logger.warning(
                    f'Conversion of line {coords} failed. Skipping.')
                continue

            # check if line is non-zero
            if line.max() == line.min():
                logger.warning('Empty run. Skipping.')
                continue

            _, net = self._resolve_tags_to_model({'type': tag}, self.nets)

            logger.debug(f'Forward pass with model {tag}.')
            preds = net.predict(line.unsqueeze(0))[0]

            # calculate recognized LSTM locations of characters
            logger.debug('Convert to absolute coordinates')
            # calculate recognized LSTM locations of characters
            # scale between network output and network input
            self.net_scale = line.shape[2] / net.outputs.shape[2]
            # scale between network input and original line
            self.in_scale = box.size[0] / (line.shape[2] - 2 * self.pad)

            pred = ''.join(x[0] for x in preds)
            pos = []
            conf = []

            for _, start, end, c in preds:
                if self.bounds['text_direction'].startswith('horizontal'):
                    xmin = coords[0] + self._scale_val(start, 0,
                                                       self.box.size[0])
                    xmax = coords[0] + self._scale_val(end, 0,
                                                       self.box.size[0])
                    pos.append([[xmin, coords[1]], [xmin, coords[3]],
                                [xmax, coords[3]], [xmax, coords[1]]])
                else:
                    ymin = coords[1] + self._scale_val(start, 0,
                                                       self.box.size[1])
                    ymax = coords[1] + self._scale_val(end, 0,
                                                       self.box.size[1])
                    pos.append([[coords[0], ymin], [coords[2], ymin],
                                [coords[2], ymax], [coords[0], ymax]])
                conf.append(c)
            rec.prediction += pred
            rec.cuts.extend(pos)
            rec.confidences.extend(conf)
        if self.bidi_reordering:
            logger.debug('BiDi reordering record.')
            return bidi_record(rec,
                               base_dir=self.bidi_reordering
                               if self.bidi_reordering in ('L', 'R') else None)
        else:
            logger.debug('Emitting raw record')
            return rec