def add_page(self, im, segmentation=None, records=None): """ Adds an image to the transcription interface, optionally filling in information from a list of ocr_record objects. Args: im (PIL.Image): Input image records (list): A list of ocr_record objects. """ page = {} fd = BytesIO() im.save(fd, format='png', optimize=True) page['index'] = self.page_idx self.page_idx += 1 page['img'] = 'data:image/png;base64,' + base64.b64encode(fd.getvalue()).decode('ascii') page['lines'] = [] if records: for record in records: splits = regex.split(u'(\s+)', record.prediction) bbox = max_bbox(record.cuts) line_offset = 0 segments = [] for segment, whitespace in zip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(record.cuts[line_offset:line_offset + len(segment)]) segments.append({'bbox': '{}, {}, {}, {}'.format(*seg_bbox), 'text': segment, 'index': self.seg_idx}) self.seg_idx += 1 line_offset += len(segment) if whitespace: line_offset += len(whitespace) page['lines'].append({'index': self.line_idx, 'recognition': segments, 'left': 100*int(bbox[0]) / im.size[0], 'top': 100*int(bbox[1]) / im.size[1], 'width': 100*(bbox[2] - bbox[0])/im.size[0], 'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1], 'bbox': '{}, {}, {}, {}'.format(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))}) self.line_idx += 1 elif segmentation: for bbox in segmentation: page['lines'].append({'index': self.line_idx, 'left': 100*int(bbox[0]) / im.size[0], 'top': 100*int(bbox[1]) / im.size[1], 'width': 100*(bbox[2] - bbox[0])/im.size[0], 'height': 100*(int(bbox[3]) - int(bbox[1]))/im.size[1], 'bbox': '{}, {}, {}, {}'.format(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))}) self.line_idx += 1 else: raise KrakenInputException('Neither segmentations nor records given') self.pages.append(page)
def detect_scripts(im, bounds, model=None): """ Detects scripts in a segmented page. Classifies lines returned by the page segmenter into runs of scripts/writing systems. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-tb/vertical-lr/rl'. model (str): Location of the script classification model or None for default. Returns: {'text_direction': '$dir', 'boxes': [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text direction and a list of lists of reading order sorted bounding boxes under the key 'boxes' with each list containing the script segmentation of a single line. Script is a ISO15924 4 character identifier. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. KrakenInvalidModelException if no clstm module is available. """ if not model: model = pkg_resources.resource_filename(__name__, 'script.clstm') rnn = models.load_clstm(model) # load numerical to 4 char identifier map with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp: n2s = json.load(fp) it = rpred(rnn, im, bounds) preds = [] for pred in it: # substitute inherited scripts with neighboring runs def subs(m, s): p = u'' for c in s: if c in m and p: p += p[-1] else: p += c return p p = subs([u'\U000f03e6', u'\U000f03e6'], pred.prediction) # do a reverse run to fix leading inherited scripts pred.prediction = ''.join(reversed(subs([u'\U000f03e6', u'\U000f03e6'], reversed(p)))) # group by grapheme t = [] for k, g in groupby(pred, key=lambda x: x[0]): # convert to ISO15924 numerical identifier k = ord(k) - 0xF0000 b = max_bbox(x[1] for x in g) t.append((n2s[str(k)], b)) preds.append(t) return {'boxes': preds, 'text_direction': bounds['text_direction']}
def detect_scripts(im, bounds, model=pkg_resources.resource_filename( __name__, 'script.mlmodel'), valid_scripts=None): """ Detects scripts in a segmented page. Classifies lines returned by the page segmenter into runs of scripts/writing systems. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. model (str): Location of the script classification model or None for default. valid_scripts (list): List of valid scripts. Returns: {'script_detection': True, 'text_direction': '$dir', 'boxes': [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text direction and a list of lists of reading order sorted bounding boxes under the key 'boxes' with each list containing the script segmentation of a single line. Script is a ISO15924 4 character identifier. Raises: KrakenInvalidModelException if no clstm module is available. """ raise NotImplementedError( 'Temporarily unavailable. Please open a github ticket if you want this fixed sooner.' ) im_str = get_im_str(im) logger.info(u'Detecting scripts with {} in {} lines on {}'.format( model, len(bounds['boxes']), im_str)) logger.debug(u'Loading detection model {}'.format(model)) rnn = models.load_any(model) # load numerical to 4 char identifier map logger.debug(u'Loading label to identifier map') with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp: n2s = json.load(fp) # convert allowed scripts to labels val_scripts = [] if valid_scripts: logger.debug( u'Converting allowed scripts list {}'.format(valid_scripts)) for k, v in n2s.items(): if v in valid_scripts: val_scripts.append(chr(int(k) + 0xF0000)) else: valid_scripts = [] it = rpred(rnn, im, bounds, bidi_reordering=False) preds = [] logger.debug(u'Running detection') for pred, bbox in zip(it, bounds['boxes']): # substitute inherited scripts with neighboring runs def _subs(m, s, r=False): p = u'' for c in s: if c in m and p and not r: p += p[-1] elif c not in m and p and r: p += p[-1] else: p += c return p logger.debug(u'Substituting scripts') p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction) # do a reverse run to fix leading inherited scripts pred.prediction = ''.join( reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p)))) # group by valid scripts. two steps: 1. substitute common confusions # (Latin->Fraktur and Syriac->Arabic) if given in script list. if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts: pred.prediction = pred.prediction.replace(u'\U000f0087', u'\U000f00a0') if 'Latn' in valid_scripts and 'Latf' not in valid_scripts: pred.prediction = pred.prediction.replace(u'\U000f00d9', u'\U000f00d7') # next merge adjacent scripts if val_scripts: pred.prediction = _subs(val_scripts, pred.prediction, r=True) # group by grapheme t = [] logger.debug(u'Merging detections') # if line contains only a single script return whole line bounding box if len(set(pred.prediction)) == 1: logger.debug('Only one script on line. Emitting whole line bbox') k = ord(pred.prediction[0]) - 0xF0000 t.append((n2s[str(k)], bbox)) else: for k, g in groupby(pred, key=lambda x: x[0]): # convert to ISO15924 numerical identifier k = ord(k) - 0xF0000 b = max_bbox(x[1] for x in g) t.append((n2s[str(k)], b)) preds.append(t) return { 'boxes': preds, 'text_direction': bounds['text_direction'], 'script_detection': True }
def detect_scripts(im, bounds, model=pkg_resources.resource_filename(__name__, 'script.mlmodel'), valid_scripts=None): """ Detects scripts in a segmented page. Classifies lines returned by the page segmenter into runs of scripts/writing systems. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' bounds (dict): A dictionary containing a 'boxes' entry with a list of coordinates (x0, y0, x1, y1) of a text line in the image and an entry 'text_direction' containing 'horizontal-lr/rl/vertical-lr/rl'. model (str): Location of the script classification model or None for default. valid_scripts (list): List of valid scripts. Returns: {'script_detection': True, 'text_direction': '$dir', 'boxes': [[(script, (x1, y1, x2, y2)),...]]}: A dictionary containing the text direction and a list of lists of reading order sorted bounding boxes under the key 'boxes' with each list containing the script segmentation of a single line. Script is a ISO15924 4 character identifier. Raises: KrakenInvalidModelException if no clstm module is available. """ raise NotImplementedError('Temporarily unavailable. Please open a github ticket if you want this fixed sooner.') im_str = get_im_str(im) logger.info(u'Detecting scripts with {} in {} lines on {}'.format(model, len(bounds['boxes']), im_str)) logger.debug(u'Loading detection model {}'.format(model)) rnn = models.load_any(model) # load numerical to 4 char identifier map logger.debug(u'Loading label to identifier map') with pkg_resources.resource_stream(__name__, 'iso15924.json') as fp: n2s = json.load(fp) # convert allowed scripts to labels val_scripts = [] if valid_scripts: logger.debug(u'Converting allowed scripts list {}'.format(valid_scripts)) for k, v in n2s.items(): if v in valid_scripts: val_scripts.append(chr(int(k) + 0xF0000)) else: valid_scripts = [] it = rpred(rnn, im, bounds, bidi_reordering=False) preds = [] logger.debug(u'Running detection') for pred, bbox in zip(it, bounds['boxes']): # substitute inherited scripts with neighboring runs def _subs(m, s, r=False): p = u'' for c in s: if c in m and p and not r: p += p[-1] elif c not in m and p and r: p += p[-1] else: p += c return p logger.debug(u'Substituting scripts') p = _subs([u'\U000f03e2', u'\U000f03e6'], pred.prediction) # do a reverse run to fix leading inherited scripts pred.prediction = ''.join(reversed(_subs([u'\U000f03e2', u'\U000f03e6'], reversed(p)))) # group by valid scripts. two steps: 1. substitute common confusions # (Latin->Fraktur and Syriac->Arabic) if given in script list. if 'Arab' in valid_scripts and 'Syrc' not in valid_scripts: pred.prediction = pred.prediction.replace(u'\U000f0087', u'\U000f00a0') if 'Latn' in valid_scripts and 'Latf' not in valid_scripts: pred.prediction = pred.prediction.replace(u'\U000f00d9', u'\U000f00d7') # next merge adjacent scripts if val_scripts: pred.prediction = _subs(val_scripts, pred.prediction, r=True) # group by grapheme t = [] logger.debug(u'Merging detections') # if line contains only a single script return whole line bounding box if len(set(pred.prediction)) == 1: logger.debug('Only one script on line. Emitting whole line bbox') k = ord(pred.prediction[0]) - 0xF0000 t.append((n2s[str(k)], bbox)) else: for k, g in groupby(pred, key=lambda x: x[0]): # convert to ISO15924 numerical identifier k = ord(k) - 0xF0000 b = max_bbox(x[1] for x in g) t.append((n2s[str(k)], b)) preds.append(t) return {'boxes': preds, 'text_direction': bounds['text_direction'], 'script_detection': True}