def bidi_record(record): """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() base_level = bd.get_base_level(record.prediction) storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = u'' cuts = [] confidences = [] for ch in storage['chars']: prediction = prediction + ch['record'][0] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) return ocr_record(prediction, cuts, confidences)
def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False): """ Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). Set `base_dir` to 'L' or 'R' to override the calculated base_level. Set `debug` to True to display (using sys.stderr) the steps taken with the algorithm. Returns an info dict object and the display layout. """ storage = get_empty_storage() if base_dir is None: base_level = get_base_level(text, upper_is_rtl) else: base_level = PARAGRAPH_LEVELS[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] get_embedding_levels(text, storage, upper_is_rtl, debug) assert len(text) == len(storage["chars"]) for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])): assert ch == chInfo["ch"] chInfo["index"] = index explicit_embed_and_overrides(storage, debug) resolve_weak_types(storage, debug) resolve_neutral_types(storage, debug) resolve_implicit_levels(storage, debug) reorder_resolved_levels(storage, debug) return storage
def get_display_mod(unicode_or_str, encoding='utf-8', upper_is_rtl=False, base_dir=None, debug=False): """Accepts unicode or string. In case it's a string, `encoding` is needed as it works on unicode ones (default:"utf-8"). Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). Set `base_dir` to 'L' or 'R' to override the calculated base_level. Set `debug` to True to display (using sys.stderr) the steps taken with the algorithm. Returns the display layout, either as unicode or `encoding` encoded string. """ storage = bidi.get_empty_storage() # utf-8 ? we need unicode if isinstance(unicode_or_str, six.text_type): text = unicode_or_str decoded = False else: text = unicode_or_str.decode(encoding) decoded = True if base_dir is None: base_level = bidi.get_base_level(text, upper_is_rtl) else: base_level = bidi.PARAGRAPH_LEVELS[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bidi.get_embedding_levels(text, storage, upper_is_rtl, debug) bidi.explicit_embed_and_overrides(storage, debug) bidi.resolve_weak_types(storage, debug) bidi.resolve_neutral_types(storage, debug) bidi.resolve_implicit_levels(storage, debug) bidi.reorder_resolved_levels(storage, debug) #Commented out from original code: # bidi.apply_mirroring(storage, debug) # print_storage_chars(storage) # chars = storage['chars'] # display = u''.join([_ch['ch'] for _ch in chars]) display = print_storage_chars(storage) if decoded: return display.encode(encoding) else: return display
def bidi_record(record: ocr_record, base_dir=None) -> ocr_record: """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() if base_dir not in ('L', 'R'): base_level = bd.get_base_level(record.prediction) else: base_level = {'L': 0, 'R': 1}[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = '' cuts = [] confidences = [] for ch in storage['chars']: # code point may have been mirrored prediction = prediction + ch['ch'] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) # carry over whole line information if record.type == 'baselines': line = {'boundary': record.line, 'baseline': record.baseline} else: line = record.line rec = ocr_record(prediction, cuts, confidences, line) rec.tags = record.tags rec.base_dir = base_dir return rec