def pagexmllineseg(xmlfile, text_direction = 'horizontal-lr', outputfile=None): """ Opens file 'xmlfile', converts to newest pagexml version 2017, segments the text regions and writes xml to file. Output is written to input file if outfile is 'None'. """ if not outputfile: outputfile = xmlfile root = etree.parse(xmlfile).getroot() ns = {"ns":root.nsmap[None]} #convert point notation from older pagexml versions for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns): cc = [] for point in c.xpath("./ns:Point", namespaces=ns): #coordstrings = [x.split(",") for x in c.attrib["points"].split()] cx = point.attrib["x"] cy = point.attrib["y"] c.remove(point) cc.append(cx+","+cy) c.attrib["points"] = " ".join(cc) coordmap = {} for r in root.xpath('//ns:TextRegion', namespaces=ns): rid = r.attrib["id"] coordmap[rid] = {"type":r.attrib["type"]} coordmap[rid]["coords"] = [] for c in r.xpath("./ns:Coords", namespaces=ns) + r.xpath("./Coords"): coordstrings = [x.split(",") for x in c.attrib["points"].split()] coordmap[rid]["coords"] += [[int(x[0]), int(x[1])] for x in coordstrings ] filename = root.xpath('//ns:Page', namespaces=ns)[0].attrib["imageFilename"] im = Image.open(filename) for n, c in enumerate(sorted(coordmap)): coords = coordmap[c]['coords'] cropped = cutout(im, coords) offset = (min([x[0] for x in coords]), min([x[1] for x in coords])) if cropped != None: if not binarization.is_bitonal(cropped): cropped = binarization.nlbin(cropped) lines = segment(cropped, text_direction=text_direction, maxcolseps=0)['lines'] else: lines = [] for n, l in enumerate(lines): coords = ((x[1]+offset[0], x[0]+offset[1]) for x in l.polygon) coordstrg = " ".join(str(x[0])+","+str(x[1]) for x in coords) textregion = root.xpath('//ns:TextRegion[@id="'+c+'"]', namespaces=ns)[0] linexml = etree.SubElement(textregion, "TextLine", attrib={"id":c+"_l{:03d}".format(n + 1)}) coordsxml = etree.SubElement(linexml, "Coords", attrib={"points":coordstrg}) xmlstring = etree.tounicode(root.getroottree()).replace( "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15" ) with open(outputfile, "w") as f: f.write(xmlstring)
def transcription(ctx, text_direction, scale, maxcolseps, black_colseps, font, font_style, prefill, output, images): st_time = time.time() ti = transcribe.TranscriptionInterface(font, font_style) if prefill: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Loading model {}'.format(time.time() - st_time, prefill)) else: spin('Loading RNN') prefill = models.load_any(prefill.encode('utf-8')) if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) for fp in images: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Reading {}'.format(time.time() - st_time, fp.name)) else: spin('Reading images') im = Image.open(fp) if not binarization.is_bitonal(im): if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Binarizing page'.format(time.time() - st_time)) im = binarization.nlbin(im) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Segmenting page'.format(time.time() - st_time)) res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps) if prefill: it = rpred.rpred(prefill, im, res) preds = [] for pred in it: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction)) else: spin('Recognizing') preds.append(pred) if ctx.meta['verbose'] > 0: click.echo(u'Execution time: {}s'.format(time.time() - st_time)) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) ti.add_page(im, res, records=preds) else: ti.add_page(im, res) fp.close() if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Writing transcription to {}'.format(time.time() - st_time, output.name)) else: spin('Writing output') ti.write(output) if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False)
def transcription(ctx, font, font_style, prefill, output, images): st_time = time.time() ti = transcrib.TranscriptionInterface(font, font_style) if prefill: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Loading model {}'.format(time.time() - st_time, prefill)) else: spin('Loading RNN') prefill = models.load_any(prefill) if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) for fp in images: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Reading {}'.format(time.time() - st_time, fp.name)) else: spin('Reading images') im = Image.open(fp) if not binarization.is_bitonal(im): if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Binarizing page'.format(time.time() - st_time)) im = binarization.nlbin(im) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Segmenting page'.format(time.time() - st_time)) res = pageseg.segment(im) if prefill: it = rpred.rpred(prefill, im, res) preds = [] for pred in it: if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] {}'.format(time.time() - st_time, pred.prediction)) else: spin('Recognizing') preds.append(pred) if ctx.meta['verbose'] > 0: click.echo(u'Execution time: {}s'.format(time.time() - st_time)) else: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) ti.add_page(im, records=preds) else: ti.add_page(im, res) if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False) if ctx.meta['verbose'] > 0: click.echo(u'[{:2.4f}] Writing transcription to {}'.format(time.time() - st_time, output.name)) else: spin('Writing output') ti.write(output) if not ctx.meta['verbose']: click.secho(u'\b\u2713', fg='green', nl=False) click.echo('\033[?25h\n', nl=False)
def segment(im, scale=None, black_colseps=False): """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' scale (float): Scale of the image black_colseps (bool): Whether column separators are assumed to be vertical black lines or not Returns: [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes of the segmented lines in reading order. Raises: KrakenInputException if the input image is not binarized """ if im.mode != '1' and not is_bitonal(im): raise KrakenInputException('Image is not bi-level') # honestly I've got no idea what's going on here. In theory a simple # np.array(im, 'i') should suffice here but for some reason the # tostring/fromstring magic in pil2array alters the array in a way that is # needed for the algorithm to work correctly. a = pil2array(im) binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) binary = remove_hlines(binary, scale) if black_colseps: colseps, binary = compute_black_colseps(binary, scale) else: colseps = compute_white_colseps(binary, scale) bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread * binary) segmentation = llabels * binary lines = compute_lines(segmentation, scale) order = reading_order([l.bounds for l in lines]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
def segment(im, scale=None, black_colseps=False): """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' scale (float): Scale of the image black_colseps (bool): Whether column separators are assumed to be vertical black lines or not Returns: [(x1, y1, x2, y2),...]: A list of tuples containing the bounding boxes of the segmented lines in reading order. Raises: KrakenInputException if the input image is not binarized """ if im.mode != '1' and not is_bitonal(im): raise KrakenInputException('Image is not bi-level') # honestly I've got no idea what's going on here. In theory a simple # np.array(im, 'i') should suffice here but for some reason the # tostring/fromstring magic in pil2array alters the array in a way that is # needed for the algorithm to work correctly. a = pil2array(im) binary = np.array(a > 0.5*(np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) binary = remove_hlines(binary, scale) if black_colseps: colseps, binary = compute_black_colseps(binary, scale) else: colseps = compute_white_colseps(binary, scale) bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread*binary) segmentation = llabels*binary lines = compute_lines(segmentation, scale) order = reading_order([l.bounds for l in lines]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] return [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines]
def transcription(ctx, text_direction, scale, maxcolseps, black_colseps, font, font_style, prefill, output, images, segment_page): ti = transcribe.TranscriptionInterface(font, font_style) if prefill: logger.info('Loading model {}'.format(prefill)) spin('Loading RNN') prefill = models.load_any(prefill.encode('utf-8')) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) for fp in images: logger.info('Reading {}'.format(fp.name)) spin('Reading images') im = Image.open(fp) if not binarization.is_bitonal(im): logger.info(u'Binarizing page') im = binarization.nlbin(im) if segment_page: logger.info(u'Segmenting page') res = pageseg.segment(im, text_direction, scale, maxcolseps, black_colseps) else: res = { 'text_direction': 'horizontal-tb', 'boxes': [(0, 0) + im.size] } if prefill: it = rpred.rpred(prefill, im, res) preds = [] for pred in it: logger.info('{}'.format(pred.prediction)) spin('Recognizing') preds.append(pred) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) ti.add_page(im, res, records=preds) else: ti.add_page(im, res) fp.close() message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False) logger.info(u'Writing transcription to {}'.format(output.name)) spin('Writing output') ti.write(output) message(u'\b\u2713', fg='green', nl=False) message('\033[?25h\n', nl=False)
def segment(im, text_direction='horizontal-lr', scale=None, maxcolseps=2, black_colseps=False): """ Segments a page into text lines. Segments a page into text lines and returns the absolute coordinates of each line in reading order. Args: im (PIL.Image): A bi-level page of mode '1' or 'L' text_direction (str): Principal direction of the text (horizontal-lr/rl/vertical-lr/rl) scale (float): Scale of the image maxcolseps (int): Maximum number of whitespace column separators black_colseps (bool): Whether column separators are assumed to be vertical black lines or not Returns: {'text_direction': '$dir', 'boxes': [(x1, y1, x2, y2),...]}: A dictionary containing the text direction and a list of reading order sorted bounding boxes under the key 'boxes'. Raises: KrakenInputException if the input image is not binarized or the text direction is invalid. """ if im.mode != '1' and not is_bitonal(im): raise KrakenInputException('Image is not bi-level') # rotate input image for vertical lines if text_direction.startswith('horizontal'): angle = 0 offset = (0, 0) elif text_direction == 'vertical-lr': angle = 270 offset = (0, im.size[1]) elif text_direction == 'vertical-rl': angle = 90 offset = (im.size[0], 0) else: raise KrakenInputException('Invalid text direction') im = im.rotate(angle, expand=True) # honestly I've got no idea what's going on here. In theory a simple # np.array(im, 'i') should suffice here but for some reason the # tostring/fromstring magic in pil2array alters the array in a way that is # needed for the algorithm to work correctly. a = pil2array(im) binary = np.array(a > 0.5 * (np.amin(a) + np.amax(a)), 'i') binary = 1 - binary if not scale: scale = estimate_scale(binary) binary = remove_hlines(binary, scale) # emptyish images wll cause exceptions here. try: if black_colseps: colseps, binary = compute_black_colseps(binary, scale, maxcolseps) else: colseps = compute_white_colseps(binary, scale, maxcolseps) except ValueError: return {'text_direction': text_direction, 'boxes': []} bottom, top, boxmap = compute_gradmaps(binary, scale) seeds = compute_line_seeds(binary, bottom, top, colseps, scale) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = morph.spread_labels(seeds, maxdist=scale) llabels = np.where(llabels > 0, llabels, spread * binary) segmentation = llabels * binary lines = compute_lines(segmentation, scale) order = reading_order([l.bounds for l in lines], text_direction[-2:]) lsort = topsort(order) lines = [lines[i].bounds for i in lsort] lines = [(s2.start, s1.start, s2.stop, s1.stop) for s1, s2 in lines] return { 'text_direction': text_direction, 'boxes': rotate_lines(lines, 360 - angle, offset).tolist(), 'script_detection': False }
def pagexmllineseg(xmlfile, imgpath, text_direction='horizontal-lr', scale=None): root = etree.parse(xmlfile).getroot() ns = {"ns": root.nsmap[None]} # convert point notation from older pagexml versions for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns): cc = [] for point in c.xpath("./ns:Point", namespaces=ns): # coordstrings = [x.split(",") for x in c.attrib["points"].split()] cx = point.attrib["x"] cy = point.attrib["y"] c.remove(point) cc.append(cx + "," + cy) c.attrib["points"] = " ".join(cc) coordmap = {} for r in root.xpath('//ns:TextRegion', namespaces=ns): rid = r.attrib["id"] coordmap[rid] = {"type": r.attrib["type"]} coordmap[rid]["coords"] = [] for c in r.xpath("./ns:Coords", namespaces=ns) + r.xpath("./Coords"): coordmap[rid]["coordstring"] = c.attrib["points"] coordstrings = [x.split(",") for x in c.attrib["points"].split()] coordmap[rid]["coords"] += [[int(x[0]), int(x[1])] for x in coordstrings] filename = root.xpath('//ns:Page', namespaces=ns)[0]\ .attrib["imageFilename"] filename = imgpath + "/" + filename im = Image.open(filename) for n, c in enumerate(sorted(coordmap)): if type(scale) == dict: if coordmap[c]['type'] in scale: rscale = scale[coordmap[c]['type']] elif "other" in scale: rscale = scale["other"] else: rscale = None else: rscale = scale coords = coordmap[c]['coords'] if len(coords) < 3: continue cropped = cutout(im, coords) offset = (min([x[0] for x in coords]), min([x[1] for x in coords])) if cropped is not None: if not binarization.is_bitonal(cropped): try: cropped = binarization.nlbin(cropped) except SystemError: continue if coordmap[c]["type"] == "drop-capital": lines = [1] else: # if line in lines = segment(cropped, text_direction=text_direction, scale=rscale, maxcolseps=-1) lines = lines["lines"] if "lines" in lines else [] else: lines = [] for n, l in enumerate(lines): if coordmap[c]["type"] == "drop-capital": coordstrg = coordmap[c]["coordstring"] else: coords = ((x[1] + offset[0], x[0] + offset[1]) for x in l.polygon) coordstrg = " ".join( [str(x[0]) + "," + str(x[1]) for x in coords]) textregion = root.xpath('//ns:TextRegion[@id="' + c + '"]', namespaces=ns)[0] linexml = etree.SubElement( textregion, "TextLine", attrib={"id": "{}_l{:03d}".format(c, n + 1)}) coordsxml = etree.SubElement(linexml, "Coords", attrib={"points": coordstrg}) xmlstring = etree.tounicode(root.getroottree()).replace( "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15") no_lines_segm = int(root.xpath("count(//TextLine)")) return xmlstring, no_lines_segm