"> found pages separator line at %f (image space position) / %f (page space position)" % (sep_line_img_x, sep_line_page_x)) # split the scanned double page at the separator line split_images = iproc_obj.split_image(sep_line_img_x) # split the textboxes at the separator line split_texts = split_page_texts(p, sep_line_page_x) split_texts_and_images.append((p, split_texts, split_images)) # generate a new XML and "pages" dict structure from the split pages split_pages_xmlfile = os.path.join( OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml') print("> saving split pages XML to '%s'" % split_pages_xmlfile) split_tree, split_root, split_pages = create_split_pages_dict_structure( split_texts_and_images, save_to_output_path=split_pages_xmlfile) # we don't need the original double pages any more, we'll work with 'split_pages' del pages #%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages hori_lines_clusters = {} pages_image_scaling = { } # scaling of the scanned page image in relation to the OCR page dimensions for each page for p_num, p in split_pages.items(): # get the image file of the scanned page imgfilebasename = p['image'][:p['image'].rindex('.')] imgfile = os.path.join(OUTPUTPATH, p['image'])
sep_line_page_x = sep_line_img_x / page_scaling_x print("> found pages separator line at %f (image space position) / %f (page space position)" % (sep_line_img_x, sep_line_page_x)) # split the scanned double page at the separator line split_images = iproc_obj.split_image(sep_line_img_x) # split the textboxes at the separator line split_texts = split_page_texts(p, sep_line_page_x) split_texts_and_images.append((p, split_texts, split_images)) # generate a new XML and "pages" dict structure from the split pages split_pages_xmlfile = os.path.join(OUTPUTPATH, INPUT_XML[:INPUT_XML.rindex('.')] + '.split.xml') print("> saving split pages XML to '%s'" % split_pages_xmlfile) split_tree, split_root, split_pages = create_split_pages_dict_structure(split_texts_and_images, save_to_output_path=split_pages_xmlfile) # we don't need the original double pages any more, we'll work with 'split_pages' del pages #%% Detect clusters of horizontal lines using the image processing module and rotate back or deskew pages hori_lines_clusters = {} pages_image_scaling = {} # scaling of the scanned page image in relation to the OCR page dimensions for each page for p_num, p in split_pages.items(): # get the image file of the scanned page imgfilebasename = p['image'][:p['image'].rindex('.')] imgfile = os.path.join(OUTPUTPATH, p['image']) print("page %d: detecting lines in image file '%s'..." % (p_num, imgfile))