def create_text_file_from_page(page: Page, path_to_save_file=None): article_dict = page.get_article_dict() with open(path_to_save_file, 'w') as f: for i, textlines in enumerate(article_dict.values()): for tl in textlines: if tl.text: f.write(tl.text + "\n") if i != len(article_dict) - 1: f.write('\n' + '#' * 100 + '\n\n')
def create_text_files_from_page_list(page_list, path_to_save_folder=None): for page in page_list: page_file_name = os.path.basename(page) if path_to_save_folder: path_to_save_file = os.path.join(path_to_save_folder, page_file_name + '.txt') else: path_to_save_file = page + ".txt" page = Page(page) create_text_file_from_page(page, path_to_save_file)
def save_results_in_pagexml(path_to_pagexml, text_region_txtline_dict): """ :param path_to_pagexml: file path :param text_region_txtline_dict: dictionray {text region id: (list of boundary_points, list of corresponding text lines, reading order of the region)} """ page_file = Page(path_to_pagexml) lst_of_txtregions = [] for txtregion_id in text_region_txtline_dict: boundary_points = text_region_txtline_dict[txtregion_id][0] lst_of_txtlines = text_region_txtline_dict[txtregion_id][1] reading_order = text_region_txtline_dict[txtregion_id][2] # set the reading order of the text lines txtlines_set_reading_order(lst_of_txtlines=lst_of_txtlines) # generation of the text region txtregion = TextRegion(_id=txtregion_id, region_type="paragraph", custom={"readingOrder": {"index": reading_order}}, points=boundary_points, text_lines=lst_of_txtlines) lst_of_txtregions.append(txtregion) page_file.set_text_regions(text_regions=lst_of_txtregions, overwrite=True) page_file.write_page_xml(path_to_pagexml)
def get_article_rectangles_from_surr_polygons(page, use_max_rect_size=True, max_d=0, max_rect_size_scale=1 / 50, max_d_scale=1 / 20): """Given the PageXml file `page` return the corresponding article subregions as a list of ArticleRectangle objects. Also returns the width and height of the image (NOT of the PrintSpace). :param page: Either the path to the PageXml file or a Page object. :type page: Union[str, Page] :param use_max_rect_size: whether to use a max rectangle size for the article rectangles or not :type use_max_rect_size: bool :return: the article subregion list, the height and the width of the image """ if type(page) == str: page = Page(page) assert type(page) == Page, f"Type must be Page, got {type(page)} instead." ps_coords = page.get_print_space_coords() ps_poly = Points(ps_coords).to_polygon() # Maybe check if the surrounding Rectangle of the polygon has corners given by ps_poly ps_rectangle = ps_poly.get_bounding_box() # First ArticleRectangle to consider ps_rectangle = ArticleRectangle(ps_rectangle.x, ps_rectangle.y, ps_rectangle.width, ps_rectangle.height, page.get_textlines()) if use_max_rect_size: max_rect_size = int(max_rect_size_scale * ps_rectangle.height) else: max_rect_size = 0 if not max_d: max_d = int(max_d_scale * ps_rectangle.height) ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=max_d, max_rect_size=max_rect_size) # ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=int(1 / 20 * ps_rectangle.height)) img_width, img_height = page.get_image_resolution() return ars, img_height, img_width
def get_data_from_pagexml(path_to_pagexml): """ :param path_to_pagexml: file path :return: dictionary with the article / block ID's as keys and a list of corresponding baselines (given by polygons) as values """ art_polygons_dict = {} try: # load the page xml file page_file = Page(path_to_xml=path_to_pagexml) # get all text lines article wise art_txtlines_dict = page_file.get_article_dict() except (): print("!! Can not load the lines of the Page XML {} !!\n".format( path_to_pagexml)) return art_polygons_dict for article_id in art_txtlines_dict: for txtline in art_txtlines_dict[article_id]: try: # get the baseline of the text line as polygon polygon = txtline.baseline.to_polygon() # skipp baselines with less than two points if len(polygon.x_points) == len(polygon.y_points) > 1: if article_id in art_polygons_dict: art_polygons_dict[article_id].append(polygon) else: art_polygons_dict.update({article_id: [polygon]}) except (): print( "!! 'NoneType' object with id {} has no attribute 'to_polygon' !!\n" .format(txtline.id)) continue return art_polygons_dict
def create_page_objects(self): return [Page(page_path) for page_path in self.page_path_lst]
def plot_pagexml(page, path_to_img, ax=None, plot_article=True, plot_legend=True, fill_regions=False, use_page_image_resolution=False): if type(page) == str: page = Page(page) assert type(page) == Page, f"Type must be Page, got {type(page)} instead." # get baselines based on the article id article_dict = page.get_article_dict() if not article_dict: bcolors = [] blines_list = [] else: unique_ids = sorted(set(article_dict.keys()), key=functools.cmp_to_key(compare_article_ids)) if None in unique_ids: article_colors = dict( zip(unique_ids, COLORS[:len(unique_ids) - 1] + [DEFAULT_COLOR])) else: article_colors = dict(zip(unique_ids, COLORS[:len(unique_ids)])) if plot_article: bcolors = [article_colors[id] for id in unique_ids] else: bcolors = [DEFAULT_COLOR] * len(article_dict) blines_list = [[ textline.baseline.points_list for textline in article_dict[id] if textline.baseline ] for id in unique_ids] # elif None in article_dict: # if plot_article: # bcolors = COLORS[:len(article_dict) - 1] + [DEFAULT_COLOR] # else: # bcolors = [DEFAULT_COLOR] * len(article_dict) # # blines_list = [[tline.baseline.points_list for tline in tlines if tline.baseline] # for (a_id, tlines) in article_dict.items() if a_id is not None] \ # + [[tline.baseline.points_list for tline in article_dict[None] if tline.baseline]] # else: # if plot_article: # bcolors = COLORS[:len(article_dict)] # else: # bcolors = [DEFAULT_COLOR] * len(article_dict) # blines_list = [[tline.baseline.points_list for tline in tlines] for tlines in article_dict.values()] region_dict = page.get_regions() if not region_dict: rcolors = {} region_dict_polygons = {} else: rcolors = { page_constants.sTEXTREGION: "darkgreen", page_constants.sSEPARATORREGION: "darkviolet", page_constants.sGRAPHICREGION: "darkcyan", page_constants.sIMAGEREGION: "darkblue", page_constants.sTABLEREGION: "darkorange", page_constants.sADVERTREGION: "yellow", page_constants.sLINEDRAWINGREGION: "salmon", page_constants.sCHARTREGION: "brown", page_constants.sCHEMREGION: "navy", page_constants.sMATHSREGION: "crimson", page_constants.sNOISEREGION: "darkkhaki", page_constants.sMUSICREGION: "firebrick", page_constants.sUNKNOWNREGION: "darkorchid" } region_dict_polygons = { region_name: [region.points.points_list for region in regions] for region_name, regions in region_dict.items() } # get surrounding polygons textlines = page.get_textlines() surr_polys = [ tl.surr_p.points_list for tl in textlines if (tl and tl.surr_p) ] words = page.get_words() word_polys = [ word.surr_p.points_list for word in words if (word and word.surr_p) ] # # Maximize plotting window # mng = plt.get_current_fig_manager() # mng.resize(*mng.window.maxsize()) if use_page_image_resolution: page_height, page_width = page.get_image_resolution() else: page_height = page_width = None plot_ax(ax, path_to_img, blines_list, surr_polys, bcolors, region_dict_polygons, rcolors, word_polys, plot_legend, fill_regions=fill_regions, height=page_height, width=page_width)
def get_article_rectangles_from_baselines(page, image_path, stretch=False, use_surr_polygons=True): if type(page) == str: page = Page(page) assert type(page) == Page, f"Type must be Page, got {type(page)} instead." article_dict = page.get_article_dict() article_rectangles_dict = defaultdict(list) if stretch: binarized_image = get_binarization(image_path) for article_id, textlines in article_dict.items(): used_textline_ids = [] sorted_textlines = sort_textlines_by_y(textlines) for i, textline in enumerate(sorted_textlines): # used_textline_ids = [tl.id for article_rectangle in article_rectangles_dict[article_id] for tl in # article_rectangle.textlines] if textline.id in used_textline_ids: continue baseline = textline.baseline.points_list baseline_polygon = textline.baseline.to_polygon() if use_surr_polygons: baseline_bounding_box = textline.surr_p.to_polygon().get_bounding_box() if textline.surr_p else baseline_polygon.get_bounding_box() else: baseline_bounding_box = baseline_polygon.get_bounding_box() # [ar for aid, ar in article_rectangles_dict.items() if aid != article_id] # print(baseline_bounding_box.get_vertices()) # print(article_id) for ars in [ar for aid, ar in article_rectangles_dict.items() if aid != article_id]: for ar in ars: intersection = ar.intersection(baseline_bounding_box) for _ in range(20): if intersection.width > 0 and intersection.height > 0: baseline_bounding_box.translate(0, 1) baseline_bounding_box.height -= 1 intersection = ar.intersection(baseline_bounding_box) else: break article_rectangle = ArticleRectangle(baseline_bounding_box.x, baseline_bounding_box.y, baseline_bounding_box.width, baseline_bounding_box.height, [textline], None) used_textline_ids.append(textline.id) if i == len(sorted_textlines): continue for j, textline_compare in enumerate(sorted_textlines[i + 1:]): if textline_compare.id in used_textline_ids: continue # for tl in article_rectangle.textlines: # print(tl.baseline.points_list) baseline_compare = textline_compare.baseline.points_list skip = False # instead of checking whether the two baselines are aligned, we should check, if the current article # rectangle and the baseline_compare are aligned! article_rectangle_horizontal_poly = article_rectangle.get_vertices()[:2] # if not is_vertical_aligned(baseline, baseline_compare): if not is_vertical_aligned(article_rectangle_horizontal_poly, baseline_compare): if i + j + 2 != len(sorted_textlines): for tl in sorted_textlines[i + j + 2:]: if tl.id not in used_textline_ids: if is_vertical_aligned(baseline, tl.baseline.points_list) and is_vertical_aligned( baseline_compare, tl.baseline.points_list, margin=50): skip = False break else: skip = True else: skip = True if skip: continue baseline_compare_polygon = textline_compare.baseline.to_polygon() if use_surr_polygons: baseline_compare_bounding_box = textline_compare.surr_p.to_polygon().get_bounding_box() if textline_compare.surr_p else baseline_compare_polygon.get_bounding_box() else: baseline_compare_bounding_box = baseline_compare_polygon.get_bounding_box() merged_rectangle = merge_rectangles([article_rectangle, baseline_compare_bounding_box]) skip = False for ars in article_rectangles_dict.values(): for ar in ars: intersection = ar.intersection(merged_rectangle) if intersection.width > 0 and intersection.height > 0: skip = True break if skip: break if skip: continue merged_article_rectangle = ArticleRectangle(merged_rectangle.x, merged_rectangle.y, merged_rectangle.width, merged_rectangle.height) # if merged_article_rectangle contains any other baseline, that is not yet in an article_rectangle, skip # textlines_to_check_intersection = [tl for tl in sorted_textlines if # tl.id not in used_textline_ids and tl.id != textline_compare.id] textlines_to_check_intersection = [] textlines_to_check_intersection += [tl for textlines in [article_dict[aid] for aid in article_dict if aid != article_id] for tl in textlines] # polygons_to_check_intersection = [tl.surr_p.to_polygon() if tl.surr_p is not None else # tl.baseline.to_polygon() for tl in textlines_to_check_intersection] polygons_to_check_intersection = [tl.baseline.to_polygon() for tl in textlines_to_check_intersection] skip = False for polygon in polygons_to_check_intersection: if merged_article_rectangle.contains_polygon(polygon, merged_article_rectangle.x, merged_article_rectangle.y, merged_article_rectangle.width, merged_article_rectangle.height): skip = True merged_article_rectangle_copy = copy.deepcopy(merged_article_rectangle) for _ in range(50): merged_article_rectangle_copy.translate(0, 1) merged_article_rectangle_copy.height -= 1 if not merged_article_rectangle_copy.contains_polygon(polygon, merged_article_rectangle_copy.x, merged_article_rectangle_copy.y, merged_article_rectangle_copy.width, merged_article_rectangle_copy.height): skip = False merged_article_rectangle = merged_article_rectangle_copy break if skip: break if skip: continue article_rectangle.textlines.append(textline_compare) article_rectangle.set_bounds(merged_article_rectangle.x, merged_article_rectangle.y, merged_article_rectangle.width, merged_article_rectangle.height) used_textline_ids.append(textline_compare.id) if len(article_rectangle.textlines) == 1: if article_rectangle.textlines[0].surr_p: # bb = article_rectangle.textlines[0].surr_p.to_polygon().get_bounding_box() # article_rectangle.set_bounds(bb.x, bb.y, bb.width, bb.height) pass else: article_rectangle.translate(0, -10) article_rectangle.height = 10 if stretch: img_height = len(binarized_image) article_rectangle = stretch_rectangle_until_whitespace(binarized_image, article_rectangle, whitespace_height=max(1, img_height // 1000), stretch_limit=img_height // 10) article_rectangles_dict[article_id].append(article_rectangle) return article_rectangles_dict
# img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/krz/ONB_krz_19110701_corrected/ONB_krz_19110701_016" \ # ".jpg" # # img_path = "/home/max/devel/projects/article_separation/data/newseye_onb/ibn/ONB_ibn_18640702_corrected/ONB_ibn_18640702_003.tif" xml_path = "/home/max/devel/projects/article_separation/data/newseye_onb/ibn/ONB_ibn_18640702_corrected/page/ONB_ibn_18640702_003.xml" # xml_path = "/home/max/data/as/NewsEye_ONB_data_corrected/ibn/ONB_ibn_19330701_corrected/page/ONB_ibn_19330701_001.xml" # img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/ibn/ONB_ibn_19330701_corrected/ONB_ibn_19330701_001.jpg" # # # # xml_path = "/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18730705_corrected/page/ONB_nfp_18730705_016.xml" # img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18730705_corrected/ONB_nfp_18730705_016.tif" # # xml_path = '/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18950706_corrected/page/ONB_nfp_18950706_015.xml' # img_path = '/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18950706_corrected/ONB_nfp_18950706_015.tif' article_rectangles_dict = get_article_rectangles_from_baselines(Page(xml_path), img_path, use_surr_polygons=True, stretch=False) surr_polys_dict = merge_article_rectangles_vertically(article_rectangles_dict) import matplotlib.pyplot as plt from citlab_python_util.parser.xml.page import plot as page_plot from matplotlib.collections import PolyCollection from citlab_python_util.plot import colors # page_plot.plot_pagexml(xml_path, img_path) fig, ax = plt.subplots() page_plot.add_image(ax, img_path) for i, a_id in enumerate(surr_polys_dict):
def get_data_from_pagexml(path_to_pagexml, des_dist=50, max_d=500, use_java_code=True): """ :param path_to_pagexml: file path :param des_dist: desired distance (measured in pixels) of two adjacent pixels in the normed polygons :param max_d: maximum distance (measured in pixels) for the calculation of the interline distances :param use_java_code: usage of methods written in java (faster than python!) or not :return: two dictionaries: {article id: corresponding list of text lines} {text line id: (normed polygon, interline distance)} """ # load the page xml file page_file = Page(path_to_pagexml) # get all text lines article wise art_txtlines_dict = page_file.get_article_dict() # get all text lines of the loaded page file lst_of_txtlines = page_file.get_textlines() lst_of_polygons = [] lst_of_txtlines_adjusted = [] for txtline in lst_of_txtlines: try: # get the baseline of the text line as polygon baseline = txtline.baseline.to_polygon() # baselines with less than two points will skipped if len(baseline.x_points) == len(baseline.y_points) > 1: lst_of_polygons.append(txtline.baseline.to_polygon()) lst_of_txtlines_adjusted.append(txtline) except(AttributeError): # print("'NoneType' object in PAGEXML with id {} has no attribute 'to_polygon'!\n".format(txtline.id)) continue # normed polygons lst_of_normed_polygons = norm_poly_dists(poly_list=lst_of_polygons, des_dist=des_dist) # interline distances lst_of_intdists = get_list_of_interline_distances(lst_of_polygons=lst_of_polygons, max_d=max_d, use_java_code=use_java_code) txtline_dict = {} for i, txtline in enumerate(lst_of_txtlines_adjusted): # check the surrounding polygon of the text line if txtline.surr_p is None: normed_polygon = lst_of_normed_polygons[i] x_points_shifted = [x + 1 for x in normed_polygon.x_points] # y values are shifted upwards by at least one pixel y_shift = max(int(0.95 * lst_of_intdists[i]), 1) y_points_shifted = [y - y_shift for y in normed_polygon.y_points] sp_points = list(zip(normed_polygon.x_points + x_points_shifted[::-1], normed_polygon.y_points + y_points_shifted[::-1])) for article in art_txtlines_dict: for reference_txtline in art_txtlines_dict[article]: if reference_txtline.id == txtline.id: reference_txtline.surr_p = Points(sp_points) txtline_dict.update({txtline.id: (lst_of_normed_polygons[i], lst_of_intdists[i])}) return art_txtlines_dict, txtline_dict
files_exist = check_if_files_exist( article_gt_filename, article_boundary_gt_filename, other_gt_filename, downscaled_grey_img_filename, rotation_filename) if files_exist: print( f"GT Files for PageXml {path_to_page_xml} already exist, skipping..." ) continue # TODO: only generates files with '0's in it -> fix this with open(rotation_filename, "w") as rot: rot.write("0") page = Page(path_to_page_xml) img_width, img_height = page.get_image_resolution() article_rectangle_dict = get_article_rectangles_from_baselines( page, path_to_img, use_surr_polygons=args.use_surr_polys, stretch=args.use_stretch) if args.fixed_img_height: sc_factor = args.fixed_img_height / img_height else: sc_factor = args.scaling_factor surr_polys_dict = merge_article_rectangles_vertically( article_rectangle_dict, min_width_intersect=args.min_width_intersect,
f'Skipping image {curr_img} since the result files for query {_query} already exist.' ) skip = True if skip: continue # Get full path for image_path in image_paths: if curr_img in image_path: curr_img_path = image_path break # print(curr_img_path) # open corresponding PAGE file page_path = get_corresponding_page_path(curr_img_path) page = Page(page_path) article_dict = page.get_article_dict() used_article_ids = [] fig, ax = plt.subplots() plot.add_image(ax, curr_img_path) highest_conf_img = 0.0 with open( os.path.join(path_to_query_folder, curr_img + ".txt"), 'w+') as text_file: text_file.write(f"QUERY: '{_query}'\n\n") for hit in query_results: if hit[0] == curr_img:
# "newseye_as_test_data/xml_files_gt/19000715_1-0003.xml" # path_to_img = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \ # "newseye_as_test_data/image_files/19000715_1-0003.jpg" path_to_page_xml = '/home/max/devel/projects/article_separation/data/newseye_onb/aze/ONB_aze_18950706_corrected/' \ 'page/ONB_aze_18950706_5.xml' path_to_img = '/home/max/devel/projects/article_separation/data/newseye_onb/aze/ONB_aze_18950706_corrected/' \ 'ONB_aze_18950706_5.jpg' # path_to_page_xml = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \ # "newseye_as_test_data/xml_files_gt/19420115_1-0002.xml" # path_to_img = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \ # "newseye_as_test_data/image_files/19420115_1-0002.jpg" path_to_page_xml = path_to_page_xml.strip() page = Page(path_to_page_xml) # Get the article rectangles as a list of ArticleRectangle objects ars, img_height, img_width = get_article_rectangles_from_surr_polygons(page) # resize the image to draw the border polygons (if available) img_height += 1 img_width += 1 print("img_height = {}, img_width = {}".format(img_height, img_width)) # Convert the list of article rectangles to a dictionary with the article ids as keys # and the corresponding list of rectangles as value ars_dict = filter_by_attribute(ars, "a_ids") print("Len(Blank) = ", len(ars_dict["blank"])) # Convert blank article rectangles (by rectangles)