def get_article_rectangles_from_surr_polygons(page, use_max_rect_size=True, max_d=0, max_rect_size_scale=1 / 50, max_d_scale=1 / 20): """Given the PageXml file `page` return the corresponding article subregions as a list of ArticleRectangle objects. Also returns the width and height of the image (NOT of the PrintSpace). :param page: Either the path to the PageXml file or a Page object. :type page: Union[str, Page] :param use_max_rect_size: whether to use a max rectangle size for the article rectangles or not :type use_max_rect_size: bool :return: the article subregion list, the height and the width of the image """ if type(page) == str: page = Page(page) assert type(page) == Page, f"Type must be Page, got {type(page)} instead." ps_coords = page.get_print_space_coords() ps_poly = Points(ps_coords).to_polygon() # Maybe check if the surrounding Rectangle of the polygon has corners given by ps_poly ps_rectangle = ps_poly.get_bounding_box() # First ArticleRectangle to consider ps_rectangle = ArticleRectangle(ps_rectangle.x, ps_rectangle.y, ps_rectangle.width, ps_rectangle.height, page.get_textlines()) if use_max_rect_size: max_rect_size = int(max_rect_size_scale * ps_rectangle.height) else: max_rect_size = 0 if not max_d: max_d = int(max_d_scale * ps_rectangle.height) ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=max_d, max_rect_size=max_rect_size) # ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=int(1 / 20 * ps_rectangle.height)) img_width, img_height = page.get_image_resolution() return ars, img_height, img_width
def plot_pagexml(page, path_to_img, ax=None, plot_article=True, plot_legend=True, fill_regions=False, use_page_image_resolution=False): if type(page) == str: page = Page(page) assert type(page) == Page, f"Type must be Page, got {type(page)} instead." # get baselines based on the article id article_dict = page.get_article_dict() if not article_dict: bcolors = [] blines_list = [] else: unique_ids = sorted(set(article_dict.keys()), key=functools.cmp_to_key(compare_article_ids)) if None in unique_ids: article_colors = dict( zip(unique_ids, COLORS[:len(unique_ids) - 1] + [DEFAULT_COLOR])) else: article_colors = dict(zip(unique_ids, COLORS[:len(unique_ids)])) if plot_article: bcolors = [article_colors[id] for id in unique_ids] else: bcolors = [DEFAULT_COLOR] * len(article_dict) blines_list = [[ textline.baseline.points_list for textline in article_dict[id] if textline.baseline ] for id in unique_ids] # elif None in article_dict: # if plot_article: # bcolors = COLORS[:len(article_dict) - 1] + [DEFAULT_COLOR] # else: # bcolors = [DEFAULT_COLOR] * len(article_dict) # # blines_list = [[tline.baseline.points_list for tline in tlines if tline.baseline] # for (a_id, tlines) in article_dict.items() if a_id is not None] \ # + [[tline.baseline.points_list for tline in article_dict[None] if tline.baseline]] # else: # if plot_article: # bcolors = COLORS[:len(article_dict)] # else: # bcolors = [DEFAULT_COLOR] * len(article_dict) # blines_list = [[tline.baseline.points_list for tline in tlines] for tlines in article_dict.values()] region_dict = page.get_regions() if not region_dict: rcolors = {} region_dict_polygons = {} else: rcolors = { page_constants.sTEXTREGION: "darkgreen", page_constants.sSEPARATORREGION: "darkviolet", page_constants.sGRAPHICREGION: "darkcyan", page_constants.sIMAGEREGION: "darkblue", page_constants.sTABLEREGION: "darkorange", page_constants.sADVERTREGION: "yellow", page_constants.sLINEDRAWINGREGION: "salmon", page_constants.sCHARTREGION: "brown", page_constants.sCHEMREGION: "navy", page_constants.sMATHSREGION: "crimson", page_constants.sNOISEREGION: "darkkhaki", page_constants.sMUSICREGION: "firebrick", page_constants.sUNKNOWNREGION: "darkorchid" } region_dict_polygons = { region_name: [region.points.points_list for region in regions] for region_name, regions in region_dict.items() } # get surrounding polygons textlines = page.get_textlines() surr_polys = [ tl.surr_p.points_list for tl in textlines if (tl and tl.surr_p) ] words = page.get_words() word_polys = [ word.surr_p.points_list for word in words if (word and word.surr_p) ] # # Maximize plotting window # mng = plt.get_current_fig_manager() # mng.resize(*mng.window.maxsize()) if use_page_image_resolution: page_height, page_width = page.get_image_resolution() else: page_height = page_width = None plot_ax(ax, path_to_img, blines_list, surr_polys, bcolors, region_dict_polygons, rcolors, word_polys, plot_legend, fill_regions=fill_regions, height=page_height, width=page_width)
def get_data_from_pagexml(path_to_pagexml, des_dist=50, max_d=500, use_java_code=True): """ :param path_to_pagexml: file path :param des_dist: desired distance (measured in pixels) of two adjacent pixels in the normed polygons :param max_d: maximum distance (measured in pixels) for the calculation of the interline distances :param use_java_code: usage of methods written in java (faster than python!) or not :return: two dictionaries: {article id: corresponding list of text lines} {text line id: (normed polygon, interline distance)} """ # load the page xml file page_file = Page(path_to_pagexml) # get all text lines article wise art_txtlines_dict = page_file.get_article_dict() # get all text lines of the loaded page file lst_of_txtlines = page_file.get_textlines() lst_of_polygons = [] lst_of_txtlines_adjusted = [] for txtline in lst_of_txtlines: try: # get the baseline of the text line as polygon baseline = txtline.baseline.to_polygon() # baselines with less than two points will skipped if len(baseline.x_points) == len(baseline.y_points) > 1: lst_of_polygons.append(txtline.baseline.to_polygon()) lst_of_txtlines_adjusted.append(txtline) except(AttributeError): # print("'NoneType' object in PAGEXML with id {} has no attribute 'to_polygon'!\n".format(txtline.id)) continue # normed polygons lst_of_normed_polygons = norm_poly_dists(poly_list=lst_of_polygons, des_dist=des_dist) # interline distances lst_of_intdists = get_list_of_interline_distances(lst_of_polygons=lst_of_polygons, max_d=max_d, use_java_code=use_java_code) txtline_dict = {} for i, txtline in enumerate(lst_of_txtlines_adjusted): # check the surrounding polygon of the text line if txtline.surr_p is None: normed_polygon = lst_of_normed_polygons[i] x_points_shifted = [x + 1 for x in normed_polygon.x_points] # y values are shifted upwards by at least one pixel y_shift = max(int(0.95 * lst_of_intdists[i]), 1) y_points_shifted = [y - y_shift for y in normed_polygon.y_points] sp_points = list(zip(normed_polygon.x_points + x_points_shifted[::-1], normed_polygon.y_points + y_points_shifted[::-1])) for article in art_txtlines_dict: for reference_txtline in art_txtlines_dict[article]: if reference_txtline.id == txtline.id: reference_txtline.surr_p = Points(sp_points) txtline_dict.update({txtline.id: (lst_of_normed_polygons[i], lst_of_intdists[i])}) return art_txtlines_dict, txtline_dict