Пример #1
0
def get_article_rectangles_from_surr_polygons(page, use_max_rect_size=True, max_d=0, max_rect_size_scale=1 / 50,
                                              max_d_scale=1 / 20):
    """Given the PageXml file `page` return the corresponding article subregions as a list of ArticleRectangle objects.
     Also returns the width and height of the image (NOT of the PrintSpace).

    :param page: Either the path to the PageXml file or a Page object.
    :type page: Union[str, Page]
    :param use_max_rect_size: whether to use a max rectangle size for the article rectangles or not
    :type use_max_rect_size: bool
    :return: the article subregion list, the height and the width of the image
    """
    if type(page) == str:
        page = Page(page)

    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."
    ps_coords = page.get_print_space_coords()
    ps_poly = Points(ps_coords).to_polygon()
    # Maybe check if the surrounding Rectangle of the polygon has corners given by ps_poly
    ps_rectangle = ps_poly.get_bounding_box()

    # First ArticleRectangle to consider
    ps_rectangle = ArticleRectangle(ps_rectangle.x, ps_rectangle.y, ps_rectangle.width, ps_rectangle.height,
                                    page.get_textlines())

    if use_max_rect_size:
        max_rect_size = int(max_rect_size_scale * ps_rectangle.height)
    else:
        max_rect_size = 0
    if not max_d:
        max_d = int(max_d_scale * ps_rectangle.height)

    ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=max_d, max_rect_size=max_rect_size)

    # ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=int(1 / 20 * ps_rectangle.height))

    img_width, img_height = page.get_image_resolution()

    return ars, img_height, img_width
Пример #2
0
def plot_pagexml(page,
                 path_to_img,
                 ax=None,
                 plot_article=True,
                 plot_legend=True,
                 fill_regions=False,
                 use_page_image_resolution=False):
    if type(page) == str:
        page = Page(page)
    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."

    # get baselines based on the article id
    article_dict = page.get_article_dict()
    if not article_dict:
        bcolors = []
        blines_list = []
    else:
        unique_ids = sorted(set(article_dict.keys()),
                            key=functools.cmp_to_key(compare_article_ids))
        if None in unique_ids:
            article_colors = dict(
                zip(unique_ids,
                    COLORS[:len(unique_ids) - 1] + [DEFAULT_COLOR]))
        else:
            article_colors = dict(zip(unique_ids, COLORS[:len(unique_ids)]))
        if plot_article:
            bcolors = [article_colors[id] for id in unique_ids]
        else:
            bcolors = [DEFAULT_COLOR] * len(article_dict)
        blines_list = [[
            textline.baseline.points_list for textline in article_dict[id]
            if textline.baseline
        ] for id in unique_ids]

    # elif None in article_dict:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict) - 1] + [DEFAULT_COLOR]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #
    #     blines_list = [[tline.baseline.points_list for tline in tlines if tline.baseline]
    #                    for (a_id, tlines) in article_dict.items() if a_id is not None] \
    #                   + [[tline.baseline.points_list for tline in article_dict[None] if tline.baseline]]
    # else:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict)]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #     blines_list = [[tline.baseline.points_list for tline in tlines] for tlines in article_dict.values()]

    region_dict = page.get_regions()
    if not region_dict:
        rcolors = {}
        region_dict_polygons = {}
    else:
        rcolors = {
            page_constants.sTEXTREGION: "darkgreen",
            page_constants.sSEPARATORREGION: "darkviolet",
            page_constants.sGRAPHICREGION: "darkcyan",
            page_constants.sIMAGEREGION: "darkblue",
            page_constants.sTABLEREGION: "darkorange",
            page_constants.sADVERTREGION: "yellow",
            page_constants.sLINEDRAWINGREGION: "salmon",
            page_constants.sCHARTREGION: "brown",
            page_constants.sCHEMREGION: "navy",
            page_constants.sMATHSREGION: "crimson",
            page_constants.sNOISEREGION: "darkkhaki",
            page_constants.sMUSICREGION: "firebrick",
            page_constants.sUNKNOWNREGION: "darkorchid"
        }
        region_dict_polygons = {
            region_name: [region.points.points_list for region in regions]
            for region_name, regions in region_dict.items()
        }

    # get surrounding polygons
    textlines = page.get_textlines()
    surr_polys = [
        tl.surr_p.points_list for tl in textlines if (tl and tl.surr_p)
    ]

    words = page.get_words()
    word_polys = [
        word.surr_p.points_list for word in words if (word and word.surr_p)
    ]

    # # Maximize plotting window
    # mng = plt.get_current_fig_manager()
    # mng.resize(*mng.window.maxsize())

    if use_page_image_resolution:
        page_height, page_width = page.get_image_resolution()
    else:
        page_height = page_width = None

    plot_ax(ax,
            path_to_img,
            blines_list,
            surr_polys,
            bcolors,
            region_dict_polygons,
            rcolors,
            word_polys,
            plot_legend,
            fill_regions=fill_regions,
            height=page_height,
            width=page_width)
def get_data_from_pagexml(path_to_pagexml, des_dist=50, max_d=500, use_java_code=True):
    """
    :param path_to_pagexml: file path
    :param des_dist: desired distance (measured in pixels) of two adjacent pixels in the normed polygons
    :param max_d: maximum distance (measured in pixels) for the calculation of the interline distances
    :param use_java_code: usage of methods written in java (faster than python!) or not

    :return: two dictionaries: {article id: corresponding list of text lines}
                               {text line id: (normed polygon, interline distance)}
    """
    # load the page xml file
    page_file = Page(path_to_pagexml)

    # get all text lines article wise
    art_txtlines_dict = page_file.get_article_dict()
    # get all text lines of the loaded page file
    lst_of_txtlines = page_file.get_textlines()

    lst_of_polygons = []
    lst_of_txtlines_adjusted = []

    for txtline in lst_of_txtlines:
        try:
            # get the baseline of the text line as polygon
            baseline = txtline.baseline.to_polygon()
            # baselines with less than two points will skipped
            if len(baseline.x_points) == len(baseline.y_points) > 1:
                lst_of_polygons.append(txtline.baseline.to_polygon())
                lst_of_txtlines_adjusted.append(txtline)
        except(AttributeError):
            # print("'NoneType' object in PAGEXML with id {} has no attribute 'to_polygon'!\n".format(txtline.id))
            continue

    # normed polygons
    lst_of_normed_polygons = norm_poly_dists(poly_list=lst_of_polygons, des_dist=des_dist)
    # interline distances
    lst_of_intdists = get_list_of_interline_distances(lst_of_polygons=lst_of_polygons, max_d=max_d,
                                                      use_java_code=use_java_code)

    txtline_dict = {}
    for i, txtline in enumerate(lst_of_txtlines_adjusted):
        # check the surrounding polygon of the text line
        if txtline.surr_p is None:
            normed_polygon = lst_of_normed_polygons[i]

            x_points_shifted = [x + 1 for x in normed_polygon.x_points]
            # y values are shifted upwards by at least one pixel
            y_shift = max(int(0.95 * lst_of_intdists[i]), 1)
            y_points_shifted = [y - y_shift for y in normed_polygon.y_points]

            sp_points = list(zip(normed_polygon.x_points + x_points_shifted[::-1],
                                 normed_polygon.y_points + y_points_shifted[::-1]))

            for article in art_txtlines_dict:
                for reference_txtline in art_txtlines_dict[article]:
                    if reference_txtline.id == txtline.id:
                        reference_txtline.surr_p = Points(sp_points)

        txtline_dict.update({txtline.id: (lst_of_normed_polygons[i], lst_of_intdists[i])})

    return art_txtlines_dict, txtline_dict