예제 #1
0
def adjust_bbox_by_latex_val(bbox, latex_val):
    """
    Given the tight bbox of a char and the latex value,

    :param bbox: tight bbox
    :type bbox: BBox
    :param latex_val: string of the latex
    :return:
    """
    if latex_val in ["\\prime", "\\dprime"]:
        # find
        return bbox

    if latex_val.startswith("\\mathcal"):
        latex_val = latex_val[latex_val.index("{")+1: latex_val.index("}")]

    # TODO, find the stat
    glyph_type = get_gt_type_by_latex_val(latex_val)
    if glyph_type == GT_NON_STABLE:
        return BBox(bbox)
    elif glyph_type == GT_HEIGHT_STABLE:
        # adjust by the character
        return adjust_bbox_h_latex_vertically(bbox, latex_val)
    elif glyph_type == GT_CENTERED:
        return BBox(bbox)
    elif glyph_type == GT_WIDTH_STABLE:
        return adjust_bbox_h_latex_horizontally(bbox, latex_val)
    else:
        print type(bbox)
        raise Exception("unknown type for bbox")
예제 #2
0
def get_char_list_bbox(char_list, remove_accent=False):
    tmp_char_list = []
    for char in char_list:
        if not isinstance(char, LTChar):
            continue
        if remove_accent:
            from pdfxml.pdf_util.char_process import get_char_glyph
            from pdfxml.me_taxonomy.math_resources import accent_name_list
            gn = get_char_glyph(char, None)
            if gn in accent_name_list:
                continue
        tmp_char_list.append(char)

    left_list = [char.bbox[0] for char in tmp_char_list]
    right_list = [char.bbox[2] for char in tmp_char_list]
    bottom_list = [char.bbox[1] for char in tmp_char_list]
    top_list = [char.bbox[3] for char in tmp_char_list]
    if len(left_list) == 0 or\
        len(right_list) == 0 or\
        len(bottom_list) == 0 or\
        len(top_list) == 0:
        print "WARNING: no bbox for a empty char list"
        return BBox([0, 0, 0, 0])
    new_bbox = (np.min(left_list), np.min(bottom_list), np.max(right_list),
                np.max(top_list))
    return BBox(new_bbox)
예제 #3
0
def is_horizontal_line(path, m_char_height):
    """

    :param path:
    :param char_height:
    :return:
    """
    path_bbox = BBox(path.bbox)
    is_thin = path_bbox.height() < m_char_height / 3.0
    is_flat = path_bbox.width() > path_bbox.height()
    return is_thin and is_flat
예제 #4
0
파일: me_group.py 프로젝트: senyalin/pdfxml
 def __init__(self, path_bbox):
     MEObject.__init__(self)
     if isinstance(path_bbox, tuple) or isinstance(path_bbox, list):
         path_bbox = BBox(path_bbox)
     self.set_adjusted_bbox(path_bbox)
     self.set_tight_bbox(path_bbox)
     self.info = {}
예제 #5
0
def merge_bbox_list(bbox_list):
    if len(bbox_list) == 0:
        pass
    bbox_res = bbox_list[0]
    for i in range(1, len(bbox_list)):
        bbox_res = merge_bbox(bbox_res, bbox_list[i])
    return BBox(bbox_res)
예제 #6
0
 def add_bbox(self, name, bbox):
     if isinstance(bbox, list) or isinstance(bbox, tuple):
         bbox = BBox(bbox)
     if not bbox.isvalid():
         return
     self.name2bbox[name] = bbox
     self.hor_tree.addi(bbox.left(), bbox.right(), name)
     self.ver_tree.addi(bbox.bottom(), bbox.top(), name)
예제 #7
0
def bbox_half_overlap_list(b, blist, thres=0.5):
    """
    return true if there is a bbox in blist that overlap area > thres * b.area

    :param b:
    :param blist:
    :param thres:
    :return:
    """
    b = BBox(b)
    org_area = b.area()
    for nb in blist:
        if not b.overlap(nb):
            continue
        ol_area = b.intersect(nb).area()
        # ol_area = bbox_overlap_area(b, nb)
        if ol_area > org_area * thres:
            return True
    return False
예제 #8
0
    def exist_overlap(self, the_bbox):
        """

        :param the_bbox:
        :return:
        """
        if isinstance(the_bbox, list) or isinstance(the_bbox, tuple):
            the_bbox = BBox(the_bbox)
        res_bbox_list = self.get_overlap_by_bbox(the_bbox)
        return len(res_bbox_list) > 0
예제 #9
0
파일: me_group.py 프로젝트: senyalin/pdfxml
    def set_tight_bbox(self, bbox):
        """

        :param bbox:
        :return:
        """
        if isinstance(bbox, tuple) or isinstance(bbox, list):
            assert len(bbox) == 4
            bbox = BBox(bbox)

        assert isinstance(bbox, BBox)
        self.tight_bbox = bbox
예제 #10
0
    def add_bbox_only(self, bbox):

        if isinstance(bbox, list) or isinstance(bbox, tuple) or isinstance(
                bbox, dict):
            bbox = BBox(bbox)
        if not bbox.isvalid():
            return
        tmp_name = str(bbox)
        self.name2bbox[tmp_name] = bbox
        self.hor_tree.addi(bbox.left(), bbox.right(), tmp_name)
        self.ver_tree.addi(bbox.bottom(), bbox.top(), tmp_name)
예제 #11
0
def merge_bbox(bbox1, bbox2):
    if isinstance(bbox1, list) or isinstance(bbox1, tuple):
        res = []
        res.append(min(bbox1[0], bbox2[0]))
        res.append(min(bbox1[1], bbox2[1]))
        res.append(max(bbox1[2], bbox2[2]))
        res.append(max(bbox1[3], bbox2[3]))
        return res
    elif isinstance(bbox1, BBox):
        return BBox([
            min(bbox1.left(), bbox2.left()),
            min(bbox1.bottom(), bbox2.bottom()),
            max(bbox1.right(), bbox2.right()),
            max(bbox1.top(), bbox2.top())
        ])
    else:
        raise Exception('unknown type of bbox')
예제 #12
0
def adjust_char_bbox_by_path(lines, paths):
    for li, line in enumerate(lines):
        for ci, char in enumerate(line):
            if not isinstance(char, LTChar):
                continue
            for path in paths:
                path_bbox = BBox(path.bbox)
                char_bbox = BBox(char.bbox)
                if char_bbox.overlap(path_bbox):
                    path_bbox_v_center = path_bbox.v_center()
                    path_bbox_height = path_bbox.height()
                    path_bbox_height += 1
                    bbox1, bbox2 = char_bbox.v_split(
                        path_bbox_v_center, path_bbox_height * 1.1 / 2)
                    if bbox1.area() > bbox2.area():
                        lines[li][ci].set_bbox(bbox1.to_list())
                    else:
                        lines[li][ci].set_bbox(bbox2.to_list())
    return lines
예제 #13
0
    def get_UGPs(self):
        """
        only get the UGP and test the performance of ME layout analysis
        mostly copy from the export_latex

        :return:
        """
        assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list)
        ugp_list = []

        nscs_id = 0  # the id for nscs
        nscs_num = len(self.id_list_list_for_nscs)

        while nscs_id < nscs_num:
            if self.nscs_label_list[nscs_id] == 1:
                # keep finding more
                tmp_id = nscs_id

                me_symbol_groups = [
                ]  # prepare the me_symbol group for parsing

                while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1:
                    cid_list = self.id_list_list_for_nscs[tmp_id]

                    for cid in cid_list:
                        latex_val = get_latex_val_of_lt_char(
                            self.chars[cid], self.get_font())
                        # TODO, ajdust of the tight bounding box
                        bbox = BBox(self.chars[cid].bbox)

                        me_symbol_group = MESymbolGroup(
                            MESymbol(latex_val, bbox))
                        me_symbol_groups.append(me_symbol_group)

                    tmp_id += 1
                nscs_id = tmp_id - 1

                # TODO, the path is not presented here
                ugp = UnorganizedGroupPath(me_symbol_groups, [])
                ugp_list.append(ugp)
            nscs_id += 1
        return ugp_list
예제 #14
0
    def get_overlap_by_bbox(self, the_bbox):
        if isinstance(the_bbox, list):
            the_bbox = BBox(the_bbox)
        hor_name_list = []
        i_list = self.hor_tree[the_bbox.left():the_bbox.right()]
        #print i_list
        for interval in i_list:
            hor_name_list.append(interval.data)

        ver_name_list = []
        v_list = self.ver_tree[the_bbox.bottom():the_bbox.top()]
        #print v_list
        for interval in v_list:
            ver_name_list.append(interval.data)

        return list(set(hor_name_list).intersection(set(ver_name_list)))
예제 #15
0
def check_pdfbox_word_segmentation_fail(char_list_list, word_info_list):
    """

    :param char_list_list:
    :param word_info_list:
    :return:
    """
    CHECK_PDFBOX_FAIL_THRES = 0.5
    line_bbox_list = []
    line_width_list = []
    for char_list_line in char_list_list:
        line_bbox = get_char_list_bbox(char_list_line)
        line_bbox_list.append(line_bbox)
        line_width_list.append(line_bbox.width())

    line_width_median = np.percentile(line_width_list, 95)

    failed_count = 0
    for word_info in word_info_list:
        if BBox(word_info['bbox']).width() > CHECK_PDFBOX_FAIL_THRES * line_width_median:
            failed_count += 1
    return failed_count > CHECK_PDFBOX_FAIL_THRES * len(word_info_list)
예제 #16
0
def pdf_extract_lines_raw(pdf_path, pid=0):
    """
    each line is a list of LTChar

    based on the order of the original elements.

    :param pdf_path:
    :param pid:
    :return:
    """
    fontname2space = pdf_extract_fontname2space(pdf_path, pid)
    char_list = pdf_extract_chars(pdf_path, pid)
    char_list_list = list()
    if len(char_list) == 0:
        return char_list_list
    cur_bbox = BBox(char_list[0].bbox)
    tmp_char_list = list()
    for i, char in enumerate(char_list):
        tmp_bbox = BBox(char.bbox)
        if cur_bbox.v_overlap(tmp_bbox):
            tmp_char_list.append(char)
        else:
            tmp_char_list.sort(key=lambda char_arg: char_arg.bbox[0])
            char_list_list.append(
                re_group_char_list_seg(tmp_char_list, fontname2space))
            tmp_char_list = [char]
            cur_bbox = BBox(char.bbox)

    if len(tmp_char_list) > 0:
        char_list_list.append(
            re_group_char_list_seg(tmp_char_list, fontname2space))

    # clean the lines with only LTAnno
    char_list_list = remove_empty_lines(char_list_list)

    return char_list_list
예제 #17
0
def iterative_expanding_idx(ugp, vpos, direction, hrange, options='default'):
    """
    This is a general function that is in charging of many structures,
    such as the fraction line, the radical part analysis.

    :param options:
        "default", just as the original version,
        "half_hor", at least with half of the horizontal range overlapping

    :param ugp: UnorganizedGroupPath, only the me_paths to consider here
    :param vpos: vertical position of concern
    :param direction: "up" or "down"

    :param hrange:
        tuple of two

    :return: list of index for the MEGroup
    :rtype: list[int]
    """
    # TODO, also consider the me_path, and give expection when matched.
    tmp_bbox = BBox([hrange[0], vpos, hrange[1], vpos])

    # first gather the candiates by the horizontal groups
    cand_me_groups_idx = []
    for i, me_group in enumerate(ugp.me_groups):
        if bbox_h_overlapping(me_group.get_tight_bbox(), tmp_bbox):
            if direction == "up" and me_group.get_tight_bbox().bottom(
            ) >= vpos:
                cand_me_groups_idx.append(i)
            if direction == "down" and me_group.get_tight_bbox().top() <= vpos:
                cand_me_groups_idx.append(i)

    # create the seed here
    if direction == "up":
        cand_me_groups_idx.sort(
            key=lambda idx: ugp.me_groups[idx].get_tight_bbox().bottom())
    if direction == "down":
        cand_me_groups_idx.sort(
            key=lambda idx: -1 * ugp.me_groups[idx].get_tight_bbox().top())

    if len(cand_me_groups_idx) > 0:
        tmp_bbox.set_bottom(
            ugp.me_groups[cand_me_groups_idx[0]].get_tight_bbox().bottom())
        tmp_bbox.set_top(
            ugp.me_groups[cand_me_groups_idx[0]].get_tight_bbox().top())

        confirmed_me_groups_idx = [cand_me_groups_idx[0]]
        while True:
            new_adding = False
            for cand_me_group_idx in cand_me_groups_idx:
                if cand_me_group_idx in confirmed_me_groups_idx:
                    continue
                cur_cand_me_group = ugp.me_groups[cand_me_group_idx]

                from pdfxml.me_layout.char_adjust_bbox_core import could_estimate_height
                if isinstance(cur_cand_me_group, MESymbolGroup
                              ) and could_estimate_height(cur_cand_me_group):
                    v_overlap = bbox_v_overlapping(
                        tmp_bbox, cur_cand_me_group.get_adjusted_bbox())
                else:
                    v_overlap = bbox_v_overlapping(
                        tmp_bbox, cur_cand_me_group.get_tight_bbox())
                """
                me_parsing_logger.debug("check overlapping {} and {} as {}".format(
                    tmp_bbox,
                    ugp.me_groups[cand_me_group_idx].get_tight_bbox(),
                    v_overlap
                ))
                """
                if v_overlap:
                    # The vertical overlapping is the first condition that must be sat in call cases
                    if options == 'default':
                        confirmed_me_groups_idx.append(cand_me_group_idx)
                        new_adding = True
                        tmp_bbox = merge_bbox(
                            tmp_bbox,
                            ugp.me_groups[cand_me_group_idx].get_tight_bbox())
                    elif options == 'half_hor':
                        # in the horizontal direction, at least overlapping half of the horizontal range
                        tight_bbox = ugp.me_groups[
                            cand_me_group_idx].get_tight_bbox()

                        max_left = max(tight_bbox.left(), hrange[0])
                        min_right = min(tight_bbox.right(), hrange[1])
                        if min_right >= max_left:
                            tmp_len = min_right - max_left
                            if tmp_len > (tight_bbox.right() -
                                          tight_bbox.left()) / 2:
                                confirmed_me_groups_idx.append(
                                    cand_me_group_idx)
                                new_adding = True
                                tmp_bbox = merge_bbox(
                                    tmp_bbox, ugp.me_groups[cand_me_group_idx].
                                    get_tight_bbox())
                    else:
                        raise Exception("unknown options: {}".format(options))

            if new_adding == False:
                break
        return confirmed_me_groups_idx
    else:
        return []
예제 #18
0
def get_median_char_height_by_char_list(char_list):
    height_list = [
        BBox(char.bbox).height() for char in char_list
        if isinstance(char, LTChar)
    ]
    return np.median(height_list)
예제 #19
0
def merge_line_basic(char_list_list, fontname2space):
    """
    with bugs of creating duplicate characters

    after the accent is merge with the corresponding line

    first, sort the lines based on the top position
    then, merge the lines that overlap with 0.5 of the height

    :param char_list_list:
    :return:
    """
    if len(char_list_list) == 0:
        return char_list_list

    line_bbox_list = []
    for char_list in char_list_list:
        # remove the accent is to avoid over merging.
        line_bbox = get_char_list_bbox(char_list, remove_accent=True)

        # adjust to reduce the height by 1/3
        line_bbox = BBox([
            line_bbox.left(),
            line_bbox.bottom() + line_bbox.height() / 6,
            line_bbox.right(),
            line_bbox.top() - line_bbox.height() / 6
        ])

        line_bbox_list.append(line_bbox)

    cur_line_idx_list = range(len(char_list_list))
    cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top())

    tmp_char_list = []
    tmp_bbox = line_bbox_list[cur_line_idx_list[0]]

    # create debug information here about the merging
    line_str = []
    try:
        for sort_lid in cur_line_idx_list:
            tmp_str = ""
            for char in char_list_list[sort_lid]:
                tmp_str += char.get_text()
            line_str.append(tmp_str)
    except Exception as e:
        print 'create debug information error'
        pass

    # the accent merging here.
    return_char_list_list = []
    for i, sort_lid in enumerate(cur_line_idx_list):
        # if vertical overlapping larger 0.5 of each, then merging,
        # other wise, dont merge

        #if tmp_bbox.v_overlap(line_bbox_list[sort_lid], 0.5):
        # hat not be part of the calculation
        if tmp_bbox.v_overlap(line_bbox_list[sort_lid]):
            tmp_char_list.extend(char_list_list[sort_lid])
            tmp_bbox = merge_bbox_list([tmp_bbox, line_bbox_list[sort_lid]])
        else:
            tmp_char_list = [c for c in tmp_char_list if isinstance(c, LTChar)]
            tmp_char_list.sort(key=lambda c: c.bbox[0])
            return_char_list_list.append(
                re_group_char_list_seg(tmp_char_list, fontname2space))

            # create a new line to merge
            tmp_char_list = []
            tmp_char_list.extend(char_list_list[sort_lid])
            tmp_bbox = line_bbox_list[sort_lid]

    if len(tmp_char_list) > 0:
        return_char_list_list.append(
            re_group_char_list_seg(tmp_char_list, fontname2space))
    return return_char_list_list
예제 #20
0
    def export_latex(self):
        """
        export the sentence into latex format, might also need to pipeline the layout analysis

        :return:
        """
        print "Start exporting LaTeX"
        assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list)
        res = ""

        nscs_id = 0  # the id for nscs
        nscs_num = len(self.id_list_list_for_nscs)

        while nscs_id < nscs_num:
            cid_list = self.id_list_list_for_nscs[nscs_id]
            nscs_str = "".join([self.text_list[cid] for cid in cid_list])
            if isinstance(nscs_str, unicode):
                nscs_str = nscs_str.encode("utf-8")

            if self.nscs_label_list[nscs_id] == 1:
                # keep finding more
                me_str = ""
                tmp_id = nscs_id

                me_symbol_groups = [
                ]  # prepare the me_symbol group for parsing

                while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1:
                    cid_list = self.id_list_list_for_nscs[tmp_id]
                    nscs_str = "".join(
                        [self.text_list[cid] for cid in cid_list])

                    # convert from char to latex value. NOTE: there used to be a bug here.
                    for cid in cid_list:
                        latex_val = get_latex_val_of_lt_char(
                            self.chars[cid], self.get_font())
                        # TODO, ajdust of the tight bounding box
                        bbox = BBox(self.chars[cid].bbox)

                        me_symbol_group = MESymbolGroup(
                            MESymbol(latex_val, bbox))
                        me_symbol_groups.append(me_symbol_group)

                    if isinstance(nscs_str, unicode):
                        nscs_str = nscs_str.encode("utf-8")
                    me_str += nscs_str
                    tmp_id += 1
                nscs_id = tmp_id - 1

                # TODO, NOTE, remove the try catch to get all the parsing here

                try:
                    print "HOWDY!!!"
                    # TODO, the path is not presented here
                    ugp = UnorganizedGroupPath(me_symbol_groups, [])
                    hgroup = ugp2hgroup(ugp)
                    latex_str = hgroup.to_latex()

                    #res += "${}$ ".format(xml_str)
                    res += "${}$ ".format(latex_str)
                except Exception as e:
                    print "OH NO!!!"
                    res += "${}$ ".format(me_str)

            else:
                res += nscs_str + " "
            nscs_id += 1
        res = res.strip()
        return res
예제 #21
0
파일: me_group.py 프로젝트: senyalin/pdfxml
 def set_adjusted_bbox(self, bbox):
     if isinstance(bbox, tuple) or isinstance(bbox, list):
         bbox = BBox(bbox)
     assert isinstance(bbox, BBox)
     self.adjusted_bbox = bbox
예제 #22
0
def get_adj_bbox(char):
    """
    Want to adjust the bounding box based on
     whether the char is italic,
     This is not accurate

    36, 150, 154
    In the past, want to adjust based on whether italic or not

    :param char:
    :return:
    """
    assert isinstance(char, LTChar)
    return BBox(char.bbox)

    import re
    if re.search(r"italic", char.fontname, re.I):
        tmp_bbox = BBox(char.bbox)
        h_cen = tmp_bbox.h_center()
        tmp_width = tmp_bbox.width() - tmp_bbox.height() * 36.0 / 150
        tmp_bbox.set_left(h_cen - tmp_width / 2)
        tmp_bbox.set_right(h_cen + tmp_width / 2)
        return tmp_bbox
    else:
        return BBox(char.bbox)