def adjust_bbox_by_latex_val(bbox, latex_val): """ Given the tight bbox of a char and the latex value, :param bbox: tight bbox :type bbox: BBox :param latex_val: string of the latex :return: """ if latex_val in ["\\prime", "\\dprime"]: # find return bbox if latex_val.startswith("\\mathcal"): latex_val = latex_val[latex_val.index("{")+1: latex_val.index("}")] # TODO, find the stat glyph_type = get_gt_type_by_latex_val(latex_val) if glyph_type == GT_NON_STABLE: return BBox(bbox) elif glyph_type == GT_HEIGHT_STABLE: # adjust by the character return adjust_bbox_h_latex_vertically(bbox, latex_val) elif glyph_type == GT_CENTERED: return BBox(bbox) elif glyph_type == GT_WIDTH_STABLE: return adjust_bbox_h_latex_horizontally(bbox, latex_val) else: print type(bbox) raise Exception("unknown type for bbox")
def get_char_list_bbox(char_list, remove_accent=False): tmp_char_list = [] for char in char_list: if not isinstance(char, LTChar): continue if remove_accent: from pdfxml.pdf_util.char_process import get_char_glyph from pdfxml.me_taxonomy.math_resources import accent_name_list gn = get_char_glyph(char, None) if gn in accent_name_list: continue tmp_char_list.append(char) left_list = [char.bbox[0] for char in tmp_char_list] right_list = [char.bbox[2] for char in tmp_char_list] bottom_list = [char.bbox[1] for char in tmp_char_list] top_list = [char.bbox[3] for char in tmp_char_list] if len(left_list) == 0 or\ len(right_list) == 0 or\ len(bottom_list) == 0 or\ len(top_list) == 0: print "WARNING: no bbox for a empty char list" return BBox([0, 0, 0, 0]) new_bbox = (np.min(left_list), np.min(bottom_list), np.max(right_list), np.max(top_list)) return BBox(new_bbox)
def is_horizontal_line(path, m_char_height): """ :param path: :param char_height: :return: """ path_bbox = BBox(path.bbox) is_thin = path_bbox.height() < m_char_height / 3.0 is_flat = path_bbox.width() > path_bbox.height() return is_thin and is_flat
def __init__(self, path_bbox): MEObject.__init__(self) if isinstance(path_bbox, tuple) or isinstance(path_bbox, list): path_bbox = BBox(path_bbox) self.set_adjusted_bbox(path_bbox) self.set_tight_bbox(path_bbox) self.info = {}
def merge_bbox_list(bbox_list): if len(bbox_list) == 0: pass bbox_res = bbox_list[0] for i in range(1, len(bbox_list)): bbox_res = merge_bbox(bbox_res, bbox_list[i]) return BBox(bbox_res)
def add_bbox(self, name, bbox): if isinstance(bbox, list) or isinstance(bbox, tuple): bbox = BBox(bbox) if not bbox.isvalid(): return self.name2bbox[name] = bbox self.hor_tree.addi(bbox.left(), bbox.right(), name) self.ver_tree.addi(bbox.bottom(), bbox.top(), name)
def bbox_half_overlap_list(b, blist, thres=0.5): """ return true if there is a bbox in blist that overlap area > thres * b.area :param b: :param blist: :param thres: :return: """ b = BBox(b) org_area = b.area() for nb in blist: if not b.overlap(nb): continue ol_area = b.intersect(nb).area() # ol_area = bbox_overlap_area(b, nb) if ol_area > org_area * thres: return True return False
def exist_overlap(self, the_bbox): """ :param the_bbox: :return: """ if isinstance(the_bbox, list) or isinstance(the_bbox, tuple): the_bbox = BBox(the_bbox) res_bbox_list = self.get_overlap_by_bbox(the_bbox) return len(res_bbox_list) > 0
def set_tight_bbox(self, bbox): """ :param bbox: :return: """ if isinstance(bbox, tuple) or isinstance(bbox, list): assert len(bbox) == 4 bbox = BBox(bbox) assert isinstance(bbox, BBox) self.tight_bbox = bbox
def add_bbox_only(self, bbox): if isinstance(bbox, list) or isinstance(bbox, tuple) or isinstance( bbox, dict): bbox = BBox(bbox) if not bbox.isvalid(): return tmp_name = str(bbox) self.name2bbox[tmp_name] = bbox self.hor_tree.addi(bbox.left(), bbox.right(), tmp_name) self.ver_tree.addi(bbox.bottom(), bbox.top(), tmp_name)
def merge_bbox(bbox1, bbox2): if isinstance(bbox1, list) or isinstance(bbox1, tuple): res = [] res.append(min(bbox1[0], bbox2[0])) res.append(min(bbox1[1], bbox2[1])) res.append(max(bbox1[2], bbox2[2])) res.append(max(bbox1[3], bbox2[3])) return res elif isinstance(bbox1, BBox): return BBox([ min(bbox1.left(), bbox2.left()), min(bbox1.bottom(), bbox2.bottom()), max(bbox1.right(), bbox2.right()), max(bbox1.top(), bbox2.top()) ]) else: raise Exception('unknown type of bbox')
def adjust_char_bbox_by_path(lines, paths): for li, line in enumerate(lines): for ci, char in enumerate(line): if not isinstance(char, LTChar): continue for path in paths: path_bbox = BBox(path.bbox) char_bbox = BBox(char.bbox) if char_bbox.overlap(path_bbox): path_bbox_v_center = path_bbox.v_center() path_bbox_height = path_bbox.height() path_bbox_height += 1 bbox1, bbox2 = char_bbox.v_split( path_bbox_v_center, path_bbox_height * 1.1 / 2) if bbox1.area() > bbox2.area(): lines[li][ci].set_bbox(bbox1.to_list()) else: lines[li][ci].set_bbox(bbox2.to_list()) return lines
def get_UGPs(self): """ only get the UGP and test the performance of ME layout analysis mostly copy from the export_latex :return: """ assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list) ugp_list = [] nscs_id = 0 # the id for nscs nscs_num = len(self.id_list_list_for_nscs) while nscs_id < nscs_num: if self.nscs_label_list[nscs_id] == 1: # keep finding more tmp_id = nscs_id me_symbol_groups = [ ] # prepare the me_symbol group for parsing while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1: cid_list = self.id_list_list_for_nscs[tmp_id] for cid in cid_list: latex_val = get_latex_val_of_lt_char( self.chars[cid], self.get_font()) # TODO, ajdust of the tight bounding box bbox = BBox(self.chars[cid].bbox) me_symbol_group = MESymbolGroup( MESymbol(latex_val, bbox)) me_symbol_groups.append(me_symbol_group) tmp_id += 1 nscs_id = tmp_id - 1 # TODO, the path is not presented here ugp = UnorganizedGroupPath(me_symbol_groups, []) ugp_list.append(ugp) nscs_id += 1 return ugp_list
def get_overlap_by_bbox(self, the_bbox): if isinstance(the_bbox, list): the_bbox = BBox(the_bbox) hor_name_list = [] i_list = self.hor_tree[the_bbox.left():the_bbox.right()] #print i_list for interval in i_list: hor_name_list.append(interval.data) ver_name_list = [] v_list = self.ver_tree[the_bbox.bottom():the_bbox.top()] #print v_list for interval in v_list: ver_name_list.append(interval.data) return list(set(hor_name_list).intersection(set(ver_name_list)))
def check_pdfbox_word_segmentation_fail(char_list_list, word_info_list): """ :param char_list_list: :param word_info_list: :return: """ CHECK_PDFBOX_FAIL_THRES = 0.5 line_bbox_list = [] line_width_list = [] for char_list_line in char_list_list: line_bbox = get_char_list_bbox(char_list_line) line_bbox_list.append(line_bbox) line_width_list.append(line_bbox.width()) line_width_median = np.percentile(line_width_list, 95) failed_count = 0 for word_info in word_info_list: if BBox(word_info['bbox']).width() > CHECK_PDFBOX_FAIL_THRES * line_width_median: failed_count += 1 return failed_count > CHECK_PDFBOX_FAIL_THRES * len(word_info_list)
def pdf_extract_lines_raw(pdf_path, pid=0): """ each line is a list of LTChar based on the order of the original elements. :param pdf_path: :param pid: :return: """ fontname2space = pdf_extract_fontname2space(pdf_path, pid) char_list = pdf_extract_chars(pdf_path, pid) char_list_list = list() if len(char_list) == 0: return char_list_list cur_bbox = BBox(char_list[0].bbox) tmp_char_list = list() for i, char in enumerate(char_list): tmp_bbox = BBox(char.bbox) if cur_bbox.v_overlap(tmp_bbox): tmp_char_list.append(char) else: tmp_char_list.sort(key=lambda char_arg: char_arg.bbox[0]) char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) tmp_char_list = [char] cur_bbox = BBox(char.bbox) if len(tmp_char_list) > 0: char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) # clean the lines with only LTAnno char_list_list = remove_empty_lines(char_list_list) return char_list_list
def iterative_expanding_idx(ugp, vpos, direction, hrange, options='default'): """ This is a general function that is in charging of many structures, such as the fraction line, the radical part analysis. :param options: "default", just as the original version, "half_hor", at least with half of the horizontal range overlapping :param ugp: UnorganizedGroupPath, only the me_paths to consider here :param vpos: vertical position of concern :param direction: "up" or "down" :param hrange: tuple of two :return: list of index for the MEGroup :rtype: list[int] """ # TODO, also consider the me_path, and give expection when matched. tmp_bbox = BBox([hrange[0], vpos, hrange[1], vpos]) # first gather the candiates by the horizontal groups cand_me_groups_idx = [] for i, me_group in enumerate(ugp.me_groups): if bbox_h_overlapping(me_group.get_tight_bbox(), tmp_bbox): if direction == "up" and me_group.get_tight_bbox().bottom( ) >= vpos: cand_me_groups_idx.append(i) if direction == "down" and me_group.get_tight_bbox().top() <= vpos: cand_me_groups_idx.append(i) # create the seed here if direction == "up": cand_me_groups_idx.sort( key=lambda idx: ugp.me_groups[idx].get_tight_bbox().bottom()) if direction == "down": cand_me_groups_idx.sort( key=lambda idx: -1 * ugp.me_groups[idx].get_tight_bbox().top()) if len(cand_me_groups_idx) > 0: tmp_bbox.set_bottom( ugp.me_groups[cand_me_groups_idx[0]].get_tight_bbox().bottom()) tmp_bbox.set_top( ugp.me_groups[cand_me_groups_idx[0]].get_tight_bbox().top()) confirmed_me_groups_idx = [cand_me_groups_idx[0]] while True: new_adding = False for cand_me_group_idx in cand_me_groups_idx: if cand_me_group_idx in confirmed_me_groups_idx: continue cur_cand_me_group = ugp.me_groups[cand_me_group_idx] from pdfxml.me_layout.char_adjust_bbox_core import could_estimate_height if isinstance(cur_cand_me_group, MESymbolGroup ) and could_estimate_height(cur_cand_me_group): v_overlap = bbox_v_overlapping( tmp_bbox, cur_cand_me_group.get_adjusted_bbox()) else: v_overlap = bbox_v_overlapping( tmp_bbox, cur_cand_me_group.get_tight_bbox()) """ me_parsing_logger.debug("check overlapping {} and {} as {}".format( tmp_bbox, ugp.me_groups[cand_me_group_idx].get_tight_bbox(), v_overlap )) """ if v_overlap: # The vertical overlapping is the first condition that must be sat in call cases if options == 'default': confirmed_me_groups_idx.append(cand_me_group_idx) new_adding = True tmp_bbox = merge_bbox( tmp_bbox, ugp.me_groups[cand_me_group_idx].get_tight_bbox()) elif options == 'half_hor': # in the horizontal direction, at least overlapping half of the horizontal range tight_bbox = ugp.me_groups[ cand_me_group_idx].get_tight_bbox() max_left = max(tight_bbox.left(), hrange[0]) min_right = min(tight_bbox.right(), hrange[1]) if min_right >= max_left: tmp_len = min_right - max_left if tmp_len > (tight_bbox.right() - tight_bbox.left()) / 2: confirmed_me_groups_idx.append( cand_me_group_idx) new_adding = True tmp_bbox = merge_bbox( tmp_bbox, ugp.me_groups[cand_me_group_idx]. get_tight_bbox()) else: raise Exception("unknown options: {}".format(options)) if new_adding == False: break return confirmed_me_groups_idx else: return []
def get_median_char_height_by_char_list(char_list): height_list = [ BBox(char.bbox).height() for char in char_list if isinstance(char, LTChar) ] return np.median(height_list)
def merge_line_basic(char_list_list, fontname2space): """ with bugs of creating duplicate characters after the accent is merge with the corresponding line first, sort the lines based on the top position then, merge the lines that overlap with 0.5 of the height :param char_list_list: :return: """ if len(char_list_list) == 0: return char_list_list line_bbox_list = [] for char_list in char_list_list: # remove the accent is to avoid over merging. line_bbox = get_char_list_bbox(char_list, remove_accent=True) # adjust to reduce the height by 1/3 line_bbox = BBox([ line_bbox.left(), line_bbox.bottom() + line_bbox.height() / 6, line_bbox.right(), line_bbox.top() - line_bbox.height() / 6 ]) line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) tmp_char_list = [] tmp_bbox = line_bbox_list[cur_line_idx_list[0]] # create debug information here about the merging line_str = [] try: for sort_lid in cur_line_idx_list: tmp_str = "" for char in char_list_list[sort_lid]: tmp_str += char.get_text() line_str.append(tmp_str) except Exception as e: print 'create debug information error' pass # the accent merging here. return_char_list_list = [] for i, sort_lid in enumerate(cur_line_idx_list): # if vertical overlapping larger 0.5 of each, then merging, # other wise, dont merge #if tmp_bbox.v_overlap(line_bbox_list[sort_lid], 0.5): # hat not be part of the calculation if tmp_bbox.v_overlap(line_bbox_list[sort_lid]): tmp_char_list.extend(char_list_list[sort_lid]) tmp_bbox = merge_bbox_list([tmp_bbox, line_bbox_list[sort_lid]]) else: tmp_char_list = [c for c in tmp_char_list if isinstance(c, LTChar)] tmp_char_list.sort(key=lambda c: c.bbox[0]) return_char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) # create a new line to merge tmp_char_list = [] tmp_char_list.extend(char_list_list[sort_lid]) tmp_bbox = line_bbox_list[sort_lid] if len(tmp_char_list) > 0: return_char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) return return_char_list_list
def export_latex(self): """ export the sentence into latex format, might also need to pipeline the layout analysis :return: """ print "Start exporting LaTeX" assert len(self.id_list_list_for_nscs) == len(self.nscs_label_list) res = "" nscs_id = 0 # the id for nscs nscs_num = len(self.id_list_list_for_nscs) while nscs_id < nscs_num: cid_list = self.id_list_list_for_nscs[nscs_id] nscs_str = "".join([self.text_list[cid] for cid in cid_list]) if isinstance(nscs_str, unicode): nscs_str = nscs_str.encode("utf-8") if self.nscs_label_list[nscs_id] == 1: # keep finding more me_str = "" tmp_id = nscs_id me_symbol_groups = [ ] # prepare the me_symbol group for parsing while tmp_id < nscs_num and self.nscs_label_list[tmp_id] == 1: cid_list = self.id_list_list_for_nscs[tmp_id] nscs_str = "".join( [self.text_list[cid] for cid in cid_list]) # convert from char to latex value. NOTE: there used to be a bug here. for cid in cid_list: latex_val = get_latex_val_of_lt_char( self.chars[cid], self.get_font()) # TODO, ajdust of the tight bounding box bbox = BBox(self.chars[cid].bbox) me_symbol_group = MESymbolGroup( MESymbol(latex_val, bbox)) me_symbol_groups.append(me_symbol_group) if isinstance(nscs_str, unicode): nscs_str = nscs_str.encode("utf-8") me_str += nscs_str tmp_id += 1 nscs_id = tmp_id - 1 # TODO, NOTE, remove the try catch to get all the parsing here try: print "HOWDY!!!" # TODO, the path is not presented here ugp = UnorganizedGroupPath(me_symbol_groups, []) hgroup = ugp2hgroup(ugp) latex_str = hgroup.to_latex() #res += "${}$ ".format(xml_str) res += "${}$ ".format(latex_str) except Exception as e: print "OH NO!!!" res += "${}$ ".format(me_str) else: res += nscs_str + " " nscs_id += 1 res = res.strip() return res
def set_adjusted_bbox(self, bbox): if isinstance(bbox, tuple) or isinstance(bbox, list): bbox = BBox(bbox) assert isinstance(bbox, BBox) self.adjusted_bbox = bbox
def get_adj_bbox(char): """ Want to adjust the bounding box based on whether the char is italic, This is not accurate 36, 150, 154 In the past, want to adjust based on whether italic or not :param char: :return: """ assert isinstance(char, LTChar) return BBox(char.bbox) import re if re.search(r"italic", char.fontname, re.I): tmp_bbox = BBox(char.bbox) h_cen = tmp_bbox.h_center() tmp_width = tmp_bbox.width() - tmp_bbox.height() * 36.0 / 150 tmp_bbox.set_left(h_cen - tmp_width / 2) tmp_bbox.set_right(h_cen + tmp_width / 2) return tmp_bbox else: return BBox(char.bbox)