def check_word_subscript_exist(word): """ :param word: :return: """ found_subscript = False for i in range(len(word) - 1): # avoid over doing on the in accurate. c1 = word[i].get_text() c2 = word[i + 1].get_text() if not (c1.isdigit() or c1.isalpha()): continue if not (c2.isdigit() or c2.isalpha()): continue from pdfxml.pdf_util.layout_util import get_height if get_height(word[i].bbox) == 0: continue hr_fea_val = height_ratio_AB(word[i].bbox, word[i + 1].bbox) nvcd_fea_val = normalized_ycenter_diff_AB(word[i].bbox, word[i + 1].bbox) if hr_ratio_range[0] <= hr_fea_val <= hr_ratio_range[1] and \ nvcd_range[0] <= nvcd_fea_val <= nvcd_range[1]: found_subscript = True me_extraction_logger.debug("{} Bbox1:{}, {} Bbox2:{}".format( word[i].get_text().encode('utf-8'), word[i].bbox, word[i + 1].get_text().encode('utf-8'), word[i + 1].bbox)) return found_subscript
def adjust_bbox_h_latex_vertically(bbox, latex): """ :param bbox: :param latex: :return: """ # adjust vertically. ver_latex2ur, ver_latex2lr, hor_latex2ur, hor_latex2lr = get_latex2adjustment_ratio( ) #upper_ratio = ver_latex2ur[latex] #lower_ratio = ver_latex2lr[latex] upper_ratio, lower_ratio = get_upper_lower_ratio(latex, ver_latex2ur, ver_latex2lr) new_bbox = copy.copy(bbox) if isinstance(new_bbox, BBox): new_bbox = new_bbox.to_list() else: new_bbox = list(new_bbox) height = get_height(bbox) new_bbox[1] = new_bbox[1] - height * lower_ratio new_bbox[3] = new_bbox[3] + height * upper_ratio return new_bbox
def adjust_bbox_h_gt(bbox, gt, debug=False): """ width stable character. :param bbox: :param gt: String of glyph type of xyz :return: """ gt2ur, gt2lr = get_gt2adjust() upper_ratio = gt2ur[gt] lower_ratio = gt2lr[gt] if debug: print gt, upper_ratio, lower_ratio new_bbox = copy.copy(bbox) new_bbox = list(new_bbox) height = get_height(bbox) new_bbox[1] = new_bbox[1] - height * lower_ratio new_bbox[3] = new_bbox[3] + height * upper_ratio return new_bbox
def normalized_At_Bb_diff_by_height_AB(b1, b2): d = b2[1] - b1[3] return d / get_height(b1)
def normalized_Ab_Bt_diff_by_height_AB(b1, b2): d = b2[3] - b1[1] return d / get_height(b1)
def normalized_Ar_Br_diff_by_height_AB(b1, b2): d = b2[2] - b1[2] return d / get_height(b1)
def normalized_Al_Bl_diff_by_height_AB(b1, b2): d = b2[0] - b1[0] return d / get_height(b1)
def normalized_xcenter_diff_by_height_AB(b1, b2): """ horizontal center difference normalized by height of first bbox """ x_center_diff = get_x_center(b1) - get_x_center(b2) return x_center_diff / get_height(b1)
def normalized_ycenter_diff_AB(b1, b2): """ vertical center difference normalized by height of first bbox """ return (get_y_center(b2) - get_y_center(b1)) / get_height(b1)
def height_ratio_AB(b1, b2): """ height ratio """ return get_height(b2) / get_height(b1)
def normalized_ycenter_diff_by_merge_AB(b1, b2): """ symmetric vertical difference """ mb = merge_bbox(b1, b2) return abs(get_y_center(b1) - get_y_center(b2)) / get_height(mb)