def word_info_filter(char_list_list, word_info_list): """ pass in all lines :param char_list_list: :param word_info_list: :return: """ # remove bbox that span multiple lines # 2D overlapping finding. from pdfxml.intervaltree_2d import IntervalTree2D line_interval_tree_2d = IntervalTree2D() for i, char_list_line in enumerate(char_list_list): line_bbox = get_char_list_bbox(char_list_line) line_interval_tree_2d.add_bbox(i, line_bbox) filtered_word_info_list = [] for word_info in word_info_list: line_name_list = line_interval_tree_2d.get_overlap_by_bbox( word_info['bbox']) if len(line_name_list) > 1: msg = "one word overlap with multipe lines {}".format(word_info) print msg continue filtered_word_info_list.append(word_info) word_info_list = filtered_word_info_list return word_info_list
def check_pdfbox_word_segmentation_fail(char_list_list, word_info_list): """ :param char_list_list: :param word_info_list: :return: """ CHECK_PDFBOX_FAIL_THRES = 0.5 line_bbox_list = [] line_width_list = [] for char_list_line in char_list_list: line_bbox = get_char_list_bbox(char_list_line) line_bbox_list.append(line_bbox) line_width_list.append(line_bbox.width()) line_width_median = np.percentile(line_width_list, 95) failed_count = 0 for word_info in word_info_list: if BBox(word_info['bbox']).width() > CHECK_PDFBOX_FAIL_THRES * line_width_median: failed_count += 1 return failed_count > CHECK_PDFBOX_FAIL_THRES * len(word_info_list)
def merging_merge_one_line(char_list_line, word_info_list): """ merge the word in a line based on the overlapping with the word extracted from pdfbox :param char_list_line: :param word_info_list: :return: """ # out put some debuging information here. pdf_util_debug_log.debug(char_list2str(char_list_line)) line_bbox = get_char_list_bbox(char_list_line) pdfbox_word_bbox_list = [] for word_info in word_info_list: if line_bbox.overlap(word_info['bbox']): pdfbox_word_bbox_list.append(word_info['bbox']) # get the bbox and overlapped word bbox # build a confusion matrix of the word. char_word_list = get_char_list_list(char_list_line) char_word_bbox_list = [] for wid, char_word in enumerate(char_word_list): char_word_bbox_list.append(get_char_list_bbox(char_word)) pdf_util_debug_log.debug("{} {}".format(wid, char_list2str(char_word))) # build the overlapping matrix here. # two should be merged if they overlap with the same word from pdfbox # union find algorithm here to find connected components. uf = UnionFind() for cid, char_word_bbox in enumerate(char_word_bbox_list): uf.add_node(cid) for pid, pdfbox_word_bbox in enumerate(pdfbox_word_bbox_list): cid_list = [] for cid, char_word_bbox in enumerate(char_word_bbox_list): if char_word_bbox.overlap(pdfbox_word_bbox): cid_list.append(cid) for cid in cid_list: uf.merge(cid_list[0], cid) merged_cid_list_list = uf.get_groups() new_char_word_list = [] new_char_word_bbox_list = [] for merged_cid_list in merged_cid_list_list: tmp_char_word = [] for cid in merged_cid_list: tmp_char_word.extend(char_word_list[cid]) new_char_word_list.append(tmp_char_word) new_char_word_bbox_list.append(get_char_list_bbox(tmp_char_word)) # for the lesft cid_list # sort based on the left boundary tmp_idx_list = range(len(new_char_word_list)) tmp_idx_list.sort(key=lambda idx: new_char_word_bbox_list[idx].left()) sorted_new_char_word_list = [] for tmp_idx in tmp_idx_list: sorted_new_char_word_list.append(new_char_word_list[tmp_idx]) # TODO, split the lines with too long word, very likely to be wrong max_len = get_longest_length(sorted_new_char_word_list) if max_len > WORD_LENGTH_95_QUARTILE: new_char_list_line = max_word_split(char_list_line) else: new_char_list_line = char_list_list2char_list(sorted_new_char_word_list) return new_char_list_line
def export_xml(page_info, out_path, pdf_path=None, pid=None): """ TODO, also export the value human could understand, rather than the hex value hex value is only for consistency with the other system """ page_n = ET.Element('Page', {'PageNum': str(page_info['pid'])}) font = get_font_from_pdf(pdf_path, pid) for ime_line in page_info['ilist']: bbox = get_char_list_bbox(ime_line) i_n = ET.SubElement(page_n, 'IsolatedFormula', { 'BBox': icst_bbox2str(bbox), 'readable_bbox': readable_bbox2str(bbox) }) for char in ime_line: if isinstance(char, LTChar): clean_text = get_latex_val_of_lt_char(char, font) clean_text = invalid_xml_remove(clean_text) #print clean_text #clean_text = illegal_xml_re.sub('', char.get_text()) c_n = ET.SubElement( i_n, 'Char', { 'BBox': icst_bbox2str(char.bbox), 'readable_bbox': readable_bbox2str(char.bbox), 'FSize': str(char.size), 'Text': clean_text }) # the eme part for eme in page_info['elist']: bbox = get_char_list_bbox(eme) i_n = ET.SubElement(page_n, 'EmbeddedFormula', { 'BBox': icst_bbox2str(bbox), 'readable_bbox': readable_bbox2str(bbox) }) for char in eme: if isinstance(char, LTChar): #clean_text = illegal_xml_re.sub('', char.get_text()) clean_text = get_latex_val_of_lt_char(char, font) clean_text = invalid_xml_remove(clean_text) #print clean_text c_n = ET.SubElement( i_n, 'Char', { 'BBox': icst_bbox2str(char.bbox), 'readable_bbox': readable_bbox2str(char.bbox), 'FSize': str(char.size), 'Text': clean_text }) try: res = ET.tostring(page_n, encoding='utf-8') if out_path: with open(out_path, 'w') as f: print >> f, res else: print res except Exception as e: print e
def pdf_extract_lines(pdf_path, pid=0, force_single=False): """ each line is a list of LTChar :param pdf_path: :param pid: :return: """ tmp_pdf_path = get_tmp_path(pdf_path) cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid) if os.path.isfile(cache_path): return load_serialization(cache_path) char_list_list = pdf_extract_lines_raw(pdf_path, pid) # TODO, do another round of line merging # still use our column line detection model to find the region. fontname2space = pdf_extract_fontname2space(pdf_path, pid) word_info_list = pdf_extract_words(pdf_path, pid) res_char_list_list = [] if not force_single and is_double_column(pdf_path, pid): # split the current list into three parts # detect the center split, create two column # outside of the double column, # within the double column page_size = get_pdf_page_size(pdf_path, pid) page_width = page_size['width'] out_char_list_list = [] left_char_list_list = [] right_char_list_list = [] from pdfxml.pdf_util.layout_util import get_char_list_bbox for char_list in char_list_list: bbox = get_char_list_bbox(char_list) if bbox.left() < bbox.right() < page_width / 2: left_char_list_list.append(char_list) elif bbox.right() > bbox.left() > page_width / 2: right_char_list_list.append(char_list) else: out_char_list_list.append(char_list) # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) new_out_char_list_list = merging_lines(out_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_left_char_list_list = merging_lines(left_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_right_char_list_list = merging_lines(right_char_list_list, fontname2space, word_info_list, pdf_path, pid) # not in the vertical range of the double dolumn # center on the left part, # center on the right part, char_list_list = [] char_list_list.extend(new_out_char_list_list) char_list_list.extend(new_left_char_list_list) char_list_list.extend(new_right_char_list_list) res_char_list_list = char_list_list else: # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) # single column, then just go on merging the lines new_char_list_list = merging_lines(char_list_list, fontname2space, word_info_list, pdf_path, pid) res_char_list_list = new_char_list_list dump_serialization(res_char_list_list, cache_path) return res_char_list_list
def merge_line_basic(char_list_list, fontname2space): """ with bugs of creating duplicate characters after the accent is merge with the corresponding line first, sort the lines based on the top position then, merge the lines that overlap with 0.5 of the height :param char_list_list: :return: """ if len(char_list_list) == 0: return char_list_list line_bbox_list = [] for char_list in char_list_list: # remove the accent is to avoid over merging. line_bbox = get_char_list_bbox(char_list, remove_accent=True) # adjust to reduce the height by 1/3 line_bbox = BBox([ line_bbox.left(), line_bbox.bottom() + line_bbox.height() / 6, line_bbox.right(), line_bbox.top() - line_bbox.height() / 6 ]) line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) tmp_char_list = [] tmp_bbox = line_bbox_list[cur_line_idx_list[0]] # create debug information here about the merging line_str = [] try: for sort_lid in cur_line_idx_list: tmp_str = "" for char in char_list_list[sort_lid]: tmp_str += char.get_text() line_str.append(tmp_str) except Exception as e: print 'create debug information error' pass # the accent merging here. return_char_list_list = [] for i, sort_lid in enumerate(cur_line_idx_list): # if vertical overlapping larger 0.5 of each, then merging, # other wise, dont merge #if tmp_bbox.v_overlap(line_bbox_list[sort_lid], 0.5): # hat not be part of the calculation if tmp_bbox.v_overlap(line_bbox_list[sort_lid]): tmp_char_list.extend(char_list_list[sort_lid]) tmp_bbox = merge_bbox_list([tmp_bbox, line_bbox_list[sort_lid]]) else: tmp_char_list = [c for c in tmp_char_list if isinstance(c, LTChar)] tmp_char_list.sort(key=lambda c: c.bbox[0]) return_char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) # create a new line to merge tmp_char_list = [] tmp_char_list.extend(char_list_list[sort_lid]) tmp_bbox = line_bbox_list[sort_lid] if len(tmp_char_list) > 0: return_char_list_list.append( re_group_char_list_seg(tmp_char_list, fontname2space)) return return_char_list_list
def merge_accent(char_list_list, fontname2space): """ merge the accent :param char_list_list: :return: """ line_bbox_list = [] for char_list in char_list_list: line_bbox = get_char_list_bbox(char_list) # not removing the accent line_bbox_list.append(line_bbox) #print char_list2str(char_list) # for debugging only cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) # for each accent line, merge with the first line under it. and h overlap with it used_line_id_list = [] # the line might be occur before the accent accent_idx2target_idx = {} for i, line_idx in enumerate(cur_line_idx_list): if line_idx in used_line_id_list: continue if only_accent(char_list_list[cur_line_idx_list[i]]): cur_bbox = line_bbox_list[line_idx] found_next_line = False for j in range(0, len(cur_line_idx_list)): # check all lines cand_bbox = line_bbox_list[cur_line_idx_list[j]] #if cur_bbox.h_overlap(cand_bbox) and cand_bbox.top() < cur_bbox.bottom(): if cur_bbox.h_overlap(cand_bbox): cond1 = cand_bbox.top() < cur_bbox.top() # but not self cond2 = cand_bbox.bottom() <= cur_bbox.bottom() <= \ cur_bbox.top() <= cand_bbox.top() and i != j if cond1 or cond2: accent_idx2target_idx[line_idx] = cur_line_idx_list[j] used_line_id_list.append(line_idx) used_line_id_list.append(cur_line_idx_list[j]) #tmp_char_list = char_list_list[line_idx] #tmp_char_list.extend(char_list_list[cur_line_idx_list[j]]) #return_char_list_list.append(tmp_char_list) found_next_line = True break if not found_next_line: pass else: #return_char_list_list.append(char_list_list[line_idx]) pass return_char_list_list = [] used_line_id_list = [] target_idx2accent_idx = {v: k for k, v in accent_idx2target_idx.items()} for i, line_idx in enumerate(cur_line_idx_list): if line_idx in used_line_id_list: continue if line_idx in accent_idx2target_idx: tmp_char_list = [] tmp_char_list.extend(char_list_list[line_idx]) tmp_char_list.extend( char_list_list[accent_idx2target_idx[line_idx]]) return_char_list_list.append(tmp_char_list) used_line_id_list.append(line_idx) used_line_id_list.append(accent_idx2target_idx[line_idx]) elif line_idx in target_idx2accent_idx: tmp_char_list = [] tmp_char_list.extend(char_list_list[line_idx]) tmp_char_list.extend( char_list_list[target_idx2accent_idx[line_idx]]) return_char_list_list.append(tmp_char_list) used_line_id_list.append(line_idx) used_line_id_list.append(target_idx2accent_idx[line_idx]) else: return_char_list_list.append(char_list_list[line_idx]) for char_list in return_char_list_list: #print char_list2str(char_list) pass # sort the return _char_list_list based on the top line_bbox_list = [] for char_list in return_char_list_list: line_bbox = get_char_list_bbox(char_list) # not removing the accent line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(return_char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) # re-order new_return_char_list_list = [] for line_idx in cur_line_idx_list: new_return_char_list_list.append(return_char_list_list[line_idx]) return_char_list_list = new_return_char_list_list res_char_list_list = [] for char_list in return_char_list_list: res_char_list_list.append( re_group_char_list_seg(char_list, fontname2space)) return res_char_list_list
def is_double_column(pdf_path, pid, debug=False): """ The idea is that if there are two cluster of begin position , then double column :param pdf_path: :param pid: :return: """ from pdfxml.pdf_util.pdfbox_line_merging import pdf_extract_lines_raw from pdfxml.pdf_util.pdfbox_wrapper import get_pdf_page_size char_list_list = pdf_extract_lines_raw(pdf_path, pid) page_size = get_pdf_page_size(pdf_path, pid) page_width = page_size['width'] # get the boundary of the column, collect the startpoint, and end point, use 0.95 quantile start_pos_list = [] end_pos_list = [] quantile = 0.90 for char_list in char_list_list: # remove line with less than 10 chars if len(char_list) < 30: continue bbox = get_char_list_bbox(char_list) start_pos_list.append(bbox.left()) end_pos_list.append(bbox.right()) if len(start_pos_list) == 0 or len(end_pos_list) == 0: # it's an empty page. #raise Exception("could not get the left/right boundary") return False start_pos = np.percentile(start_pos_list, int((1 - quantile) * 100)) end_pos = np.percentile(end_pos_list, int(quantile * 100)) if debug: #plt.hist(start_pos_list) print("The main column boundary {} {}".format(start_pos, end_pos)) if end_pos < page_width / 2 or start_pos > page_width / 2: # if only half of the column have enough lines. return True center_pos = (start_pos + end_pos) / 2 good_line_count = 0 total_count = 0.0 for char_list in char_list_list: # remove line with less than 10 chars if len(char_list) < 30: continue bbox = get_char_list_bbox(char_list) if bbox.left() < bbox.right() < center_pos or \ bbox.right() > bbox.left() > center_pos: good_line_count += 1 if debug: tmp_str = char_list2str(char_list) print "Good Line", tmp_str, bbox total_count += 1 #print "BadLine", total_count if debug: line_str_list = [] for char_list in char_list_list: line_str_list.append(char_list2str(char_list)) threshold = 0.6 if float(good_line_count) / total_count > threshold: return True else: return False
def merge_line_ime(char_list_list): """ Though it's called IME processing, however it's only merging the bind var, no matter it's IME or EME. a better name might be merge big op only merge based on the bind var operator :param char_list_list: :return: """ line_bbox_list = [] for char_list in char_list_list: line_bbox = get_char_list_bbox(char_list) # not removing the accent line_bbox_list.append(line_bbox) cur_line_idx_list = range(len(char_list_list)) cur_line_idx_list.sort(key=lambda lid: -line_bbox_list[lid].top()) uppper_under_line_idx = list() line_idx2line_idx_list = {} res_char_list_list = [] for i, line_idx in enumerate(cur_line_idx_list): left_bound, right_bound = 1000000, -1 for char in char_list_list[line_idx]: if not isinstance(char, LTChar): continue latex_val = get_latex_val_of_lt_char(char) #print latex_val if latex_val in ['\\sum', '\\prod']: left_bound = min(left_bound, char.bbox[0]-get_width(char.bbox)) right_bound = max(right_bound, char.bbox[2]+get_width(char.bbox)) if left_bound > right_bound: line_idx2line_idx_list[line_idx] = [line_idx] continue line_idx2line_idx_list[line_idx] = [line_idx] if i != 0: prev_line_idx = cur_line_idx_list[i-1] prev_bbox = line_bbox_list[prev_line_idx] if prev_bbox.left() > left_bound and prev_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(prev_line_idx) uppper_under_line_idx.append(prev_line_idx) if i != len(cur_line_idx_list)-1: next_line_idx = cur_line_idx_list[i+1] next_bbox = line_bbox_list[next_line_idx] if next_bbox.left() > left_bound and next_bbox.right() < right_bound: line_idx2line_idx_list[line_idx].append(next_line_idx) uppper_under_line_idx.append(next_line_idx) res_char_list_list = [] for line_idx in cur_line_idx_list: if line_idx in uppper_under_line_idx: continue tmp_char_list = [] for tmp_li in line_idx2line_idx_list[line_idx]: tmp_char_list.extend(char_list_list[tmp_li]) res_char_list_list.append(tmp_char_list) return res_char_list_list