def is_new_matching(self, nscs, pid): """ the new here means it's not recognized earlier The matching here only concern with the char value Should not overlapping with an existing me :param nscs: list of LTChar sorted by the left bound :return: """ # skip the punct s = char_list2str(nscs) ignore_single_char_list = ['(', ')', '[', ']', '{', '}', 'a', 'A'] if s.strip() in latex_punct_list or s in ignore_single_char_list: # a and A are the only possible sequence to be matched out return False nscs_bbox = char_list2bbox(nscs) if self.pid2it2d[pid].exist_overlap(nscs_bbox): return False print("It's very inefficient now") for me_nscs in self.pid2me_char_list_list[pid]: if is_subsequence(nscs, me_nscs, char_same): return True return False
def group_line_into_words(line, check_nltk=False): """ TODO, This looks challenging? or what is done here? :param line: :return: list of list of LTChar """ word_list = [] word_char_list = [] for c in line: if isinstance(c, LTAnno): if len(word_char_list) > 0: tmp_str = char_list2str(word_char_list) if check_nltk: if tmp_str in word_set: word_list.append(word_char_list) else: # ignore the current one word_list.append([]) else: word_list.append(word_char_list) word_char_list = [] elif isinstance(c, LTChar): word_char_list.append(c) else: raise Exception('unknown char type') if len(word_char_list) > 0: word_list.append(word_char_list) # each word is a list of char return word_list
def get_char_dist_est_line(line): """ get the mean between consectuve chars wthin and between words :param line: list of LTChar or LTAnno :return: """ within_dist_list = [] between_dist_list = [] word_list = group_line_into_words(line) word_str_list = [] for word in word_list: word_str_list.append(char_list2str(word)) for i in range(len(word_list) - 1): if len(word_list[i + 1]) == 0 or len(word_list[i]) == 0: continue between_dist = word_list[i + 1][0].bbox[0] - word_list[i][-1].bbox[2] if between_dist < -10: continue between_dist_list.append(between_dist) for word in word_list: for i in range(len(word) - 1): within_dist = word[i + 1].bbox[0] - word[i].bbox[2] if within_dist < -10: continue within_dist_list.append(within_dist) return within_dist_list, between_dist_list
def re_group_char_list_merge_isolated_digit(char_list_in, font): char_list_list = get_char_list_list(char_list_in) word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) if MERGE_ISOLATED_DIGIT: while True: merge_pos = None for i in range(len(word_list) - 1): next_first_latex_val = get_latex_val_of_lt_char( char_list_list[i + 1][0], font) prev_last_latex_val = get_latex_val_of_lt_char( char_list_list[i][-1], font) if word_list[i].isdigit() and \ (is_bin_op_latex_val(next_first_latex_val) or is_rel_latex_val(next_first_latex_val)): merge_pos = i break elif word_list[i + 1].isdigit() and \ (is_bin_op_latex_val(prev_last_latex_val) or is_rel_latex_val(prev_last_latex_val)): merge_pos = i break else: pass if merge_pos is None: break # process new_char_list_list = [] i = 0 while i < len(char_list_list): if i == merge_pos: new_char_list_list.append(char_list_list[i]) new_char_list_list[-1].extend(char_list_list[i + 1]) i += 1 else: new_char_list_list.append(char_list_list[i]) i += 1 char_list_list = new_char_list_list word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) return char_list_list2char_list(char_list_list)
def re_group_char_list_split(char_list_in): """ split the unmatched parenthesis :param char_list_in: :return: """ char_list_list = get_char_list_list(char_list_in) word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) if SPLIT_UNPAIRED_PARENTHESIS: while True: do_split = False new_char_list_list = [] for i in range(len(char_list_list)): if word_list[i].count('(') == 1 and \ word_list[i].count(')') == 0 and \ word_list[i][0] == '(' and \ len(word_list[i]) > 1: new_char_list_list.append([char_list_list[i][0]]) new_char_list_list.append(char_list_list[i][1:]) do_split = True elif word_list[i].count('(') == 0 and \ word_list[i].count(')') == 1 and \ word_list[i][-1] == ')' and \ len(word_list[i]) > 1: new_char_list_list.append(char_list_list[i][0:-1]) new_char_list_list.append([char_list_list[i][-1]]) do_split = True else: new_char_list_list.append(char_list_list[i]) if do_split: char_list_list = new_char_list_list word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) else: break return char_list_list2char_list(char_list_list)
def re_group_char_list_merge_unmatched_parenthesis(char_list_in, font): """ NOTE: :param char_list_in: :param font: :return: """ char_list_list = get_char_list_list(char_list_in) word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) while True: merge_unmatched = False new_char_list_list = [] i = 0 while i < len(char_list_list): if i+1 < len(char_list_list) and \ word_list[i].count('(') == 1 and \ word_list[i].count(')') == 0 and \ word_list[i+1].count(')') == 1: new_char_list_list.append(char_list_list[i]) new_char_list_list[-1].extend(char_list_list[i + 1]) i += 2 merge_unmatched = True else: new_char_list_list.append(char_list_list[i]) i += 1 if not merge_unmatched: break else: char_list_list = new_char_list_list word_list = [] for word_char_list in char_list_list: word_list.append(char_list2str(word_char_list)) return char_list_list2char_list(char_list_list)
def re_group_char_list(char_list_in, font, debug=False): """ remove or insert LTAnno when necessary :param char_list: :return: """ if debug: print "Input ", char_list2str(char_list_in) # regroup the unmatched parenthesis # do this first, otherwise, might be splited again by the split func char_list_in = re_group_char_list_merge_unmatched_parenthesis( char_list_in, font) if debug: print "After parenthesis", char_list2str(char_list_in) char_list_in = re_group_ending_punct(char_list_in, font) if debug: print "After Punct", char_list2str(char_list_in) # based on the binary operator char_list_in = re_group_char_list_merge_isolated_digit(char_list_in, font) if debug: print "After isolated digit", char_list2str(char_list_in) char_list_in = re_group_char_list_split(char_list_in) if debug: print "After isolated digit", char_list2str(char_list_in) # merge oversplit captalized char_list_in = re_group_merge_cap(char_list_in) if debug: print "After cap", char_list2str(char_list_in) # TODO, binary relation/operator merger return char_list_in
def merging_merge_one_line(char_list_line, word_info_list): """ merge the word in a line based on the overlapping with the word extracted from pdfbox :param char_list_line: :param word_info_list: :return: """ # out put some debuging information here. pdf_util_debug_log.debug(char_list2str(char_list_line)) line_bbox = get_char_list_bbox(char_list_line) pdfbox_word_bbox_list = [] for word_info in word_info_list: if line_bbox.overlap(word_info['bbox']): pdfbox_word_bbox_list.append(word_info['bbox']) # get the bbox and overlapped word bbox # build a confusion matrix of the word. char_word_list = get_char_list_list(char_list_line) char_word_bbox_list = [] for wid, char_word in enumerate(char_word_list): char_word_bbox_list.append(get_char_list_bbox(char_word)) pdf_util_debug_log.debug("{} {}".format(wid, char_list2str(char_word))) # build the overlapping matrix here. # two should be merged if they overlap with the same word from pdfbox # union find algorithm here to find connected components. uf = UnionFind() for cid, char_word_bbox in enumerate(char_word_bbox_list): uf.add_node(cid) for pid, pdfbox_word_bbox in enumerate(pdfbox_word_bbox_list): cid_list = [] for cid, char_word_bbox in enumerate(char_word_bbox_list): if char_word_bbox.overlap(pdfbox_word_bbox): cid_list.append(cid) for cid in cid_list: uf.merge(cid_list[0], cid) merged_cid_list_list = uf.get_groups() new_char_word_list = [] new_char_word_bbox_list = [] for merged_cid_list in merged_cid_list_list: tmp_char_word = [] for cid in merged_cid_list: tmp_char_word.extend(char_word_list[cid]) new_char_word_list.append(tmp_char_word) new_char_word_bbox_list.append(get_char_list_bbox(tmp_char_word)) # for the lesft cid_list # sort based on the left boundary tmp_idx_list = range(len(new_char_word_list)) tmp_idx_list.sort(key=lambda idx: new_char_word_bbox_list[idx].left()) sorted_new_char_word_list = [] for tmp_idx in tmp_idx_list: sorted_new_char_word_list.append(new_char_word_list[tmp_idx]) # TODO, split the lines with too long word, very likely to be wrong max_len = get_longest_length(sorted_new_char_word_list) if max_len > WORD_LENGTH_95_QUARTILE: new_char_list_line = max_word_split(char_list_line) else: new_char_list_line = char_list_list2char_list(sorted_new_char_word_list) return new_char_list_line
def max_word_split(char_list): """ sort based on the left punctuation: comma, period, colon, semi-colon, parenthesis letters: if vertical overlap, should not be split into two parts. :param char_list: :return: char_list with LTAnno inserted. """ if len(char_list) == 0: return [] # only keep the LTChar char_list = [char for char in char_list if isinstance(char, LTChar)] # create the hor-overlapping constraints check_sorted_by_left(char_list) # get overlapping pairs as hard constraints pair_left_idx_list = [] pair_right_idx_list = [] for i in range(len(char_list)-1): if char_list[i].bbox[2] > char_list[i+1].bbox[0]: pair_left_idx_list.append(i) pair_right_idx_list.append(i+1) def is_word(start_i, end_i): """ check whether the chars in the range is good :param start_i: :param end_i: :return: """ # check the boundary if start_i in pair_right_idx_list or end_i in pair_left_idx_list: # very big penalty return -10000 word = "" for char_idx in range(start_i, end_i+1): if len(char_list[char_idx].get_text()) > 1: return -1 word += char_list[char_idx].get_text() # remove leading or trailing punctuation. word = word.strip(".,;:"+string.whitespace) word = word.lower() s_word = wnl.lemmatize(word, 'n') v_word = wnl.lemmatize(word, 'v') is_word = word in wl or s_word in wl or v_word in wl # special rule if len(word) == 1: if word in ['A', 'a']: return 1 else: return -1 #return 1 if is_word else -1 if is_word: # give more weight to long return (end_i-start_i+1)*(end_i-start_i+1) else: return -1 # dp[i] = dp[j] + word[i,j] # keep track of the split position. # use this for test "http://localhost:8080/pdf_viewer?pdf_name=10.1.1.6.2280_14" # dp[i] means with i char considerd dp = [-100000]*(len(char_list)+1) prev_end = [-1]*(len(char_list)+1) # when previous end lead to -1, mean the last chunk dp[0] = 0 for char_num in range(1, len(char_list)+1): for prev_char_num in range(0, char_num): word_score = is_word(prev_char_num, char_num-1) ms = 'word {}, score {}'.format( char_list2str([char_list[i] for i in range(prev_char_num, char_num)]), word_score) if dp[prev_char_num] + word_score > dp[char_num]: dp[char_num] = dp[prev_char_num] + word_score prev_end[char_num] = prev_char_num # recover the word here i = len(char_list) word_end_idx_list = [] while i >= 0: word_end_idx_list.append(i-1) i = prev_end[i] print word_end_idx_list i = 0 return_char_list = [] while i < len(char_list): return_char_list.append(char_list[i]) if i in word_end_idx_list: return_char_list.append(LTAnno(" ")) i += 1 return return_char_list
def merge_line_fraction(char_list_list, fraction_path_list): char_list = char_list_list2char_list(char_list_list) median_char_height = get_median_char_height_by_char_list(char_list) # char_list_list, sort by the vertical center position line_v_c_list = [] # vertical center org_line_bbox_list = [] for char_list in char_list_list: bbox = char_list2bbox(char_list) line_v_c_list.append(bbox.v_center()) org_line_bbox_list.append(bbox) line_idx_list = range(len(char_list_list)) line_idx_list.sort(key=lambda line_idx: -line_v_c_list[line_idx]) sorted_char_list_list = [] sorted_line_bbox_list = [] for line_idx in line_idx_list: sorted_char_list_list.append(char_list_list[line_idx]) sorted_line_bbox_list.append(org_line_bbox_list[line_idx]) sorted_v_c_list = [-bbox.v_center() for bbox in sorted_line_bbox_list] from pdfxml.pdf_util.unionfind import UnionFind uf = UnionFind() for idx in range(len(sorted_v_c_list)): uf.add_node(idx) for path in fraction_path_list: # get the nearest lines # find_lt() # another issue to concern is the order of the char_list_list? # change to nscs_list, merge the nscs_list, and then convert the char_list for the line gt_idx = find_gt_idx(sorted_v_c_list, -path.bbox[1]) lt_idx = find_lt_idx(sorted_v_c_list, -path.bbox[3]) if gt_idx is None or lt_idx is None: continue if gt_idx <= lt_idx: raise Exception("TODO") assert gt_idx > lt_idx if abs(sorted_v_c_list[gt_idx] - sorted_v_c_list[lt_idx] ) < 3 * median_char_height: # merge the two lines uf.merge(lt_idx, gt_idx) else: print abs(sorted_v_c_list[gt_idx] - sorted_v_c_list[lt_idx]) prev_line_str = char_list2str(sorted_char_list_list[lt_idx]) next_line_str = char_list2str(sorted_char_list_list[gt_idx]) print prev_line_str print next_line_str print 'failed to merge lines?' idx_group_list = uf.get_groups() # merged line list merged_line_list = [] for idx_group in idx_group_list: nscs_list = [] for line_idx in idx_group: tmp_nscs_list = char_list2char_list_list( sorted_char_list_list[line_idx]) nscs_list.extend(tmp_nscs_list) merged_line_list.append(char_list_list2char_list(nscs_list)) # sort the line again merged_line_bbox_list = [char_list2bbox(line) for line in merged_line_list] merged_line_idx_list = range(len(merged_line_list)) merged_line_idx_list.sort( key=lambda merge_line_idx: -merged_line_bbox_list[merge_line_idx].top( )) # minus because start from the bottom sorted_merged_line_list = [ merged_line_list[line_idx] for line_idx in merged_line_idx_list ] return sorted_merged_line_list
def is_double_column(pdf_path, pid, debug=False): """ The idea is that if there are two cluster of begin position , then double column :param pdf_path: :param pid: :return: """ from pdfxml.pdf_util.pdfbox_line_merging import pdf_extract_lines_raw from pdfxml.pdf_util.pdfbox_wrapper import get_pdf_page_size char_list_list = pdf_extract_lines_raw(pdf_path, pid) page_size = get_pdf_page_size(pdf_path, pid) page_width = page_size['width'] # get the boundary of the column, collect the startpoint, and end point, use 0.95 quantile start_pos_list = [] end_pos_list = [] quantile = 0.90 for char_list in char_list_list: # remove line with less than 10 chars if len(char_list) < 30: continue bbox = get_char_list_bbox(char_list) start_pos_list.append(bbox.left()) end_pos_list.append(bbox.right()) if len(start_pos_list) == 0 or len(end_pos_list) == 0: # it's an empty page. #raise Exception("could not get the left/right boundary") return False start_pos = np.percentile(start_pos_list, int((1 - quantile) * 100)) end_pos = np.percentile(end_pos_list, int(quantile * 100)) if debug: #plt.hist(start_pos_list) print("The main column boundary {} {}".format(start_pos, end_pos)) if end_pos < page_width / 2 or start_pos > page_width / 2: # if only half of the column have enough lines. return True center_pos = (start_pos + end_pos) / 2 good_line_count = 0 total_count = 0.0 for char_list in char_list_list: # remove line with less than 10 chars if len(char_list) < 30: continue bbox = get_char_list_bbox(char_list) if bbox.left() < bbox.right() < center_pos or \ bbox.right() > bbox.left() > center_pos: good_line_count += 1 if debug: tmp_str = char_list2str(char_list) print "Good Line", tmp_str, bbox total_count += 1 #print "BadLine", total_count if debug: line_str_list = [] for char_list in char_list_list: line_str_list.append(char_list2str(char_list)) threshold = 0.6 if float(good_line_count) / total_count > threshold: return True else: return False
def get_ignore_region(pdf_path): """ :param pdf_path: :return: return pid2intervaltree """ from pdfxml.me_extraction.me_font_stat_stage4 import internal_get_llines pn = get_page_num(pdf_path) pid2it2d = {} all_lines = [] for pid in range(pn): lines = internal_get_llines(None, pdf_path, pid) for line in lines: line_str = char_list2str(line) is_heading = is_heading_line_by_str(line_str) all_lines.append({ 'char_list': line, 'line_str': line_str, 'pid': pid, 'is_heading': is_heading }) # first process the abstract part abstract_heading_line_idx = None for i, line_info in enumerate(all_lines): if line_info['is_heading'] and \ is_abstract_head(line_info['line_str']): abstract_heading_line_idx = i break if line_info['pid'] > pn / 2: # should not be at the second half of the document. break if abstract_heading_line_idx is not None: for i in range(abstract_heading_line_idx): pid = all_lines[i]['pid'] if pid not in pid2it2d: pid2it2d[pid] = IntervalTree2D() pid2it2d[pid].add_bbox_only( char_list2bbox(all_lines[i]['char_list'])) ignore_begin = False for line_info in all_lines: # ignore the abstraction and the reference if line_info['is_heading']: abs_head = is_abstract_head(line_info['line_str']) ref_head = is_reference_head(line_info['line_str']) if abs_head or ref_head: ignore_begin = True else: ignore_begin = False else: if ignore_begin: pid = line_info['pid'] if pid not in pid2it2d: pid2it2d[pid] = IntervalTree2D() pid2it2d[pid].add_bbox_only( char_list2bbox(line_info['char_list'])) return pid2it2d
def get_line(self): line_str = char_list2str(self._chars) return line_str
def assess_ime(pdf_path, pid=0, xml_out_path=None, ignore_exist=False): """ # IME [3] With math symbol and without non-math words Return: xml_out_path: output the boundary file """ tmp_path = get_tmp_path(pdf_path) ret_info_dict = {} if xml_out_path and os.path.isfile(xml_out_path) and (not ignore_exist): return {} from pdfxml.me_extraction.me_consts import math_words t = time.time() # common resource loader wl = set(words.words()) wl.update(additional_words) wnl = WordNetLemmatizer() d = time.time() - t ret_info_dict['resource_time'] = d t = time.time() # layout analysis font = get_font_from_pdf(pdf_path, pid) #font = None prefix = pdf_path[pdf_path.rindex("/") + 1:-4] lines = internal_get_llines(prefix, pdf_path, pid) d = time.time() - t ret_info_dict['layout_time'] = d # IME assessment core t = time.time() line_labels = [0] * len(lines) for li, line in enumerate(lines): line_label = 0 beg_idx = 0 with_math_symbol_or_word = False with_non_math_word = False for i, char in enumerate(line): if isinstance(char, LTChar): if check_is_math_LTChar(char, font): me_extraction_logger.debug("Char {} as Math".format(char)) with_math_symbol_or_word = True if is_space_char(char): word = "" for j in range(beg_idx, i): if j == i - 1 and line[j].get_text() in [ ',', '.', 'period', 'comma' ]: continue # for word checking, only work on the alpha beta tmp_text = line[j].get_text() if len(tmp_text) != 1: tmp_text = " " word += tmp_text beg_idx = i + 1 word = word.lower().strip() # move to above, and use glyph name to match #if word.endswith(',') or word.endswith('.'): # word = word[:-1] # print check word s_word, v_word = "", "" try: s_word = wnl.lemmatize(word, 'n') v_word = word v_word = wnl.lemmatize(word, 'v') except Exception as e: me_extraction_error_logger.error( "Error checking the word as noun or verb") if word in math_words: me_extraction_logger.debug("Math Word {}".format(word)) with_math_symbol_or_word = True elif len(word) > 2 and (word in wl or s_word in wl or v_word in wl): me_extraction_logger.debug("Plain Word {}".format(word)) with_non_math_word = True else: pass # debug for line, with ME or not tmp_line_str = char_list2str(line, ', ') me_extraction_logger.debug(tmp_line_str) me_extraction_logger.debug("with math {}, with word {}".format( with_math_symbol_or_word, with_non_math_word)) if with_math_symbol_or_word and (not with_non_math_word): me_extraction_logger.debug("MATHLINE") line_label = 1 line_labels[li] = line_label d = time.time() - t ret_info_dict['core_time'] = d if not xml_out_path: for li, line in enumerate(lines): if line_labels[li]: tmp_str = ''.join([ char.get_text() for char in line if isinstance(char, LTChar) ]) print tmp_str.encode("utf-8") # export for evaluation page_info = {} page_info['pid'] = pid page_info['ilist'] = [] page_info['elist'] = [] # create bbox for each ME for li, line in enumerate(lines): if line_labels[li]: visible_char_list = [ char for char in line if isinstance(char, LTChar) ] char_list2str(visible_char_list) page_info['ilist'].append(line) t = time.time() if xml_out_path: export_xml(page_info, xml_out_path, pdf_path, pid) d = time.time() - t ret_info_dict['io_time'] = d return ret_info_dict
def EME_font_stat_pipeline(pdf_path, pid, eme_export_path=None, prev_page_info={}): """ for each word, compare the probability :param pdf_path: :param pid: :param eme_export_path: :param stage: :param refresh_font_stat: :param condprob_type: font, font-val :param word_export_path: :param refresh: :return: """ #print "NOTE force EME extraction to be false" #ext_settings.LANG_MODEL = False #SEQ_ME_MERGER = False ret_info_dict, t = {}, time.time() # assign value and get the current time font_stat_dict = stage4_font_stat(pdf_path) me_font_condprob = create_me_font_condprob( font_stat_dict) # the conditional prob font_val_condprob = create_me_font_val_condprob(font_stat_dict) d = time.time() - t ret_info_dict['stat_time'], t = d, time.time() font = get_font_from_pdf(pdf_path, pid) prefix = pdf_path[pdf_path.rindex("/") + 1:-4] lines = internal_get_llines(prefix, pdf_path, pid) d = time.time() - t ret_info_dict['layout_time'], t = d, time.time() # the loaded IME play two roles here: # * another rule here, if the word is part of an IME line, then also good # * find connected EME, should not overlap with IME xml_path = get_xml_path(pdf_path) ime_res_path = "{}.ime.{}.xml".format(xml_path, pid) if not os.path.isfile(ime_res_path): assess_ime(pdf_path, pid, ime_res_path) gt_flag, gt_me_list = get_info(ime_res_path) gt_ime_list = [gt_me for gt_me in gt_me_list if gt_me['type'] == 'I'] ime_bbox_list = [ [gt_ime['rect']['l'], gt_ime['rect']['b'], gt_ime['rect']['r'], gt_ime['rect']['t']] \ for gt_ime in gt_ime_list ] d = time.time() - t ret_info_dict['ime_time'] = d t = time.time() me_chars = set() references_met = False # TODO, the refer might cover many pages if 'references_met' in prev_page_info: references_met = True #return eme_list = [] word_class_list = [] nscs_label_list_list = [] for li, line in enumerate(lines): nscs_label_list = [] beg_idx = 0 # TODO, reference has a regex later # at the level of line tmp_str = char_list2str(line) from pdfxml.me_extraction.element_matching import is_reference_head if is_reference_head(tmp_str): references_met = True ret_info_dict['references_met'] = True break line_bbox = char_list2bbox(line) if bbox_half_overlap_list(line_bbox, ime_bbox_list): if ext_settings.debug: me_extraction_logger.debug("Line {} as IME".format( char_list2str(line))) continue nscs_list = char_list2char_list_list(line) nscs_str_list = [char_list2str(nscs) for nscs in nscs_list] me_log_prob_list = [] nme_log_prob_list = [] for nscs in nscs_list: nscs_pred = nscs if nscs[-1].get_text() in [',', '.']: nscs_pred = nscs[:-1] # only use the font-val pair to make inferecen now is_me = check_me_font_val(nscs_pred, font_val_condprob, me_font_condprob, font, debug=ext_settings.debug) me_prob, nme_prob = get_me_nme_log_prob_font_val( nscs_pred, font_val_condprob, me_font_condprob, font, debug=ext_settings.debug) if is_me and ext_settings.debug: me_extraction_logger.debug("check me font val ME") me_log_prob_list.append(me_prob) nme_log_prob_list.append(nme_prob) if is_me: # print sth here tmp_str = char_list2str(nscs) if ext_settings.debug: me_extraction_logger.debug("me chunk {}".format( tmp_str.encode('utf-8'))) me_chars.update([c for c in nscs if isinstance(c, LTChar)]) word_class_list.append((nscs, 1)) nscs_label_list.append((nscs, 1)) else: word_class_list.append((nscs, 0)) nscs_label_list.append((nscs, 0)) nscs_label_list_list.append(nscs_label_list) d = time.time() - t ret_info_dict['core_time'], t = d, time.time() # post processing # if EME overlap with IME, should be removed. # and a post processing on merging EME eme_list = eme_merger(lines, me_chars, ime_bbox_list) if eme_export_path: # export the data page_info = {} page_info['pid'] = pid page_info['ilist'] = [] page_info['elist'] = eme_list export_xml(page_info, eme_export_path, pdf_path, pid) d = time.time() - t ret_info_dict['io_time'] = d return ret_info_dict