def get_me_idx_list(): """ get all the me_idx, no matter runnable or not :return: list[int] """ cache_path = "{}/tmp/me_idx_list.pkl".format(infty_cdb_folder) test_folder_exist_for_file_path(cache_path) if os.path.isfile(cache_path): return load_serialization(cache_path) me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER) wb = xlrd.open_workbook(me_xlsx_path) sheet_names = wb.sheet_names() me_idx_list = [] ws = wb.sheet_by_index(0) for r_idx in range(ws.nrows): row = ws.row(r_idx) me_idx = int(row[20].value) me_idx_list.append(me_idx) me_idx_list = list(set(me_idx_list)) dump_serialization(me_idx_list, cache_path) return me_idx_list
def batch_split_chars_by_me_idx(): from pdfxml.file_util import test_folder_exist_for_file_path me_idx2chars = get_me_idx2chars() # load all first for me_idx, chars in me_idx2chars.items(): cached_path = get_cached_chars_path_by_me_idx(me_idx) test_folder_exist_for_file_path(cached_path) if os.path.isfile(cached_path): continue dump_serialization(chars, cached_path)
def get_one_ltchar(): from pdfxml.pdf_util.pdf_extract import process_pdf_internal from pdfxml.path_util import ME_RESOURCE_FOLDER cache_path = "{}/one_ltchar.pkl".format(ME_RESOURCE_FOLDER) if os.path.isfile(cache_path): return load_serialization(cache_path) pdf_file_name = "{}/NIPS_2016_6202".format(ME_RESOURCE_FOLDER) pdf_path = "{}.pdf".format(pdf_file_name) char_list = process_pdf_internal(pdf_path, 0) dump_serialization(char_list[0], cache_path) return char_list[0]
def get_all_me_idx_list(): """ This will be the list of me_idx to run experiment with :return: """ cache_path = "{}/all_me_idx_list.pkl".format(infty_cdb_tmp_folder) if os.path.isfile(cache_path): return load_serialization(cache_path) xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER) elem_list = load_me_elems_xlsx(xlsx_path) me_idx_set = set() for elem in elem_list: me_idx_set.add(elem['me_idx']) me_idx_list = list(me_idx_set) dump_serialization(me_idx_list, cache_path) return me_idx_list
def get_l1_dist(rel_id, fea_name): """ :param rel_id: :param fea_name: :return: """ cache_path = get_dist_cache_path([rel_id], fea_name) if os.path.isfile(cache_path): return load_serialization(cache_path) samples = get_l1_samples(rel_id, fea_name) dist = SampleDist(samples) dump_serialization(dist, cache_path) return dist
def get_cid2me_idx(): """ char id to me idx :return: """ cache = "{}/cid2me_idx.pkl".format(infty_cdb_tmp_folder) test_folder_exist_for_file_path(cache) if os.path.isfile(cache): return load_serialization(cache) cid2me_idx = {} me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER) elem_list = load_me_elems_xlsx(me_xlsx_path) for elem in elem_list: cid2me_idx[elem['cid']] = elem['me_idx'] dump_serialization(cid2me_idx, cache) return cid2me_idx
def get_nycd_dist(rel_id_list, refresh=False, sample_num=me_layout_config.default_sample_num): """ if length of 0 or 1, will not be :param rel_id_list: :param refresh: :param sample_num: :return: """ cache_path = get_dist_cache_path( rel_id_list, 'nycd_dist', sample_num=sample_num) if os.path.isfile(cache_path) and not refresh: return load_serialization(cache_path) print "try to calculate {}".format(cache_path) dist = None if len(rel_id_list) == 0: raise Exception("Should not be here") elif len(rel_id_list) == 1: dist = get_l1_nycd_dist(rel_id_list[0]) else: n = len(rel_id_list) mid = int(n/2) rel_list1 = rel_id_list[0:mid] rel_list2 = rel_id_list[mid:] nycd_dist_1 = get_nycd_dist(rel_list1) nycd_dist_2 = get_nycd_dist(rel_list2) hr_dist_1 = get_hr_dist(rel_list1) nycd_samples_1 = nycd_dist_1.gen_random(sample_num) nycd_samples_2 = nycd_dist_2.gen_random(sample_num) hr_samples_1 = hr_dist_1.gen_random(sample_num) # first do nycd2 * hr1 samples = [] for nycd2, hr1, nycd1 in itertools.product( nycd_samples_2, hr_samples_1, nycd_samples_1): tmp = nycd2 * hr1 + nycd1 samples.append(tmp) dist = SampleDist(samples) dump_serialization(dist, cache_path) return dist
def get_hr_dist(rel_id_list, refresh=False, sample_num=me_layout_config.default_sample_num, debug=False): """ :param rel_id_list: :param refresh: :param sample_num: :return: """ cache_path = get_dist_cache_path( rel_id_list, 'hr_dist', sample_num=sample_num) if os.path.isfile(cache_path) and not refresh: if debug: print cache_path return load_serialization(cache_path) print "try to calculate {}".format(cache_path) dist = None if len(rel_id_list) == 0: raise Exception("TODO") elif len(rel_id_list) == 1: dist = get_l1_hr_dist(rel_id_list[0]) else: n = len(rel_id_list) mid = int(n/2) # NOTE, there should be symbol overlapping of the two intervals # but at the relation level, there is no overlapping dist1 = get_hr_dist(rel_id_list[0:mid]) dist2 = get_hr_dist(rel_id_list[mid:]) print "done dist1 and dist2" sample1 = dist1.gen_random(sample_num) sample2 = dist2.gen_random(sample_num) print "done gen samples" vp_list = itertools.product(sample1, sample2) samples = [vp[0] * vp[1] for vp in vp_list] dist = SampleDist(samples) print "done create new dist" dump_serialization(dist, cache_path) return dist
def get_fea_val_by_fea_name(fea_name): """ :param fea_name: :type fea_name: basestring :return: dict from relative position type to list of feature values :rtype: dict[string_wx, list[float]] """ assert (fea_name in fea_name2fea_func) fea_func = fea_name2fea_func[fea_name] cache_path = "{}/fea_val/{}.pkl".format(infty_cdb_folder, fea_name) if os.path.isfile(cache_path): return load_serialization(cache_path) rel_list = prepare_parent_child_relation_list_batch() rel2fea_val_list = { "HORIZONTAL": [], "RSUP": [], "RSUB": [], "REV_RSUB": [], "REV_RSUP": [] } for pr in rel_list: if not pr['relation'] in ["HORIZONTAL", "RSUP", "RSUB"]: continue # adjust bbox here p_bbox = adjust_bbox_h_gt_name(pr['pinfo']['bbox'], pr['pinfo']['name']) c_bbox = adjust_bbox_h_gt_name(pr['cinfo']['bbox'], pr['cinfo']['name']) fea_val = fea_func(p_bbox, c_bbox) rel2fea_val_list[pr['relation']].append(fea_val) if pr['relation'] in ["RSUP", "RSUB"]: # calculate the feature value here rev_fea_val = fea_func(c_bbox, p_bbox) rel2fea_val_list["REV_" + pr['relation']].append(rev_fea_val) dump_serialization(rel2fea_val_list, cache_path) return rel2fea_val_list
def prepare_parent_child_relation_list_batch(): """ merge all the triples :return: :return: list of dict[parent, children, relation] """ cache_path = "{}/tmp/hor_sub_sup_alphanumeric.pkl".format(infty_cdb_folder) if os.path.isfile(cache_path): return load_serialization(cache_path) res_list = [] # get all me_idx here chars_folder = "{}/crop_chars".format(infty_cdb_folder) for fname in os.listdir(chars_folder): me_idx = int(fname[fname.rindex("_") + 1:-4]) pair_list = prepare_parent_child_relation_list_one_me(me_idx) res_list.extend(pair_list) print me_idx, len(pair_list) dump_serialization(res_list, cache_path) return res_list
def fname2shape(): """ pre-calculate and store the shape of image file avoid overhead here Need this because the vertical coordinate is reversed. """ cache_path = "{}/tmp/im_shape.pkl".format(infty_cdb_folder) test_folder_exist_for_file_path(cache_path) if os.path.isfile(cache_path): return load_serialization(cache_path) im2shape = {} img_folder = "{}/InftyCDB-1/Images".format(SHARED_FOLDER) for fname in os.listdir(img_folder): if not fname.endswith("png"): continue fpath = "{}/InftyCDB-1/Images/{}".format(SHARED_FOLDER, fname) im = imread(fpath) im2shape[fname] = im.shape dump_serialization(im2shape, cache_path) return im2shape
def get_pid2cid_list(): """ :return: map from parent char id to list of children """ cache_path = 'pid2cid_list.pkl' if os.path.isfile(cache_path): return load_serialization(cache_path) cid2info = load_char_map() pid2cid_list = {} for cid, info in cid2info.items(): if info['pid'] == -1: continue if not pid2cid_list.has_key(info['pid']): pid2cid_list[info['pid']] = [] pid2cid_list[info['pid']].append(cid) dump_serialization(pid2cid_list, cache_path) return pid2cid_list
def process_pdf_lines(fname, page_num='all', do_adjust=False): """ :param fname: file path to the PDF file :param page_num: default to extract all :return: :rtype: list(list(LTChar)) """ # TODO, cache the informatin here? from pdfxml.path_util import get_tmp_path tmp_pdf_path = get_tmp_path(fname) pdf_lines_cache = "{}.pdf_line.{}.pkl".format(tmp_pdf_path, page_num) if os.path.isfile(pdf_lines_cache): return load_serialization(pdf_lines_cache) line_list = [] char_list = [] def print_layout(l): """ get all the chars """ for e in l: if isinstance(e, LTTextLineHorizontal): #print "try recursively text line" print_layout(e) line_list.append(copy.copy(char_list)) while len(char_list) > 0: char_list.pop() if isinstance(e, LTTextBoxHorizontal): #print "try recursively text box" print_layout(e) if isinstance(e, LTChar) or isinstance(e, LTAnno): char_list.append(e) fp = open(fname, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(document)): process_mark = (page_num == 'all' or page_num == i) if process_mark: interpreter.process_page(page) layout = device.get_result() print_layout(layout) if page_num == i: break if do_adjust: for line in line_list: adjust_basedon_glyph_ratio(line, fname, page_num) # adjust based on crop bbox crop_bbox = get_pdf_page_bbox_abandon(fname, page_num) for line in line_list: for char in line: if isinstance(char, LTChar): adjust_element_bbox(char, crop_bbox) dump_serialization(line_list, pdf_lines_cache) return line_list
def pdf_extract_lines(pdf_path, pid=0, force_single=False): """ each line is a list of LTChar :param pdf_path: :param pid: :return: """ tmp_pdf_path = get_tmp_path(pdf_path) cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid) if os.path.isfile(cache_path): return load_serialization(cache_path) char_list_list = pdf_extract_lines_raw(pdf_path, pid) # TODO, do another round of line merging # still use our column line detection model to find the region. fontname2space = pdf_extract_fontname2space(pdf_path, pid) word_info_list = pdf_extract_words(pdf_path, pid) res_char_list_list = [] if not force_single and is_double_column(pdf_path, pid): # split the current list into three parts # detect the center split, create two column # outside of the double column, # within the double column page_size = get_pdf_page_size(pdf_path, pid) page_width = page_size['width'] out_char_list_list = [] left_char_list_list = [] right_char_list_list = [] from pdfxml.pdf_util.layout_util import get_char_list_bbox for char_list in char_list_list: bbox = get_char_list_bbox(char_list) if bbox.left() < bbox.right() < page_width / 2: left_char_list_list.append(char_list) elif bbox.right() > bbox.left() > page_width / 2: right_char_list_list.append(char_list) else: out_char_list_list.append(char_list) # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) new_out_char_list_list = merging_lines(out_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_left_char_list_list = merging_lines(left_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_right_char_list_list = merging_lines(right_char_list_list, fontname2space, word_info_list, pdf_path, pid) # not in the vertical range of the double dolumn # center on the left part, # center on the right part, char_list_list = [] char_list_list.extend(new_out_char_list_list) char_list_list.extend(new_left_char_list_list) char_list_list.extend(new_right_char_list_list) res_char_list_list = char_list_list else: # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) # single column, then just go on merging the lines new_char_list_list = merging_lines(char_list_list, fontname2space, word_info_list, pdf_path, pid) res_char_list_list = new_char_list_list dump_serialization(res_char_list_list, cache_path) return res_char_list_list
def get_char2extend_for_one(me_idx, debug=False): """ based on the horizontal grouping from construct_hierarchy try to make assessment on the correction of the bbox :param me_idx: the idx of ME :type me_idx: int :param debug: :return: a dict, code 2 upper_list and code 2 lower_list 'code2upper_ratio': use height to adjust upper 'code2upper_ratio_hor': use the width to adjust upper, because the error rate for the flat sign such as minus is hard to estimate. 'code2lower_ratio': use height to adjust lower 'code2lower_ratio_hor': use the width to adjust lower """ code2name = get_code2name() cached_path = "{}/char2extend_ratio/{}.pkl".format(infty_cdb_folder, me_idx) if not debug and os.path.isfile(cached_path): # if not debug and the file exist return load_serialization(cached_path) code2upper_ratio, code2upper_ratio_hor, \ code2lower_ratio, code2lower_ratio_hor = {}, {}, {}, {} res = { 'code2upper_ratio': code2upper_ratio, 'code2upper_ratio_hor': code2upper_ratio_hor, 'code2lower_ratio': code2lower_ratio, 'code2lower_ratio_hor': code2lower_ratio_hor } def update_dict(d, k, v): if not d.has_key(k): d[k] = [] d[k].append(v) # change to inftyCDBME try: struct_info = construct_hierarchy_by_me_idx(me_idx) except Exception as e: print "failed for me_idx {} {}".format(me_idx, str(e)) return res cid2info = struct_info.cid2chars for group in struct_info.hor_groups: # each group is a list of cid if upper_exist(group, cid2info): upper_line = get_upper_line(group, cid2info) if debug: print 'ascender_line: {}'.format(upper_line) print_line(group, cid2info) for cid in group: r = get_upper_ratio(cid2info[cid], upper_line) r_hor = get_upper_ratio_hor(cid2info[cid], upper_line) update_dict(code2upper_ratio, cid2info[cid]['code'], r) update_dict(code2upper_ratio_hor, cid2info[cid]['code'], r_hor) if debug: print cid, code2name[cid2info[cid]['code']], \ "code2upper_ratio", r, \ "code2upper_ratio_hor", r_hor if lower_exist(group, cid2info): lower_line = get_lower_line(group, cid2info) if debug: print 'descender_line: {}'.format(lower_line) print_line(group, cid2info) for cid in group: r = get_lower_ratio(cid2info[cid], lower_line) r_hor = get_lower_ratio_hor(cid2info[cid], lower_line) update_dict(code2lower_ratio, cid2info[cid]['code'], r) update_dict(code2lower_ratio_hor, cid2info[cid]['code'], r_hor) if debug: print cid, code2name[cid2info[cid]['code']], \ "code2lower_ratio", r, \ "code2lower_ratio_hor", r_hor dump_serialization(res, cached_path) return res