def get_name2code_set(): """ name is the glyph name, code is the unique id for the pair of glyph name & font one glyph name could have multiple code in different fonts. :return: """ from pdfxml.path_util import SHARED_FOLDER #name2code_cache_path = "{}/tmp/name2code.pkl".format(infty_cdb_folder) name2code_cache_path = "{}/InftyCDB-1/cache_data/name2code.json".format( SHARED_FOLDER) if os.path.isfile(name2code_cache_path): name2code_list = load_general(name2code_cache_path) name2code_set = {} for name, code_list in name2code_list.items(): name2code_set[name] = set(code_list) return name2code_set cid2info = load_char_map() name2code_set = {} for c in cid2info.values(): if c['name'] not in name2code_set: name2code_set[c['name']] = set() name2code_set[c['name']].add(c['code']) name2code_list = {} for name, code_set in name2code_set.items(): name2code_list[name] = list(code_set) dump_general(name2code_list, name2code_cache_path) return name2code_set
def get_latex2adjustment_ratio_list(): from pdfxml.InftyCDB.name2latex import name2latex cache_path = "{}/InftyCDB-1/cache_data/all_adjustment_ratio_list_latex.json".format( SHARED_FOLDER) if os.path.isfile(cache_path): return load_general(cache_path) code2name = get_code2name() gt_dict = { 'code2upper_ratio': {}, 'code2upper_ratio_hor': {}, 'code2lower_ratio': {}, 'code2lower_ratio_hor': {} } total = unify_all_adjustment_ratio() for adjustment_type in total.keys(): c2rlist = total[adjustment_type] for c, rlist in c2rlist.items(): latex_val = name2latex[code2name[c]] if latex_val not in gt_dict[adjustment_type]: gt_dict[adjustment_type][latex_val] = [] gt_dict[adjustment_type][latex_val].extend(rlist) dump_general(gt_dict, cache_path) return gt_dict
def unify_glyph_type_adjustment_ratio(): """ In comparison with `create_all_adjustment_ratio`, this is a marginal version with each code mapping to the glyph type :return: from glyph_type to list of adjustment ratio """ from pdfxml.path_util import PROJECT_FOLDER #cache_path = "{}/tmp/all_adjustment_ratio_glyph_type_level.pkl".format(infty_cdb_folder) cache_path = "{}/InftyCDB-1/cache_data/all_adjustment_ratio_glyph_type_level.json".format( SHARED_FOLDER) if os.path.isfile(cache_path): return load_general(cache_path) code2name = get_code2name() total = unify_all_adjustment_ratio() # gt is short for glyph_type # the code here is the glyph type gt_dict = { 'code2upper_ratio': {}, 'code2upper_ratio_hor': {}, 'code2lower_ratio': {}, 'code2lower_ratio_hor': {} } # NOTE: This is what I am looking for to ajdust by group for adjustment_type in total.keys(): c2rlist = total[adjustment_type] for c, rlist in c2rlist.items(): gt = get_glyph_type(code2name[c]) if gt not in gt_dict[adjustment_type]: gt_dict[adjustment_type][gt] = [] gt_dict[adjustment_type][gt].extend(rlist) dump_general(gt_dict, cache_path) return gt_dict
def load_char_map(refresh=False): """ load mathematical chars from the CSV file NOTE: Dec. 10, later use the XLSX file with less noise. :param refresh: :return: the dict from the char id to char info """ #cached_path = "{}/InftyCDB/cache_data/chars.json".format(PROJECT_FOLDER) cached_path = "{}/tmp/chars.json".format(infty_cdb_folder) test_folder_exist_for_file_path(cached_path) if os.path.isfile(cached_path) and not refresh: return load_general(cached_path) print('rebuild cache from the xlsx file') me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER) me_elems = load_me_elems_xlsx(me_xlsx_path) cid2info = {} for me_elem in me_elems: cid2info[me_elem['cid']] = me_elem dump_general(cid2info, cached_path) return cid2info
def create_all_adjustment_ratio(): """ This is code level adjustment, which is a very refined level. :return: """ cache_path = "{}/tmp/all_adjustment_ratio_code_level.json".format( infty_cdb_folder) if os.path.isfile(cache_path): return load_general(cache_path) total = { 'code2upper_ratio': {}, 'code2upper_ratio_hor': {}, 'code2lower_ratio': {}, 'code2lower_ratio_hor': {} } def update(info_dict): for k in total.keys(): c2rlist = info_dict[k] for c, rlist in c2rlist.items(): # code to list of ratio values if c not in total[k]: total[k][c] = [] total[k][c].extend(rlist) me_idx_list = get_me_idx_list() #me_idx_list = get_me_idx_list_with_adjustment() for n, me_idx in enumerate(me_idx_list): if n % 100 == 0: print "done loading {} files".format(n) adjust_dict = get_char2extend_for_one(me_idx) update(adjust_dict) dump_general(total, cache_path) return total
def get_page_num(fpath): """ Get the page number for the current pdf file https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python """ tmp_path = get_tmp_path(fpath) cache_path = "{}.page_num.json".format(tmp_path) if os.path.isfile(cache_path): tmp_dict = load_general(cache_path) return tmp_dict['page_num'] # Open a PDF file. fp = open(fpath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) c = resolve1(document.catalog['Pages'])['Count'] tmp_dict = {'page_num': c} dump_general(tmp_dict, cache_path) return c