예제 #1
0
def get_name2code_set():
    """
    name is the glyph name,
    code is the unique id for the pair of glyph name & font

    one glyph name could have multiple code in different fonts.

    :return:
    """
    from pdfxml.path_util import SHARED_FOLDER
    #name2code_cache_path = "{}/tmp/name2code.pkl".format(infty_cdb_folder)
    name2code_cache_path = "{}/InftyCDB-1/cache_data/name2code.json".format(
        SHARED_FOLDER)
    if os.path.isfile(name2code_cache_path):
        name2code_list = load_general(name2code_cache_path)
        name2code_set = {}
        for name, code_list in name2code_list.items():
            name2code_set[name] = set(code_list)
        return name2code_set

    cid2info = load_char_map()
    name2code_set = {}
    for c in cid2info.values():
        if c['name'] not in name2code_set:
            name2code_set[c['name']] = set()
        name2code_set[c['name']].add(c['code'])

    name2code_list = {}
    for name, code_set in name2code_set.items():
        name2code_list[name] = list(code_set)
    dump_general(name2code_list, name2code_cache_path)

    return name2code_set
예제 #2
0
def get_latex2adjustment_ratio_list():
    from pdfxml.InftyCDB.name2latex import name2latex
    cache_path = "{}/InftyCDB-1/cache_data/all_adjustment_ratio_list_latex.json".format(
        SHARED_FOLDER)
    if os.path.isfile(cache_path):
        return load_general(cache_path)

    code2name = get_code2name()
    gt_dict = {
        'code2upper_ratio': {},
        'code2upper_ratio_hor': {},
        'code2lower_ratio': {},
        'code2lower_ratio_hor': {}
    }

    total = unify_all_adjustment_ratio()
    for adjustment_type in total.keys():
        c2rlist = total[adjustment_type]
        for c, rlist in c2rlist.items():
            latex_val = name2latex[code2name[c]]
            if latex_val not in gt_dict[adjustment_type]:
                gt_dict[adjustment_type][latex_val] = []
            gt_dict[adjustment_type][latex_val].extend(rlist)
    dump_general(gt_dict, cache_path)

    return gt_dict
예제 #3
0
def unify_glyph_type_adjustment_ratio():
    """
    In comparison with `create_all_adjustment_ratio`,
    this is a marginal version with each code mapping to the glyph type

    :return: from glyph_type to list of adjustment ratio
    """
    from pdfxml.path_util import PROJECT_FOLDER
    #cache_path = "{}/tmp/all_adjustment_ratio_glyph_type_level.pkl".format(infty_cdb_folder)
    cache_path = "{}/InftyCDB-1/cache_data/all_adjustment_ratio_glyph_type_level.json".format(
        SHARED_FOLDER)

    if os.path.isfile(cache_path):
        return load_general(cache_path)

    code2name = get_code2name()

    total = unify_all_adjustment_ratio()
    # gt is short for glyph_type
    # the code here is the glyph type
    gt_dict = {
        'code2upper_ratio': {},
        'code2upper_ratio_hor': {},
        'code2lower_ratio': {},
        'code2lower_ratio_hor': {}
    }

    # NOTE: This is what I am looking for to ajdust by group
    for adjustment_type in total.keys():
        c2rlist = total[adjustment_type]
        for c, rlist in c2rlist.items():
            gt = get_glyph_type(code2name[c])
            if gt not in gt_dict[adjustment_type]:
                gt_dict[adjustment_type][gt] = []
            gt_dict[adjustment_type][gt].extend(rlist)

    dump_general(gt_dict, cache_path)
    return gt_dict
예제 #4
0
def load_char_map(refresh=False):
    """
    load mathematical chars from the CSV file
    NOTE: Dec. 10, later use the XLSX file with less noise.

    :param refresh:
    :return: the dict from the char id to char info
    """
    #cached_path = "{}/InftyCDB/cache_data/chars.json".format(PROJECT_FOLDER)
    cached_path = "{}/tmp/chars.json".format(infty_cdb_folder)
    test_folder_exist_for_file_path(cached_path)
    if os.path.isfile(cached_path) and not refresh:
        return load_general(cached_path)

    print('rebuild cache from the xlsx file')
    me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER)
    me_elems = load_me_elems_xlsx(me_xlsx_path)
    cid2info = {}
    for me_elem in me_elems:
        cid2info[me_elem['cid']] = me_elem

    dump_general(cid2info, cached_path)
    return cid2info
예제 #5
0
def create_all_adjustment_ratio():
    """
    This is code level adjustment, which is a very refined level.

    :return:
    """
    cache_path = "{}/tmp/all_adjustment_ratio_code_level.json".format(
        infty_cdb_folder)
    if os.path.isfile(cache_path):
        return load_general(cache_path)

    total = {
        'code2upper_ratio': {},
        'code2upper_ratio_hor': {},
        'code2lower_ratio': {},
        'code2lower_ratio_hor': {}
    }

    def update(info_dict):
        for k in total.keys():
            c2rlist = info_dict[k]
            for c, rlist in c2rlist.items():  # code to list of ratio values
                if c not in total[k]:
                    total[k][c] = []
                total[k][c].extend(rlist)

    me_idx_list = get_me_idx_list()

    #me_idx_list = get_me_idx_list_with_adjustment()
    for n, me_idx in enumerate(me_idx_list):
        if n % 100 == 0:
            print "done loading {} files".format(n)
        adjust_dict = get_char2extend_for_one(me_idx)
        update(adjust_dict)

    dump_general(total, cache_path)
    return total
예제 #6
0
def get_page_num(fpath):
    """ Get the page number for the current pdf file
    https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python
    """
    tmp_path = get_tmp_path(fpath)
    cache_path = "{}.page_num.json".format(tmp_path)
    if os.path.isfile(cache_path):
        tmp_dict = load_general(cache_path)
        return tmp_dict['page_num']

    # Open a PDF file.
    fp = open(fpath, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)

    c = resolve1(document.catalog['Pages'])['Count']

    tmp_dict = {'page_num': c}
    dump_general(tmp_dict, cache_path)

    return c