Пример #1
def get_me_idx_list():
    get all the me_idx, no matter runnable or not

    :return: list[int]
    cache_path = "{}/tmp/me_idx_list.pkl".format(infty_cdb_folder)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER)
    wb = xlrd.open_workbook(me_xlsx_path)
    sheet_names = wb.sheet_names()

    me_idx_list = []
    ws = wb.sheet_by_index(0)
    for r_idx in range(ws.nrows):
        row = ws.row(r_idx)
        me_idx = int(row[20].value)

    me_idx_list = list(set(me_idx_list))
    dump_serialization(me_idx_list, cache_path)
    return me_idx_list
Пример #2
def batch_split_chars_by_me_idx():
    from pdfxml.file_util import test_folder_exist_for_file_path
    me_idx2chars = get_me_idx2chars()
    # load all first
    for me_idx, chars in me_idx2chars.items():
        cached_path = get_cached_chars_path_by_me_idx(me_idx)
        if os.path.isfile(cached_path):
        dump_serialization(chars, cached_path)
Пример #3
def get_one_ltchar():
    from pdfxml.pdf_util.pdf_extract import process_pdf_internal
    from pdfxml.path_util import ME_RESOURCE_FOLDER
    cache_path = "{}/one_ltchar.pkl".format(ME_RESOURCE_FOLDER)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    pdf_file_name = "{}/NIPS_2016_6202".format(ME_RESOURCE_FOLDER)
    pdf_path = "{}.pdf".format(pdf_file_name)
    char_list = process_pdf_internal(pdf_path, 0)
    dump_serialization(char_list[0], cache_path)

    return char_list[0]
Пример #4
def get_all_me_idx_list():
    This will be the list of me_idx to run experiment with
    cache_path = "{}/all_me_idx_list.pkl".format(infty_cdb_tmp_folder)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)
    xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER)
    elem_list = load_me_elems_xlsx(xlsx_path)
    me_idx_set = set()
    for elem in elem_list:
    me_idx_list = list(me_idx_set)
    dump_serialization(me_idx_list, cache_path)
    return me_idx_list
Пример #5
def get_l1_dist(rel_id, fea_name):

    :param rel_id:
    :param fea_name:
    cache_path = get_dist_cache_path([rel_id], fea_name)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    samples = get_l1_samples(rel_id, fea_name)
    dist = SampleDist(samples)

    dump_serialization(dist, cache_path)
    return dist
Пример #6
def get_cid2me_idx():
    char id to me idx
    cache = "{}/cid2me_idx.pkl".format(infty_cdb_tmp_folder)
    if os.path.isfile(cache):
        return load_serialization(cache)
    cid2me_idx = {}
    me_xlsx_path = "{}/InftyCDB-1/resources/me.xlsx".format(SHARED_FOLDER)
    elem_list = load_me_elems_xlsx(me_xlsx_path)
    for elem in elem_list:
        cid2me_idx[elem['cid']] = elem['me_idx']
    dump_serialization(cid2me_idx, cache)
    return cid2me_idx
Пример #7
def get_nycd_dist(rel_id_list, refresh=False, sample_num=me_layout_config.default_sample_num):
    if length of 0 or 1, will not be

    :param rel_id_list:
    :param refresh:
    :param sample_num:
    cache_path = get_dist_cache_path(
        rel_id_list, 'nycd_dist', sample_num=sample_num)
    if os.path.isfile(cache_path) and not refresh:
        return load_serialization(cache_path)

    print "try to calculate {}".format(cache_path)
    dist = None
    if len(rel_id_list) == 0:
        raise Exception("Should not be here")
    elif len(rel_id_list) == 1:
        dist = get_l1_nycd_dist(rel_id_list[0])
        n = len(rel_id_list)
        mid = int(n/2)

        rel_list1 = rel_id_list[0:mid]
        rel_list2 = rel_id_list[mid:]

        nycd_dist_1 = get_nycd_dist(rel_list1)
        nycd_dist_2 = get_nycd_dist(rel_list2)
        hr_dist_1 = get_hr_dist(rel_list1)

        nycd_samples_1 = nycd_dist_1.gen_random(sample_num)
        nycd_samples_2 = nycd_dist_2.gen_random(sample_num)
        hr_samples_1 = hr_dist_1.gen_random(sample_num)

        # first do nycd2 * hr1

        samples = []
        for nycd2, hr1, nycd1 in itertools.product(
                nycd_samples_2, hr_samples_1, nycd_samples_1):
            tmp = nycd2 * hr1 + nycd1
        dist = SampleDist(samples)

    dump_serialization(dist, cache_path)
    return dist
Пример #8
def get_hr_dist(rel_id_list, refresh=False, sample_num=me_layout_config.default_sample_num, debug=False):

    :param rel_id_list:
    :param refresh:
    :param sample_num:
    cache_path = get_dist_cache_path(
        rel_id_list, 'hr_dist', sample_num=sample_num)

    if os.path.isfile(cache_path) and not refresh:
        if debug:
            print cache_path
        return load_serialization(cache_path)

    print "try to calculate {}".format(cache_path)

    dist = None
    if len(rel_id_list) == 0:
        raise Exception("TODO")
    elif len(rel_id_list) == 1:
        dist = get_l1_hr_dist(rel_id_list[0])
        n = len(rel_id_list)
        mid = int(n/2)
        # NOTE, there should be symbol overlapping of the two intervals
        # but at the relation level, there is no overlapping
        dist1 = get_hr_dist(rel_id_list[0:mid])
        dist2 = get_hr_dist(rel_id_list[mid:])

        print "done dist1 and dist2"

        sample1 = dist1.gen_random(sample_num)
        sample2 = dist2.gen_random(sample_num)

        print "done gen samples"

        vp_list = itertools.product(sample1, sample2)
        samples = [vp[0] * vp[1] for vp in vp_list]
        dist = SampleDist(samples)

        print "done create new dist"

    dump_serialization(dist, cache_path)
    return dist
Пример #9
def get_fea_val_by_fea_name(fea_name):

    :param fea_name:
    :type fea_name: basestring
    :return: dict from relative position type to list of feature values
    :rtype: dict[string_wx, list[float]]

    assert (fea_name in fea_name2fea_func)
    fea_func = fea_name2fea_func[fea_name]

    cache_path = "{}/fea_val/{}.pkl".format(infty_cdb_folder, fea_name)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    rel_list = prepare_parent_child_relation_list_batch()
    rel2fea_val_list = {
        "HORIZONTAL": [],
        "RSUP": [],
        "RSUB": [],
        "REV_RSUB": [],
        "REV_RSUP": []

    for pr in rel_list:
        if not pr['relation'] in ["HORIZONTAL", "RSUP", "RSUB"]:

        # adjust bbox here
        p_bbox = adjust_bbox_h_gt_name(pr['pinfo']['bbox'],

        c_bbox = adjust_bbox_h_gt_name(pr['cinfo']['bbox'],

        fea_val = fea_func(p_bbox, c_bbox)

        if pr['relation'] in ["RSUP", "RSUB"]:
            # calculate the feature value here
            rev_fea_val = fea_func(c_bbox, p_bbox)
            rel2fea_val_list["REV_" + pr['relation']].append(rev_fea_val)

    dump_serialization(rel2fea_val_list, cache_path)
    return rel2fea_val_list
Пример #10
def prepare_parent_child_relation_list_batch():
    merge all the triples

    :return: :return: list of dict[parent, children, relation]
    cache_path = "{}/tmp/hor_sub_sup_alphanumeric.pkl".format(infty_cdb_folder)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    res_list = []
    # get all me_idx here
    chars_folder = "{}/crop_chars".format(infty_cdb_folder)
    for fname in os.listdir(chars_folder):
        me_idx = int(fname[fname.rindex("_") + 1:-4])
        pair_list = prepare_parent_child_relation_list_one_me(me_idx)
        print me_idx, len(pair_list)

    dump_serialization(res_list, cache_path)
    return res_list
Пример #11
def fname2shape():
    pre-calculate and store the shape of image file avoid overhead here
    Need this because the vertical coordinate is reversed.

    cache_path = "{}/tmp/im_shape.pkl".format(infty_cdb_folder)

    if os.path.isfile(cache_path):
        return load_serialization(cache_path)
    im2shape = {}
    img_folder = "{}/InftyCDB-1/Images".format(SHARED_FOLDER)
    for fname in os.listdir(img_folder):
        if not fname.endswith("png"):
        fpath = "{}/InftyCDB-1/Images/{}".format(SHARED_FOLDER, fname)
        im = imread(fpath)
        im2shape[fname] = im.shape
    dump_serialization(im2shape, cache_path)
    return im2shape
Пример #12
def get_pid2cid_list():

    :return: map from parent char id to list of children
    cache_path = 'pid2cid_list.pkl'
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    cid2info = load_char_map()
    pid2cid_list = {}
    for cid, info in cid2info.items():
        if info['pid'] == -1:
        if not pid2cid_list.has_key(info['pid']):
            pid2cid_list[info['pid']] = []

    dump_serialization(pid2cid_list, cache_path)

    return pid2cid_list
Пример #13
def process_pdf_lines(fname, page_num='all', do_adjust=False):

    :param fname: file path to the PDF file
    :param page_num: default to extract all
    :rtype: list(list(LTChar))
    # TODO, cache the informatin here?
    from pdfxml.path_util import get_tmp_path
    tmp_pdf_path = get_tmp_path(fname)

    pdf_lines_cache = "{}.pdf_line.{}.pkl".format(tmp_pdf_path, page_num)
    if os.path.isfile(pdf_lines_cache):
        return load_serialization(pdf_lines_cache)

    line_list = []
    char_list = []
    def print_layout(l):
        """ get all the chars
        for e in l:
            if isinstance(e, LTTextLineHorizontal):
                #print "try recursively text line"
                while len(char_list) > 0:

            if isinstance(e, LTTextBoxHorizontal):
                #print "try recursively text box"

            if isinstance(e, LTChar) or isinstance(e, LTAnno):

    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for i, page in enumerate(PDFPage.create_pages(document)):
        process_mark = (page_num == 'all' or page_num == i)
        if process_mark:
            layout = device.get_result()

        if page_num == i:

    if do_adjust:
        for line in line_list:
            adjust_basedon_glyph_ratio(line, fname, page_num)

    # adjust based on crop bbox
    crop_bbox = get_pdf_page_bbox_abandon(fname, page_num)
    for line in line_list:
        for char in line:
            if isinstance(char, LTChar):
                adjust_element_bbox(char, crop_bbox)

    dump_serialization(line_list, pdf_lines_cache)
    return line_list
Пример #14
def pdf_extract_lines(pdf_path, pid=0, force_single=False):
    each line is a list of LTChar

    :param pdf_path:
    :param pid:
    tmp_pdf_path = get_tmp_path(pdf_path)
    cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid)
    if os.path.isfile(cache_path):
        return load_serialization(cache_path)

    char_list_list = pdf_extract_lines_raw(pdf_path, pid)

    # TODO, do another round of line merging
    # still use our column line detection model to find the region.
    fontname2space = pdf_extract_fontname2space(pdf_path, pid)
    word_info_list = pdf_extract_words(pdf_path, pid)

    res_char_list_list = []
    if not force_single and is_double_column(pdf_path, pid):
        # split the current list into three parts
        # detect the center split, create two column
        # outside of the double column,
        # within the double column
        page_size = get_pdf_page_size(pdf_path, pid)
        page_width = page_size['width']

        out_char_list_list = []
        left_char_list_list = []
        right_char_list_list = []
        from pdfxml.pdf_util.layout_util import get_char_list_bbox
        for char_list in char_list_list:
            bbox = get_char_list_bbox(char_list)
            if bbox.left() < bbox.right() < page_width / 2:
            elif bbox.right() > bbox.left() > page_width / 2:

        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        new_out_char_list_list = merging_lines(out_char_list_list,
                                               fontname2space, word_info_list,
                                               pdf_path, pid)
        new_left_char_list_list = merging_lines(left_char_list_list,
                                                fontname2space, word_info_list,
                                                pdf_path, pid)
        new_right_char_list_list = merging_lines(right_char_list_list,
                                                 word_info_list, pdf_path, pid)

        # not in the vertical range of the double dolumn
        # center on the left part,
        # center on the right part,
        char_list_list = []

        res_char_list_list = char_list_list
        # before mering do the word_info_filter
        word_info_list = word_info_filter(char_list_list, word_info_list)

        # single column, then just go on merging the lines
        new_char_list_list = merging_lines(char_list_list, fontname2space,
                                           word_info_list, pdf_path, pid)
        res_char_list_list = new_char_list_list
    dump_serialization(res_char_list_list, cache_path)
    return res_char_list_list
Пример #15
def get_char2extend_for_one(me_idx, debug=False):
    based on the horizontal grouping from construct_hierarchy
    try to make assessment on the correction of the bbox

    :param me_idx: the idx of ME
    :type me_idx: int
    :param debug:
    :return: a dict, code 2 upper_list and code 2 lower_list
        'code2upper_ratio': use height to adjust upper
        'code2upper_ratio_hor': use the width to adjust upper, because the error rate for the flat sign such as minus is hard to estimate.
        'code2lower_ratio': use height to adjust lower
        'code2lower_ratio_hor': use the width to adjust lower
    code2name = get_code2name()

    cached_path = "{}/char2extend_ratio/{}.pkl".format(infty_cdb_folder,
    if not debug and os.path.isfile(cached_path):
        # if not debug and the file exist
        return load_serialization(cached_path)

    code2upper_ratio, code2upper_ratio_hor, \
        code2lower_ratio, code2lower_ratio_hor = {}, {}, {}, {}
    res = {
        'code2upper_ratio': code2upper_ratio,
        'code2upper_ratio_hor': code2upper_ratio_hor,
        'code2lower_ratio': code2lower_ratio,
        'code2lower_ratio_hor': code2lower_ratio_hor

    def update_dict(d, k, v):
        if not d.has_key(k):
            d[k] = []

    # change to inftyCDBME
        struct_info = construct_hierarchy_by_me_idx(me_idx)
    except Exception as e:
        print "failed for me_idx {} {}".format(me_idx, str(e))
        return res

    cid2info = struct_info.cid2chars
    for group in struct_info.hor_groups:

        # each group is a list of cid
        if upper_exist(group, cid2info):
            upper_line = get_upper_line(group, cid2info)
            if debug:
                print 'ascender_line: {}'.format(upper_line)
                print_line(group, cid2info)

            for cid in group:
                r = get_upper_ratio(cid2info[cid], upper_line)
                r_hor = get_upper_ratio_hor(cid2info[cid], upper_line)
                update_dict(code2upper_ratio, cid2info[cid]['code'], r)
                update_dict(code2upper_ratio_hor, cid2info[cid]['code'], r_hor)
                if debug:
                    print cid, code2name[cid2info[cid]['code']], \
                        "code2upper_ratio", r, \
                        "code2upper_ratio_hor", r_hor

        if lower_exist(group, cid2info):
            lower_line = get_lower_line(group, cid2info)
            if debug:
                print 'descender_line: {}'.format(lower_line)
                print_line(group, cid2info)

            for cid in group:
                r = get_lower_ratio(cid2info[cid], lower_line)
                r_hor = get_lower_ratio_hor(cid2info[cid], lower_line)
                update_dict(code2lower_ratio, cid2info[cid]['code'], r)
                update_dict(code2lower_ratio_hor, cid2info[cid]['code'], r_hor)
                if debug:
                    print cid, code2name[cid2info[cid]['code']], \
                        "code2lower_ratio", r, \
                        "code2lower_ratio_hor", r_hor

    dump_serialization(res, cached_path)
    return res