Python char_list2str示例，pdfxml.pdf_util.char_process.char_list2str Python示例

示例#1

0

显示文件

文件： sequential_sub_exp_matching.py 项目： senyalin/pdfxml

    def is_new_matching(self, nscs, pid):
        """
        the new here means it's not recognized earlier

        The matching here only concern with the char value
        Should not overlapping with an existing me

        :param nscs: list of LTChar sorted by the left bound
        :return:
        """
        # skip the punct

        s = char_list2str(nscs)
        ignore_single_char_list = ['(', ')', '[', ']', '{', '}', 'a', 'A']
        if s.strip() in latex_punct_list or s in ignore_single_char_list:
            # a and A are the only possible sequence to be matched out
            return False

        nscs_bbox = char_list2bbox(nscs)
        if self.pid2it2d[pid].exist_overlap(nscs_bbox):
            return False
        print("It's very inefficient now")
        for me_nscs in self.pid2me_char_list_list[pid]:
            if is_subsequence(nscs, me_nscs, char_same):
                return True
        return False

示例#2

0

显示文件

def group_line_into_words(line, check_nltk=False):
    """
    TODO, This looks challenging? or what is done here?

    :param line:
    :return: list of list of LTChar
    """
    word_list = []
    word_char_list = []
    for c in line:
        if isinstance(c, LTAnno):
            if len(word_char_list) > 0:
                tmp_str = char_list2str(word_char_list)
                if check_nltk:
                    if tmp_str in word_set:
                        word_list.append(word_char_list)
                    else:
                        # ignore the current one
                        word_list.append([])
                else:
                    word_list.append(word_char_list)
            word_char_list = []
        elif isinstance(c, LTChar):
            word_char_list.append(c)
        else:
            raise Exception('unknown char type')
    if len(word_char_list) > 0:
        word_list.append(word_char_list)
    # each word is a list of char
    return word_list

示例#3

0

显示文件

def get_char_dist_est_line(line):
    """
    get the mean between consectuve chars wthin and between words

    :param line: list of LTChar or LTAnno
    :return:
    """
    within_dist_list = []
    between_dist_list = []
    word_list = group_line_into_words(line)
    word_str_list = []
    for word in word_list:
        word_str_list.append(char_list2str(word))
    for i in range(len(word_list) - 1):
        if len(word_list[i + 1]) == 0 or len(word_list[i]) == 0:
            continue
        between_dist = word_list[i + 1][0].bbox[0] - word_list[i][-1].bbox[2]
        if between_dist < -10:
            continue
        between_dist_list.append(between_dist)
    for word in word_list:
        for i in range(len(word) - 1):
            within_dist = word[i + 1].bbox[0] - word[i].bbox[2]
            if within_dist < -10:
                continue
            within_dist_list.append(within_dist)
    return within_dist_list, between_dist_list

示例#4

0

显示文件

def re_group_char_list_merge_isolated_digit(char_list_in, font):
    char_list_list = get_char_list_list(char_list_in)
    word_list = []
    for word_char_list in char_list_list:
        word_list.append(char_list2str(word_char_list))
    if MERGE_ISOLATED_DIGIT:
        while True:
            merge_pos = None
            for i in range(len(word_list) - 1):
                next_first_latex_val = get_latex_val_of_lt_char(
                    char_list_list[i + 1][0], font)
                prev_last_latex_val = get_latex_val_of_lt_char(
                    char_list_list[i][-1], font)
                if word_list[i].isdigit() and \
                        (is_bin_op_latex_val(next_first_latex_val) or is_rel_latex_val(next_first_latex_val)):
                    merge_pos = i
                    break
                elif word_list[i + 1].isdigit() and \
                        (is_bin_op_latex_val(prev_last_latex_val) or is_rel_latex_val(prev_last_latex_val)):
                    merge_pos = i
                    break
                else:
                    pass
            if merge_pos is None:
                break
            # process
            new_char_list_list = []
            i = 0
            while i < len(char_list_list):
                if i == merge_pos:
                    new_char_list_list.append(char_list_list[i])
                    new_char_list_list[-1].extend(char_list_list[i + 1])
                    i += 1
                else:
                    new_char_list_list.append(char_list_list[i])
                i += 1

            char_list_list = new_char_list_list
            word_list = []
            for word_char_list in char_list_list:
                word_list.append(char_list2str(word_char_list))

    return char_list_list2char_list(char_list_list)

示例#5

0

显示文件

def re_group_char_list_split(char_list_in):
    """
    split the unmatched parenthesis

    :param char_list_in:
    :return:
    """
    char_list_list = get_char_list_list(char_list_in)
    word_list = []
    for word_char_list in char_list_list:
        word_list.append(char_list2str(word_char_list))

    if SPLIT_UNPAIRED_PARENTHESIS:
        while True:
            do_split = False
            new_char_list_list = []
            for i in range(len(char_list_list)):
                if word_list[i].count('(') == 1 and \
                        word_list[i].count(')') == 0 and \
                        word_list[i][0] == '(' and \
                        len(word_list[i]) > 1:
                    new_char_list_list.append([char_list_list[i][0]])
                    new_char_list_list.append(char_list_list[i][1:])
                    do_split = True
                elif word_list[i].count('(') == 0 and \
                        word_list[i].count(')') == 1 and \
                        word_list[i][-1] == ')' and \
                        len(word_list[i]) > 1:
                    new_char_list_list.append(char_list_list[i][0:-1])
                    new_char_list_list.append([char_list_list[i][-1]])
                    do_split = True
                else:
                    new_char_list_list.append(char_list_list[i])

            if do_split:
                char_list_list = new_char_list_list
                word_list = []
                for word_char_list in char_list_list:
                    word_list.append(char_list2str(word_char_list))
            else:
                break
    return char_list_list2char_list(char_list_list)

示例#6

0

显示文件

def re_group_char_list_merge_unmatched_parenthesis(char_list_in, font):
    """
    NOTE:

    :param char_list_in:
    :param font:
    :return:
    """
    char_list_list = get_char_list_list(char_list_in)
    word_list = []
    for word_char_list in char_list_list:
        word_list.append(char_list2str(word_char_list))

    while True:
        merge_unmatched = False
        new_char_list_list = []
        i = 0
        while i < len(char_list_list):
            if i+1 < len(char_list_list) and \
                    word_list[i].count('(') == 1 and \
                    word_list[i].count(')') == 0 and \
                    word_list[i+1].count(')') == 1:
                new_char_list_list.append(char_list_list[i])
                new_char_list_list[-1].extend(char_list_list[i + 1])
                i += 2
                merge_unmatched = True
            else:
                new_char_list_list.append(char_list_list[i])
                i += 1

        if not merge_unmatched:
            break
        else:
            char_list_list = new_char_list_list
            word_list = []
            for word_char_list in char_list_list:
                word_list.append(char_list2str(word_char_list))

    return char_list_list2char_list(char_list_list)

示例#7

0

显示文件

def re_group_char_list(char_list_in, font, debug=False):
    """
    remove or insert LTAnno when necessary

    :param char_list:
    :return:
    """
    if debug:
        print "Input ", char_list2str(char_list_in)
    # regroup the unmatched parenthesis
    # do this first, otherwise, might be splited again by the split func
    char_list_in = re_group_char_list_merge_unmatched_parenthesis(
        char_list_in, font)
    if debug:
        print "After parenthesis", char_list2str(char_list_in)

    char_list_in = re_group_ending_punct(char_list_in, font)
    if debug:
        print "After Punct", char_list2str(char_list_in)

    # based on the binary operator
    char_list_in = re_group_char_list_merge_isolated_digit(char_list_in, font)
    if debug:
        print "After isolated digit", char_list2str(char_list_in)

    char_list_in = re_group_char_list_split(char_list_in)
    if debug:
        print "After isolated digit", char_list2str(char_list_in)

    # merge oversplit captalized
    char_list_in = re_group_merge_cap(char_list_in)
    if debug:
        print "After cap", char_list2str(char_list_in)

    # TODO, binary relation/operator merger

    return char_list_in

示例#8

0

显示文件

文件： pdfbox_line_split_merge.py 项目： senyalin/pdfxml

def merging_merge_one_line(char_list_line, word_info_list):
    """
    merge the word in a line based on the overlapping with the word extracted from pdfbox

    :param char_list_line:
    :param word_info_list:
    :return:
    """
    # out put some debuging information here.
    pdf_util_debug_log.debug(char_list2str(char_list_line))
    line_bbox = get_char_list_bbox(char_list_line)
    pdfbox_word_bbox_list = []
    for word_info in word_info_list:
        if line_bbox.overlap(word_info['bbox']):
            pdfbox_word_bbox_list.append(word_info['bbox'])

    # get the bbox and overlapped word bbox
    # build a confusion matrix of the word.
    char_word_list = get_char_list_list(char_list_line)
    char_word_bbox_list = []
    for wid, char_word in enumerate(char_word_list):
        char_word_bbox_list.append(get_char_list_bbox(char_word))
        pdf_util_debug_log.debug("{} {}".format(wid, char_list2str(char_word)))

    # build the overlapping matrix here.
    # two should be merged if they overlap with the same word from pdfbox
    # union find algorithm here to find connected components.
    uf = UnionFind()
    for cid, char_word_bbox in enumerate(char_word_bbox_list):
        uf.add_node(cid)

    for pid, pdfbox_word_bbox in enumerate(pdfbox_word_bbox_list):
        cid_list = []
        for cid, char_word_bbox in enumerate(char_word_bbox_list):
            if char_word_bbox.overlap(pdfbox_word_bbox):
                cid_list.append(cid)
        for cid in cid_list:
            uf.merge(cid_list[0], cid)

    merged_cid_list_list = uf.get_groups()
    new_char_word_list = []
    new_char_word_bbox_list = []
    for merged_cid_list in merged_cid_list_list:
        tmp_char_word = []
        for cid in merged_cid_list:
            tmp_char_word.extend(char_word_list[cid])
        new_char_word_list.append(tmp_char_word)
        new_char_word_bbox_list.append(get_char_list_bbox(tmp_char_word))

    # for the lesft cid_list

    # sort based on the left boundary
    tmp_idx_list = range(len(new_char_word_list))
    tmp_idx_list.sort(key=lambda idx: new_char_word_bbox_list[idx].left())
    sorted_new_char_word_list = []
    for tmp_idx in tmp_idx_list:
        sorted_new_char_word_list.append(new_char_word_list[tmp_idx])

    # TODO, split the lines with too long word, very likely to be wrong
    max_len = get_longest_length(sorted_new_char_word_list)
    if max_len > WORD_LENGTH_95_QUARTILE:
        new_char_list_line = max_word_split(char_list_line)
    else:
        new_char_list_line = char_list_list2char_list(sorted_new_char_word_list)

    return new_char_list_line

示例#9

0

显示文件

文件： max_word_num.py 项目： senyalin/pdfxml

def max_word_split(char_list):
    """
    sort based on the left
    punctuation: comma, period, colon, semi-colon, parenthesis
    letters:

    if vertical overlap, should not be split into two parts.

    :param char_list:
    :return:
        char_list with LTAnno inserted.
    """
    if len(char_list) == 0:
        return []

    # only keep the LTChar
    char_list = [char for char in char_list if isinstance(char, LTChar)]

    # create the hor-overlapping constraints
    check_sorted_by_left(char_list)

    # get overlapping pairs as hard constraints
    pair_left_idx_list = []
    pair_right_idx_list = []
    for i in range(len(char_list)-1):
        if char_list[i].bbox[2] > char_list[i+1].bbox[0]:
            pair_left_idx_list.append(i)
            pair_right_idx_list.append(i+1)

    def is_word(start_i, end_i):
        """
        check whether the chars in the range is good
        :param start_i:
        :param end_i:
        :return:
        """
        # check the boundary
        if start_i in pair_right_idx_list or end_i in pair_left_idx_list:
            # very big penalty
            return -10000

        word = ""
        for char_idx in range(start_i, end_i+1):
            if len(char_list[char_idx].get_text()) > 1:
                return -1
            word += char_list[char_idx].get_text()

        # remove leading or trailing punctuation.
        word = word.strip(".,;:"+string.whitespace)
        word = word.lower()
        s_word = wnl.lemmatize(word, 'n')
        v_word = wnl.lemmatize(word, 'v')
        is_word = word in wl or s_word in wl or v_word in wl

        # special rule
        if len(word) == 1:
            if word in ['A', 'a']:
                return 1
            else:
                return -1

        #return 1 if is_word else -1
        if is_word:
            # give more weight to long
            return (end_i-start_i+1)*(end_i-start_i+1)
        else:
            return -1

    # dp[i] = dp[j] + word[i,j]
    # keep track of the split position.

    # use this for test
    "http://localhost:8080/pdf_viewer?pdf_name=10.1.1.6.2280_14"
    # dp[i] means with i char considerd
    dp = [-100000]*(len(char_list)+1)
    prev_end = [-1]*(len(char_list)+1)  # when previous end lead to -1, mean the last chunk
    dp[0] = 0
    for char_num in range(1, len(char_list)+1):
        for prev_char_num in range(0, char_num):
            word_score = is_word(prev_char_num, char_num-1)
            ms = 'word {}, score {}'.format(
                char_list2str([char_list[i] for i in range(prev_char_num, char_num)]),
                word_score)
            if dp[prev_char_num] + word_score > dp[char_num]:
                dp[char_num] = dp[prev_char_num] + word_score
                prev_end[char_num] = prev_char_num

    # recover the word here
    i = len(char_list)
    word_end_idx_list = []
    while i >= 0:
        word_end_idx_list.append(i-1)
        i = prev_end[i]
    print word_end_idx_list

    i = 0
    return_char_list = []
    while i < len(char_list):
        return_char_list.append(char_list[i])
        if i in word_end_idx_list:
            return_char_list.append(LTAnno(" "))
        i += 1
    return return_char_list

示例#10

0

显示文件

def merge_line_fraction(char_list_list, fraction_path_list):
    char_list = char_list_list2char_list(char_list_list)
    median_char_height = get_median_char_height_by_char_list(char_list)

    # char_list_list, sort by the vertical center position
    line_v_c_list = []  # vertical center
    org_line_bbox_list = []
    for char_list in char_list_list:
        bbox = char_list2bbox(char_list)
        line_v_c_list.append(bbox.v_center())
        org_line_bbox_list.append(bbox)
    line_idx_list = range(len(char_list_list))
    line_idx_list.sort(key=lambda line_idx: -line_v_c_list[line_idx])

    sorted_char_list_list = []
    sorted_line_bbox_list = []
    for line_idx in line_idx_list:
        sorted_char_list_list.append(char_list_list[line_idx])
        sorted_line_bbox_list.append(org_line_bbox_list[line_idx])
    sorted_v_c_list = [-bbox.v_center() for bbox in sorted_line_bbox_list]

    from pdfxml.pdf_util.unionfind import UnionFind
    uf = UnionFind()
    for idx in range(len(sorted_v_c_list)):
        uf.add_node(idx)

    for path in fraction_path_list:
        # get the nearest lines
        # find_lt()
        # another issue to concern is the order of the char_list_list?
        # change to nscs_list, merge the nscs_list, and then convert the char_list for the line
        gt_idx = find_gt_idx(sorted_v_c_list, -path.bbox[1])
        lt_idx = find_lt_idx(sorted_v_c_list, -path.bbox[3])
        if gt_idx is None or lt_idx is None:
            continue
        if gt_idx <= lt_idx:
            raise Exception("TODO")
        assert gt_idx > lt_idx
        if abs(sorted_v_c_list[gt_idx] - sorted_v_c_list[lt_idx]
               ) < 3 * median_char_height:  # merge the two lines
            uf.merge(lt_idx, gt_idx)
        else:
            print abs(sorted_v_c_list[gt_idx] - sorted_v_c_list[lt_idx])
            prev_line_str = char_list2str(sorted_char_list_list[lt_idx])
            next_line_str = char_list2str(sorted_char_list_list[gt_idx])
            print prev_line_str
            print next_line_str
            print 'failed to merge lines?'
    idx_group_list = uf.get_groups()

    # merged line list
    merged_line_list = []
    for idx_group in idx_group_list:
        nscs_list = []
        for line_idx in idx_group:
            tmp_nscs_list = char_list2char_list_list(
                sorted_char_list_list[line_idx])
            nscs_list.extend(tmp_nscs_list)
        merged_line_list.append(char_list_list2char_list(nscs_list))

    # sort the line again
    merged_line_bbox_list = [char_list2bbox(line) for line in merged_line_list]
    merged_line_idx_list = range(len(merged_line_list))
    merged_line_idx_list.sort(
        key=lambda merge_line_idx: -merged_line_bbox_list[merge_line_idx].top(
        ))  # minus because start from the bottom

    sorted_merged_line_list = [
        merged_line_list[line_idx] for line_idx in merged_line_idx_list
    ]
    return sorted_merged_line_list

示例#11

0

显示文件

文件： pdfbox_layout.py 项目： senyalin/pdfxml

def is_double_column(pdf_path, pid, debug=False):
    """
        The idea is that if there are two cluster of begin position , then double column

    :param pdf_path:
    :param pid:
    :return:
    """
    from pdfxml.pdf_util.pdfbox_line_merging import pdf_extract_lines_raw
    from pdfxml.pdf_util.pdfbox_wrapper import get_pdf_page_size

    char_list_list = pdf_extract_lines_raw(pdf_path, pid)

    page_size = get_pdf_page_size(pdf_path, pid)
    page_width = page_size['width']

    # get the boundary of the column, collect the startpoint, and end point, use 0.95 quantile
    start_pos_list = []
    end_pos_list = []
    quantile = 0.90
    for char_list in char_list_list:
        # remove line with less than 10 chars
        if len(char_list) < 30:
            continue
        bbox = get_char_list_bbox(char_list)
        start_pos_list.append(bbox.left())
        end_pos_list.append(bbox.right())

    if len(start_pos_list) == 0 or len(end_pos_list) == 0:
        # it's an empty page.
        #raise Exception("could not get the left/right boundary")
        return False

    start_pos = np.percentile(start_pos_list, int((1 - quantile) * 100))
    end_pos = np.percentile(end_pos_list, int(quantile * 100))
    if debug:
        #plt.hist(start_pos_list)
        print("The main column boundary {} {}".format(start_pos, end_pos))

    if end_pos < page_width / 2 or start_pos > page_width / 2:
        # if only half of the column have enough lines.
        return True

    center_pos = (start_pos + end_pos) / 2
    good_line_count = 0
    total_count = 0.0

    for char_list in char_list_list:
        # remove line with less than 10 chars
        if len(char_list) < 30:
            continue

        bbox = get_char_list_bbox(char_list)
        if bbox.left() < bbox.right() < center_pos or \
                bbox.right() > bbox.left() > center_pos:
            good_line_count += 1
            if debug:
                tmp_str = char_list2str(char_list)
                print "Good Line", tmp_str, bbox
        total_count += 1
        #print "BadLine", total_count

    if debug:
        line_str_list = []
        for char_list in char_list_list:
            line_str_list.append(char_list2str(char_list))

    threshold = 0.6
    if float(good_line_count) / total_count > threshold:
        return True
    else:
        return False

示例#12

0

显示文件

def get_ignore_region(pdf_path):
    """
    :param pdf_path:
    :return:
        return pid2intervaltree
    """
    from pdfxml.me_extraction.me_font_stat_stage4 import internal_get_llines
    pn = get_page_num(pdf_path)

    pid2it2d = {}

    all_lines = []
    for pid in range(pn):
        lines = internal_get_llines(None, pdf_path, pid)
        for line in lines:
            line_str = char_list2str(line)
            is_heading = is_heading_line_by_str(line_str)
            all_lines.append({
                'char_list': line,
                'line_str': line_str,
                'pid': pid,
                'is_heading': is_heading
            })

    # first process the abstract part
    abstract_heading_line_idx = None
    for i, line_info in enumerate(all_lines):
        if line_info['is_heading'] and \
                is_abstract_head(line_info['line_str']):
            abstract_heading_line_idx = i
            break
        if line_info['pid'] > pn / 2:
            # should not be at the second half of the document.
            break

    if abstract_heading_line_idx is not None:
        for i in range(abstract_heading_line_idx):
            pid = all_lines[i]['pid']
            if pid not in pid2it2d:
                pid2it2d[pid] = IntervalTree2D()
            pid2it2d[pid].add_bbox_only(
                char_list2bbox(all_lines[i]['char_list']))

    ignore_begin = False
    for line_info in all_lines:
        # ignore the abstraction and the reference
        if line_info['is_heading']:
            abs_head = is_abstract_head(line_info['line_str'])
            ref_head = is_reference_head(line_info['line_str'])
            if abs_head or ref_head:
                ignore_begin = True
            else:
                ignore_begin = False
        else:
            if ignore_begin:
                pid = line_info['pid']
                if pid not in pid2it2d:
                    pid2it2d[pid] = IntervalTree2D()
                pid2it2d[pid].add_bbox_only(
                    char_list2bbox(line_info['char_list']))
    return pid2it2d

示例#13

0

显示文件

文件： layout_line.py 项目： senyalin/pdfxml

 def get_line(self):
     line_str = char_list2str(self._chars)
     return line_str

示例#14

0

显示文件

def assess_ime(pdf_path, pid=0, xml_out_path=None, ignore_exist=False):
    """
    # IME [3]

    With math symbol and without non-math words

    Return:
        xml_out_path: output the boundary file
    """
    tmp_path = get_tmp_path(pdf_path)
    ret_info_dict = {}
    if xml_out_path and os.path.isfile(xml_out_path) and (not ignore_exist):
        return {}

    from pdfxml.me_extraction.me_consts import math_words
    t = time.time()
    # common resource loader
    wl = set(words.words())
    wl.update(additional_words)
    wnl = WordNetLemmatizer()
    d = time.time() - t
    ret_info_dict['resource_time'] = d

    t = time.time()
    # layout analysis
    font = get_font_from_pdf(pdf_path, pid)
    #font = None
    prefix = pdf_path[pdf_path.rindex("/") + 1:-4]
    lines = internal_get_llines(prefix, pdf_path, pid)

    d = time.time() - t
    ret_info_dict['layout_time'] = d

    # IME assessment core
    t = time.time()
    line_labels = [0] * len(lines)
    for li, line in enumerate(lines):
        line_label = 0
        beg_idx = 0
        with_math_symbol_or_word = False
        with_non_math_word = False
        for i, char in enumerate(line):
            if isinstance(char, LTChar):
                if check_is_math_LTChar(char, font):
                    me_extraction_logger.debug("Char {} as Math".format(char))
                    with_math_symbol_or_word = True

            if is_space_char(char):
                word = ""
                for j in range(beg_idx, i):
                    if j == i - 1 and line[j].get_text() in [
                            ',', '.', 'period', 'comma'
                    ]:
                        continue

                    # for word checking, only work on the alpha beta
                    tmp_text = line[j].get_text()
                    if len(tmp_text) != 1:
                        tmp_text = " "

                    word += tmp_text
                beg_idx = i + 1
                word = word.lower().strip()

                # move to above, and use glyph name to match
                #if word.endswith(',') or word.endswith('.'):
                #    word = word[:-1]

                # print check word
                s_word, v_word = "", ""
                try:
                    s_word = wnl.lemmatize(word, 'n')
                    v_word = word
                    v_word = wnl.lemmatize(word, 'v')
                except Exception as e:
                    me_extraction_error_logger.error(
                        "Error checking the word as noun or verb")

                if word in math_words:
                    me_extraction_logger.debug("Math Word {}".format(word))
                    with_math_symbol_or_word = True
                elif len(word) > 2 and (word in wl or s_word in wl
                                        or v_word in wl):
                    me_extraction_logger.debug("Plain Word {}".format(word))
                    with_non_math_word = True
                else:
                    pass

        # debug for line, with ME or not
        tmp_line_str = char_list2str(line, ', ')
        me_extraction_logger.debug(tmp_line_str)
        me_extraction_logger.debug("with math {}, with word {}".format(
            with_math_symbol_or_word, with_non_math_word))
        if with_math_symbol_or_word and (not with_non_math_word):
            me_extraction_logger.debug("MATHLINE")
            line_label = 1
        line_labels[li] = line_label
    d = time.time() - t
    ret_info_dict['core_time'] = d

    if not xml_out_path:
        for li, line in enumerate(lines):
            if line_labels[li]:
                tmp_str = ''.join([
                    char.get_text() for char in line
                    if isinstance(char, LTChar)
                ])
                print tmp_str.encode("utf-8")

    # export for evaluation
    page_info = {}
    page_info['pid'] = pid
    page_info['ilist'] = []
    page_info['elist'] = []

    # create bbox for each ME
    for li, line in enumerate(lines):
        if line_labels[li]:
            visible_char_list = [
                char for char in line if isinstance(char, LTChar)
            ]
            char_list2str(visible_char_list)
            page_info['ilist'].append(line)

    t = time.time()
    if xml_out_path:
        export_xml(page_info, xml_out_path, pdf_path, pid)
    d = time.time() - t
    ret_info_dict['io_time'] = d
    return ret_info_dict

示例#15

0

显示文件

def EME_font_stat_pipeline(pdf_path,
                           pid,
                           eme_export_path=None,
                           prev_page_info={}):
    """
    for each word, compare the probability
    :param pdf_path:
    :param pid:
    :param eme_export_path:
    :param stage:
    :param refresh_font_stat:
    :param condprob_type: font, font-val
    :param word_export_path:
    :param refresh:
    :return:
    """
    #print "NOTE force EME extraction to be false"
    #ext_settings.LANG_MODEL = False
    #SEQ_ME_MERGER = False

    ret_info_dict, t = {}, time.time()  # assign value and get the current time
    font_stat_dict = stage4_font_stat(pdf_path)
    me_font_condprob = create_me_font_condprob(
        font_stat_dict)  # the conditional prob
    font_val_condprob = create_me_font_val_condprob(font_stat_dict)
    d = time.time() - t

    ret_info_dict['stat_time'], t = d, time.time()
    font = get_font_from_pdf(pdf_path, pid)

    prefix = pdf_path[pdf_path.rindex("/") + 1:-4]
    lines = internal_get_llines(prefix, pdf_path, pid)

    d = time.time() - t
    ret_info_dict['layout_time'], t = d, time.time()

    # the loaded IME play two roles here:
    # * another rule here, if the word is part of an IME line, then also good
    # * find connected EME, should not overlap with IME
    xml_path = get_xml_path(pdf_path)
    ime_res_path = "{}.ime.{}.xml".format(xml_path, pid)
    if not os.path.isfile(ime_res_path):
        assess_ime(pdf_path, pid, ime_res_path)

    gt_flag, gt_me_list = get_info(ime_res_path)
    gt_ime_list = [gt_me for gt_me in gt_me_list if gt_me['type'] == 'I']

    ime_bbox_list = [
        [gt_ime['rect']['l'], gt_ime['rect']['b'], gt_ime['rect']['r'], gt_ime['rect']['t']] \
        for gt_ime in gt_ime_list
    ]

    d = time.time() - t
    ret_info_dict['ime_time'] = d

    t = time.time()
    me_chars = set()

    references_met = False  # TODO, the refer might cover many pages
    if 'references_met' in prev_page_info:
        references_met = True
        #return

    eme_list = []
    word_class_list = []
    nscs_label_list_list = []

    for li, line in enumerate(lines):
        nscs_label_list = []
        beg_idx = 0

        # TODO, reference has a regex later
        # at the level of line
        tmp_str = char_list2str(line)

        from pdfxml.me_extraction.element_matching import is_reference_head
        if is_reference_head(tmp_str):
            references_met = True
            ret_info_dict['references_met'] = True
            break

        line_bbox = char_list2bbox(line)
        if bbox_half_overlap_list(line_bbox, ime_bbox_list):
            if ext_settings.debug:
                me_extraction_logger.debug("Line {} as IME".format(
                    char_list2str(line)))
            continue

        nscs_list = char_list2char_list_list(line)
        nscs_str_list = [char_list2str(nscs) for nscs in nscs_list]
        me_log_prob_list = []
        nme_log_prob_list = []
        for nscs in nscs_list:
            nscs_pred = nscs
            if nscs[-1].get_text() in [',', '.']:
                nscs_pred = nscs[:-1]

            # only use the font-val pair to make inferecen now
            is_me = check_me_font_val(nscs_pred,
                                      font_val_condprob,
                                      me_font_condprob,
                                      font,
                                      debug=ext_settings.debug)
            me_prob, nme_prob = get_me_nme_log_prob_font_val(
                nscs_pred,
                font_val_condprob,
                me_font_condprob,
                font,
                debug=ext_settings.debug)
            if is_me and ext_settings.debug:
                me_extraction_logger.debug("check me font val ME")
            me_log_prob_list.append(me_prob)
            nme_log_prob_list.append(nme_prob)

            if is_me:
                # print sth here
                tmp_str = char_list2str(nscs)
                if ext_settings.debug:
                    me_extraction_logger.debug("me chunk {}".format(
                        tmp_str.encode('utf-8')))
                me_chars.update([c for c in nscs if isinstance(c, LTChar)])
                word_class_list.append((nscs, 1))
                nscs_label_list.append((nscs, 1))
            else:
                word_class_list.append((nscs, 0))
                nscs_label_list.append((nscs, 0))

        nscs_label_list_list.append(nscs_label_list)

    d = time.time() - t
    ret_info_dict['core_time'], t = d, time.time()

    # post processing
    # if EME overlap with IME, should be removed.
    # and a post processing on merging EME
    eme_list = eme_merger(lines, me_chars, ime_bbox_list)

    if eme_export_path:  # export the data
        page_info = {}
        page_info['pid'] = pid
        page_info['ilist'] = []
        page_info['elist'] = eme_list
        export_xml(page_info, eme_export_path, pdf_path, pid)

    d = time.time() - t
    ret_info_dict['io_time'] = d
    return ret_info_dict