Пример #1
0
def merge_overlapping_sections_of_texts(texts_in_secs, direction,
                                        overlap_thresh):
    """
    Merge overlapping sections of texts in <direction> whose consecutive
    "distance" or overlap (when the distance is negative) is less than <overlap_thresh>.

    Return merged sections.
    """
    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError(
            "direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)"
        )

    if direction == DIRECTION_HORIZONTAL:
        pos_attr = 'left'
        other_pos_attr = 'right'
    else:
        pos_attr = 'top'
        other_pos_attr = 'bottom'

        # sorted section positions for left side or top side
    sec_positions1 = [
        sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr]
        for sec in texts_in_secs
    ]
    # sorted section positions for right side or bottom side
    sec_positions2 = [
        sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr]
        for sec in texts_in_secs
    ]

    # calculate distance/overlap between sections
    sec_positions = list(zip(sec_positions1, sec_positions2))
    sec_dists = [
        pos[0] - sec_positions[i - 1][1] if i > 0 else 0
        for i, pos in enumerate(sec_positions)
    ]
    # print(sum([d <= 0 for d in sec_dists]))

    # merge sections that overlap (whose distance is less than <overlap_thresh>)
    merged_secs = []
    prev_sec = []
    for i, dist in enumerate(sec_dists):
        cur_sec = texts_in_secs[i]
        if dist < overlap_thresh:
            sec = cur_sec + prev_sec
            if len(merged_secs) > 0:
                merged_secs.pop()
        else:
            sec = cur_sec

        merged_secs.append(sec)
        prev_sec = sec

    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))

    return merged_secs
def merge_overlapping_sections_of_texts(texts_in_secs, direction, overlap_thresh):
    """
    Merge overlapping sections of texts in <direction> whose consecutive
    "distance" or overlap (when the distance is negative) is less than <overlap_thresh>.
    
    Return merged sections.
    """
    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError("direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)")
    
    if direction == DIRECTION_HORIZONTAL:
        pos_attr = 'left'
        other_pos_attr = 'right'
    else:
        pos_attr = 'top'
        other_pos_attr = 'bottom'    
    
    # sorted section positions for left side or top side
    sec_positions1 = [sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr] for sec in texts_in_secs]
    # sorted section positions for right side or bottom side
    sec_positions2 = [sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr] for sec in texts_in_secs]
    
    # calculate distance/overlap between sections
    sec_positions = list(zip(sec_positions1, sec_positions2))
    sec_dists = [pos[0] - sec_positions[i-1][1] if i > 0 else 0 for i, pos in enumerate(sec_positions)]
    #print(sum([d <= 0 for d in sec_dists]))
    
    # merge sections that overlap (whose distance is less than <overlap_thresh>)
    merged_secs = []
    prev_sec = []
    for i, dist in enumerate(sec_dists):
        cur_sec = texts_in_secs[i]
        if dist < overlap_thresh:
            sec = cur_sec + prev_sec
            if len(merged_secs) > 0:
                merged_secs.pop()
        else:
            sec = cur_sec
        
        merged_secs.append(sec)
        prev_sec = sec
    
    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))
    
    return merged_secs
Пример #3
0
def join_texts(texts, sorted_by='left', glue=' ', strip=True):
    """Join strings in text boxes <texts>, sorting them by <sorted_by> and concatenating them using <glue>."""
    if sorted_by:
        texts = sorted_by_attr(texts, sorted_by)
    
    s = glue.join([t['value'] for t in texts])
    if strip:
        s = s.strip()
    return s
Пример #4
0
def merge_nearby_textboxes_in_page(page,
                                   direction,
                                   max_nearby_dist,
                                   max_same_axis_dist,
                                   axis_align='center',
                                   merge_on_overlap=False,
                                   join_str=None):
    texts = page['texts']

    if not texts:
        return

    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError(
            "direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)"
        )

    if max_same_axis_dist < 0:
        raise ValueError('`max_same_axis_dist` must be positive')

    if direction == DIRECTION_HORIZONTAL:
        nearby_attr = 'left'
        nearby_attr_other = 'right'
        axis_attr = 'top'
        axis_attr_dim = 'height'
        if join_str is None:
            join_str = ' '
    else:
        nearby_attr = 'top'
        nearby_attr_other = 'bottom'
        axis_attr = 'left'
        axis_attr_dim = 'width'
        if join_str is None:
            join_str = '\n'

    if not isinstance(join_str, str):
        raise ValueError('`join_str` must be a string')

    texts = sorted_by_attr(texts, axis_attr)
    if axis_align == 'center':
        axes_gaps = np.array(
            [t[axis_attr] + t[axis_attr_dim] / 2 for t in texts])
    else:
        axes_gaps = np.diff([t[axis_align] for t in texts])
    axes = find_clusters_1d_break_dist(axes_gaps,
                                       dist_thresh=max_same_axis_dist)

    merged_texts = []
    for t_indices in axes:
        t_positions = [(texts[i], texts[i][nearby_attr],
                        texts[i][nearby_attr_other]) for i in t_indices]
        t_positions = sorted(t_positions, key=lambda x: x[1])

        merged_texts.append(t_positions[0][0])

        if len(t_positions) > 1:
            gaps = [
                t_positions[i + 1][1] - p[2]
                for i, p in enumerate(t_positions[:-1])
            ]
            for i, gap in enumerate(gaps):
                t = t_positions[i + 1][0]
                prev_t = merged_texts[-1]

                if gap <= max_nearby_dist and (merge_on_overlap
                                               or gap >= 0):  # merge
                    # merge texts
                    prev_t['value'] += join_str + t['value']
                    prev_t['xmlnode'].text = prev_t['value']

                    # update dimensions
                    if direction == DIRECTION_HORIZONTAL:
                        new_w = prev_t['width'] + (t['right'] -
                                                   prev_t['right'])
                        new_h = prev_t['height']
                    else:
                        new_w = prev_t['width']
                        new_h = prev_t['height'] + (t['bottom'] -
                                                    prev_t['bottom'])

                    update_text_dict_dim(prev_t, (new_w, new_h),
                                         update_node=True)

                    # remove this node from page
                    page['xmlnode'].remove(t['xmlnode'])
                else:  # don't merge
                    merged_texts.append(t)

    assert len(merged_texts) <= len(texts)

    page['texts'] = merged_texts
Пример #5
0
# load image
imgdata = cv2.imread(pngfile)
imgdata = imgdata[:, :,
                  0]  # binary image input -> select only one of three channels, they are all the same in case
# of a binary image anyway

print(imgdata.shape)

imgh, imgw = imgdata.shape

#%% 8. Find out sections (dark blue headers) in the table that contain the scores

# filter for all text boxes with "Bewertung" and sort them from top to bottom
texts_sections = sorted_by_attr(
    [t for t in page['texts'] if t['value'].replace(' ', '') == 'Bewertung'],
    'top')

print(len(texts_sections))

# show an example text box
pprint(texts_sections[0])

#%% 9. Parse the data, section by section

# RE pattern to identify the "A B C D" header
pttrn_grade_header = re.compile(r'^[A-D]$')

# RE pattern to identify the item number in front of each row like "1.3" or "E.2"
pttrn_item = re.compile(r'^[1-9A-Z]{1,2}\.\d{1,2}(\.\d{1,2})?')
#%% 7. Load the PNG image representation of the PDF that was generated before

# load image
imgdata = cv2.imread(pngfile)
imgdata = imgdata[:, :, 0]  # binary image input -> select only one of three channels, they are all the same in case
                            # of a binary image anyway

print(imgdata.shape)

imgh, imgw = imgdata.shape

#%% 8. Find out sections (dark blue headers) in the table that contain the scores

# filter for all text boxes with "Bewertung" and sort them from top to bottom
texts_sections = sorted_by_attr([t for t in page['texts'] if t['value'].replace(' ', '') == 'Bewertung'], 'top')

print(len(texts_sections))

# show an example text box
pprint(texts_sections[0])

#%% 9. Parse the data, section by section

# RE pattern to identify the "A B C D" header
pttrn_grade_header = re.compile(r'^[A-D]$')

# RE pattern to identify the item number in front of each row like "1.3" or "E.2"
pttrn_item = re.compile(r'^[1-9A-Z]{1,2}\.\d{1,2}(\.\d{1,2})?')

i_subplot = 1     # we'll generate a plot of the image data inside the checkboxes -- this is the subplot index
def merge_nearby_textboxes_in_page(page, direction, max_nearby_dist, max_same_axis_dist, axis_align='center',
                                   merge_on_overlap=False, join_str=None):
    texts = page['texts']

    if not texts:
        return

    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError("direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)")

    if max_same_axis_dist < 0:
        raise ValueError('`max_same_axis_dist` must be positive')

    if direction == DIRECTION_HORIZONTAL:
        nearby_attr = 'left'
        nearby_attr_other = 'right'
        axis_attr = 'top'
        axis_attr_dim = 'height'
        if join_str is None:
            join_str = ' '
    else:
        nearby_attr = 'top'
        nearby_attr_other = 'bottom'
        axis_attr = 'left'
        axis_attr_dim = 'width'
        if join_str is None:
            join_str = '\n'

    if not isinstance(join_str, str):
        raise ValueError('`join_str` must be a string')

    texts = sorted_by_attr(texts, axis_attr)
    if axis_align == 'center':
        axes_gaps = np.array([t[axis_attr] + t[axis_attr_dim]/2 for t in texts])
    else:
        axes_gaps = np.diff([t[axis_align] for t in texts])
    axes = find_clusters_1d_break_dist(axes_gaps, dist_thresh=max_same_axis_dist)

    merged_texts = []
    for t_indices in axes:
        t_positions = [(texts[i], texts[i][nearby_attr], texts[i][nearby_attr_other]) for i in t_indices]
        t_positions = sorted(t_positions, key=lambda x: x[1])

        merged_texts.append(t_positions[0][0])

        if len(t_positions) > 1:
            gaps = [t_positions[i+1][1] - p[2] for i, p in enumerate(t_positions[:-1])]
            for i, gap in enumerate(gaps):
                t = t_positions[i+1][0]
                prev_t = merged_texts[-1]

                if gap <= max_nearby_dist and (merge_on_overlap or gap >= 0):   # merge
                    # merge texts
                    prev_t['value'] += join_str + t['value']
                    prev_t['xmlnode'].text = prev_t['value']

                    # update dimensions
                    if direction == DIRECTION_HORIZONTAL:
                        new_w = prev_t['width'] + (t['right'] - prev_t['right'])
                        new_h = prev_t['height']
                    else:
                        new_w = prev_t['width']
                        new_h = prev_t['height'] + (t['bottom'] - prev_t['bottom'])

                    update_text_dict_dim(prev_t, (new_w, new_h), update_node=True)

                    # remove this node from page
                    page['xmlnode'].remove(t['xmlnode'])
                else:   # don't merge
                    merged_texts.append(t)

    assert len(merged_texts) <= len(texts)

    page['texts'] = merged_texts