def merge_overlapping_sections_of_texts(texts_in_secs, direction, overlap_thresh): """ Merge overlapping sections of texts in <direction> whose consecutive "distance" or overlap (when the distance is negative) is less than <overlap_thresh>. Return merged sections. """ if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL): raise ValueError( "direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)" ) if direction == DIRECTION_HORIZONTAL: pos_attr = 'left' other_pos_attr = 'right' else: pos_attr = 'top' other_pos_attr = 'bottom' # sorted section positions for left side or top side sec_positions1 = [ sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr] for sec in texts_in_secs ] # sorted section positions for right side or bottom side sec_positions2 = [ sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr] for sec in texts_in_secs ] # calculate distance/overlap between sections sec_positions = list(zip(sec_positions1, sec_positions2)) sec_dists = [ pos[0] - sec_positions[i - 1][1] if i > 0 else 0 for i, pos in enumerate(sec_positions) ] # print(sum([d <= 0 for d in sec_dists])) # merge sections that overlap (whose distance is less than <overlap_thresh>) merged_secs = [] prev_sec = [] for i, dist in enumerate(sec_dists): cur_sec = texts_in_secs[i] if dist < overlap_thresh: sec = cur_sec + prev_sec if len(merged_secs) > 0: merged_secs.pop() else: sec = cur_sec merged_secs.append(sec) prev_sec = sec assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs)) return merged_secs
def merge_overlapping_sections_of_texts(texts_in_secs, direction, overlap_thresh): """ Merge overlapping sections of texts in <direction> whose consecutive "distance" or overlap (when the distance is negative) is less than <overlap_thresh>. Return merged sections. """ if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL): raise ValueError("direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)") if direction == DIRECTION_HORIZONTAL: pos_attr = 'left' other_pos_attr = 'right' else: pos_attr = 'top' other_pos_attr = 'bottom' # sorted section positions for left side or top side sec_positions1 = [sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr] for sec in texts_in_secs] # sorted section positions for right side or bottom side sec_positions2 = [sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr] for sec in texts_in_secs] # calculate distance/overlap between sections sec_positions = list(zip(sec_positions1, sec_positions2)) sec_dists = [pos[0] - sec_positions[i-1][1] if i > 0 else 0 for i, pos in enumerate(sec_positions)] #print(sum([d <= 0 for d in sec_dists])) # merge sections that overlap (whose distance is less than <overlap_thresh>) merged_secs = [] prev_sec = [] for i, dist in enumerate(sec_dists): cur_sec = texts_in_secs[i] if dist < overlap_thresh: sec = cur_sec + prev_sec if len(merged_secs) > 0: merged_secs.pop() else: sec = cur_sec merged_secs.append(sec) prev_sec = sec assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs)) return merged_secs
def join_texts(texts, sorted_by='left', glue=' ', strip=True): """Join strings in text boxes <texts>, sorting them by <sorted_by> and concatenating them using <glue>.""" if sorted_by: texts = sorted_by_attr(texts, sorted_by) s = glue.join([t['value'] for t in texts]) if strip: s = s.strip() return s
def merge_nearby_textboxes_in_page(page, direction, max_nearby_dist, max_same_axis_dist, axis_align='center', merge_on_overlap=False, join_str=None): texts = page['texts'] if not texts: return if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL): raise ValueError( "direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)" ) if max_same_axis_dist < 0: raise ValueError('`max_same_axis_dist` must be positive') if direction == DIRECTION_HORIZONTAL: nearby_attr = 'left' nearby_attr_other = 'right' axis_attr = 'top' axis_attr_dim = 'height' if join_str is None: join_str = ' ' else: nearby_attr = 'top' nearby_attr_other = 'bottom' axis_attr = 'left' axis_attr_dim = 'width' if join_str is None: join_str = '\n' if not isinstance(join_str, str): raise ValueError('`join_str` must be a string') texts = sorted_by_attr(texts, axis_attr) if axis_align == 'center': axes_gaps = np.array( [t[axis_attr] + t[axis_attr_dim] / 2 for t in texts]) else: axes_gaps = np.diff([t[axis_align] for t in texts]) axes = find_clusters_1d_break_dist(axes_gaps, dist_thresh=max_same_axis_dist) merged_texts = [] for t_indices in axes: t_positions = [(texts[i], texts[i][nearby_attr], texts[i][nearby_attr_other]) for i in t_indices] t_positions = sorted(t_positions, key=lambda x: x[1]) merged_texts.append(t_positions[0][0]) if len(t_positions) > 1: gaps = [ t_positions[i + 1][1] - p[2] for i, p in enumerate(t_positions[:-1]) ] for i, gap in enumerate(gaps): t = t_positions[i + 1][0] prev_t = merged_texts[-1] if gap <= max_nearby_dist and (merge_on_overlap or gap >= 0): # merge # merge texts prev_t['value'] += join_str + t['value'] prev_t['xmlnode'].text = prev_t['value'] # update dimensions if direction == DIRECTION_HORIZONTAL: new_w = prev_t['width'] + (t['right'] - prev_t['right']) new_h = prev_t['height'] else: new_w = prev_t['width'] new_h = prev_t['height'] + (t['bottom'] - prev_t['bottom']) update_text_dict_dim(prev_t, (new_w, new_h), update_node=True) # remove this node from page page['xmlnode'].remove(t['xmlnode']) else: # don't merge merged_texts.append(t) assert len(merged_texts) <= len(texts) page['texts'] = merged_texts
# load image imgdata = cv2.imread(pngfile) imgdata = imgdata[:, :, 0] # binary image input -> select only one of three channels, they are all the same in case # of a binary image anyway print(imgdata.shape) imgh, imgw = imgdata.shape #%% 8. Find out sections (dark blue headers) in the table that contain the scores # filter for all text boxes with "Bewertung" and sort them from top to bottom texts_sections = sorted_by_attr( [t for t in page['texts'] if t['value'].replace(' ', '') == 'Bewertung'], 'top') print(len(texts_sections)) # show an example text box pprint(texts_sections[0]) #%% 9. Parse the data, section by section # RE pattern to identify the "A B C D" header pttrn_grade_header = re.compile(r'^[A-D]$') # RE pattern to identify the item number in front of each row like "1.3" or "E.2" pttrn_item = re.compile(r'^[1-9A-Z]{1,2}\.\d{1,2}(\.\d{1,2})?')
#%% 7. Load the PNG image representation of the PDF that was generated before # load image imgdata = cv2.imread(pngfile) imgdata = imgdata[:, :, 0] # binary image input -> select only one of three channels, they are all the same in case # of a binary image anyway print(imgdata.shape) imgh, imgw = imgdata.shape #%% 8. Find out sections (dark blue headers) in the table that contain the scores # filter for all text boxes with "Bewertung" and sort them from top to bottom texts_sections = sorted_by_attr([t for t in page['texts'] if t['value'].replace(' ', '') == 'Bewertung'], 'top') print(len(texts_sections)) # show an example text box pprint(texts_sections[0]) #%% 9. Parse the data, section by section # RE pattern to identify the "A B C D" header pttrn_grade_header = re.compile(r'^[A-D]$') # RE pattern to identify the item number in front of each row like "1.3" or "E.2" pttrn_item = re.compile(r'^[1-9A-Z]{1,2}\.\d{1,2}(\.\d{1,2})?') i_subplot = 1 # we'll generate a plot of the image data inside the checkboxes -- this is the subplot index
def merge_nearby_textboxes_in_page(page, direction, max_nearby_dist, max_same_axis_dist, axis_align='center', merge_on_overlap=False, join_str=None): texts = page['texts'] if not texts: return if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL): raise ValueError("direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)") if max_same_axis_dist < 0: raise ValueError('`max_same_axis_dist` must be positive') if direction == DIRECTION_HORIZONTAL: nearby_attr = 'left' nearby_attr_other = 'right' axis_attr = 'top' axis_attr_dim = 'height' if join_str is None: join_str = ' ' else: nearby_attr = 'top' nearby_attr_other = 'bottom' axis_attr = 'left' axis_attr_dim = 'width' if join_str is None: join_str = '\n' if not isinstance(join_str, str): raise ValueError('`join_str` must be a string') texts = sorted_by_attr(texts, axis_attr) if axis_align == 'center': axes_gaps = np.array([t[axis_attr] + t[axis_attr_dim]/2 for t in texts]) else: axes_gaps = np.diff([t[axis_align] for t in texts]) axes = find_clusters_1d_break_dist(axes_gaps, dist_thresh=max_same_axis_dist) merged_texts = [] for t_indices in axes: t_positions = [(texts[i], texts[i][nearby_attr], texts[i][nearby_attr_other]) for i in t_indices] t_positions = sorted(t_positions, key=lambda x: x[1]) merged_texts.append(t_positions[0][0]) if len(t_positions) > 1: gaps = [t_positions[i+1][1] - p[2] for i, p in enumerate(t_positions[:-1])] for i, gap in enumerate(gaps): t = t_positions[i+1][0] prev_t = merged_texts[-1] if gap <= max_nearby_dist and (merge_on_overlap or gap >= 0): # merge # merge texts prev_t['value'] += join_str + t['value'] prev_t['xmlnode'].text = prev_t['value'] # update dimensions if direction == DIRECTION_HORIZONTAL: new_w = prev_t['width'] + (t['right'] - prev_t['right']) new_h = prev_t['height'] else: new_w = prev_t['width'] new_h = prev_t['height'] + (t['bottom'] - prev_t['bottom']) update_text_dict_dim(prev_t, (new_w, new_h), update_node=True) # remove this node from page page['xmlnode'].remove(t['xmlnode']) else: # don't merge merged_texts.append(t) assert len(merged_texts) <= len(texts) page['texts'] = merged_texts