示例#1
0
def merge_small_sections_of_texts(texts_in_secs, min_num_texts):
    """
    Merge sections that are too small, i.e. have too few "content" which means that their number
    of texts is lower than or equal <min_num_texts>.

    Return merged sections.
    """
    merged_secs = []
    prev_sec = None
    for cur_sec in texts_in_secs:
        if prev_sec:
            if len(cur_sec) <= min_num_texts:  # number of texts is too low
                sec = cur_sec + prev_sec  # merge this section with the previous section
                if len(merged_secs
                       ) > 0:  # remove the prev. section from the final list
                    merged_secs.pop(
                    )  # in order to add the merged section later
            else:
                sec = cur_sec
        else:
            sec = cur_sec

        merged_secs.append(sec)  # add the (possibly merged) section
        prev_sec = sec

    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))

    return merged_secs
def merge_small_sections_of_texts(texts_in_secs, min_num_texts):
    """
    Merge sections that are too small, i.e. have too few "content" which means that their number
    of texts is lower than or equal <min_num_texts>.
    
    Return merged sections.
    """
    merged_secs = []
    prev_sec = None
    for cur_sec in texts_in_secs:
        if prev_sec:
            if len(cur_sec) <= min_num_texts:  # number of texts is too low
                sec = cur_sec + prev_sec       # merge this section with the previous section
                if len(merged_secs) > 0:       # remove the prev. section from the final list
                    merged_secs.pop()          # in order to add the merged section later
            else:
                sec = cur_sec
        else:
            sec = cur_sec
        
        merged_secs.append(sec)   # add the (possibly merged) section
        prev_sec = sec

    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))

    return merged_secs
示例#3
0
def merge_overlapping_sections_of_texts(texts_in_secs, direction,
                                        overlap_thresh):
    """
    Merge overlapping sections of texts in <direction> whose consecutive
    "distance" or overlap (when the distance is negative) is less than <overlap_thresh>.

    Return merged sections.
    """
    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError(
            "direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)"
        )

    if direction == DIRECTION_HORIZONTAL:
        pos_attr = 'left'
        other_pos_attr = 'right'
    else:
        pos_attr = 'top'
        other_pos_attr = 'bottom'

        # sorted section positions for left side or top side
    sec_positions1 = [
        sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr]
        for sec in texts_in_secs
    ]
    # sorted section positions for right side or bottom side
    sec_positions2 = [
        sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr]
        for sec in texts_in_secs
    ]

    # calculate distance/overlap between sections
    sec_positions = list(zip(sec_positions1, sec_positions2))
    sec_dists = [
        pos[0] - sec_positions[i - 1][1] if i > 0 else 0
        for i, pos in enumerate(sec_positions)
    ]
    # print(sum([d <= 0 for d in sec_dists]))

    # merge sections that overlap (whose distance is less than <overlap_thresh>)
    merged_secs = []
    prev_sec = []
    for i, dist in enumerate(sec_dists):
        cur_sec = texts_in_secs[i]
        if dist < overlap_thresh:
            sec = cur_sec + prev_sec
            if len(merged_secs) > 0:
                merged_secs.pop()
        else:
            sec = cur_sec

        merged_secs.append(sec)
        prev_sec = sec

    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))

    return merged_secs
def merge_overlapping_sections_of_texts(texts_in_secs, direction, overlap_thresh):
    """
    Merge overlapping sections of texts in <direction> whose consecutive
    "distance" or overlap (when the distance is negative) is less than <overlap_thresh>.
    
    Return merged sections.
    """
    if direction not in (DIRECTION_HORIZONTAL, DIRECTION_VERTICAL):
        raise ValueError("direction must be `DIRECTION_HORIZONTAL` or `DIRECTION_VERTICAL` (see pdftabextract.common)")
    
    if direction == DIRECTION_HORIZONTAL:
        pos_attr = 'left'
        other_pos_attr = 'right'
    else:
        pos_attr = 'top'
        other_pos_attr = 'bottom'    
    
    # sorted section positions for left side or top side
    sec_positions1 = [sorted_by_attr(sec, pos_attr, reverse=True)[0][pos_attr] for sec in texts_in_secs]
    # sorted section positions for right side or bottom side
    sec_positions2 = [sorted_by_attr(sec, other_pos_attr, reverse=True)[0][other_pos_attr] for sec in texts_in_secs]
    
    # calculate distance/overlap between sections
    sec_positions = list(zip(sec_positions1, sec_positions2))
    sec_dists = [pos[0] - sec_positions[i-1][1] if i > 0 else 0 for i, pos in enumerate(sec_positions)]
    #print(sum([d <= 0 for d in sec_dists]))
    
    # merge sections that overlap (whose distance is less than <overlap_thresh>)
    merged_secs = []
    prev_sec = []
    for i, dist in enumerate(sec_dists):
        cur_sec = texts_in_secs[i]
        if dist < overlap_thresh:
            sec = cur_sec + prev_sec
            if len(merged_secs) > 0:
                merged_secs.pop()
        else:
            sec = cur_sec
        
        merged_secs.append(sec)
        prev_sec = sec
    
    assert len(flatten_list(texts_in_secs)) == len(flatten_list(merged_secs))
    
    return merged_secs