def _merge_groups(groups, merge_threshold):
    label = 0
    merged_groups = [(label, groups[0])]
    if len(groups) == 1: return merged_groups
    for i, g in enumerate(groups[1:]):
        real_index = i + 1
        distance_of_left = min(
            get_distance_of_two_segments(s1, s2)
            for s1, s2 in product(groups[real_index - 1], g))
        for s1, s2 in product(groups[real_index - 1], g):
            gg = []
            if distance_of_left == get_distance_of_two_segments(s1, s2):
                gg.append(s1)
                gg.append(s2)
        last_element_label = merged_groups[-1][0]
        if distance_of_left <= 0:
            merged_groups.append((last_element_label, g))
        else:
            merged_groups.append((last_element_label + 1, g))
    assert len(merged_groups) == len(groups)

    return merged_groups
def _merge_groups(groups, merge_threshold):
    """

    :param groups:[[line.start, line.end], [((start_x, start_y), (end_x, end_y))]]
    :param merge_threshold:
    :return:
    """

    label = 0
    merged_groups = [(label, groups[0])]
    if len(groups) == 1: return merged_groups
    for i, g in enumerate(groups[1:]):
        real_index = i + 1

        distance_of_left = min(get_distance_of_two_segments(s1, s2)
                               for s1, s2 in product(groups[real_index - 1], g)
                               )
        # s1, s2
        # [(((line1_sx, line1_sy), (line1_ex, line1_ey)), ((line2_sx, line2_sy), (line2_ex, line2_ey)))]

        tight_close_threshold = 15

        for s1, s2 in product(groups[real_index - 1], g):
            gg = []
            if distance_of_left == get_distance_of_two_segments(s1, s2):
                gg.append(s1)
                gg.append(s2)
                min_len = min(get_line_length(gg))

        last_element_label = merged_groups[-1][0]

        if distance_of_left <= 0:  # min_len / tight_close_threshold:
            merged_groups.append((last_element_label, g))
        else:
            merged_groups.append((last_element_label + 1, g))
    assert len(merged_groups) == len(groups)

    return merged_groups
示例#3
0
def _merge_groups_according_to_position(groups):
    labels = [-1 for i in range(len(groups))]
    dst_group = groups
    count = 0
    for i in range(len(dst_group) - 30):
        if i != 0 and labels[i] != -1:
            continue
        labels[i] = count
        count += 1
        group1 = dst_group[i]
        remain_group_list = dst_group[i + 1:i + 30]
        for j, group2 in enumerate(remain_group_list):
            distance_of_left = min(
                get_distance_of_two_segments(s1, s2)
                for s1, s2 in product(group1, group2))
            if distance_of_left > 0.1:
                continue
            max_len = max(get_line_length(group1 + group2))

            def is_percent_sign():
                if len(group1) == 1:
                    len1 = max(get_line_length(group1))
                    len2 = max(get_line_length(group2))
                else:
                    len1 = max(get_line_length(group2))
                    len2 = max(get_line_length(group1))
                return distance_of_left < max_len / 4 and min(len(group1), len(group2)) == 1 \
                       and max(len(group1), len(group2)) == 10 and len1 > 5 * len2 and max_len < 0.2

            if is_percent_sign():

                if labels[i + j + 1] == -1:
                    labels[i + j + 1] = labels[i]
    for label_index in range(len(labels)):
        if labels[label_index] == -1:
            labels[label_index] = count
            count += 1
    merge_group = []
    for index in range(len(labels)):
        label = labels[index]
        if len(merge_group) < label + 1:
            merge_group.append(dst_group[index])
        else:
            merge_group[label] += dst_group[index]
    return merge_group, labels
def _merge_groups_according_to_position_char(groups, type_index):
    labels = [-1 for i in range(len(groups))]
    dst_group = groups
    count = 0
    merge_percent = 1
    merge_tight_and_i_j = 2
    merge_special_char = 3
    for i in range(len(dst_group) - 35):
        if i != 0 and labels[i] != -1:
            continue
        labels[i] = count
        count += 1
        group1 = dst_group[i]
        remain_group_list = dst_group[i + 1:i + 35]
        for j, group2 in enumerate(remain_group_list):
            distance_of_left = min(
                get_distance_of_two_segments(s1, s2)
                for s1, s2 in product(group1, group2))
            if distance_of_left > 0.1:
                continue
            min_len = min(get_line_length(group1 + group2))
            max_len = max(get_line_length(group1 + group2))
            bounding_box1 = get_bounding_box_list(group1)
            bounding_box2 = get_bounding_box_list(group2)
            bounding_box12 = get_bounding_box_list(group1 + group2)
            x1, y1 = perpendicular_of_box(bounding_box1)
            x2, y2 = perpendicular_of_box(bounding_box2)
            (min_x1, min_y1), (max_x1, max_y1) = get_bounding_box_list(group1)
            (min_x2, min_y2), (max_x2, max_y2) = get_bounding_box_list(group2)
            tight_close_threshold = 0.0001
            distance_and_max_len_ratio1 = 2
            distance_and_max_len_ratio2 = 3
            dot_line_nums = 4
            comma_line_nums = 7
            i_minus_dot_line_nums = 1
            j_minus_dot_line_nums = 4
            big_ration_of_line = 11
            vertical_margin = 0.001
            text_line_margin = 0.1

            def ration_between_distance_and_max_len_of_i_j():
                return max_len / distance_and_max_len_ratio2 < distance_of_left < max_len / distance_and_max_len_ratio1

            def big_num_of_lines(num):
                return max(len(group1), len(group2)) == num

            def small_num_of_lines(num):
                return min(len(group1), len(group2)) == num

            def is_tight_close():
                return distance_of_left < tight_close_threshold

            def not_form_line():
                return max_len < text_line_margin

            def is_vertical(margin):
                return abs(x1 - x2) < margin

            def big_ration_between_group_line(ratio):
                return max_len > ratio * min_len

            distance_margin = 0.1

            def is_distance_legal():
                distance_of_left < distance_margin

            def is_i():
                return ration_between_distance_and_max_len_of_i_j() and big_ration_between_group_line(big_ration_of_line)\
                       and is_vertical(vertical_margin)\
                       and small_num_of_lines(i_minus_dot_line_nums) \
                       and big_num_of_lines(dot_line_nums) and not_form_line()

            def is_j():
                return ration_between_distance_and_max_len_of_i_j() and big_ration_between_group_line(big_ration_of_line) \
                       and max_x1 == max_x2 \
                       and small_num_of_lines(dot_line_nums) \
                       and big_num_of_lines(j_minus_dot_line_nums) and not_form_line()

            def size_of_bounding_box(bounding_box):
                (min_x, min_y), (max_x, max_y) = bounding_box
                return abs(max_x - min_x) * abs(max_y - min_y)

            def is_colon():
                return small_num_of_lines(dot_line_nums) and big_num_of_lines(dot_line_nums) \
                       and is_vertical(0) \
                       and size_of_bounding_box(bounding_box12) < \
                       10 * min(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))\
                       and is_distance_legal()\
                       and not_form_line()

            def is_semicolon():
                return small_num_of_lines(dot_line_nums) and big_num_of_lines(comma_line_nums) \
                       and is_vertical(0) \
                       and size_of_bounding_box(bounding_box12) < \
                       15 * min(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2)) \
                       and min_len > max_len / 2 and is_distance_legal()

            def is_percent_sign():
                if len(group1) == 1:
                    len1 = max(get_line_length(group1))
                    len2 = max(get_line_length(group2))
                else:
                    len1 = max(get_line_length(group2))
                    len2 = max(get_line_length(group1))
                return distance_of_left < max_len / 4 and min(len(group1), len(group2)) == 1 \
                    and max(len(group1), len(group2)) == 10 and len1 > 5 * len2 and max_len < 0.2

            def is_exclamation_mark():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 12 and \
                       distance_of_left < max_len / 2 and abs(x1 - x2) < 0.01

            def is_equals():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 1 \
                       and min_len > max_len / 1.2 and distance_of_left < min_len / 2 and abs(x1 - x2) < 0.01

            def is_minus_and_plus():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 2 \
                       and min_len > max_len / 1.2 and distance_of_left < min_len / 5

            def is_special_char():
                return any([
                    is_colon(),
                    is_semicolon(),
                    is_exclamation_mark(),
                    is_equals(),
                    is_minus_and_plus()
                ])

            def is_lower_i_or_j():
                return any([is_i(), is_j()])

            def is_merged_before():
                return labels[i + j + 1] == -1

            if all([
                    type_index == merge_percent,
                    is_percent_sign(),
                    is_merged_before()
            ]):
                labels[i + j + 1] = labels[i]
            elif all([
                    type_index == merge_tight_and_i_j,
                    is_tight_close() or is_lower_i_or_j(),
                    is_merged_before()
            ]):
                labels[i + j + 1] = labels[i]
            elif all([
                    type_index == merge_special_char,
                    is_special_char(),
                    is_merged_before()
            ]):
                labels[i + j + 1] = labels[i]
    for label_index in range(len(labels)):
        if labels[label_index] == -1:
            labels[label_index] = count
            count += 1
    merge_group = []
    for index in range(len(labels)):
        label = labels[index]
        if len(merge_group) < label + 1:
            merge_group.append(dst_group[index])
        else:
            merge_group[label] += dst_group[index]
    return merge_group, labels
示例#5
0
def _merge_groups1(groups):
    label = 0
    merged_groups = [(label, groups[0])]
    if len(groups) == 1: return merged_groups
    print('len=', len(groups[1:]))
    for i, g in enumerate(groups[1:]):
        real_index = i + 1
        min_len = 0
        distance_of_left = min(
            get_distance_of_two_segments(s1, s2)
            for s1, s2 in product(groups[real_index - 1], g))
        max_len = 0
        for s1, s2 in product(groups[real_index - 1], g):
            gg = []
            if distance_of_left == get_distance_of_two_segments(s1, s2):
                gg.append(s1)
                gg.append(s2)
                min_len = min(get_line_length(gg))
                max_len = max(get_line_length(gg))

        last_element_label = merged_groups[-1][0]

        tight_close_threshold = 15
        big_ratio_threshold = 11
        small_ratio_threshold_of_colon = 1.1
        small_ratio_threshold = 1.2
        small_ratio_threshold_of_semicolon = 1.5
        close_threshold_of_i_and_j = 2.3
        close_threshold_of_percent = 4
        dot_lines_num = 6
        dot_lines_num_in_gdo = 4
        lower_i_without_dot = 1
        lower_j_without_dot = 6
        dot_of_percent = 12
        dot_of_oblique_line = 1
        lines_of_above_semicolon = 4
        lines_of_below_semicolon = 5
        lines_of_half_equals = 1
        lines_of_plus = 2
        lines_of_minus = 1

        def get_one_group_line_number_by_group(func, n):
            return func((len(merged_groups[-1][1]), len(g))) == n

        def get_smaller_group_line_number_by_group(n):
            return get_one_group_line_number_by_group(min, n)

        def get_larger_group_line_number_by_group(n):
            return get_one_group_line_number_by_group(max, n)

        def big_line_ration_with_long_and_short(ratio_threshold):
            return max_len > ratio_threshold * min_len

        def small_line_ration_with_long_and_short(ratio_threshold):
            return min_len > max_len / ratio_threshold

        def is_groups_are_close(threshold):
            return distance_of_left <= max_len / threshold

        def is_lower_i():
            return all([
                get_smaller_group_line_number_by_group(lower_i_without_dot),
                get_larger_group_line_number_by_group(dot_lines_num)
                or get_larger_group_line_number_by_group(dot_lines_num_in_gdo),
                big_line_ration_with_long_and_short(big_ratio_threshold),
                is_groups_are_close(close_threshold_of_i_and_j)
            ])

        def is_lower_j():
            return all([
                get_smaller_group_line_number_by_group(lower_j_without_dot),
                get_larger_group_line_number_by_group(dot_lines_num),
                big_line_ration_with_long_and_short(big_ratio_threshold),
                is_groups_are_close(close_threshold_of_i_and_j)
            ])

        def is_percent():
            return (
                (get_smaller_group_line_number_by_group(dot_of_oblique_line)
                 and get_larger_group_line_number_by_group(dot_of_percent)) or
                (get_smaller_group_line_number_by_group(dot_of_percent)
                 and get_larger_group_line_number_by_group(dot_of_percent +
                                                           dot_of_oblique_line)
                 )) and is_groups_are_close(close_threshold_of_percent)

        def is_colon():
            return all([
                get_smaller_group_line_number_by_group(dot_lines_num),
                get_larger_group_line_number_by_group(dot_lines_num),
                small_line_ration_with_long_and_short(
                    small_ratio_threshold_of_colon)
            ])

        def is_semicolon():
            return all([
                get_smaller_group_line_number_by_group(
                    lines_of_above_semicolon),
                get_larger_group_line_number_by_group(
                    lines_of_below_semicolon),
                small_line_ration_with_long_and_short(
                    small_ratio_threshold_of_semicolon)
            ])

        def is_equals():
            return all([
                get_smaller_group_line_number_by_group(lines_of_half_equals),
                get_larger_group_line_number_by_group(lines_of_half_equals),
                small_line_ration_with_long_and_short(small_ratio_threshold),
                distance_of_left <= max_len / 1.2
            ])

        def is_plus_minus():
            return all([
                get_smaller_group_line_number_by_group(lines_of_plus),
                get_larger_group_line_number_by_group(lines_of_minus),
                small_line_ration_with_long_and_short(small_ratio_threshold)
            ])

        def is_tight_close():
            return distance_of_left <= min_len / tight_close_threshold

        if any([
                is_tight_close(),
                is_lower_i(),
                is_lower_j(),
                is_percent(),
                is_colon(),
                is_semicolon(),
                is_equals(),
                is_plus_minus()
        ]) and max_len < 0.1:
            merged_groups.append((last_element_label, g))
        else:
            merged_groups.append((last_element_label + 1, g))

    assert len(merged_groups) == len(groups)

    return merged_groups
示例#6
0
def _merge_groups_according_to_position2(groups):
    labels = [-1 for i in range(len(groups))]
    dst_group = groups
    count = 0
    for i in range(len(dst_group) - 30):
        if i != 0 and labels[i] != -1:
            continue
        labels[i] = count
        count += 1
        group1 = dst_group[i]
        remain_group_list = dst_group[i + 1:i + 30]
        for j, group2 in enumerate(remain_group_list):
            distance_of_left = min(
                get_distance_of_two_segments(s1, s2)
                for s1, s2 in product(group1, group2))
            if distance_of_left > 0.1:
                continue
            min_len = min(get_line_length(group1 + group2))
            max_len = max(get_line_length(group1 + group2))
            bounding_box1 = get_bounding_box_list(group1)
            bounding_box2 = get_bounding_box_list(group2)
            bounding_box12 = get_bounding_box_list(group1 + group2)
            x1, y1 = perpendicular_of_box(bounding_box1)
            x2, y2 = perpendicular_of_box(bounding_box2)
            (min_x1, min_y1), (max_x1, max_y1) = get_bounding_box_list(group1)
            (min_x2, min_y2), (max_x2, max_y2) = get_bounding_box_list(group2)

            def is_tight_close():
                return distance_of_left < 0.0001

            def is_i():
                return max_len / 3 < distance_of_left < max_len / 2 and max_len > 11 * min_len and abs(
                    x1 - x2) < 0.01 and \
                       min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 4 and max_len < 0.1

            def is_j():
                return max_len / 3 < distance_of_left < max_len / 2 and max_len > 11 * min_len and \
                       min(len(group1), len(group2)) == 4 and max(len(group1), len(group2)) == 4 and max_len < 0.1 \
                       and max_x1 == max_x2

            def size_of_bounding_box(bounding_box):
                (min_x, min_y), (max_x, max_y) = bounding_box
                return abs(max_x - min_x) * abs(max_y - min_y)

            def is_colon():
                return abs(x1 - x2) == 0 and distance_of_left < 0.1 and \
                       min(len(group1), len(group2)) == 4 and max(len(group1), len(group2)) == 4 and \
                       max_len < 0.1 and size_of_bounding_box(bounding_box12) < \
                       10 * min(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))

            def is_semicolon():
                return abs(x1 - x2) == 0 and min_len > max_len / 2 and distance_of_left < 0.1 and \
                       min(len(group1), len(group2)) == 4 and max(len(group1), len(group2)) == 7 and \
                       size_of_bounding_box(bounding_box12) < 15 * min(size_of_bounding_box(bounding_box1),
                                                                       size_of_bounding_box(bounding_box2))

            def is_exclamation_mark():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 12 and \
                       distance_of_left < max_len / 2 and abs(x1 - x2) < 0.01

            def is_equals():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 1 \
                       and min_len > max_len / 1.2 and distance_of_left < min_len / 2 and abs(x1 - x2) < 0.01

            def is_minus_and_plus():
                return min(len(group1), len(group2)) == 1 and max(len(group1), len(group2)) == 2 \
                       and min_len > max_len / 1.2 and distance_of_left < min_len / 5

            def is_special_char():
                return any([
                    is_colon(),
                    is_semicolon(),
                    is_exclamation_mark(),
                    is_equals(),
                    is_minus_and_plus()
                ])

            def is_lower_i_or_j():
                return any([is_i(), is_j()])

            if is_special_char():
                if labels[i + j + 1] == -1:
                    labels[i + j + 1] = labels[i]
    for label_index in range(len(labels)):
        if labels[label_index] == -1:
            labels[label_index] = count
            count += 1
    merge_group = []
    for index in range(len(labels)):
        label = labels[index]
        if len(merge_group) < label + 1:
            merge_group.append(dst_group[index])
        else:
            merge_group[label] += dst_group[index]
    return merge_group, labels
def _merge_groups1(groups):
    label = 0
    merged_groups = [(label, groups[0])]
    if len(groups) == 1: return merged_groups
    print('len=', len(groups[1:]))
    for i, g in enumerate(groups[1:]):
        real_index = i + 1
        min_len = 0  # min(getlinelength(g))
        distance_of_left = min(get_distance_of_two_segments(s1, s2)
                               for s1, s2 in product(groups[real_index - 1], g)
                               )
        max_len = 0
        for s1, s2 in product(groups[real_index - 1], g):
            gg = []
            if distance_of_left == get_distance_of_two_segments(s1, s2):
                gg.append(s1)
                gg.append(s2)
                min_len = min(get_line_length(gg))
                max_len = max(get_line_length(gg))

        last_element_label = merged_groups[-1][0]

        tight_close_threshold = 15
        big_ratio_threshold = 11
        small_ratio_threshold_of_colon = 1.1
        small_ratio_threshold = 1.2
        small_ratio_threshold_of_semicolon = 1.5
        close_threshold_of_i_and_j = 2.3
        close_threshold_of_percent = 4
        dot_lines_num = 6
        dot_lines_num_in_gdo = 4
        lower_i_without_dot = 1
        lower_j_without_dot = 6
        dot_of_percent = 12
        dot_of_oblique_line = 1
        lines_of_above_semicolon = 4
        lines_of_below_semicolon = 5
        lines_of_half_equals = 1
        lines_of_plus = 2
        lines_of_minus = 1

        lines_of_index = merged_groups[-1][1]
        bounding_box1 = get_bounding_box_list(lines_of_index)
        bounding_box2 = get_bounding_box_list(g)
        bounding_box12 = get_bounding_box_list(lines_of_index + g)

        # print(bounding_box1, bounding_box2, bounding_box12)

        def size_of_bounding_box(bounding_box):
            (min_x, min_y), (max_x, max_y) = bounding_box
            return (max_x - min_x) * (max_y - min_y)

        def perpendicular_of_box(bounding_box):
            (min_x, min_y), (max_x, max_y) = bounding_box
            p_x = (min_x + max_x) / 2
            p_y = (min_y + max_y) / 2
            return p_x, p_y

        def is_low_i_or_colon_equals_minus_and_plus():
            (min_x, min_y), (max_x, max_y) = bounding_box1
            (min_x1, min_y1), (max_x1, max_y1) = bounding_box2
            min_size = min(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))
            max_size = max(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))
            merge_size = size_of_bounding_box(bounding_box12)
            x1, y1 = perpendicular_of_box(bounding_box1)
            x2, y2 = perpendicular_of_box(bounding_box2)
            span = max(max_x - min_x, max_x1 - min_x1)

            def is_perpendicular():
                return (abs(x1 - x2) < span) or (abs(y1 - y2) < span)

            return is_perpendicular() and distance_of_left < max_len / 2 \
                   and min((len(merged_groups[-1][1]), len(g))) == 1 \
                # and max((len(merged_groups[-1][1]), len(g))) == 4

        #     """
        #     ":"
        #     if is_perpendicular() \
        #             and min_len < max_len / 1.1 and merge_size > 10 * max_size:
        #         return True
        #
        #     "="
        #     if is_perpendicular() \
        #             and min_len < max_len / 1.2 and max((len(merged_groups[-1][1]), len(g))) == 1:
        #         return True
        #     ";"
        #     if is_perpendicular() \
        #             and min_len < max_len / 1.5 and merge_size > 5 < min_size:
        #         return True
        #     "+_"
        #     if is_perpendicular() \
        #             and min_len < max_len / 1.2 and min((len(merged_groups[-1][1]), len(g))) == 1\
        #             and max((len(merged_groups[-1][1]), len(g))) == 2:
        #         return True
        # """

        def is_colon_or_i():
            min_size = min(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))
            max_size = max(size_of_bounding_box(bounding_box1), size_of_bounding_box(bounding_box2))
            merge_size = size_of_bounding_box(bounding_box12)
            if min_size < math.e ** -16:
                return False
            if merge_size < 10 * min_size and min_size < 0.0001:
                print(bounding_box1, bounding_box2)
                return True
            return False

        def get_one_group_line_number_by_group(func, n):
            return func((len(merged_groups[-1][1]), len(g))) == n

        def get_smaller_group_line_number_by_group(n):
            return get_one_group_line_number_by_group(min, n)

        def get_larger_group_line_number_by_group(n):
            return get_one_group_line_number_by_group(max, n)

        def big_line_ration_with_long_and_short(ratio_threshold):
            return max_len > ratio_threshold * min_len

        def small_line_ration_with_long_and_short(ratio_threshold):
            return min_len > max_len / ratio_threshold

        def is_groups_are_close(threshold):
            return distance_of_left <= max_len / threshold

        def is_lower_i():
            return all([get_smaller_group_line_number_by_group(lower_i_without_dot),
                        get_larger_group_line_number_by_group(dot_lines_num)
                        or get_larger_group_line_number_by_group(dot_lines_num_in_gdo),
                        big_line_ration_with_long_and_short(big_ratio_threshold),
                        is_groups_are_close(close_threshold_of_i_and_j)])

        def is_lower_i_gdo():
            return all([
                distance_of_left < max_len
            ])

        def is_colon_gdo():
            return all([get_smaller_group_line_number_by_group(dot_lines_num_in_gdo),
                        get_larger_group_line_number_by_group(dot_lines_num_in_gdo),
                        distance_of_left < 0.2,
                        small_line_ration_with_long_and_short(small_ratio_threshold_of_colon)])

        def is_lower_j():
            return all([get_smaller_group_line_number_by_group(lower_j_without_dot),
                        get_larger_group_line_number_by_group(dot_lines_num),
                        big_line_ration_with_long_and_short(big_ratio_threshold),
                        is_groups_are_close(close_threshold_of_i_and_j)])

        def is_percent():
            return ((get_smaller_group_line_number_by_group(dot_of_oblique_line) and
                     get_larger_group_line_number_by_group(dot_of_percent)) or (
                            get_smaller_group_line_number_by_group(dot_of_percent) and
                            get_larger_group_line_number_by_group(dot_of_percent + dot_of_oblique_line)
                    )) and is_groups_are_close(close_threshold_of_percent)

        def is_colon():
            return all([get_smaller_group_line_number_by_group(dot_lines_num),
                        get_larger_group_line_number_by_group(dot_lines_num),
                        small_line_ration_with_long_and_short(small_ratio_threshold_of_colon),
                        distance_of_left > 10 * max_len])

        def is_semicolon():
            return all([get_smaller_group_line_number_by_group(lines_of_above_semicolon),
                        get_larger_group_line_number_by_group(lines_of_below_semicolon),
                        small_line_ration_with_long_and_short(small_ratio_threshold_of_semicolon)])

        def is_equals():
            return all([get_smaller_group_line_number_by_group(lines_of_half_equals),
                        get_larger_group_line_number_by_group(lines_of_half_equals),
                        small_line_ration_with_long_and_short(small_ratio_threshold),
                        distance_of_left <= max_len / 1.2])

        def is_plus_minus():
            return all([get_smaller_group_line_number_by_group(lines_of_plus),
                        get_larger_group_line_number_by_group(lines_of_minus),
                        small_line_ration_with_long_and_short(small_ratio_threshold)])

        def is_tight_close():
            return distance_of_left <= min_len / tight_close_threshold

        if any([is_tight_close(), is_lower_i(), is_lower_j(), is_percent(),
                is_colon(), is_semicolon(), is_equals(), is_plus_minus()]) and max_len < 0.1:
            # if is_low_i_or_colon_equals_minus_and_plus():
            # print('hhhhh')
            merged_groups.append((last_element_label, g))
        else:
            merged_groups.append((last_element_label + 1, g))

    assert len(merged_groups) == len(groups)

    return merged_groups
def _merge_groups_according_to_position(groups):
    label = 0
    dst_group = copy.deepcopy(groups)
    merged_groups = [(label, dst_group[0])]
    if len(groups) == 1: return merged_groups
    "merge approach box lines,dynamic dst_group"
    labels = [0 for i in range(len(dst_group))]
    for i in range(len(dst_group) - 1):
        if labels[i] == 0:
            labels[i] = label[i - 1]
        if i >= len(dst_group) - 20:
            break
        g = dst_group[i]
        new_list = dst_group[i + 1:i + 20]
        minimum_distance = min(dist_of_box(line_box1, g)
                               for line_box1 in new_list)
        line_combination = []
        for item in new_list:
            for item1 in item:
                for item2 in g:
                    line_combination.append((item1, item2))
        distance_of_left = min(get_distance_of_two_segments(s1, s2)
                               for s1, s2 in line_combination)

        for j, line_box1 in enumerate(new_list):
            if True:
                bounding_box1 = get_bounding_box_list(line_box1)
                bounding_box2 = get_bounding_box_list(g)
                dist = dist_of_box(line_box1, g)

                def perpendicular_of_box(bounding_box):
                    (min_x, min_y), (max_x, max_y) = bounding_box
                    p_x = (min_x + max_x) / 2
                    p_y = (min_y + max_y) / 2
                    return p_x, p_y

                x1, y1 = perpendicular_of_box(bounding_box1)
                x2, y2 = perpendicular_of_box(bounding_box2)
                (min_x, min_y), (max_x, max_y) = bounding_box1
                (min_x1, min_y1), (max_x1, max_y1) = bounding_box2
                span = max(max_x - min_x, max_x1 - min_x1)

                def is_perpendicular():
                    return (abs(x1 - x2) < span) or (abs(y1 - y2) < span)

                min_len = min(get_line_length(line_box1 + g))
                max_len = max(get_line_length(line_box1 + g))

                # if is_merged(line_box1, g):is_perpendicular() and
            if \
                    min((len(line_box1)), len(g)) == 1 and \
                            max_len > 10 * min_len and dist < 0.1 and \
                            max((len(line_box1)), len(g)) == 6:
                print('hhhhh')
                print(len(line_box1), len(g))
                labels[i + j] = label[i]
            else:
                pass
    "get merged label"
    merged_groups = [(label, dst_group[0])]
    for i, g in enumerate(dst_group[1:]):
        index = i + 1
        merged_groups.append((labels[index], g))
    return merged_groups