def remove_lines(lines, filtered_lines, scores): new_horiz_lines = cut_lines(lines[0], filtered_lines[0], scores[0]) new_vert_lines = cut_lines(lines[1], filtered_lines[1], scores[1]) scorer.add_score('new_horiz_lines', len(new_horiz_lines)) scorer.add_score('new_vert_lines', len(new_vert_lines)) return (new_horiz_lines, new_vert_lines)
def get_structure(boxes, lines): # rows = cluster_boxes(boxes, 1) # cols = cluster_boxes(boxes, 0) row_clusters, col_clusters = rate_combinations(boxes, lines) rows = translate_clusters(row_clusters) cols = translate_clusters(col_clusters) sorted_rows = sorted(rows, key = lambda row: (row[1], row[7])) sorted_cols = sorted(cols, key = lambda col: (col[0], col[6])) scorer.add_score('initial_rows', len(sorted_rows)) scorer.add_score('initial_cols', len(sorted_cols)) # combined_rows = combine_overlapping_neighbors(sorted_rows, 1, 0.5) # combined_cols = combine_overlapping_neighbors(sorted_cols, 0, 0.5) # scorer.add_score('combined_rows', len(combined_rows)) # scorer.add_score('combined_cols', len(combined_cols)) # return (combined_rows, combined_cols) return (sorted_rows, sorted_cols)
def rate_combinations(boxes, lines): overall_row_scores = {} row_score_matrix = [[1.0 for x in range(len(boxes))] for y in range(len(boxes))] overall_col_scores = {} col_score_matrix = [[1.0 for x in range(len(boxes))] for y in range(len(boxes))] horiz_lines = lines[0] vert_lines = lines[1] for comb in combinations(enumerate(boxes), 2): row_scores = {} col_scores = {} i = comb[0][0] j = comb[1][0] box_1 = { 'left': comb[0][1][0], 'right': comb[0][1][0] + comb[0][1][2], 'top': comb[0][1][1], 'bottom': comb[0][1][1] + comb[0][1][3] } box_2 = { 'left': comb[1][1][0], 'right': comb[1][1][0] + comb[1][1][2], 'top': comb[1][1][1], 'bottom': comb[1][1][1] + comb[1][1][3] } # 1.) Their vertical (horizontal) centers align # May want to cut the factor down to 1.0 to make it a max of 1.0 row_scores['center_align'] = 2.0 / (1.0 + abs(box_1['top'] + box_1['bottom'] - box_2['top'] - box_2['bottom'])) col_scores['center_align'] = 2.0 / (1.0 + abs(box_1['left'] + box_1['right'] - box_2['left'] - box_2['right'])) # 2.) Their left (top) edges align row_scores['top_align'] = 1.0 / (1.0 + abs(box_1['top'] - box_2['top'])) col_scores['left_align'] = 1.0 / (1.0 + abs(box_1['left'] - box_2['left'])) # 3.) Their right (bottom) edges align row_scores['bottom_align'] = 1.0 / (1.0 + abs(box_1['bottom'] - box_2['bottom'])) col_scores['right_align'] = 1.0 / (1.0 + abs(box_1['right'] - box_2['right'])) # 4.) If there is a line close to their left (above them) row_scores['top_line'] = calculate_preceding_line_score(box_1['top'], box_2['top'], horiz_lines) col_scores['left_line'] = calculate_preceding_line_score(box_1['left'], box_2['left'], vert_lines) # 5.) If there is a line close to their right (below them) row_scores['bottom_line'] = calculate_succeeding_line_score(box_1['bottom'], box_2['bottom'], horiz_lines) col_scores['right_line'] = calculate_succeeding_line_score(box_1['right'], box_2['right'], vert_lines) # 6.) They overlap significantly in their horizontal (vertical) range row_scores['vert_overlap'] = calculate_vertical_overlap(box_1, box_2) col_scores['horiz_overlap'] = calculate_horizontal_overlap(box_1, box_2) # 7.) I would like to add in a term regarding a shared strong score with a third object row_score = calculate_row_score(row_scores) col_score = calculate_col_score(col_scores) overall_row_scores[str(comb)] = row_score overall_col_scores[str(comb)] = col_score row_score_matrix[comb[0][0]][comb[1][0]] = row_score row_score_matrix[comb[1][0]][comb[0][0]] = row_score col_score_matrix[comb[0][0]][comb[1][0]] = col_score col_score_matrix[comb[1][0]][comb[0][0]] = col_score # for comb in overall_row_scores: # print('comb: ' + str(comb)) # print('row score: ' + str(overall_row_scores[comb])) # print('col score: ' + str(overall_col_scores[comb])) # Might want to do 0.999 later row_clusters = clusterer.newer_cluster_scores(row_score_matrix, 1.0) col_clusters = clusterer.newer_cluster_scores(col_score_matrix, 1.0) # print('Row clusters found:') # for cluster in row_clusters: # print('*****************') # for i in cluster: # print(boxes[i]) # print('------') # print('Col clusters found:') # for cluster in col_clusters: # print('*****************') # for i in cluster: # print(boxes[i]) scorer.add_score('cluster_rows', len(row_clusters)) scorer.add_score('cluster_cols', len(col_clusters)) # print('done clustering') # Now translate the clusters of indexes into clusters of boxes row_cluster_boxes = [] for row in row_clusters: row_cluster_boxes.append([]) for box_index in row: row_cluster_boxes[len(row_cluster_boxes) - 1].append(boxes[box_index]) col_cluster_boxes = [] for col in col_clusters: col_cluster_boxes.append([]) for box_index in col: col_cluster_boxes[len(col_cluster_boxes) - 1].append(boxes[box_index]) return (row_cluster_boxes, col_cluster_boxes)
def get_lines(img_name, base_path): img = cv2.imread(base_path + '/' + img_name) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) edges = cv2.Canny(gray, 50, 150, apertureSize = 3) # cv2.imwrite('regents/canny/' + img_name, edges) # 120, 20, 10 is good. Also 80, 20, 1 lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 120, minLineLength=40, maxLineGap=2) if lines is None: lines = [] horiz_count = 0 vert_count = 0 horiz_lines = [] vert_lines = [] for info in lines: x1, y1, x2, y2 = info[0] line_info = {} if abs(y1 - y2) < 0.1: # This is a horizontal line line_info['border'] = int((y1 + y2) / 2) line_info['start'] = x1 line_info['end'] = x2 horiz_lines.append(line_info) horiz_count += 1 elif abs(x1 - x2) < 0.1: # This is a vertical line line_info['border'] = int((x1 + x2) / 2) line_info['start'] = y1 line_info['end'] = y2 vert_lines.append(line_info) vert_count += 1 elif verbose: print('Nonstandard line: ' + str(theta)) scorer.add_score('horiz_lines', len(horiz_lines)) scorer.add_score('vert_lines', len(vert_lines)) scorer.add_score('line_outside_rows', len(horiz_lines) - 1) scorer.add_score('line_outside_cols', len(vert_lines) - 1) scorer.add_score('line_inside_rows', len(horiz_lines) + 1) scorer.add_score('line_inside_cols', len(vert_lines) + 1) return (horiz_lines, vert_lines)