def line_word_height(line): # For each line, get words words = line.find_all('span', 'ocrx_word') word_heights = [] for word_idx, word in enumerate(words): wordbbox = helpers.extractbbox(word.get('title')) word_heights.append(wordbbox['y2'] - wordbbox['y1']) avg = 0 if len(words) == 0 else np.nanmean(word_heights) return avg
def extract_tables(document_path=None): if isinstance(document_path, str): config = TableExtractConfig(document_path) elif isinstance(document_path, TableExtractConfig): config = document_path else: raise ValueError( f"Illegal parameter. {document_path} is not a valid document path") config.make_subdirs() page_paths = config.hocr_files() # Check if a native text layer is available and load it text_layer = '' has_text_layer = False text_layer_path = config.text_layer_path() if os.path.exists( text_layer_path) and os.path.getsize(text_layer_path) > 1: with open(text_layer_path) as t: text_layer = t.read() has_text_layer = True else: print('Does not have text layer') pages = [] # TODO use a threadpool to do this in parallel. for page_no, page in enumerate(page_paths): # Read in each tesseract page with BeautifulSoup so we can look at the document holistically with open(page) as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') # TODO use a named tuple for this purpose. pages.append({ 'page_no': page.split('/')[-1].replace(f'.{config.hocr_ext}', '').replace('page_', ''), 'soup': soup, 'page': helpers.extractbbox( soup.find_all('div', 'ocr_page')[0].get('title')), 'areas': [ area_summary(area) for area in soup.find_all('div', 'ocr_carea') ], 'lines': [line for line in soup.find_all('span', 'ocr_line')] }) # Record the OCR-identified text if a native text layer was unavailable if not has_text_layer: text_layer += soup.getText() # Attempt to identify all charts/tables/etc in the paper by looking at the text layer # i.e. It is useful for us to know if the text mentions "see table 4", because if the caption # for table 4 is distorted in the text layer ("teble 4", for example), we can still guess that # it is table 4 because of it's position in the document and our prior knowledge that a table 4 # exists text_layer = text_layer.strip().replace('\n', ' ').replace(' ', ' ').lower() figures = [] for result in re.findall( '(table|figure|fig|map|appendix|app|appx|tbl)(\.)? (\d+)(\.)?', text_layer, flags=re.IGNORECASE): figures.append(' '.join(' '.join(result).replace('.', '').replace( 'figure', 'fig').split()).lower()) # Clean up the list of figures/tables/etc figures = sorted(set(figures)) figure_idx = {} for fig in figures: parts = fig.split(' ') # Need to try/except because often times the "number" is actually a string that cannot be parsed into an integer if parts[0] in figure_idx: try: figure_idx[parts[0]].append(int(parts[1])) except: continue else: try: figure_idx[parts[0]] = [int(parts[1])] except: continue # Clean up for reformat for key in figure_idx: figure_idx[key] = helpers.clean_range(sorted(set(figure_idx[key]))) # map/reduce page_areas = [page['areas'] for page in pages] area_stats = [area for areas in page_areas for area in areas] # Calculate summary stats for the document from all areas identified by Tesseract doc_stats = summarize_document(area_stats) # Classify and assign a table score to each area in each page pages = [classify_areas(page, doc_stats) for page in pages] # Identify the areas that classified as 'text block's and record their widths text_block_widths = [] for page in pages: for area in page['areas']: if area['type'] == 'text block': text_block_widths.append(area['x2'] - area['x1']) # Calculate stats about the text blocks in the whole document. First get rid of outliers two_sigma = [ val for val in text_block_widths if val > (np.nanmedian(text_block_widths) - (np.nanstd(text_block_widths) * 2)) and val < (np.nanmedian(text_block_widths) + (np.nanstd(text_block_widths) * 2)) ] # Update doc stats, then reclassify doc_stats['text_block_median'] = np.nanmedian(two_sigma) doc_stats['text_block_std'] = np.nanstd(two_sigma) # Reclassify all areas based on the stats of the whole document for page in pages: for area in page['areas']: width = area['x2'] - area['x1'] # Not a text block if it's width is outside of 2 sigma if area['type'] == 'text block' and ( width < doc_stats['text_block_median'] - (2 * doc_stats['text_block_std']) or width > doc_stats['text_block_median'] + (2 * doc_stats['text_block_std'])): area['type'] = 'other' # Most documents only contain one page height, but others mix landscape and portrait pages # Figure out which is the most common doc_stats['page_height'] = np.bincount( [page['page']['y2'] - page['page']['y1'] for page in pages]).argmax() doc_stats['page_width'] = np.bincount( [page['page']['x2'] - page['page']['x1'] for page in pages]).argmax() # Find out if a header or footer is present in the document - make sure we don't include them in extracts doc_stats['header'], doc_stats['footer'] = helpers.get_header_footer( pages, doc_stats['page_height'], doc_stats['page_width']) new_page_areas = [{ 'page_no': page['page_no'], 'areas': helpers.reclassify_areas(page['areas'], doc_stats['line_height_avg'] / 2) } for page in pages] new_pages = {} for page in new_page_areas: new_pages[page['page_no']] = {'areas': page['areas']} for page in pages: for ai, area in enumerate(new_pages[page['page_no']]['areas']): new_pages[page['page_no']]['areas'][ai]['lines'] = [ line for line in page['soup'].find_all('span', 'ocr_line') if helpers.rectangles_intersect( area['geom'], helpers.extractbbox(line.get('title'))) ] for page in pages: new_areas = helpers.reclassify_areas(page['areas'], doc_stats['line_height_avg'] / 2) # helpers.plot_new_areas(page['page_no'], new_areas) # Store table scores and type of the areas in the hocr files store_table_metadata_in_soup(pages) write_table_metadata_to_hocr_files(pages, config.document_path, subdir=config.subdir_hocr_ts) print("Completed writing hocr files") # Plot table detection # FIXME make this optional. if config.is_writing_table_extract_boxes: plot_table_detection(pages, config.document_path, sub_dir=config.subdir_table_extract_boxes) doc_stats['found_tables'] = figure_idx print('these tables were found --') for ttype in figure_idx: print(' ', ttype, figure_idx[ttype]) for page in pages: page_extracts = process_page(doc_stats, page) found = [] for e in page_extracts: if e['name'] in found: e['name'] = e['name'] + '*' found.append(e['name']) # DEBUG # if page['page_no'] == '5': # for idx, area in enumerate(page['areas']): # print 'Area %s -- %s (%s)' % (idx, area['type'], area['table_score']) # print ' Lines: %s' % (area['lines'], ) # print ' Words: %s' % (area['words'], ) # print ' Area: %s' % (area['area'], ) # print ' Word separation index: %s' % ('%.2f' % area['word_separation_index'], ) # print ' Word height index: %s' % ('%.2f' % area['word_height_index'], ) # print ' Word height avg: %s' % ('%.2f' % area['word_height_avg'], ) # print ' Area covered by words: %s%%' % (int(area['word_area_index'] * 100), ) # print ' Average word height: %s' % ('%.2f' % area['word_height_avg']) # print ' Gaps: %s' % (area['gaps']) # print ' Line height average: %s' %(np.nanmean(area['line_heights'])) # plot(page['soup'], page_extracts) if config.is_extracting_tables: for table in page_extracts: # FIXME Make this optional. helpers.extract_table(config.document_path, page['page_no'], table)
def process_page(doc_stats, page): def find_above_and_below(extract): out = {'above': [], 'below': [], 'left': [], 'right': []} for area_idx, area in enumerate(page['areas']): # Check if they overlap in x space if area['x1'] <= extract['x2'] and extract['x1'] <= area['x2']: # Check how *much* they overlap in x space # Number of pixels area overlaps with current extract extent overlap = max([ 0, abs( min([area['x2'], extract['x2']]) - max([extract['x1'], area['x1']])) ]) area_length = area['x2'] - area['x1'] percent_overlap = float( overlap) / area_length if area_length else 0 # If the area overlaps more than 90% in x space with the target area if percent_overlap >= 0.9: # Check if this area is above or below the extract area area_centroid = helpers.centroid(area) extract_centroid = helpers.centroid(extract) # If it is above if area_centroid['y'] <= extract_centroid['y']: # Work backwards so that when we iterate we start at the area closest to the extract out['above'].insert(0, area_idx) # If below else: out['below'].append(area_idx) # Check if they overlap in y space elif area['y1'] <= extract['y2'] and extract['y1'] <= area['y2']: overlap = max([ 0, abs( min([area['y2'], extract['y2']]) - max([extract['y1'], area['y1']])) ]) area_length = area['y2'] - area['y1'] percent_overlap = float(overlap) / area_length if percent_overlap >= 0.9: area_centroid = helpers.centroid(area) extract_centroid = helpers.centroid(extract) if area_centroid['x'] <= extract_centroid['x']: out['left'].insert(0, area_idx) else: out['right'].append(area_idx) return out def expand_extraction(extract_idx, props): # Iterate on above and below areas for each extract for direction, areas in extract_relations[extract_idx].items(): stopped = False for area_idx in extract_relations[extract_idx][direction]: # Iterate on all other extracts, making sure that extending the current one won't run into any of the others for extract_idx2, props2 in extract_relations.items(): if extract_idx != extract_idx2: will_intersect = helpers.rectangles_intersect( extracts[extract_idx2], helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) if will_intersect: stopped = True continue if stopped: continue if page['areas'][area_idx][ 'type'] == 'possible table' and direction == extracts[ extract_idx]['direction']: #print 'extend', extracts[extract_idx]['name'], 'into possible table' extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) elif page['areas'][area_idx]['type'] == 'caption': extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) elif page['areas'][area_idx]['type'] == 'table': #print 'extend', extracts[extract_idx]['name'], 'into table' extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) elif page['areas'][area_idx]['type'] == 'line': #print 'extend', extracts[extract_idx]['name'], 'into line' extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) elif ((page['areas'][area_idx]['type'] == 'text block' or page['areas'][area_idx]['type'] == 'other') and page['areas'][area_idx]['word_height_avg'] < (doc_stats['word_height_avg'] - (doc_stats['word_height_avg_std'] / 4))): #print 'extend', extracts[extract_idx]['name'], 'into text' extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) else: #print 'stop ', extracts[extract_idx]['name'] stopped = True # Find all areas that each area intersects areas = {} for idx_a, area_a in enumerate(page['areas']): areas[idx_a] = [] for idx_b, area_b in enumerate(page['areas']): if idx_a != idx_b and helpers.rectangles_intersect( helpers.extractbbox(area_a['soup'].get('title')), helpers.extractbbox(area_b['soup'].get('title'))): areas[idx_a].append(idx_b) # If area intersects others, recursively get all intersections # new_areas = [] # for area_idx in areas: # if len(areas[area_idx]): # new_area = { 'x1': 9999999, 'y1': 9999999, 'x2': -9999999, 'y2': -9999999 } # new_area_consists_of = [] # all_intersections = [ areas[i] for i in areas if i in areas[area_idx] ] # # Flatten and filter # all_intersections = set([ item for sublist in all_intersections for item in sublist ]) # for area in all_intersections: # new_area_consists_of.append(area) # new_area = helpers.enlarge_extract(new_area, helpers.extractbbox(page['areas'][area]['soup'].get('title'))) # # if new_area['x1'] != 9999999: # new_area['consists_of'] = new_area_consists_of # new_areas.append(new_area) # # # Filter unique new areas and remove areas that this new area covers # unique_new_areas = [] # for area in new_areas: # # Does this area overlap with any areas already accounted for? # found = False # for uidx, each in enumerate(unique_new_areas): # # If it does, add it to that existing area # if len(set(each['consists_of']).intersection(area['consists_of'])) > 0: # found = True # unique_new_areas[uidx]['consists_of'] = list(set(each['consists_of'] + area['consists_of'])) # new_area = helpers.enlarge_extract(each, area) # for key in new_area: # unique_new_areas[uidx][key] = new_area[key] # # if not found: # unique_new_areas.append(area) # # print 'UNIQUE NEW AREAS', unique_new_areas # Find the captions/titles for charts, figures, maps, tables indicator_lines = [] for line in page['lines']: # Remove nonsense clean_line = line.getText().strip().replace('\n', ' ').replace(' ', ' ').lower() # Find all lines that contain only a target word plus a number dedicated_line_matches = re.match( '(table|figure|fig|map)(\.)? \d+(\.)?', clean_line, flags=re.IGNORECASE | re.MULTILINE) # Find all the lines that start with one of the target words and a number caption_matches = re.match('(table|figure|fig|map)(\.)? \d+(\.)', clean_line, flags=re.IGNORECASE | re.MULTILINE) # Problematic tesseract matches bad_tesseract_matches = re.match( '^(table|figure|fig|map)(\.)? \w{1,5}(\S)?(\w{1,5})?(\.)?', clean_line, flags=re.IGNORECASE | re.MULTILINE) bbox = helpers.extractbbox(line.get('title')) # dedicated line (ex: Table 1) if dedicated_line_matches and dedicated_line_matches.group( 0) == clean_line: bbox['name'] = dedicated_line_matches.group(0) print(' ', bbox['name'].replace('.', '')) indicator_lines.append(bbox) # Other elif caption_matches: bbox['name'] = caption_matches.group(0) print(' ', bbox['name'].replace('.', '')) indicator_lines.append(bbox) elif bad_tesseract_matches: bbox['name'] = bad_tesseract_matches.group(0) print(' ', bbox['name'].replace('.', '')) indicator_lines.append(bbox) # Assign a caption to each table, and keep track of which captions are assigned to tables. caption_idx: [area_idx, area_idx, ...] caption_areas = {} for area_idx, area in enumerate(page['areas']): if area['type'] == 'table': # Get the distances between the given area and all captions distances = [{ 'idx': line_idx, 'distance': helpers.min_distance(area, line) } for line_idx, line in enumerate(indicator_lines)] # bail if there aren't any indicator_lines if len(distances) == 0: break distances_sorted = sorted(distances, key=lambda k: k['distance']) for line in distances_sorted: # Check if it intersects any text areas potential_area = helpers.enlarge_extract( area, indicator_lines[line['idx']]) distances = [ helpers.min_distance(area, line) for line in indicator_lines ] # The index of the nearest caption if len(distances) == 0: break nearest_caption = distances.index(min(distances)) # TODO: Need to check if expanding to this caption would intersect any text areas that don't intersect the caption # Assign the nearest caption to the area area['caption'] = nearest_caption # Bookkeep try: caption_areas[nearest_caption].append(area_idx) except: caption_areas[nearest_caption] = [area_idx] ''' If a page has tables unassigned to captions, those go in a different pile When it comes time to create extract areas from them, they play by different rules: + The starting extract area is simply the area(s) determined to be tables + Extract areas can eat each other / be combined ''' # Need to go find the tables and create appropriate areas # Basically, treat them as extracts that can overlap, and then merge intersecting extracts # alternative_captions = [] # # for line in page['lines']: # # First make sure this line doesn't exist any tables # line_bbox = helpers.extractbbox(line.get('title')) # table_intersections = [] # for table in all_tables: # if helpers.rectangles_intersect(page['areas'][table], line_bbox): # table_intersections.append(True) # else: # table_intersections.append(False) # # # If it does, skip it # if True in table_intersections: # continue # # # Remove nonsense # clean_line = line.getText().strip().replace('\n', ' ').replace(' ', ' ').lower() # # mediocre caption matches # ok_matches = re.match('^(.*?) \d+(\.)?', clean_line, flags=re.IGNORECASE) # # ''' # Caption is good enough if the following are satisfied: # + the average word height is less than the document's average word height - 1/4 average word height std # + The line it is on does not intersect and table # ''' # if ok_matches and line_word_height(line) < (doc_stats['word_height_avg'] - (doc_stats['word_height_avg_std']/4)): # line_bbox['name'] = ok_matches.group(0) # print 'Alt caption - ', line_bbox['name'] # alternative_captions.append(line_bbox) # Sanity check the caption-area assignments for caption, areas in caption_areas.items(): # Only check if the caption is assigned to more than one area if len(areas) > 1: # draw a line through the middle of the caption that spans the page ''' x1,y1 0 -------------- | | - - - - | - - - - - - - | - - - - <-- Create this line | | -------------- 0 x2,y2 ''' caption_line_y = indicator_lines[caption]['y1'] + ( indicator_lines[caption]['y2'] - indicator_lines[caption]['y1']) caption_line = { 'x1': page['page']['x1'], 'y1': caption_line_y, 'x2': page['page']['x2'], 'y2': caption_line_y } # Get a list of unique combinations of areas for this caption (example: [(0,1), (1,3)] ) area_combinations = list( itertools.combinations(caption_areas[caption], 2)) # Draw a line between them ''' ----------- | | | a | | \ | -------\--- \ <------ area_connection_line -----\- | \| - - - - | - - -|\ - - - - - - - | | \ ------ \ \ --------\-------------- | \ | | \ | | b | | | | | ----------------------- ''' for pair in area_combinations: a = helpers.centroid(page['areas'][pair[0]]) b = helpers.centroid(page['areas'][pair[1]]) area_line = { 'x1': a['x'], 'y1': a['y'], 'x2': b['x'], 'y2': b['y'] } # Check if the line intersects the caption line. If it does, determine which of the 'tables' is more table-y if helpers.lines_intersect(caption_line, area_line): if page['areas'][pair[0]]['table_score'] > page['areas'][ pair[1]]['table_score']: caption_areas[caption] = [ area for area in caption_areas[caption] if area != pair[1] ] else: page['areas'][pair[0]]['type'] = 'possible table' caption_areas[caption] = [ area for area in caption_areas[caption] if area != pair[0] ] # Extracts are bounding boxes that will be used to actually extract the tables extracts = [] for caption, areas in caption_areas.items(): print(indicator_lines[caption]) area_of_interest_centroid_y_mean = np.mean( [helpers.centroid(page['areas'][area])['y'] for area in areas]) indicator_line_centroid_y = helpers.centroid( indicator_lines[caption])['y'] areas_of_interest = [page['areas'][area] for area in areas] # Find the area that the indicator line intersects for area in page['areas']: if helpers.rectangles_intersect(area, indicator_lines[caption]): areas_of_interest.append(area) #areas_of_interest.append(indicator_lines[caption]) # The extract is designated by the min/max coordinates of the caption and cooresponding table(s) extracts.append({ 'name': indicator_lines[caption]['name'], 'direction': 'below' if area_of_interest_centroid_y_mean > indicator_line_centroid_y else 'above', 'indicator_line': indicator_lines[caption], 'x1': min([a['x1'] for a in areas_of_interest]) - padding, 'y1': min([a['y1'] for a in areas_of_interest]) - padding, 'x2': max([a['x2'] for a in areas_of_interest]) + padding, 'y2': max([a['y2'] for a in areas_of_interest]) + padding }) # Make sure each table was assigned a caption assigned_tables = [] unassigned_tables = [] for caption_idx, areas in caption_areas.items(): assigned_tables = assigned_tables + areas all_tables = [] for area_idx, area in enumerate(page['areas']): if area['type'] == 'table': all_tables.append(area_idx) if sorted(assigned_tables) == sorted(all_tables): print('all tables have a caption on page', page['page_no']) else: unassigned_tables = set(all_tables).difference(assigned_tables) print('Not all tables have a caption on page', page['page_no']) print('Not assigned - ', unassigned_tables) orphan_extracts = [] for table in unassigned_tables: if page['areas'][table]['table_score'] > 5: orphan_extracts.append( helpers.expand_area(page['areas'][table], page['areas'])) orphan_extracts = helpers.union_extracts(orphan_extracts) for extract in orphan_extracts: extract['name'] = 'Unknown' extract['direction'] = 'None' # extracts.append(extract) # Find all areas that overlap in x space and are above and below the extracts extract_relations = {} for extract_idx, extract in enumerate(extracts): extract_relations[extract_idx] = find_above_and_below(extract) for extract_idx, extract in enumerate(extracts): expand_extraction(extract_idx, find_above_and_below(extract)) # for extract_idx, props in extract_relations.iteritems(): # expand_extraction(extract_idx, props) for extract in orphan_extracts: # Find out if a good extraction already covers this area extract_poly = helpers.make_polygon(extract) covers = False for each in extracts: intersection = extract_poly.intersection( helpers.make_polygon(each)) if intersection.area >= (extract_poly.area * 0.9): covers = True if not covers: extracts.append(extract) extract_relations[len(extracts) - 1] = find_above_and_below(extract) expand_extraction( len(extracts) - 1, extract_relations[len(extracts) - 1]) return extracts
def area_summary(area): summary = {} summary['soup'] = area # Bounding box (x1, y1, x2, y2) summary.update(helpers.extractbbox(area.get('title'))) # Number of lines summary['lines'] = len(area.find_all('span', 'ocr_line')) summary['line_heights'] = [] for line in area.find_all('span', 'ocr_line'): bbox = helpers.extractbbox(line.get('title')) height = bbox['y2'] - bbox['y1'] summary['line_heights'].append(height) # Number of words summary['words'] = len([ _f for _f in area.getText().strip().replace('\n', ' ').replace( ' ', ' ').split(' ') if _f ]) # Area summary['area'] = (summary['x2'] - summary['x1']) * (summary['y2'] - summary['y1']) # Get spacing of words summary['x_gaps'] = np.zeros(summary['x2'] - summary['x1'], dtype=np.int) # Words per line summary['words_in_line'] = [] summary['word_distances'] = [] summary['word_heights'] = [] summary['word_areas'] = [] summary['words_per_line'] = [] # Iterate on each line in the area for line in area.find_all('span', 'ocr_line'): # For each line, get words words = line.find_all('span', 'ocrx_word') # Record the number of words in this line summary['words_per_line'].append(len(words)) for word_idx, word in enumerate(words): wordbbox = helpers.extractbbox(word.get('title')) summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1']) summary['word_areas'].append((wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1'])) for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']): try: summary['x_gaps'][x] = 1 except IndexError: # x lies outside of gaps. Skip. pass # If word isn't the last word in a line, get distance between word and word + 1 if word_idx != (len(words) - 1): wordP1bbox = helpers.extractbbox(words[word_idx + 1].get('title')) # Pythagorean theorum FTW summary['word_distances'].append( math.sqrt( math.pow((wordP1bbox['x1'] - wordbbox['x2']), 2) + math.pow((wordP1bbox['y1'] - wordbbox['y1']), 2))) # Count whitespace gaps summary['gaps'] = helpers.get_gaps(summary['x_gaps']) # Get the mean of the differences of the word distances (all the same == 0, difference increases away from 0) summary['word_separation_index'] = 0 if summary[ 'words'] == 0 else helpers.meanOfDifferences(summary['word_distances']) # Quantify the variation in the height of words in this area summary['word_height_index'] = 0 if summary[ 'words'] == 0 else helpers.meanOfDifferences(summary['word_heights']) # Get the average word height of this area summary['word_height_avg'] = 0 if summary['words'] == 0 else np.nanmean( summary['word_heights']) # Get word/area ratio summary['word_area_index'] = 0 if summary['words'] == 0 else np.sum( summary['word_areas']) / float(summary['area']) return summary
def tess(): # Open the file with Tesseract output with open('test_files/two_tables_equations.html.hocr') as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') # Extract the page page = helpers.extractbbox( soup.find_all('div', 'ocr_page')[0].get('title')) # Get all "areas" areas = soup.find_all('div', 'ocr_carea') # Find the captions/titles for charts, figures, maps, tables indicator_lines = [] for line in soup.find_all('span', 'ocr_line'): # Remove nonsense clean_line = line.getText().strip().replace('\n', ' ').replace(' ', ' ').lower() # Find all lines that contain only a target word plus a number dedicated_line_matches = re.match( '(table|figure|fig|map)(\.)? \d+(\.)?', clean_line, flags=re.IGNORECASE) # Find all the lines that start with one of the target words and a number caption_matches = re.match('(table|figure|fig|map)(\.)? \d+(\.)', clean_line, flags=re.IGNORECASE) # dedicated line (ex: Table 1) if dedicated_line_matches and dedicated_line_matches.group( 0) == clean_line: print(dedicated_line_matches.group(0)) indicator_lines.append(helpers.extractbbox(line.get('title'))) # Other elif caption_matches: print(caption_matches.group(0)) bbox = helpers.extractbbox(line.get('title')) bbox['name'] = caption_matches.group(0) indicator_lines.append(helpers.extractbbox(line.get('title'))) area_stats = [areaSummary(area) for area in areas] doc_stats = summarizeDocument(area_stats) print('Document Summary:') print(' Word separation avg (mean): %s' % ('%.2f' % doc_stats['word_separation_mean'], )) print(' Word separation avg (median): %s' % ('%.2f' % doc_stats['word_separation_median'], )) print(' Word separation avg (std): %s' % ('%.2f' % doc_stats['word_separation_std'], )) print(' Word separation index (mean): %s' % ('%.2f' % doc_stats['word_separation_index_mean'], )) print(' Word separtion index (median): %s' % ('%.2f' % doc_stats['word_separation_index_median'], )) print(' Word separtion index (std): %s' % ('%.2f' % doc_stats['word_separation_index_std'], )) print(' Word height index (mean): %s' % ('%.2f' % doc_stats['word_height_index_mean'], )) print(' Word height index (median): %s' % ('%.2f' % doc_stats['word_height_index_median'], )) print(' Word height index (std): %s' % ('%.2f' % doc_stats['word_height_index_std'], )) print(' Word area index (mean): %s%%' % (int(doc_stats['word_area_index_mean'] * 100), )) print(' Word area index (median): %s%%' % (int(doc_stats['word_area_index_median'] * 100), )) print(' Word area index (std): %s%%' % (int(doc_stats['word_area_index_std'] * 100), )) print(' Word height avg (mean): %s' % ('%.2f' % doc_stats['word_height_avg'], )) print(' Word height avg (median): %s' % ('%.2f' % doc_stats['word_height_avg_median'], )) print(' Word height avg (std): %s' % ('%.2f' % doc_stats['word_height_avg_std'], )) ''' table definition: word separation index > document median + 1 std word area index < document median - 1 std never one line ''' ''' text block: word separation index < document median + 1 std word area index === document median +/- (1 std / 2) never one line ''' for area in area_stats: # The table_score keeps track of how "table-y" an area is, i.e. how many characteristics it has consistent with tables area['table_score'] = 0 # Remove gaps smaller than the median gap between words area['gaps'] = [ gap for gap in area['gaps'] if gap > doc_stats['word_separation_median'] ] # Add to the table score for each gap (each gap adds one point) for gap in area['gaps']: area['table_score'] += 1 # Separator lines are only one line, have no words or other attributes if area['lines'] == 1 and area['words'] == 0 and area[ 'word_separation_index'] == 0 and area[ 'word_height_index'] == 0 and area['word_height_avg'] == 0: area['type'] = 'line' elif (area['word_separation_index'] >= (doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std'])) and ( area['word_area_index'] <= (doc_stats['word_area_index_median'] - doc_stats['word_area_index_std'])) and area['lines'] > 1: area['type'] = 'table' elif (area['word_separation_index'] < (doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std'])) and ( area['word_area_index'] > (doc_stats['word_area_index_median'] - (doc_stats['word_area_index_std'] / float(2))) and area['word_area_index'] < (doc_stats['word_area_index_median'] + (doc_stats['word_area_index_std'] / float(2))) ) and area['lines'] > 1: area['type'] = 'text block' # Else, unclassified else: area['type'] = 'other' # Tally other attributes that are indicative of tables if area['word_separation_index'] >= ( doc_stats['word_separation_index_median'] + doc_stats['word_separation_index_std']): area['table_score'] += 1 if area['word_area_index'] <= (doc_stats['word_area_index_median'] - doc_stats['word_area_index_std']): area['table_score'] += 1 if area['lines'] > 1: area['table_score'] += 1 # Find lines - can be line breaks between paragraphs or divider lines in tables lines = [area for area in area_stats if area['type'] == 'line'] # If a line intersects an area, classify that area as a table for area in area_stats: if area['type'] != 'line': for line in lines: if helpers.rectangles_intersect(area, line): area['type'] = 'table' area['table_score'] += 1 # Assign a caption to each table, and keep track of which captions are assigned to tables. caption_idx: [area_idx, area_idx, ...] caption_areas = {} for area_idx, area in enumerate(area_stats): if area['type'] == 'table': distances = [ helpers.distance(area, line) for line in indicator_lines ] nearest_caption = distances.index(min(distances)) area['caption'] = nearest_caption try: caption_areas[nearest_caption].append(area_idx) except: caption_areas[nearest_caption] = [area_idx] # Sanity check the caption-area assignments for caption, areas in caption_areas.items(): # Only check if the caption is assigned to more than one area if len(areas) > 1: # draw a line through the middle of the caption that spans the page ''' x1,y1 0 -------------- | | - - - - | - - - - - - - | - - - - <-- Create this line | | -------------- 0 x2,y2 ''' caption_line_y = indicator_lines[caption]['y1'] + ( indicator_lines[caption]['y2'] - indicator_lines[caption]['y1']) caption_line = { 'x1': page['x1'], 'y1': caption_line_y, 'x2': page['x2'], 'y2': caption_line_y } # Get a list of unique combinations of areas for this caption (example: [(0,1), (1,3)] ) area_combinations = list( itertools.combinations(caption_areas[caption], 2)) # Draw a line between them ''' ----------- | | | a | | \ | -------\--- \ <------ area_connection_line -----\- | \| - - - - | - - -|\ - - - - - - - | | \ ------ \ \ --------\-------------- | \ | | \ | | b | | | | | ----------------------- ''' for pair in area_combinations: a = helpers.centroid(area_stats[pair[0]]) b = helpers.centroid(area_stats[pair[1]]) area_line = { 'x1': a['x'], 'y1': a['y'], 'x2': b['x'], 'y2': b['y'] } # Check if the line intersects the caption line. If it does, determine which of the 'tables' is more table-y if lines_intersect(caption_line, area_line): if area_stats[pair[0]]['table_score'] > area_stats[ pair[1]]['table_score']: area_stats[pair[1]]['type'] = 'not a table' caption_areas[caption] = [ area for area in areas if area != pair[1] ] else: area_stats[pair[0]]['type'] = 'not a table' caption_areas[caption] = [ area for area in areas if area != pair[0] ] extracts = [] for caption, areas in caption_areas.items(): areas_of_interest = [area_stats[area] for area in areas] areas_of_interest.append(indicator_lines[caption]) extracts.append({ 'x1': min([a['x1'] for a in areas_of_interest]) - padding, 'y1': min([a['y1'] for a in areas_of_interest]) - padding, 'x2': max([a['x2'] for a in areas_of_interest]) + padding, 'y2': max([a['y2'] for a in areas_of_interest]) + padding }) # Find all areas that overlap in x space and are above and below the extracts extract_relations = {} for extract_idx, extract in enumerate(extracts): extract_relations[extract_idx] = {'above': [], 'below': []} for area_idx, area in enumerate(area_stats): # Check if they overlap in x space if area['x1'] <= extract['x2'] and extract['x1'] <= area['x2']: # Check how * much * they overlap in percent_overlap = (abs(area['x2'] - extract['x1']) ) / float(extract['x2'] - extract['x1']) if percent_overlap >= 0.9: # Check if this area is above or below the extract area area_centroid = helpers.centroid(area) extract_centroid = helpers.centroid(extract) if area_centroid['y'] <= extract_centroid['y']: # Work backwards so that when we iterate we start at the area closest to the extract extract_relations[extract_idx]['above'].insert( 0, area_idx) else: extract_relations[extract_idx]['below'].append( area_idx) for extract_idx, props in extract_relations.items(): for area_idx in extract_relations[extract_idx]['above']: if area_stats[area_idx]['type'] != 'text block' and area_stats[ area_idx]['type'] != 'not a table' and area_stats[ area_idx]['type'] != 'other': # [Grow] the extract area extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], area_stats[area_idx])) else: break for area_idx in extract_relations[extract_idx]['below']: if area_stats[area_idx]['type'] != 'text block' and area_stats[ area_idx]['type'] != 'not a table' and area_stats[ area_idx]['type'] != 'other': # [Grow] the extract area print(extract_idx, area_stats[area_idx]['type']) extracts[extract_idx].update( helpers.enlarge_extract(extracts[extract_idx], area_stats[area_idx])) else: break