def area_summary(area): summary = {} summary['soup'] = area # Bounding box (x1, y1, x2, y2) summary.update(helpers.extractbbox(area.get('title'))) # Number of lines summary['lines'] = len(area.find_all('span', 'ocr_line')) summary['line_heights'] = [] for line in area.find_all('span', 'ocr_line'): bbox = helpers.extractbbox(line.get('title')) height = bbox['y2'] - bbox['y1'] summary['line_heights'].append(height) # Number of words summary['words'] = len( filter( None, area.getText().strip().replace('\n', ' ').replace(' ', ' ').split(' '))) # Area summary['area'] = (summary['x2'] - summary['x1']) * (summary['y2'] - summary['y1']) # Get spacing of words summary['x_gaps'] = np.zeros(summary['x2'] - summary['x1'], dtype=np.int) # Words per line summary['words_in_line'] = [] summary['word_distances'] = [] summary['word_heights'] = [] summary['word_areas'] = [] summary['words_per_line'] = [] # Iterate on each line in the area for line in area.find_all('span', 'ocr_line'): # For each line, get words words = line.find_all('span', 'ocrx_word') # Record the number of words in this line summary['words_per_line'].append(len(words)) for word_idx, word in enumerate(words): wordbbox = helpers.extractbbox(word.get('title')) summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1']) summary['word_areas'].append((wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1'])) for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']): summary['x_gaps'][x] = 1 # If word isn't the last word in a line, get distance between word and word + 1 if word_idx != (len(words) - 1): wordP1bbox = helpers.extractbbox(words[word_idx + 1].get('title')) # Pythagorean theorum FTW summary['word_distances'].append( math.sqrt( math.pow((wordP1bbox['x1'] - wordbbox['x2']), 2) + math.pow((wordP1bbox['y1'] - wordbbox['y1']), 2))) # Count whitespace gaps summary['gaps'] = helpers.get_gaps(summary['x_gaps']) # Get the mean of the differences of the word distances (all the same == 0, difference increases away from 0) summary['word_separation_index'] = 0 if summary[ 'words'] == 0 else helpers.meanOfDifferences(summary['word_distances']) # Quantify the variation in the height of words in this area summary['word_height_index'] = 0 if summary[ 'words'] == 0 else helpers.meanOfDifferences(summary['word_heights']) # Get the average word height of this area summary['word_height_avg'] = 0 if summary['words'] == 0 else np.nanmean( summary['word_heights']) # Get word/area ratio summary['word_area_index'] = 0 if summary['words'] == 0 else np.sum( summary['word_areas']) / float(summary['area']) return summary
def area_summary_offset(area): summary = {} summary.update(area) coords = area.get('data-coordinates') spl = coords.split(' ') origin_x1 = int(spl[0]) origin_y1 = int(spl[1]) # Number of lines summary['lines'] = len(summary['soup'].find_all('span', 'ocr_line')) summary['line_heights'] = [] for line in summary['soup'].find_all('span', 'ocr_line'): bbox = helpers.extractbbox(line.get('title')) height = bbox['y2'] - bbox['y1'] summary['line_heights'].append(height) # Number of words try: summary['words'] = len(list(filter(None, summary['soup'].getText().strip().replace('\n', ' ').replace(' ', ' ').split(' ')))) except: summary['words'] = 0 # Area summary['area'] = (summary['x2'] - summary['x1']) * (summary['y2'] - summary['y1']) # Get spacing of words summary['x_gaps'] = np.zeros(summary['x2'] - summary['x1'], dtype=np.int) # Words per line summary['words_in_line'] = [] summary['word_distances'] = [] summary['word_heights'] = [] summary['word_areas'] = [] summary['words_per_line'] = [] # Record the x position of the first word in each line summary['first_word_x'] = [] # Iterate on each line in the area for line in summary['soup'].find_all('span', 'ocr_line'): # For each line, get words words = line.find_all('span', 'ocrx_word') # Record the number of words in this line summary['words_per_line'].append(len(words)) for word_idx, word in enumerate(words): wordbbox = helpers.extractbbox(word.get('title')) word_area = (wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1']) if word_area > summary['area'] or \ wordbbox['x2'] > summary['x2'] or \ wordbbox['x1'] < summary['x1'] or \ wordbbox['y1'] < summary['y1'] or \ wordbbox['y2'] > summary['y2']: print("Word outside of the enclosing area! Tesseract's black box strikes again!") continue # Record the x coordinate of the first word of each line if word_idx == 0: summary['first_word_x'] = wordbbox['x1'] + coords[0] summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1']) summary['word_areas'].append(word_area) for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']): summary['x_gaps'][x] = 1 # If word isn't the last word in a line, get distance between word and word + 1 if word_idx != (len(words) - 1): wordP1bbox = helpers.extractbbox(words[ word_idx + 1 ].get('title')) # Pythagorean theorum FTW summary['word_distances'].append(math.sqrt(math.pow((wordP1bbox['x1'] - wordbbox['x2']), 2) + math.pow((wordP1bbox['y1'] - wordbbox['y1']), 2))) # Count whitespace gaps summary['gaps'] = helpers.get_gaps(summary['x_gaps']) # Get the mean of the differences of the word distances (all the same == 0, difference increases away from 0) summary['word_separation_index'] = 0 if summary['words'] == 0 else helpers.meanOfDifferences(summary['word_distances']) # Quantify the variation in the height of words in this area summary['word_height_index'] = 0 if summary['words'] == 0 else helpers.meanOfDifferences(summary['word_heights']) # Get the average word height of this area summary['word_height_avg'] = 0 if summary['words'] == 0 else np.nanmean(summary['word_heights']) # Get word/area ratio summary['word_area_index'] = 0 if summary['words'] == 0 else np.sum(summary['word_areas']) / float(summary['area']) return summary