def _resized_image(self, file): (max_width, max_height) = self._max_dimensions file_ext = filename_extension(file) say = self._say if file.find('-reduced') > 0: new_file = file else: new_file = filename_basename(file) + '-reduced' + file_ext if path.exists(new_file) and readable(new_file): (image_width, image_height) = image_dimensions(new_file) if image_width < max_width and image_height < max_height: say.info('Using reduced image found in {}'.format( relative(new_file))) return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, dimension are too large. if __debug__: log('existing resized file larger than {}x{}: {}', max_width, max_height, new_file) say.info('Dimensions too large; reducing dimensions: {}'.format( relative(file))) (resized, error) = reduced_image_dimensions(file, new_file, max_width, max_height) if error: say.error('Failed to re-dimension {}: {}'.format( relative(file), error)) return None return resized
def annotated_image(file, text_boxes, service): service_name = service.name() fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 20)) axes.get_xaxis().set_visible(False) axes.get_yaxis().set_visible(False) axes.set_title(service_name, color='r', fontweight='bold', fontsize=22) if __debug__: log('reading image file for {}: {}', service_name, relative(file)) img = mpimg.imread(file) axes.imshow(img, cmap="gray") props = dict(facecolor='white', alpha=0.7) if text_boxes: if __debug__: log('adding {} annotations for {}', len(text_boxes), service_name) polygons = [(item.boundingBox, item.text) for item in text_boxes] for polygon in polygons: vertices = [(polygon[0][i], polygon[0][i + 1]) for i in range(0, len(polygon[0]), 2)] x = max(0, vertices[0][0] - 4) y = max(0, vertices[0][1] - 8) text = polygon[1] plt.text(x, y, text, color='r', fontsize=11, va="top", bbox=props) if __debug__: log('generating png for {} for {}', service_name, relative(file)) buf = io.BytesIO() fig.savefig(buf, format='png', dpi=300, bbox_inches='tight', pad_inches=0) buf.flush() buf.seek(0) plt.close(fig) return buf
def _smaller_file(self, file): if not file: return None say = self._say file_ext = filename_extension(file) if file.find('-reduced') > 0: new_file = file else: new_file = filename_basename(file) + '-reduced' + file_ext if path.exists(new_file): if image_size(new_file) < self._max_size: say.info('Reusing resized image found in {}'.format( relative(new_file))) return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, it's larger than allowed. if __debug__: log('existing resized file larger than {}b: {}', humanize.intcomma(self._max_size), new_file) say.info('Size too large; reducing size: {}'.format(relative(file))) (resized, error) = reduced_image_size(file, new_file, self._max_size) if error: say.error('Failed to resize {}: {}'.format(relative(file), error)) return None return resized
def file_after_converting(file, to_format, tool, spinner): new_file = filename_basename(file) + '.' + to_format if path.exists(new_file): spinner.update('Using converted image found in {}'.format( relative(new_file))) return new_file else: spinner.update('Converting to {} format: {}'.format( to_format, relative(file))) (converted, error) = converted_image(file, to_format) if not converted: spinner.fail('Failed to convert {}: {}'.format( relative(file), error)) return None return converted
def file_after_resizing(file, tool, spinner): file_ext = filename_extension(file) new_file = filename_basename(file) + '-reduced.' + file_ext if path.exists(new_file): spinner.update('Using reduced image found in {}'.format( relative(new_file))) return new_file else: spinner.update('Original image too large; reducing size') (resized, error) = reduced_image(file, tool.max_dimensions()) if not resized: spinner.fail('Failed to resize {}: {}'.format(relative( file, error))) return None return resized
def _save_output(self, result, file): say = self._say # First perform some sanity checks. if result is None: say.warn('No data for {}'.format(file)) return if isinstance(result, tuple): # Assumes 2 elements: data, and error (data, error) = result if error: say.error('Error: {}'.format(error)) say.warn('Unable to write {}'.format(file)) return else: result = data if __debug__: log('writing output to file {}', relative(file)) if isinstance(result, str): with open(file, 'w') as f: f.write(result) elif isinstance(result, io.BytesIO): with open(file, 'wb') as f: shutil.copyfileobj(result, f) else: # There's no other type in the code, so if we get here ... raise InternalError( 'Unexpected data in save_output() -- please report this.')
def _converted_file(self, file, to_format, dest_dir): basename = path.basename(filename_basename(file)) new_file = path.join(dest_dir, basename + '.' + to_format) say = self._say if path.exists(new_file): say.info('Using already converted image in {}'.format( relative(new_file))) return new_file else: say.info('Converting to {} format: {}'.format( to_format, relative(file))) (converted, error) = converted_image(file, to_format, new_file) if error: say.error('Failed to convert {}: {}'.format( relative(file), error)) return None return converted
def _get(self, item, base_name, index): # Shortcuts to make the code more readable. output_dir = self._output_dir say = self._say # For URLs, we download the corresponding files and name them with # the base_name. if is_url(item): # First make sure the URL actually points to an image. if __debug__: log('testing if URL contains an image: {}', item) try: response = urllib.request.urlopen(item) except Exception as ex: say.warn('Skipping URL due to error: {}'.format(ex)) return (None, None) if response.headers.get_content_maintype() != 'image': say.warn('Did not find an image at {}'.format(item)) return (None, None) orig_fmt = response.headers.get_content_subtype() base = '{}-{}'.format(base_name, index) # If we weren't given an output dir, then for URLs, we have no # choice but to use the current dir to download the file. # Important: don't change self._output_dir because if other # inputs *are* files, then those files will need other output dirs. if not output_dir: output_dir = os.getcwd() file = path.realpath(path.join(output_dir, base + '.' + orig_fmt)) if not download_file(item, file, say): say.warn('Unable to download {}'.format(item)) return (None, None) url_file = path.realpath(path.join(output_dir, base + '.url')) with open(url_file, 'w') as f: f.write(url_file_content(item)) say.info('Wrote URL to {}'.format(relative(url_file))) else: file = path.realpath(path.join(os.getcwd(), item)) orig_fmt = filename_extension(file)[1:] if __debug__: log('{} has original format {}', relative(file), orig_fmt) return (file, orig_fmt)
def _send(self, file, service, dest_dir): '''Send the "file" to the service named "service" and write output in directory "dest_dir". ''' say = self._say use_color = say.use_color() color = service.name_color() service_name = styled(service.name(), color) if use_color else service.name() say.info( 'Sending to {} and waiting for response ...'.format(service_name)) last_time = timer() try: result = service.result(file) except AuthFailure as ex: raise AuthFailure('Unable to use {}: {}'.format(service, ex)) except RateLimitExceeded as ex: time_passed = timer() - last_time if time_passed < 1 / service.max_rate(): say.warn('Pausing {} due to rate limits'.format(service_name)) time.sleep(1 / service.max_rate() - time_passed) # FIXME resend after pause if result.error: say.error('{} failed: {}'.format(service_name, result.error)) say.warn('No result from {} for {}'.format(service_name, relative(file))) return None say.info('Got result from {}.'.format(service_name)) file_name = path.basename(file) base_path = path.join(dest_dir, file_name) annot_path = alt_extension(base_path, str(service) + '.png') say.info('Creating annotated image for {}.'.format(service_name)) self._save_output(annotated_image(file, result.boxes, service), annot_path) if self._extended_results: txt_file = alt_extension(base_path, str(service) + '.txt') json_file = alt_extension(base_path, str(service) + '.json') say.info('Saving all data for {}.'.format(service_name)) self._save_output(json.dumps(result.data), json_file) say.info('Saving extracted text for {}.'.format(service_name)) self._save_output(result.text, txt_file) # Return the annotated image file b/c we use it for the summary grid. return annot_path
def run(classes, item, index, base_name, output_dir, creds_dir, annotate, say): spinner = ProgressIndicator(say.use_color(), say.be_quiet()) try: spinner.start('Starting on {}'.format(relative(item))) if is_url(item): # Make sure the URLs point to images. if __debug__: log('Testing if URL contains an image: {}', item) try: response = request.urlopen(item) except Exception as err: if __debug__: log('Network access resulted in error: {}', str(err)) spinner.fail('Skipping URL due to error: {}'.format(err)) return if response.headers.get_content_maintype() != 'image': spinner.fail('Did not find an image at {}'.format(item)) return fmt = response.headers.get_content_subtype() base = '{}-{}'.format(base_name, index) file = path.realpath(path.join(output_dir, base + '.' + fmt)) error = download(item, file) if not error: spinner.update('Wrote contents to {}'.format(relative(file))) else: spinner.fail('Failed to download {}: {}'.format(item, error)) return url_file = path.realpath(path.join(output_dir, base + '.url')) with open(url_file, 'w') as f: f.write(url_file_content(item)) spinner.update('Wrote URL to {}'.format(relative(url_file))) else: file = path.realpath(path.join(os.getcwd(), item)) fmt = filename_extension(file) dest_dir = output_dir if output_dir else path.dirname(file) if not writable(dest_dir): say.fatal('Cannot write output in {}.'.format(dest_dir)) return # Iterate over the methods. for method_class in classes: method = method_class() method.init_credentials(creds_dir) last_time = timer() # If need to convert format, best do it after resizing original fmt. need_convert = fmt not in method.accepted_formats() # Test the dimensions, not bytes, because of compression. if image_dimensions(file) > method.max_dimensions(): file = file_after_resizing(file, method, spinner) if file and need_convert: file = file_after_converting(file, 'jpg', method, spinner) if not file: return spinner.update('Sending to {} {}'.format( color(method, 'white', say.use_color()), # Need explicit color research or colorization goes wrong. color('and waiting for response', 'info', say.use_color()))) try: result = method.result(file) except RateLimitExceeded as err: time_passed = timer() - last_time if time_passed < 1 / method.max_rate(): spinner.warn('Pausing due to rate limits') time.sleep(1 / method.max_rate() - time_passed) if result.error: spinner.fail(result.error) return file_name = path.basename(file) base_path = path.join(dest_dir, file_name) txt_file = alt_extension(base_path, str(method) + '.txt') json_file = alt_extension(base_path, str(method) + '.json') annot_file = alt_extension(base_path, str(method) + '.jpg') spinner.update('Text -> {}'.format(relative(txt_file))) save_output(result.text, txt_file) spinner.update('All data -> {}'.format(relative(json_file))) save_output(json.dumps(result.data), json_file) if annotate: spinner.update('Annotated image -> {}'.format( relative(annot_file))) save_output(annotated_image(file, result.boxes), annot_file) spinner.stop('Done with {}'.format(relative(item))) except (KeyboardInterrupt, UserCancelled) as err: spinner.warn('Interrupted') raise except AuthenticationFailure as err: spinner.fail('Unable to continue using {}: {}'.format(method, err)) return except Exception as err: spinner.fail(say.error_text('Stopping due to a problem')) raise
def run_services(self, item, index, base_name): '''Run all requested services on the image indicated by "item", using "index" and "base_name" to construct a download copy of the item if it has to be downloaded from a URL first. ''' # Shortcuts to make the code more readable. services = self._services output_dir = self._output_dir say = self._say try: say.info('Starting on {}'.format( styled(item, 'white') if say.use_color() else item)) (file, orig_fmt) = self._get(item, base_name, index) if not file: return dest_dir = output_dir if output_dir else path.dirname(file) if not writable(dest_dir): say.error('Cannot write output in {}.'.format(dest_dir)) return # Sanity check if not path.getsize(file) > 0: say.warn('Skipping zero-length file {}'.format(relative(file))) return # Save grid file name now, because it's based on the original file. basename = path.basename(filename_basename(file)) grid_file = path.realpath( path.join(dest_dir, basename + '.all-results.png')) # We will usually delete temporary files we create. to_delete = set() # Normalize to the lowest common denominator. (new_file, intermediate_files) = self._normalized(file, orig_fmt, dest_dir) if not new_file: say.warn('Skipping {}'.format(relative(file))) return file = new_file if intermediate_files: to_delete.update(intermediate_files) # Send the file to the services. If the number of threads is set # to 1, we force non-thread-pool execution to make debugging easier. results = [] if self._num_threads == 1: results = [self._send(file, s, dest_dir) for s in services] else: with ThreadPoolExecutor( max_workers=self._num_threads) as executor: results = list( executor.map(self._send, repeat(file), iter(services), repeat(dest_dir))) # If a service failed for some reason (e.g., a network glitch), we # get no result back. Remove empty results & go on with the rest. results = [x for x in results if x is not None] to_delete.update(results) # Create grid file if requested. if self._make_grid: say.info('Creating results grid image: {}'.format( relative(grid_file))) create_image_grid(results, grid_file, max_horizontal=2) # Clean up after ourselves. if self._make_grid and not self._extended_results: for image_file in to_delete: delete_existing(image_file) say.info('Done with {}'.format(relative(item))) except (KeyboardInterrupt, UserCancelled) as ex: say.warn('Interrupted') raise except Exception as ex: say.error('Stopping due to a problem') raise