Exemplo n.º 1
0
    def __init__(self, base_name, extended, from_file, output_dir, threads):
        '''Initialize internal state and prepare for running services.'''

        if not network_available():
            raise ServiceFailure('No network.')

        if from_file:
            if not path.exists(from_file):
                raise RuntimeError('File not found: {}'.format(from_file))
            if not readable(from_file):
                raise RuntimeError('File not readable: {}'.format(from_file))

        if output_dir:
            if path.isdir(output_dir):
                if not writable(output_dir):
                    raise RuntimeError(
                        'Directory not writable: {}'.format(output_dir))
            else:
                os.mkdir(output_dir)
                if __debug__:
                    log('created output_dir directory {}', output_dir)

        self._base_name = base_name
        self._extended = extended
        self._from_file = from_file
        self._output_dir = output_dir
        self._threads = threads
Exemplo n.º 2
0
def main(base_name='B',
         creds_dir='C',
         from_file='F',
         list=False,
         method='M',
         output='O',
         given_urls=False,
         quiet=False,
         no_annot=False,
         no_color=False,
         debug=False,
         version=False,
         *images):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") can
run alternative text recognition methods on images of document pages.

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known methods and then exit.  The option -m (/m on Windows) can
be used to select a specific method.  (The default method is to run them all.)

When invoked, the command-line arguments should contain one of the following:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) if given the -u option (/u on Windows), one or more URLs, which will be
    interpreted as network locations of image files to be processed;

 c) if given the -f option (/f on Windows), a file containing either image
    paths or (if combined with the -u option), image URLs

If given URLs (via the -u option), Handprint will first download the images
found at the URLs to a local directory indicated by the option -o (/o on
Windows).  Handprint will send each image file to OCR/HTR services from
Google, Microsoft and others.  It will write the results to new files placed
either in the same directories as the original files, or (if given the -o
option) to the directory indicated by the -o option value (/o on Windows).
The results will be written in files named after the original files with the
addition of a string that indicates the method used.  For example, a file
named "somefile.jpg" will produce

  somefile.jpg
  somefile.google.txt
  somefile.google.json
  somefile.microsoft.txt
  somefile.microsoft.json
  somefile.amazon.txt
  somefile.amazon.json
  ...

and so on for each image and each service used.  The .txt files will contain
the text extracted (if any).  The .json files will contain the complete
response from the service, converted to JSON by Handprint.  In some cases,
such as Google's API, the service may offer multiple operations and will
return individual results for different API calls or options; in those cases,
Handprint combines the results of multiple API calls into a single JSON
object.

Unless given the do-not-annotate option, -A (/A on Windows), Handprint will
also generate a copy of the image with superimposed bounding boxes and text
to show the recognition results.  The annotated images will include the name
of the service; in other words, the list of files produced by Handprint will
include

  somefile.google.jpg
  somefile.microsoft.jpg
  ...

and so on.  (They are distinguished from the original unannotated image, which
will be left in somefile.jpg.)

Note that if -u (/u on Windows) is given, then an output directory MUST also
be specified using the option -o (/o on Windows) because it is not possible
to write the results in the network locations represented by the URLs.  Also,
when -u is used, the images and text results will be stored in files whose
root names have the form "document-N", where "N" is an integer.  The root
name can be changed using the -r option (/r on Windows).  The image will be
converted to ordinary JPEG format for maximum compatibility with the
different OCR services and written to "document-N.jpg", and the URL
corresponding to each document will be written in a file named
"document-N.url" so that it is possible to connect each "document-N.jpg" to
the URL it came from.

If images are too large for a method/service, then Handprint will resize them
prior to sending them.  It will write the reduced image to a file named
"FILENAME-reduced.EXT", where "FILENAME" is the original file name and "EXT"
is the file extension.  This means that if an image needs to be resized, the
results of applying the text recognition methods will be, e.g.,

  somefile-reduced.jpg
  somefile-reduced.google.txt
  somefile-reduced.google.jpg
  somefile-reduced.google.json
  somefile-reduced.microsoft.txt
  somefile-reduced.microsoft.jpg
  somefile-reduced.microsoft.json
  somefile-reduced.amazon.txt
  somefile-reduced.amazon.jpg
  somefile-reduced.amazon.json
  ...

Credentials for different services need to be provided to Handprint in the
form of JSON files.  Each service needs a separate JSON file named after the
service (e.g., "microsoft_credentials.json") and placed in a directory that
Handprint searches.  By default, Handprint searches for the files in a
subdirectory named "creds" where Handprint is installed, but an alternative
directory can be indicated at run-time using the -c command-line option (/c
on Windows).  The specific format of each credentials file is different for
each service; please consult the Handprint documentation for more details.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.

If given the -V option (/V on Windows), this program will print version
information and exit without doing anything else.
'''

    # Reverse some flags for easier code readability
    annotate = not no_annot

    # Prepare notification methods and hints.
    say = MessageHandlerCLI(not no_color, quiet)
    prefix = '/' if ON_WINDOWS else '-'
    hint = '(Hint: use {}h for help.)'.format(prefix)

    # Process arguments.
    if debug:
        set_debug(True)
    if version:
        print_version()
        exit()
    if list:
        say.info('Known methods:')
        for key in KNOWN_METHODS.keys():
            say.info('   {}'.format(key))
        exit()
    if not network_available():
        exit(say.fatal_text('No network.'))

    if from_file == 'F':
        from_file = None
    else:
        if not path.isabs(from_file):
            from_file = path.realpath(path.join(os.getcwd(), from_file))
        if not path.exists(from_file):
            exit(say.error_text('File not found: {}'.format(from_file)))
        if not readable(from_file):
            exit(say.error_text('File not readable: {}'.format(from_file)))

    if not images and not from_file:
        exit(say.error_text('Need provide images or URLs. {}'.format(hint)))
    if any(item.startswith('-') for item in images):
        exit(
            say.error_text(
                'Unrecognized option in arguments. {}'.format(hint)))

    if creds_dir == 'C':
        creds_dir = path.join(handprint_path(), 'creds')
    if not readable(creds_dir):
        exit(say.error_text('Directory not readable: {}'.format(creds_dir)))
    else:
        if __debug__: log('Assuming credentials found in {}.', creds_dir)

    if method == 'M':
        method = 'all'
    method = method.lower()
    if method != 'all' and method not in KNOWN_METHODS:
        exit(
            say.error_text('"{}" is not a known method. {}'.format(
                method, hint)))

    if output == 'O':
        output = None
    else:
        if not path.isabs(output):
            output = path.realpath(path.join(os.getcwd(), output))
        if path.isdir(output):
            if not writable(output):
                exit(
                    say.error_text(
                        'Directory not writable: {}'.format(output)))
        else:
            os.mkdir(output)
            if __debug__: log('Created output directory {}', output)
    if given_urls and not output:
        exit(say.error_text('Must provide an output directory if using URLs.'))
    if base_name != 'B' and not given_urls:
        exit(
            say.error_text(
                'Option {}r can only be used with URLs.'.format(prefix)))
    if base_name == 'B':
        base_name = 'document'

    # Create a list of files to be processed.
    targets = targets_from_arguments(images, from_file, given_urls, say)
    if not targets:
        exit(say.warn_text('No images to process; quitting.'))

    # Let's do this thing.
    try:
        num_items = len(targets)
        print_separators = num_items > 1 and not say.be_quiet()
        if method == 'all':
            # Order doesn't really matter; just make it consistent run-to-run.
            methods = sorted(KNOWN_METHODS.values(), key=lambda x: str(x))
            say.info(
                'Will apply all known methods to {} images.'.format(num_items))
        else:
            methods = [KNOWN_METHODS[method]]
            say.info('Will apply method "{}" to {} images.'.format(
                method, num_items))
        for index, item in enumerate(targets, start=1):
            if print_separators:
                say.msg('=' * 70, 'dark')
            run(methods, item, index, base_name, output, creds_dir, annotate,
                say)
        if print_separators:
            say.msg('=' * 70, 'dark')
    except (KeyboardInterrupt, UserCancelled) as err:
        exit(say.info_text('Quitting.'))
    except ServiceFailure as err:
        exit(say.error_text(str(err)))
    except Exception as err:
        if debug:
            import pdb
            pdb.set_trace()
        exit(say.error_text('{}\n{}'.format(str(err), traceback.format_exc())))
    say.info('Done.')
Exemplo n.º 3
0
def run(classes, item, index, base_name, output_dir, creds_dir, annotate, say):
    spinner = ProgressIndicator(say.use_color(), say.be_quiet())
    try:
        spinner.start('Starting on {}'.format(relative(item)))
        if is_url(item):
            # Make sure the URLs point to images.
            if __debug__: log('Testing if URL contains an image: {}', item)
            try:
                response = request.urlopen(item)
            except Exception as err:
                if __debug__:
                    log('Network access resulted in error: {}', str(err))
                spinner.fail('Skipping URL due to error: {}'.format(err))
                return
            if response.headers.get_content_maintype() != 'image':
                spinner.fail('Did not find an image at {}'.format(item))
                return
            fmt = response.headers.get_content_subtype()
            base = '{}-{}'.format(base_name, index)
            file = path.realpath(path.join(output_dir, base + '.' + fmt))
            error = download(item, file)
            if not error:
                spinner.update('Wrote contents to {}'.format(relative(file)))
            else:
                spinner.fail('Failed to download {}: {}'.format(item, error))
                return
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                spinner.update('Wrote URL to {}'.format(relative(url_file)))
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            fmt = filename_extension(file)

        dest_dir = output_dir if output_dir else path.dirname(file)
        if not writable(dest_dir):
            say.fatal('Cannot write output in {}.'.format(dest_dir))
            return

        # Iterate over the methods.
        for method_class in classes:
            method = method_class()
            method.init_credentials(creds_dir)
            last_time = timer()

            # If need to convert format, best do it after resizing original fmt.
            need_convert = fmt not in method.accepted_formats()
            # Test the dimensions, not bytes, because of compression.
            if image_dimensions(file) > method.max_dimensions():
                file = file_after_resizing(file, method, spinner)
            if file and need_convert:
                file = file_after_converting(file, 'jpg', method, spinner)
            if not file:
                return

            spinner.update('Sending to {} {}'.format(
                color(method, 'white', say.use_color()),
                # Need explicit color research or colorization goes wrong.
                color('and waiting for response', 'info', say.use_color())))
            try:
                result = method.result(file)
            except RateLimitExceeded as err:
                time_passed = timer() - last_time
                if time_passed < 1 / method.max_rate():
                    spinner.warn('Pausing due to rate limits')
                    time.sleep(1 / method.max_rate() - time_passed)
            if result.error:
                spinner.fail(result.error)
                return

            file_name = path.basename(file)
            base_path = path.join(dest_dir, file_name)
            txt_file = alt_extension(base_path, str(method) + '.txt')
            json_file = alt_extension(base_path, str(method) + '.json')
            annot_file = alt_extension(base_path, str(method) + '.jpg')
            spinner.update('Text -> {}'.format(relative(txt_file)))
            save_output(result.text, txt_file)
            spinner.update('All data -> {}'.format(relative(json_file)))
            save_output(json.dumps(result.data), json_file)
            if annotate:
                spinner.update('Annotated image -> {}'.format(
                    relative(annot_file)))
                save_output(annotated_image(file, result.boxes), annot_file)
        spinner.stop('Done with {}'.format(relative(item)))
    except (KeyboardInterrupt, UserCancelled) as err:
        spinner.warn('Interrupted')
        raise
    except AuthenticationFailure as err:
        spinner.fail('Unable to continue using {}: {}'.format(method, err))
        return
    except Exception as err:
        spinner.fail(say.error_text('Stopping due to a problem'))
        raise
Exemplo n.º 4
0
    def run_services(self, item, index, base_name):
        '''Run all requested services on the image indicated by "item", using
        "index" and "base_name" to construct a download copy of the item if
        it has to be downloaded from a URL first.
        '''
        # Shortcuts to make the code more readable.
        services = self._services
        output_dir = self._output_dir
        say = self._say

        try:
            say.info('Starting on {}'.format(
                styled(item, 'white') if say.use_color() else item))

            (file, orig_fmt) = self._get(item, base_name, index)
            if not file:
                return

            dest_dir = output_dir if output_dir else path.dirname(file)
            if not writable(dest_dir):
                say.error('Cannot write output in {}.'.format(dest_dir))
                return

            # Sanity check
            if not path.getsize(file) > 0:
                say.warn('Skipping zero-length file {}'.format(relative(file)))
                return

            # Save grid file name now, because it's based on the original file.
            basename = path.basename(filename_basename(file))
            grid_file = path.realpath(
                path.join(dest_dir, basename + '.all-results.png'))

            # We will usually delete temporary files we create.
            to_delete = set()

            # Normalize to the lowest common denominator.
            (new_file,
             intermediate_files) = self._normalized(file, orig_fmt, dest_dir)
            if not new_file:
                say.warn('Skipping {}'.format(relative(file)))
                return
            file = new_file
            if intermediate_files:
                to_delete.update(intermediate_files)

            # Send the file to the services.  If the number of threads is set
            # to 1, we force non-thread-pool execution to make debugging easier.
            results = []
            if self._num_threads == 1:
                results = [self._send(file, s, dest_dir) for s in services]
            else:
                with ThreadPoolExecutor(
                        max_workers=self._num_threads) as executor:
                    results = list(
                        executor.map(self._send, repeat(file), iter(services),
                                     repeat(dest_dir)))

            # If a service failed for some reason (e.g., a network glitch), we
            # get no result back.  Remove empty results & go on with the rest.
            results = [x for x in results if x is not None]
            to_delete.update(results)

            # Create grid file if requested.
            if self._make_grid:
                say.info('Creating results grid image: {}'.format(
                    relative(grid_file)))
                create_image_grid(results, grid_file, max_horizontal=2)

            # Clean up after ourselves.
            if self._make_grid and not self._extended_results:
                for image_file in to_delete:
                    delete_existing(image_file)

            say.info('Done with {}'.format(relative(item)))
        except (KeyboardInterrupt, UserCancelled) as ex:
            say.warn('Interrupted')
            raise
        except Exception as ex:
            say.error('Stopping due to a problem')
            raise
Exemplo n.º 5
0
def run(method_class, targets, given_urls, output_dir, root_name, creds_dir, say):
    spinner = ProgressIndicator(say.use_color(), say.be_quiet())
    try:
        tool = method_class()
        tool_name = tool.name()
        say.info('Using method "{}".'.format(tool_name))
        tool.init_credentials(creds_dir)
        for index, item in enumerate(targets, 1):
            if not given_urls and (item.startswith('http') or item.startswith('ftp')):
                say.warn('Skipping URL "{}"'.format(item))
                continue
            if say.use_color() and not say.be_quiet():
                action = 'Downloading' if given_urls else 'Reading'
                spinner.start('{} {}'.format(action, item))
            fmt = None
            if given_urls:
                # Make sure the URLs point to images.
                response = request.urlopen(item)
                if response.headers.get_content_maintype() != 'image':
                    spinner.fail('Did not find an image at "{}"'.format(item))
                    continue
                fmt = response.headers.get_content_subtype()
                if fmt not in ACCEPTED_FORMATS:
                    spinner.fail('Cannot use image format {} in "{}"'.format(fmt, item))
                    continue
                # If we're given URLs, we have to invent file names to store
                # the images and the OCR results.
                base = '{}-{}'.format(root_name, index)
                url_file = path.realpath(path.join(output_dir, base + '.url'))
                if __debug__: log('Writing URL to {}', url_file)
                with open(url_file, 'w') as f:
                    f.write(url_file_content(item))
                file = path.realpath(path.join(output_dir, base + '.' + fmt))
                if __debug__: log('Starting wget on {}', item)
                (success, error) = download_url(item, file)
                if not success:
                    spinner.fail('Failed to download {}: {}'.format(item, error))
                    continue
            else:
                file = path.realpath(path.join(os.getcwd(), item))
                fmt = filename_extension(file)
            if output_dir:
                dest_dir = output_dir
            else:
                dest_dir = path.dirname(file)
                if not writable(dest_dir):
                    say.fatal('Cannot write output in "{}".'.format(dest_dir))
                    return
            if fmt in FORMATS_MUST_CONVERT:
                spinner.update('Converting file format to JPEG: "{}"'.format(file))
                (success, converted_file, msg) = convert_image(file, fmt, 'jpeg')
                if not success:
                    spinner.fail('Failed to convert "{}": {}'.format(file, msg))
                # Note: 'file' now points to the converted file, not the original
                file = converted_file
            file_name = path.basename(file)
            base_path = path.join(dest_dir, file_name)
            txt_file  = replace_extension(base_path, '.' + tool_name + '.txt')
            json_file = replace_extension(base_path, '.' + tool_name + '.json')
            spinner.update('Sending to {} for text extraction'.format(tool_name))
            save_output(tool.document_text(file), txt_file)
            spinner.update('Text from {} saved in {}'.format(tool_name, txt_file))
            spinner.update('All data from {} saved in {}'.format(tool_name, json_file))
            save_output(json.dumps(tool.all_results(file)), json_file)
            if say.use_color() and not say.be_quiet():
                short_path = path.relpath(txt_file, os.getcwd())
                spinner.stop('{} -> {}'.format(item, short_path))
    except (KeyboardInterrupt, UserCancelled) as err:
        if spinner:
            spinner.stop()
        raise
    except Exception as err:
        if spinner:
            spinner.fail(say.error_text('Stopping due to a problem'))
        raise