示例#1
0
    def _do_preflight(self):
        '''Check the option values given by the user, and do other prep.'''

        from handprint.network import network_available
        if not network_available():
            alert_fatal('No network connection.')
            raise CannotProceed(ExitCode.no_network)

        if self.from_file:
            if not exists(self.from_file):
                alert_fatal(f'File not found: {self.from_file}')
                raise CannotProceed(ExitCode.bad_arg)
            if not readable(self.from_file):
                alert_fatal(f'File not readable: {self.from_file}')
                raise CannotProceed(ExitCode.file_error)

        if self.output_dir:
            if isdir(self.output_dir):
                if not writable(self.output_dir):
                    alert_fatal(f'Directory not writable: {self.output_dir}')
                    raise CannotProceed(ExitCode.file_error)
            else:
                os.mkdir(self.output_dir)
                if __debug__:
                    log(f'created output_dir directory {self.output_dir}')
示例#2
0
    def __init__(self, base_name, extended, from_file, output_dir, threads):
        '''Initialize internal state and prepare for running services.'''

        if not network_available():
            raise ServiceFailure('No network.')

        if from_file:
            if not path.exists(from_file):
                raise RuntimeError('File not found: {}'.format(from_file))
            if not readable(from_file):
                raise RuntimeError('File not readable: {}'.format(from_file))

        if output_dir:
            if path.isdir(output_dir):
                if not writable(output_dir):
                    raise RuntimeError(
                        'Directory not writable: {}'.format(output_dir))
            else:
                os.mkdir(output_dir)
                if __debug__:
                    log('created output_dir directory {}', output_dir)

        self._base_name = base_name
        self._extended = extended
        self._from_file = from_file
        self._output_dir = output_dir
        self._threads = threads
示例#3
0
def main(base_name='B',
         creds_dir='C',
         from_file='F',
         list=False,
         method='M',
         output='O',
         given_urls=False,
         quiet=False,
         no_annot=False,
         no_color=False,
         debug=False,
         version=False,
         *images):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") can
run alternative text recognition methods on images of document pages.

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known methods and then exit.  The option -m (/m on Windows) can
be used to select a specific method.  (The default method is to run them all.)

When invoked, the command-line arguments should contain one of the following:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) if given the -u option (/u on Windows), one or more URLs, which will be
    interpreted as network locations of image files to be processed;

 c) if given the -f option (/f on Windows), a file containing either image
    paths or (if combined with the -u option), image URLs

If given URLs (via the -u option), Handprint will first download the images
found at the URLs to a local directory indicated by the option -o (/o on
Windows).  Handprint will send each image file to OCR/HTR services from
Google, Microsoft and others.  It will write the results to new files placed
either in the same directories as the original files, or (if given the -o
option) to the directory indicated by the -o option value (/o on Windows).
The results will be written in files named after the original files with the
addition of a string that indicates the method used.  For example, a file
named "somefile.jpg" will produce

  somefile.jpg
  somefile.google.txt
  somefile.google.json
  somefile.microsoft.txt
  somefile.microsoft.json
  somefile.amazon.txt
  somefile.amazon.json
  ...

and so on for each image and each service used.  The .txt files will contain
the text extracted (if any).  The .json files will contain the complete
response from the service, converted to JSON by Handprint.  In some cases,
such as Google's API, the service may offer multiple operations and will
return individual results for different API calls or options; in those cases,
Handprint combines the results of multiple API calls into a single JSON
object.

Unless given the do-not-annotate option, -A (/A on Windows), Handprint will
also generate a copy of the image with superimposed bounding boxes and text
to show the recognition results.  The annotated images will include the name
of the service; in other words, the list of files produced by Handprint will
include

  somefile.google.jpg
  somefile.microsoft.jpg
  ...

and so on.  (They are distinguished from the original unannotated image, which
will be left in somefile.jpg.)

Note that if -u (/u on Windows) is given, then an output directory MUST also
be specified using the option -o (/o on Windows) because it is not possible
to write the results in the network locations represented by the URLs.  Also,
when -u is used, the images and text results will be stored in files whose
root names have the form "document-N", where "N" is an integer.  The root
name can be changed using the -r option (/r on Windows).  The image will be
converted to ordinary JPEG format for maximum compatibility with the
different OCR services and written to "document-N.jpg", and the URL
corresponding to each document will be written in a file named
"document-N.url" so that it is possible to connect each "document-N.jpg" to
the URL it came from.

If images are too large for a method/service, then Handprint will resize them
prior to sending them.  It will write the reduced image to a file named
"FILENAME-reduced.EXT", where "FILENAME" is the original file name and "EXT"
is the file extension.  This means that if an image needs to be resized, the
results of applying the text recognition methods will be, e.g.,

  somefile-reduced.jpg
  somefile-reduced.google.txt
  somefile-reduced.google.jpg
  somefile-reduced.google.json
  somefile-reduced.microsoft.txt
  somefile-reduced.microsoft.jpg
  somefile-reduced.microsoft.json
  somefile-reduced.amazon.txt
  somefile-reduced.amazon.jpg
  somefile-reduced.amazon.json
  ...

Credentials for different services need to be provided to Handprint in the
form of JSON files.  Each service needs a separate JSON file named after the
service (e.g., "microsoft_credentials.json") and placed in a directory that
Handprint searches.  By default, Handprint searches for the files in a
subdirectory named "creds" where Handprint is installed, but an alternative
directory can be indicated at run-time using the -c command-line option (/c
on Windows).  The specific format of each credentials file is different for
each service; please consult the Handprint documentation for more details.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.

If given the -V option (/V on Windows), this program will print version
information and exit without doing anything else.
'''

    # Reverse some flags for easier code readability
    annotate = not no_annot

    # Prepare notification methods and hints.
    say = MessageHandlerCLI(not no_color, quiet)
    prefix = '/' if ON_WINDOWS else '-'
    hint = '(Hint: use {}h for help.)'.format(prefix)

    # Process arguments.
    if debug:
        set_debug(True)
    if version:
        print_version()
        exit()
    if list:
        say.info('Known methods:')
        for key in KNOWN_METHODS.keys():
            say.info('   {}'.format(key))
        exit()
    if not network_available():
        exit(say.fatal_text('No network.'))

    if from_file == 'F':
        from_file = None
    else:
        if not path.isabs(from_file):
            from_file = path.realpath(path.join(os.getcwd(), from_file))
        if not path.exists(from_file):
            exit(say.error_text('File not found: {}'.format(from_file)))
        if not readable(from_file):
            exit(say.error_text('File not readable: {}'.format(from_file)))

    if not images and not from_file:
        exit(say.error_text('Need provide images or URLs. {}'.format(hint)))
    if any(item.startswith('-') for item in images):
        exit(
            say.error_text(
                'Unrecognized option in arguments. {}'.format(hint)))

    if creds_dir == 'C':
        creds_dir = path.join(handprint_path(), 'creds')
    if not readable(creds_dir):
        exit(say.error_text('Directory not readable: {}'.format(creds_dir)))
    else:
        if __debug__: log('Assuming credentials found in {}.', creds_dir)

    if method == 'M':
        method = 'all'
    method = method.lower()
    if method != 'all' and method not in KNOWN_METHODS:
        exit(
            say.error_text('"{}" is not a known method. {}'.format(
                method, hint)))

    if output == 'O':
        output = None
    else:
        if not path.isabs(output):
            output = path.realpath(path.join(os.getcwd(), output))
        if path.isdir(output):
            if not writable(output):
                exit(
                    say.error_text(
                        'Directory not writable: {}'.format(output)))
        else:
            os.mkdir(output)
            if __debug__: log('Created output directory {}', output)
    if given_urls and not output:
        exit(say.error_text('Must provide an output directory if using URLs.'))
    if base_name != 'B' and not given_urls:
        exit(
            say.error_text(
                'Option {}r can only be used with URLs.'.format(prefix)))
    if base_name == 'B':
        base_name = 'document'

    # Create a list of files to be processed.
    targets = targets_from_arguments(images, from_file, given_urls, say)
    if not targets:
        exit(say.warn_text('No images to process; quitting.'))

    # Let's do this thing.
    try:
        num_items = len(targets)
        print_separators = num_items > 1 and not say.be_quiet()
        if method == 'all':
            # Order doesn't really matter; just make it consistent run-to-run.
            methods = sorted(KNOWN_METHODS.values(), key=lambda x: str(x))
            say.info(
                'Will apply all known methods to {} images.'.format(num_items))
        else:
            methods = [KNOWN_METHODS[method]]
            say.info('Will apply method "{}" to {} images.'.format(
                method, num_items))
        for index, item in enumerate(targets, start=1):
            if print_separators:
                say.msg('=' * 70, 'dark')
            run(methods, item, index, base_name, output, creds_dir, annotate,
                say)
        if print_separators:
            say.msg('=' * 70, 'dark')
    except (KeyboardInterrupt, UserCancelled) as err:
        exit(say.info_text('Quitting.'))
    except ServiceFailure as err:
        exit(say.error_text(str(err)))
    except Exception as err:
        if debug:
            import pdb
            pdb.set_trace()
        exit(say.error_text('{}\n{}'.format(str(err), traceback.format_exc())))
    say.info('Done.')