Пример #1
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file) and readable(new_file):
         from handprint.images import image_dimensions
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             inform(f'Using reduced image found in {relative(new_file)}')
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {max_width}x{max_height}: {new_file}')
     inform(f'Dimensions too large; reducing dimensions: {relative(file)}')
     from handprint.images import reduced_image_dimensions
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         alert(f'Failed to re-dimension {relative(file)}: {error}')
         return None
     return resized
Пример #2
0
 def _smaller_file(self, file):
     if not file:
         return None
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file):
         from handprint.images import image_size
         if image_size(new_file) < self._max_size:
             inform(f'Reusing resized image found in {relative(new_file)}')
             return new_file
         else:
             # We found a ".handprint.ext" file, perhaps from a previous run,
             # but for the current set of services, it's larger than allowed.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {self._max_size}b: {new_file}')
     inform(f'Size too large; reducing size: {relative(file)}')
     from handprint.images import reduced_image_size
     (resized, error) = reduced_image_size(file, new_file, self._max_size)
     if error:
         alert(f'Failed to resize {relative(file)}: {error}')
         return None
     return resized
Пример #3
0
    def articles_from(self, doi_file):
        '''Returns a list of `Article` tuples from a file of DOIs.'''
        if __debug__: log(f'reading {doi_file}')
        requested_dois = []
        with open(doi_file, 'r') as file:
            requested_dois = [line.strip() for line in file]

        num = len(requested_dois)
        # I'd use pluralized() here, but it matches case when it adds a 's',
        # and is confused by DOI which is an acronym.  Must add 's' ourselves.
        inform(f'Found {num} DOI{"s" if num > 1 else ""} in {doi_file}.')
        if not requested_dois:
            if __debug__: log(f'could not read any lines from {doi_file}')
            return []

        all_articles = self.all_articles()
        all_dois = [article.doi for article in all_articles]
        skipped = 0
        for doi in requested_dois:
            if doi not in all_dois:
                warn(
                    f'Skipping "{doi}" because it is unknown for this journal.'
                )
                skipped += 1
        if skipped:
            kept = num - skipped
            inform(
                f'Using {kept} DOI{"s" if kept > 1 else ""} from {doi_file}.')
        return [
            article for article in all_articles
            if article.doi in requested_dois
        ]
Пример #4
0
def download_file(url, output_file, user=None, pswd=None):
    inform(f'Downloading {url}')
    try:
        download(url, user, pswd, output_file)
        return True
    except (NoContent, ServiceFailure, AuthFailure) as ex:
        alert(str(ex))
    return False
Пример #5
0
 def _converted_file(self, file, to_format, dest_dir):
     basename = path.basename(filename_basename(file))
     new_file = path.join(dest_dir, basename + '.handprint.' + to_format)
     if path.exists(new_file):
         inform(f'Using existing converted image in {relative(new_file)}')
         return new_file
     else:
         inform(f'Converting to {to_format} format: {relative(file)}')
         from handprint.images import converted_image
         (converted, error) = converted_image(file, to_format, new_file)
         if error:
             alert(f'Failed to convert {relative(file)}: {error}')
             return None
         return converted
Пример #6
0
 def _print_articles(self, article_list):
     inform('-' * 89)
     inform('{:3}  {:<32}  {:10}  {:20}'.format(
         '?', 'DOI', 'Date', f'URL ({self.journal.base_urls[0]})'))
     inform('-' * 89)
     count = 0
     for article in article_list:
         count += 1
         status = 'OK' if article.status != 'incomplete' else '[alert]err[/]'
         doi = article.doi if article.doi else '[alert]missing DOI[/]'
         date = article.date if article.date else '[alert]missing date[/]'
         url = article.pdf if article.pdf else '[alert]missing URL[/]'
         for base in self.journal.base_urls:
             url = url.replace(base, '')
         inform('{:3}  {:<32}  {:10}  {:20}'.format(status, doi, date, url))
     inform('-' * 89)
Пример #7
0
    def _get(self, item, base_name, index):
        # Shortcuts to make the code more readable.
        output_dir = self._output_dir

        # For URLs, we download the corresponding files and name them with
        # the base_name.
        from validator_collection.checkers import is_url
        if is_url(item):
            # First make sure the URL actually points to an image.
            if __debug__: log(f'testing if URL contains an image: {item}')
            headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}
            try:
                request = urllib.request.Request(item, None, headers)
                response = urllib.request.urlopen(request)
            except Exception as ex:
                warn(f'Skipping URL due to error: {ex}')
                return (None, None)
            if response.headers.get_content_maintype() != 'image':
                warn(f'Did not find an image at {item}')
                return (None, None)
            orig_fmt = response.headers.get_content_subtype()
            base = f'{base_name}-{index}'
            # If we weren't given an output dir, then for URLs, we have no
            # choice but to use the current dir to download the file.
            # Important: don't change self._output_dir because if other
            # inputs *are* files, then those files will need other output dirs.
            if not output_dir:
                output_dir = os.getcwd()
            file = path.realpath(path.join(output_dir, base + '.' + orig_fmt))
            if not download_file(item, file):
                warn(f'Unable to download {item}')
                return (None, None)
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                inform(
                    f'Wrote URL to [white on grey42]{relative(url_file)}[/]')
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            orig_fmt = filename_extension(file)[1:]

        if not path.getsize(file) > 0:
            warn(f'File has zero length: {relative(file)}')
            return (None, None)

        if __debug__: log(f'{relative(file)} has original format {orig_fmt}')
        return (file, orig_fmt)
Пример #8
0
    def _save_articles(self, dest_dir, article_list, dest_service,
                       zip_articles):
        # This overwrites the article.status field of each article with an
        # error description if there is an error.
        saved_files = []
        for article in article_list:
            # Start by testing that we have all the data we will need.
            if not article.doi:
                warn('Skipping article with missing DOI: ' + article.title)
                article.status = 'missing-doi'
                continue
            if not article.pdf:
                warn('Skipping article with missing PDF URL: ' + article.doi)
                article.status = 'missing-pdf'
                continue
            if self.journal.uses_jats and not article.jats:
                # We need JATS for PMC.
                warn('Skipping article with missing JATS URL: ' + article.doi)
                article.status = 'missing-jats'
                continue
            xmldict = self.journal.article_metadata(article)
            if not xmldict:
                warn('Skipping article with missing metadata: ' + article.doi)
                article.status = 'missing-' + self.journal.metadata_source.lower(
                )
                continue

            # Looks good. Carry on.
            if dest_service == 'pmc':
                self._save_article_pmc(dest_dir, article, xmldict,
                                       zip_articles)
            else:
                self._save_article_portico(dest_dir, article, xmldict)
            saved_files.append(article)

        # After we've downloaded everything, maybe zip it all up together.
        if zip_articles and dest_service != 'pmc':
            final_file = self.output_dir + '.zip'
            inform(f'Creating ZIP archive file "{final_file}"')
            comments = zip_comments(len(article_list), self.journal.name)
            archive_directory(final_file, self.output_dir, comments)
            if __debug__: log(f'verifying ZIP file {final_file}')
            verify_archive(final_file, 'zip')
            if __debug__: log(f'deleting directory {self.output_dir}')
            shutil.rmtree(self.output_dir)
Пример #9
0
    def write_link(self, file_path, uri):
        '''Write the "uri" into the "Where From" metadata attribute of "file_path".'''

        # file pathname string may contain '{' and '}', so guard against it.
        fp = antiformat(file_path)
        file = antiformat(f'[steel_blue3]{file_path}[/]')
        if not self.overwrite:
            (wherefroms, malformed) = self._wherefroms(file_path)
            if wherefroms:
                if wherefroms[0] == uri:
                    inform(
                        f'Zotero link already present in "Where from" of {file}'
                    )
                    # We found a link already present, but the attribute value
                    # was malformed (maybe due to the use of a buggy previous
                    # version of Zowie or manual experiments by the user). We
                    # should proceed to correct it even if -o is not in effect.
                    if not malformed:
                        return
                elif type(wherefroms[0]) is str and wherefroms[0].startswith(
                        'zotero://'):
                    inform(
                        f'Updating existing Zotero link in "Where from" of {file}'
                    )
                    wherefroms[0] = uri
                else:
                    inform(
                        f'Prepending Zotero link to front of "Where from" of {file}'
                    )
                    wherefroms.insert(0, uri)
            else:
                if __debug__: log(f'no prior wherefroms found on {fp}')
                inform(
                    f'Writing Zotero link into "Where From" metadata of {file}'
                )
                wherefroms = [uri]
        else:
            inform(
                f'Overwriting "Where From" metadata with Zotero link in {file}'
            )
            wherefroms = [uri]

        self._write_wherefroms(file_path, wherefroms)
Пример #10
0
    def _do_main_work(self):
        if self.overwrite:
            warn('Overwrite mode in effect.')
        if self.dry_run:
            warn('Running in dry run mode – will not modify files.')
        inform(f'Will process {pluralized("file", self._targets, True)}' +
               f' using {pluralized("method", self.methods)}' +
               f' [cyan2]{", ".join(self.methods)}[/].')
        if len(self._targets) > 10000:
            inform(
                "(That's a huge number of files – this will take a long time.)"
            )
        elif len(self._targets) > 1000:
            inform("(That's a lot of files – this will take some time.)")

        for file in self._targets:
            (record, failure) = self._zotero.record_for_file(file)
            if failure:
                warn(failure)
                continue
            ext = filename_extension(file)
            for method in self._writers:
                if method.file_extension() and ext != method.file_extension():
                    f = antiformat(f'[steel_blue3]{file}[/]')
                    warn(
                        f"Method [cyan2]{method.name()}[/] can't be used on {f}"
                    )
                else:
                    method.write_link(file, record.link)
Пример #11
0
    def write_link(self, file_path, uri):
        '''Writes the "uri" into the Finder comments of file "file_path".

        If there's an existing comment, read it.  If there's a Zotero select
        link as the first thing in the comment, replace that URI with this one,
        under the assumption that this was a link written by a prior run of
        this program; otherwise, prepend the Zotero select link to the finder
        comments.  In either case, write the results back.
        '''

        # file pathname string may contain '{' and '}', so guard against it.
        fp = antiformat(file_path)
        file = antiformat(f'[steel_blue3]{file_path}[/]')
        if not self.overwrite:
            if __debug__: log(f'reading Finder comments of {fp}')
            comments = _FINDER_SCRIPTS.call('get_comments', file_path)
            if comments and uri in comments:
                inform(f'Zotero link already present in Finder comments of {file}')
                return
            elif comments and 'zotero://select' in comments:
                inform(f'Replacing existing Zotero link in Finder comments of {file}')
                if __debug__: log(f'overwriting existing Zotero link with {uri}')
                comments = re.sub(r'(zotero://\S+)', uri, comments)
            elif comments:
                warn(f'Not overwriting existing Finder comments of {file}')
                return
            else:
                inform(f'Writing Zotero link into empty Finder comments of {file}')
                comments = uri
        else:
            inform(f'Ovewriting Finder comments with Zotero link for {file}')
            comments = uri

        if not self.dry_run:
            if self.add_space and not comments.endswith(' '):
                if __debug__: log('adding trailing space to Finder comment')
                comments += ' '
            if __debug__: log(f'invoking AS function to clear comment on {fp}')
            _FINDER_SCRIPTS.call('clear_comments', file_path)
            if __debug__: log(f'invoking AS function to set comment on {fp}')
            _FINDER_SCRIPTS.call('set_comments', file_path, comments)
Пример #12
0
    def write_link(self, file_path, uri):
        '''Write the "uri" into the Producer attribute of PDF file "file_path".'''

        fp = antiformat(file_path)
        if __debug__: log(f'reading PDF file {fp}')
        trailer = PdfReader(file_path)
        file = antiformat(f'[steel_blue3]{file_path}[/]')
        if not self.overwrite:
            producer = trailer.Info.Producer or ''
            if __debug__: log(f'found PDF Producer value {producer} on {fp}')
            if uri in producer:
                inform(
                    f'Zotero link already present in PDF "Producer" field of {file}'
                )
                return
            elif producer.startswith('zotero://select'):
                inform(
                    f'Replacing existing Zotero link in PDF "Producer" field of {file}'
                )
                producer = re.sub(r'(zotero://\S+)', uri, producer)
                trailer.Info.Producer = producer
            elif producer is not None:
                warn(
                    f'Not overwriting existing PDF "Producer" value in {file}')
                return
            else:
                if __debug__: log(f'no prior PDF Producer field found on {fp}')
                inform(
                    f'Writing Zotero link into PDF "Producer" field of {file}')
                trailer.Info.Producer = uri
        else:
            inform(f'Overwriting PDF "Producer" field of {file}')
            trailer.Info.Producer = uri

        if not self.dry_run:
            if __debug__:
                log(f'writing PDF file with new "Producer" field: {fp}')
            PdfWriter(file_path, trailer=trailer).write()
Пример #13
0
    def write_link(self, file_path, uri):
        '''Write the "uri" into the Subject attribute of PDF file "file_path".'''

        fp = antiformat(file_path)
        if __debug__: log(f'reading PDF file {fp}')
        trailer = PdfReader(file_path)
        file = antiformat(f'[steel_blue3]{file_path}[/]')
        if not self.overwrite:
            subject = trailer.Info.Subject or ''
            if __debug__: log(f'found PDF Subject value {subject} on {fp}')
            if uri in subject:
                inform(
                    f'Zotero link already present in PDF "Subject" field of {file}'
                )
                return
            elif subject.startswith('zotero://select'):
                inform(
                    f'Replacing existing Zotero link in PDF "Subject" field of {file}'
                )
                subject = re.sub(r'(zotero://\S+)', uri, subject)
                trailer.Info.Subject = subject
            elif subject is not None:
                warn(f'Not overwriting existing PDF "Subject" value in {file}')
                return
            else:
                if __debug__: log(f'no prior PDF Subject field found on {fp}')
                inform(
                    f'Writing Zotero link into PDF "Subject" field of {file}')
                trailer.Info.Subject = uri
        else:
            inform(f'Overwriting PDF "Subject" field of {file}')
            trailer.Info.Subject = uri

        if not self.dry_run:
            if __debug__:
                log(f'writing PDF file with new "Subject" field: {fp}')
            PdfWriter(file_path, trailer=trailer).write()
Пример #14
0
def main(api_key='A',
         no_color=False,
         after_date='D',
         file_ext='F',
         identifier='I',
         no_keyring=False,
         list=False,
         method='M',
         dry_run=False,
         overwrite=False,
         quiet=False,
         space=False,
         version=False,
         debug='OUT',
         *files):
    '''Zowie ("ZOtero link WrItEr") is a tool for Zotero users.

Zowie writes Zotero select links into the files and/or the macOS Finder
metadata attributes of files in the user's local Zotero database. This makes
it possible to jump to the Zotero bibliographic record corresponding to a
Zotero file attachment when viewing the file from outside of Zotero.

Credentials for Zotero access
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Zowie needs to know the user's personal library identifier (also known as the
"userID") and a Zotero API key. By default, it tries to get this information
from the user's keychain. If the values do not exist in the keychain from a
previous run, Zowie will ask the user, and (unless the -K option is given)
store the values in the user's keychain so that it does not have to ask again
in the future. It is also possible to supply the identifier and API key on
the command line using the -i and -a options, respectively; the given values
will then override the values stored in the keychain (unless the -K option is
also given). This is also how you can replace previously-stored values: use
-i and -a (without -K) and the new values will override the stored values.

To find out your Zotero userID and create an API key, log in to your Zotero
account at Zotero.org and visit https://www.zotero.org/settings/keys

Basic usage
~~~~~~~~~~~

Zowie can operate on a folder, or one or more individual files, or a mix of
both. Suppose your local Zotero database is located in ~/Zotero/. Perhaps
the simplest way to run Zowie is the following command:

  zowie ~/Zotero

If this is your first run of Zowie, it will ask you for your userID and API
key, then search for files recursively under ~/Zotero/storage/.  For each
file found, Zowie will contact the Zotero servers over the network and
determine the Zotero URI for the bibliographic entry containing that file.
Finally, it will use its default method of writing the Zotero select
link, which is to write it into the macOS Finder comments for the file.

If you are a user of DEVONthink, you will probably want to use the -s option
(see the explanation below in the section on special-case behavior):

  zowie -s ~/Zotero

Instead of a folder, you can invoke Zowie on one or more individual files (but
be careful to quote pathnames with spaces in them, such as in this example):

  zowie -s "~/Zotero/storage/26GS7CZL/Smith 2020 Paper.pdf"

Zowie supports multiple methods of writing the Zotero select link.  The
option -l will cause Zowie to print a list of all the methods available:

  zowie -l

(Note that some methods only work for some file types.) The default method is
to write it into Finder comments for the file. (These comments are visible in
the Finder's "Get Info" panel.) The option -m can be used to select one or
more alternative methods. Separate the names with commas without spaces. For
example, the following command will make Zowie write the Zotero link into both
the Finder comments and the "Where from" attribute:

  zowie -m findercomment,wherefrom ~/Zotero/storage

Where possible, Zowie tries to preserve the previous contents of metadata
attributes.  For example, In the case of Finder comments and "Where from", it
looks for existing Zotero links in the contents and updates those links only;
if it does not find an existing Zotero link, it prepends one instead of
replacing the value completely.  The general rule is that Zowie will try to
detect whether a Zotero select link is already present in the chosen metadata
attribute(s) and will only update the link text if a link is found;
otherwise, it will not write the Zotero select link at all unless given the
overwrite (-o) option.  The overwrite option (-o) makes Zowie replace values
completely.  Check the description of the methods for more details about what
they do by default and the impact of the -o option.

Filtering by file type
~~~~~~~~~~~~~~~~~~~~~~

By default, Zowie acts on all files it finds on the command line, except for
certain files that it always ignores: hidden files and files with extensions
.sqlite, .bak, .csl, .css, .js, .json, .pl, and a few others.  If the -m
option is used to select methods that only apply to specific file types,
Zowie will examine each file it finds in turn and only apply the methods that
match that particular file's type.

You can use the option -f to make Zowie filter the files it finds based on
file name extensions.  This is useful if you want it to concentrate only on
particular file types and ignore other files it might find while scanning
folders.  For example,

  zowie -f pdf,mp4,mov ~/Zotero

will cause it to only work on .pdf, .mp4, and .mov files.  You can provide
multiple file extensions separated by commas, without spaces and without the
leading periods.  Note that Zowie always ignores certain files, such as those
ending with .css, .js, .json, .bak, .csl, and a few others.

Filtering by date
~~~~~~~~~~~~~~~~~

If the -d option is given, the files will be filtered to use only those
whose last-modified date/time stamp is no older than the given date/time
description. Valid descriptors are those accepted by the Python dateparser
package. Make sure to enclose descriptions within single or double
quotes. Examples:

 zowie -d "2 weeks ago" ....
 zowie -d "2014-08-29" ....
 zowie -d "12 Dec 2014" ....
 zowie -d "July 4, 2013" ....

Special-case behavior
~~~~~~~~~~~~~~~~~~~~~

Although Zowie is not aimed solely at DEVONthink users, its development was
motivated by the author's desire to use Zotero with that software. A
complication arose due to an undocumented feature in DEVONthink: it ignores a
Finder comment if it is identical to the value of the "URL" attribute (which
is the name it gives to the "com.apple.metadata:kMDItemWhereFroms" extended
attribute on a file). In practical terms, if you do something like write the
Zotero select link into the Finder comment of a file and then have a
DEVONthink smart rule copy the value to the URL field, the Finder comment
will appear blank in DEVONthink (even though it exists on the actual
file). This can be unexpected and confusing, and has caught people (including
the author of Zowie) unaware. To compensate, Zowie 1.2 introduced a new
option: it can add a trailing space character to the end of the value it
writes into the Finder comment when using the "findercomment" method. Since
approaches to copy the Zotero link from the Finder comment to the URL field
in DEVONthink will typically strip whitespace around the URL value, the net
effect is to make the value in the Finder comment just different enough from
the URL field value to prevent DEVONthink from ignoring the Finder
comment. Use option -s to make Zowie to add the trailing space character.

Additional command-line arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the -n ("dry run") option, Zowie will only print what it would do
without actually doing it.

If given the -q option, Zowie will not print its usual informational messages
while it is working. It will only print messages for warnings or errors.
By default messages printed by Zowie are also color-coded. If given the
option -C, Zowie will not color the text of messages it prints. (This latter
option is useful when running Zowie within subshells inside other environments
such as Emacs.)

If given the -V option, this program will print the version and other
information, and exit without doing anything else.

If given the -@ argument, this program will output a detailed trace of what it
is doing. The debug trace will be sent to the given destination, which can
be '-' to indicate console output, or a file path to send the output to a file.

When -@ has been given, Zowie also installs a signal handler on signal SIGUSR1
that will drop Zowie into the pdb debugger if the signal is sent to the
running process.

Return values
~~~~~~~~~~~~~

This program exits with a return code of 0 if no problems are encountered.
It returns a nonzero value otherwise. The following table lists the possible
return values:

  0 = success -- program completed normally
  1 = the user interrupted the program's execution
  2 = encountered a bad or missing value for an option
  3 = no network detected -- cannot proceed
  4 = file error -- encountered a problem with a file
  5 = server error -- encountered a problem with a server
  6 = an exception or fatal error occurred

Command-line arguments summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''

    # Set up debug logging as soon as possible, if requested ------------------

    if debug != 'OUT':
        if __debug__: set_debug(True, debug)
        import faulthandler
        faulthandler.enable()
        if not sys.platform.startswith('win'):
            # Even with a different signal, I can't get this to work on Win.
            import signal
            from boltons.debugutils import pdb_on_signal
            pdb_on_signal(signal.SIGUSR1)

    # Preprocess arguments and handle early exits -----------------------------

    if version:
        from zowie import print_version
        print_version()
        exit(int(ExitCode.success))

    from bun import UI, inform, warn, alert, alert_fatal

    ui = UI('Zowie',
            'ZOtero link WrItEr',
            use_color=not no_color,
            be_quiet=quiet)
    ui.start()

    if list:
        import shutil
        from textwrap import wrap
        from zowie.methods import method_object
        inform('Known methods:\n')
        width = (shutil.get_terminal_size().columns - 2) or 78
        for name in method_names():
            text = f'[cyan2]{name}[/]: {method_object(name).description()}'
            inform('\n'.join(wrap(text, width=width, subsequent_indent='  ')))
            inform('')
        exit(int(ExitCode.success))

    methods_list = ['findercomment'
                    ] if method == 'M' else method.lower().split(',')
    bad_name = next((n for n in methods_list if n not in method_names()), None)
    if bad_name:
        alert(f'Unrecognized method name "{bad_name}".')
        alert('The available methods are: ' + ', '.join(method_names()) + '.')
        exit(int(ExitCode.bad_arg))

    # Do the real work --------------------------------------------------------

    from commonpy.data_utils import timestamp
    from commonpy.interrupt import config_interrupt
    from zowie.exceptions import UserCancelled, FileError, CannotProceed
    from zowie.main_body import MainBody

    if __debug__: log('=' * 8 + f' started {timestamp()} ' + '=' * 8)
    body = exception = None
    try:
        body = MainBody(files=files,
                        file_ext=None if file_ext == 'F' else file_ext,
                        api_key=None if api_key == 'A' else api_key,
                        user_id=None if identifier == 'I' else identifier,
                        use_keyring=not no_keyring,
                        after_date=None if after_date == 'D' else after_date,
                        methods=methods_list,
                        dry_run=dry_run,
                        overwrite=overwrite,
                        add_space=space)
        config_interrupt(body.stop, UserCancelled(ExitCode.user_interrupt))
        body.run()
        exception = body.exception
    except Exception as ex:
        exception = sys.exc_info()

    # Try to deal with exceptions gracefully ----------------------------------

    exit_code = ExitCode.success
    if exception:
        from commonpy.string_utils import antiformat
        if __debug__:
            log(f'main body raised exception: {antiformat(exception)}')
        if exception[0] == CannotProceed:
            exit_code = exception[1].args[0]
        elif exception[0] == FileError:
            alert_fatal(antiformat(exception[1]))
            exit_code = ExitCode.file_error
        elif exception[0] in [KeyboardInterrupt, UserCancelled]:
            warn('Interrupted.')
            exit_code = ExitCode.user_interrupt
        else:
            msg = antiformat(exception[1])
            alert_fatal(f'Encountered error {exception[0].__name__}: {msg}')
            exit_code = ExitCode.exception
            if __debug__:
                from traceback import format_exception
                details = ''.join(format_exception(*exception))
                log(f'Exception: {msg}\n{details}')
    else:
        inform('Done.')

    # And exit ----------------------------------------------------------------

    if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8)
    if __debug__: log(f'exiting with exit code {exit_code}')
    exit(int(exit_code))
Пример #15
0
    def _do_main_work(self):
        # Gather up some things and get prepared.
        targets = self.targets_from_arguments()
        if not targets:
            alert_fatal('No images to process; quitting.')
            raise CannotProceed(ExitCode.bad_arg)
        num_targets = len(targets)

        inform(f'Given {pluralized("image", num_targets, True)} to work on.')
        inform('Will apply results of {}: {}'.format(
            pluralized('service', len(self.services), True),
            ', '.join(self.services), num_targets))
        inform(
            f'Will use credentials stored in {Credentials.credentials_dir()}/.'
        )
        if self.extended:
            inform('Will save extended results.')
        num_threads = min(self.threads, len(self.services))
        inform(f'Will use up to {num_threads} process threads.')

        # Get to work.
        if __debug__: log('initializing manager and starting processes')
        import shutil
        print_separators = num_targets > 1
        rule = '─' * (shutil.get_terminal_size().columns or 80)
        for index, item in enumerate(targets, start=1):
            # Check whether we've been interrupted before doing another item.
            raise_for_interrupts()
            # Process next item.
            if print_separators:
                inform(rule)
            self._manager.run_services(item, index, self.base_name)
        if print_separators:
            inform(rule)
Пример #16
0
def main(after_date='A',
         no_color=False,
         dest='D',
         doi_file='F',
         journal='J',
         list_dois=False,
         output_dir='O',
         preview=False,
         quiet=False,
         rep_file='R',
         rep_fmt='S',
         rep_title='T',
         version=False,
         no_check=False,
         no_zip=False,
         debug='OUT'):
    '''Create archives of journals suitable for sending to Portico or PMC.

The journal whose articles are to be archived must be indicated using the
required option -j (or /j on Windows).  To list the currently-supported
journals, you can use a value of "list" to the -j option:

  pubarchiver -j list

Without any additional options, PubArchiver will contact the journal website
and either DataCite or Crossref, and create an archive containing articles and
their metadata for all articles published to date by the journal.  The options
below can be used to select articles and influence other PubArchiver behaviors.

Selecting a subset of articles
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If the option -a (or /a on Windows) is given, PubArchiver will download only
articles whose publication dates are AFTER the given date.  Valid date
descriptors are those accepted by the Python dateparser library.  Make sure to
enclose descriptions within single or double quotes.  Examples:

  pubarchiver -a "2014-08-29"   ....
  pubarchiver -a "12 Dec 2014"  ....
  pubarchiver -a "July 4, 2013"  ....
  pubarchiver -a "2 weeks ago"  ....

The option -f (or /f on Windows) can be used with a value of a file path to
limit archiving to only the DOIs listed in the given file.  The file format
must be a simple list of one DOI per line.

The selection by date performed by the -a option happens after reading the
list of articles using the -f option if present, and can be used to filter
by date the articles whose DOIs are provided.

Controlling the output
~~~~~~~~~~~~~~~~~~~~~~

The value supplied after the option -d (or /d on Windows) can be used to
select the destination where the publication archive is intended to be sent
after PubArchiver has done its work.  The possible alternatives are "portico"
and "pmc"; Portico is assumed to be the default destination.  This option
changes the structure and content of the archive created by PubArchiver.

PubArchiver will write its output to the directory indicated by the value of
the option -o (or /o on Windows). If no -o is given, the output will be written
to the current directory from which PubArchiver is being run.  Each article will
be written to a subdirectory named after the DOI of the article.  The output for
each article will consist of an XML metadata file describing the article, the
article itself in PDF format, and a subdirectory named "jats" containing the
article in JATS XML format along with any image that may appear in the article.
The image is always converted to uncompressed TIFF format (because it is
considered a good preservation format).

Unless the option -Z (or /Z on Windows) is given, the output will be archived
in ZIP format.  If the output structure (as determine by the -s option) is
being generated for PMC, each article will be put into its own individual
ZIP archive; else, the default action is to put the collected output of all
articles into a single ZIP archive file.

Writing a report
~~~~~~~~~~~~~~~~

As it works, PubArchiver writes information to the terminal about the articles
it puts into the archive, including whether any problems are encountered.  To
save this information to a file, use the option -r (or /r on Windows), which
will make PubArchiver write a report file.  By default, the format of the
report file is CSV; the option -s (/s on Windows) can be used to select "csv"
or "html" (or both) as the format.  The title of the report will be based on
the current date, unless the option -t (or /t on Windows) is used to supply a
different title.

Previewing the list of articles
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the option -p (or /p on Windows), pubarchiver will ONLY display a
list of articles it will archive and stop short of creating the archive.  This
is useful to see what would be produced without actually doing it.  However,
note that because it does not attempt to download the articles and associated
files, it will not be able to report on errors that might occur when not
operating in preview mode.  Consequently, do not use the output of -p as a
prediction of eventual success or failure.

Return values
~~~~~~~~~~~~~

This program will exit with a return code of 0 if no problems are encountered
during execution.  If a problem is encountered, it will return a nonzero value.
If no network is detected, it returns a value of 1; if the program is
interrupted (e.g., using control-c) it returns a value of 2; if it encounters
a fatal error, it returns a value of 3. If it encounters any non-fatal
problems (such as a missing PDF file or JATS validation error), it returns a
nonzero value equal to 100 + the number of articles that had failures.
Summarizing the possible return codes:

        0 = no errors were encountered -- success
        1 = no network detected -- cannot proceed
        2 = the user interrupted program execution
        3 = an exception or fatal error occurred
  100 + n = encountered non-fatal problems on a total of n articles

Additional command-line options
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The option -l (or /l on Windows) can be used to obtain a list of all DOIs
for all articles published by the selected journal.

Pubarchiver will print general informational messages as it works. To
reduce messages to only warnings and errors, use the option -q (or /q on
Windows).  Output is color-coded by default unless the -C option (or /C on
Windows) is given; this option can be helpful if the color control signals
create problems for your terminal emulator.

If given the -@ option (/@ on Windows), this program will print a detailed
real-time log of what it is doing.  The output will be sent to the given
destination, which can be '-' to indicate console output, or a file path to
send the output to a file.  The output is mainly intended for debugging.

Pubarchiver always downloads the JATS XML version of articles from
micropublication.org (in addition to downloading the PDF version), and by
default, pubarchiver validates the XML content against the JATS DTD.  To
skip the XML validation step, use the option -X (/X on Windows).

If given the -V option (/V on Windows), this program will print version
information and exit without doing anything else.

Command-line options summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''
    if debug != 'OUT':
        if __debug__: set_debug(True, debug)
        import faulthandler
        faulthandler.enable()

    if version:
        print_version()
        exit(0)

    try:
        if __debug__: log('=' * 8 + f' started {timestamp()}' + '=' * 8)
        ui = UI('PubArchiver', use_color=not no_color, be_quiet=quiet)
        ui.start()

        if journal == 'J':
            alert('Must specify a journal using the -j option.')
            print_supported_journals()
            exit(1)
        elif journal in ['list', 'help']:
            print_supported_journals()
            exit(0)
        elif journal not in journal_list():
            alert(f'Unrecognized journal "{journal}".')
            print_supported_journals()
            exit(1)

        if not network_available():
            alert('No network.')
            exit(1)

        handler = journal_handler(journal)
        if list_dois:
            inform(f'Asking {handler.name} server for a list of all DOIs ...')
            articles = handler.all_articles()
            if articles:
                inform(f'Got {len(articles)} DOIs from {handler.name}:')
                print('\n'.join(article.doi for article in articles))
            else:
                warn(f'Failed to get list of articles from {handler.name}')
            exit(0)

        body = MainBody(journal=handler,
                        dest='pmc' if dest.lower() == 'pmc' else 'portico',
                        doi_file=doi_file if doi_file != 'F' else None,
                        output_dir='.' if output_dir == 'O' else output_dir,
                        after=None if after_date == 'A' else after_date,
                        report_file=None if rep_file == 'R' else rep_file,
                        report_format='csv' if rep_fmt == 'S' else rep_fmt,
                        report_title=None if rep_title == 'T' else rep_title,
                        do_validate=handler.uses_jats and not no_check,
                        do_zip=not no_zip,
                        preview=preview)
        body.run()
        if __debug__: logf(f'finished with {body.failures} failures')
        if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8)
        exit(100 + body.failures if body.failures > 0 else 0)
    except KeyboardInterrupt as ex:
        warn('Quitting')
        if __debug__: log(f'returning with exit code 2')
        exit(2)
    except Exception as ex:
        import traceback
        if __debug__: log(f'{str(ex)}\n{traceback.format_exc()}')
        alert_fatal(f'{str(ex)}')
        if __debug__: log(f'returning with exit code 3')
        exit(3)
Пример #17
0
    def run(self):
        '''Execute the control logic.'''

        # Check and process argument values & fail early if there's a problem.
        self._process_arguments()

        # Read the article list from a file or the server.
        inform(f'Reading article list from {self.doi_file or "server"} ...')
        if self.doi_file:
            articles = self.journal.articles_from(self.doi_file)
        else:
            articles = self.journal.all_articles()

        # Do optional filtering based on the date.
        if self.after:
            date_str = self.after.strftime(_DATE_FORMAT)
            inform(f'Will only keep articles published after {date_str}.')
            articles = [
                x for x in articles if parsed_datetime(x.date) > self.after
            ]

        inform(
            f'Total articles left after filtering: {humanize.intcomma(len(articles))}.'
        )
        inform(
            f'Destination format: {"PMC" if self.dest == "pmc" else "Portico"}'
        )
        if self.preview:
            self._print_articles(articles)
            return
        elif len(articles) > 0:
            inform(f'Output will be written to directory "{self.output_dir}"')
            os.makedirs(self.output_dir, exist_ok=True)
            self._save_articles(self.output_dir, articles, self.dest,
                                self.do_zip)

        if self.report_file:
            inform(f'Writing report to {self.report_file}')
            self._write_report(self.report_file, self.report_format,
                               self.report_title, articles)

        # Count any failures by looking at the article statuses.
        inform('Done.')
        self.failures = sum(
            article.status.startswith('fail') for article in articles)
Пример #18
0
    def _save_article_portico(self, dest_dir, article, xmldict):
        article_dir = path.join(dest_dir, article.basename)
        jats_dir = path.join(article_dir, 'jats')
        try:
            os.makedirs(article_dir)
            if self.journal.uses_jats:
                os.makedirs(jats_dir)
        except FileExistsError:
            pass
        inform('Writing ' + article.doi)
        xml_file = xml_filename(article, article_dir)
        with open(xml_file, 'w', encoding='utf8') as f:
            if __debug__: log(f'writing XML to {xml_file}')
            f.write(xmltodict.unparse(xmldict, pretty=True))

        pdf_file = pdf_filename(article, article_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'

        if not self.journal.uses_jats:
            # Nothing more to do.
            return

        jats_file = jats_filename(article, jats_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, jats_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted = image_without_alpha(img)
                    converted = converted.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_name = filename_basename(image_file) + '.tif'
                    comments = tiff_comments(article, self.journal.name)
                    # Using save() means only the 1st frame of a multiframe
                    # image will be saved.
                    converted.save(tiff_name,
                                   compression=None,
                                   dpi=_TIFF_DPI,
                                   description=comments)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__: log(f'skipping empty image URL for {article.doi}')
Пример #19
0
    def _save_article_pmc(self, dest_dir, article, xml, zip_articles):
        inform('Writing ' + article.doi)
        to_archive = []

        pdf_file = pmc_pdf_filename(article, dest_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'
        to_archive.append(pdf_file)

        jats_file = jats_filename(article, dest_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')
        to_archive.append(jats_file)

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, dest_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted_img = image_without_alpha(img)
                    converted_img = converted_img.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_file = filename_basename(image_file) + '.tif'
                    # Using save() means that only the 1st frame of a
                    # multiframe image will be saved.
                    converted_img.save(tiff_file,
                                       dpi=_TIFF_DPI,
                                       compression=None,
                                       description=tiff_comments(article))
                    to_archive.append(tiff_file)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__:
                log(f'skipping empty image file URL for {article.doi}')

        # Finally, put the files into their own zip archive.
        if zip_articles:
            if not article.status.startswith('failed'):
                zip_file = pmc_zip_filename(article, dest_dir)
                inform(f'Creating ZIP archive file "{zip_file}"')
                archive_files(zip_file, to_archive)
                if __debug__: log(f'verifying ZIP file {zip_file}')
                verify_archive(zip_file, 'zip')
                for file in to_archive:
                    if __debug__: log(f'deleting file {file}')
                    delete_existing(file)
            else:
                warn(
                    f'ZIP archive for {article.doi} not created due to errors')
Пример #20
0
    def _send(self, image, service):
        '''Get results from service named "service" for the "image".'''

        service_name = f'[{service.name_color()}]{service.name()}[/]'
        base_path = path.join(image.dest_dir, path.basename(image.file))
        json_file = self._renamed(base_path, str(service), 'json')

        saved_results = None
        if self._reuse_json and readable(json_file):
            inform(
                f'Reading saved results for {service_name} from {relative(json_file)}'
            )
            with open(json_file, 'r') as f:
                saved_results = json.load(f)
            output = service.result(image.file, saved_results)
        else:
            inform(f'Sending to {service_name} and waiting for response ...')
            last_time = timer()
            try:
                output = service.result(image.file, None)
            except AuthFailure as ex:
                raise AuthFailure(f'Service {service}: {str(ex)}')
            except RateLimitExceeded as ex:
                time_passed = timer() - last_time
                if time_passed < 1 / service.max_rate():
                    warn(f'Pausing {service_name} due to rate limits')
                    wait(1 / service.max_rate() - time_passed)
                    warn(f'Continuing {service_name}')
                    return self._send(image, service)
            if output.error:
                # Sanitize the error string in case it contains '{' characters.
                msg = output.error.replace('{', '{{{{').replace('}', '}}}}')
                alert(f'{service_name} failed: {msg}')
                warn(
                    f'No result from {service_name} for {relative(image.file)}'
                )
                return None
            inform(f'Got result from {service_name}.')

        raise_for_interrupts()
        inform(f'Creating annotated image for {service_name}.')
        annot_path = self._renamed(base_path, str(service), 'png')
        report_path = None
        from handprint.images import annotated_image
        with self._lock:
            img = annotated_image(image.file, output.boxes, service,
                                  self._text_size, self._text_color,
                                  self._text_shift, self._display,
                                  self._confidence)
            self._save(img, annot_path)

        if self._extended_results and (saved_results is None):
            inform(f'Saving all data for {service_name}.')
            raw_json = json.dumps(output.data, sort_keys=True, indent=2)
            self._save(raw_json, json_file)
            inform(f'Saving extracted text for {service_name}.')
            txt_file = self._renamed(base_path, str(service), 'txt')
            self._save(output.text, txt_file)
        if self._compare:
            gt_file = alt_extension(image.item_file, 'gt.txt')
            gt_path = relative(gt_file)
            report_path = self._renamed(image.item_file, str(service), 'tsv')
            relaxed = (self._compare == 'relaxed')
            if readable(gt_file) and nonempty(gt_file):
                if __debug__: log(f'reading ground truth from {gt_file}')
                gt_text = open(gt_file, 'r').read()
                inform(f'Saving {service_name} comparison to ground truth')
                from handprint.comparison import text_comparison
                self._save(text_comparison(output.text, gt_text, relaxed),
                           report_path)
            elif not nonempty(gt_file):
                warn(
                    f'Skipping {service_name} comparison because {gt_path} is empty'
                )
            else:
                warn(
                    f'Skipping {service_name} comparison because {gt_path} not available'
                )
        return Result(service, image, annot_path, report_path)
Пример #21
0
def print_supported_journals():
    inform('Recognized journals: [white]' + ', '.join(journal_list()) + '[/]')
Пример #22
0
def main(add_creds='A',
         base_name='B',
         no_color=False,
         compare=False,
         display='D',
         extended=False,
         from_file='F',
         no_grid=False,
         list=False,
         reuse_json=False,
         text_move='M',
         confidence='N',
         output_dir='O',
         quiet=False,
         relaxed=False,
         services='S',
         threads='T',
         version=False,
         text_color='X',
         text_size='Z',
         debug='OUT',
         *files):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") runs
alternative text recognition services on images of handwritten document pages.

Installing credentials for cloud-based services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known services, and exit.

Before a given service can be used, if it is cloud-based commercial OCR/HTR
service, Handprint needs to be supplied with user credentials for accessing
that service.  The credentials must be stored in a JSON file with a certain
format; see the Handprint user documentation for details about the formats
for each service.  To add a new credentials file, use the -a option (/a on
Windows) in combination with the name of a service and a single file path on
the command line.  The name supplied right after the -a option must be the
name of a recognized service (such as "google", "amazon", "microsoft"), and
the file argument must be a JSON file containing the credentials data in the
required format for that service.  Here is an example of adding credentials
for Google (assuming you created the JSON file as described in the docs):

  handprint -a google mygooglecreds.json

Run Handprint with the -a option multiple times to install credentials for
each different service.  Handprint will copy the credential files to its own
configuration directory and exit without doing anything else.  The directory
is different on different operating sytems; for example, on macOS it
is ~/Library/Application Support/Handprint/.

Basic usage
~~~~~~~~~~~

After credentials are installed, running Handprint without the -a option will
invoke one or more OCR/HTR services on files, directories of files, or URLs.
Here is an example of running Handprint on a directory containing images:

  handprint tests/data/caltech-archives/glaser/

Image paths or URLs can be supplied to Handprint in any of the following ways:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) one or more URLs, which will be interpreted as network locations of image
    files to be processed; or

 c) if given the -f option (/f on Windows), a file containing either image
    paths or image URLs.

Note that providing URLs on the command line can be problematic due to how
terminal shells interpret certain characters, and so when supplying URLs,
it's usually better to store the URLs in a file and use the -f option.
Regardless, when given URLs, Handprint will first download the images to a
local directory indicated by the option -o (/o on Windows), or the current
directory if option -o is not used.

No matter whether files or URLs, each input should be a single image of a
document page in which text should be recognized.  Handprint can accept input
images in JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF formats. To make the
results from different services more easily comparable, Handprint will always
convert all input images to the same format (PNG) no matter if some services
may accept other formats; it will also downsize input images to the smallest
size accepted by any of the services invoked if an image exceeds that size.
(For example, if service A accepts files up to 10 MB in size and service B
accepts files up to 4 MB, all input images will be resized to 4 MB before
sending them to both A and B, even if A could accept a higher- resolution
image.)  Finally, if the input contains more than one page (e.g., in a PDF
file), Handprint will only use the first page and ignore the rest.

Be aware that resizing images to the lowest common size means that the text
recognition results returned by some services may be different than if the
original full-size input image had been sent to that service.  If your images
are larger (when converted to PNG) than the size threshold for some services
(which is currently 4 MB when Microsoft is one of the destinations), then you
may wish to compare the results of using multiple services at once versus
using the services one at a time.

Selecting destination services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The default action is to run all known services.  The option -s (/s on
Windows) can be used to select only one service or a list of services
instead.  Lists of services should be separated by commas; e.g.,
"google,microsoft".  To find out which services are supported by Handprint, run
it with the command-line flag -l (or /l on Windows), which will make Handprint
print a list of the known services and exit immediately.

Visual display of recognition results
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

After gathering the results of each service for a given input, Handprint will
create a single compound image consisting of the results for each service
arranged in a grid.  This is intended to make it easier to compare the results
of multiple services against each other.  To skip the creation of the results
grid, use the -G option (/G on Windows).  The grid image have a name with
the following pattern:

  somefile.handprint-all.png

If given the -e option (/e on Windows), Handprint will produce extended
output that includes the complete response from the service (converted to a
JSON file by Handprint) and the text extracted (stored as a .txt file).  The
output of -e will be multiple files like this:

  somefile.handprint-amazon-rekognition.json
  somefile.handprint-amazon-rekognition.png
  somefile.handprint-amazon-rekognition.txt
  somefile.handprint-amazon-textract.json
  somefile.handprint-amazon-textract.png
  somefile.handprint-amazon-textract.txt
  somefile.handprint-google.json
  somefile.handprint-google.png
  somefile.handprint-google.txt
  somefile.handprint-microsoft.json
  somefile.handprint-microsoft.png
  somefile.handprint-microsoft.txt
  ...

The files will be written to the directory indicated by -o, or (if -o is not
used) the directory where "somefile" is located.  When -o is not used and
the input images are given as URLs, then the files are written to the current
working directory instead.

When the inputs are URLs, Handprint must download a copy of the image located
at the network address (because it is not possible to write the results in
the network locations represented by the URLs.).  The images and other
results will be stored files whose root names have the form "document-N",
where "N" is an integer.  The root name can be changed using the -b option
(/b on Windows).  The image at networked locations will be converted to
ordinary PNG format for maximum compatibility with the different OCR
services and written to "document-N.png", and the URL corresponding to each
document will be written in a file named "document-N.url" so that it is
possible to connect each "document-N.png" to the URL it came from.

Finally, note that the use of the -G option (/G on Windows) WITHOUT either
the -e or -c option is an error because it means no output would be produced.

Type of annotations
~~~~~~~~~~~~~~~~~~~

Handprint produces copies of the input images overlayed with the recognition
results received from the different services.  By default, it shows only the
recognized text.  The option -d (/d on Windows) can be used to tell Handprint
to display other results.  The recognized values are as follows:

  text    -- display the text recognized in the image (default)
  bb      -- display all bounding boxes returned by the service
  bb-word -- display only the bounding boxes for words (in red)
  bb-line -- display only the bounding boxes for lines (in blue)
  bb-para -- display only the bounding boxes for paragraphs (in green)

Separate multiple values with a comma.  The option "bb" is a shorthand for the
value "bb-word,bb-line,bb-para".  As an example, the following command will
show both the recognized text and the bounding boxes around words:

  handprint -d text,bb-word  somefile.png

Note that as of June 2021, the main services (Amazon, Google, Microsoft) do not
all provide the same bounding box information in their results.  The following
table summarizes what is available:

               Bounding boxes available
  Service      Word    Line   Paragraph
  ---------    ----    ----   ---------
  Amazon         Y       Y        -
  Google         Y       -        Y
  Microsoft      Y       Y        -

If a service does not provide a particular kind of bounding box, Handprint will
not display that kind of bounding box in the annotated output for that service.

Thresholding by confidence
~~~~~~~~~~~~~~~~~~~~~~~~~~

All of the services return confidence scores for items recognized in the input.
By default, Handprint will show all results in the annotated image, no matter
how low the score.  The option -n (/n on Windows) can be used to threshold the
results based on the confidence value for each item (text or bounding boxes).
The value provided as the argument to -n must be a floating point number
between 0 and 1.0.  For example, the following command will make Handprint only
show text that is rated with least 99.5% confidence:

  handprint -n 0.995  somefile.png

Note that the confidence values returned by the different services are not
normalized against each other.  What one service considers to be 80% confidence
may not be what another service considers 80% confidence.  Handprint performs
the thresholding against the raw scores returned by each service individually.

Comparing results to expected output
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Handprint supports comparing the output of HTR services to expected output
(i.e., ground truth) using the option -c (or /c on Windows).  This facility
requires that the user provides text files that contain the expected text
for each input image.  The ground-truth text files must have the following
characteristics:

 a) The file containing the expected results should be named ".gt.txt", with
    a base name identical to the image file.  For example, an image file named
    "somefile.jpg" should have a corresponding text file "somefile.gt.txt".

 b) The ground-truth text file should be located in the same directory as the
    input image file.

 c) The text should be line oriented, with each line representing a line of
    text in the image.

 d) The text should be plain text only.  No Unicode or binary encodings.
    (This limitation comes from the HTR services, which -- as of this
    writing -- return results in plain text format.)

Handprint will write the comparison results to a tab-delimited file named
after the input image and service but with the extension ".tsv".  For
example, for an input image "somefile.jpg" and results received from Google,
the comparison results will be written to "somefile.handprint-google.tsv".
(The use of a tab-delimited format rather than comma-delimited format avoids
the need to quote commas and other characters in the text.)

Handprint reports, for each text line, the number of errors (the Levenshtein
edit distance) and the character error rate (CER), and at the end it also
reports a sum total of errors.  The CER is computed as the Levenshtein edit
distance of each line divided by the number of characters in the expected
line text, multiplied by 100; this approach to normalizing the CER value is
conventional but note that it can lead to values greater than 100%.

By default, comparisons are done on an exact basis; character case is not
changed, punctuation is not removed, and stop words are not removed.
However, multiple contiguous spaces are converted to one space, and leading
spaces are removed from text lines.  If given the option -r (/r on Windows),
Handprint will relax the comparison algorithm as follows:

 i) convert all text to lower case
 ii) ignore certain sentence punctuation characters, namely , . : ;

Handprint attempts to cope with possibly-missing text in the HTR results by
matching up likely corresponding lines in the expected and received results.
It does this by comparing each line of ground-truth text to each line of the
HTR results using longest common subsequence similarity, as implemented by
the LCSSEQ function in the Python "textdistance" package.  If the lines do
not pass a threshold score, Handprint looks at subsequent lines of the HTR
results and tries to reestablish correspondence to ground truth.  If nothing
else in the HTR results appear close enough to the expected ground-truth
line, the line is assumed to be missing from the HTR results and scored
appropriately.

Additional command-line arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The option -j (/j on Windows) tells Handprint to look for and reuse preexisting
results for each input instead of contacting the services.  This makes it look
for JSON files produced in a previous run with the -e option,

  somefile.handprint-amazon-rekognition.json
  somefile.handprint-amazon-textract.json
  somefile.handprint-google.json
  somefile.handprint-microsoft.json

and use those instead of getting results from the services.  This can be useful
to save repeated invocations of the services if all you want is to draw the
results differently or perform some testing/debugging on the same inputs.

To move the position of the text annotations overlayed over the input image,
you can use the option -m (or /m on Windows).  This takes two numbers separated
by a comma in the form x,y.  Positive numbers move the text rightward and
upward, respectively, relative to the default position.  The default position
of each text annotation in the annotated output is such that the left edge of
the word starts at the location of the upper left corner of the bounding box
returned by the service; this has the effect of putting the annotation near,
but above, the location of the (actual) word in the input image by default.
Using the text-move option allows you to move the annotation if desired.

To change the color of the text annotations overlayed over the input image,
you can use the option -x (or /x on Windows).  You can use hex color codes
such as "#ff0000" or X11/CSS4 color names with no spaces such as "purple"
or "darkgreen".  If you use a hex value, make sure to enclose the value with
quotes, or the shell will interpret the pound sign as a comment character.

To change the size of the text annotations overlayed over the input image,
you can use the option -z (or /z on Windows).  The value is in units of points.
The default size is 12 points.

Handprint will send files to the different services in parallel, using a
number of process threads at most equal to 1/2 of the number of cores on the
computer it is running on.  (E.g., if your computer has 4 cores, it will by
default use at most 2 threads.)  The option -t (/t on Windows) can be used to
change this number.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.  By default messages printed by Handprint are also
color-coded.  If given the option -Z (/Z on Windows), Handprint will not color
the text of messages it prints.  (This latter option is useful when running
Handprint within subshells inside other environments such as Emacs.)

If given the -@ argument (/@ on Windows), this program will output a detailed
trace of what it is doing.  The debug trace will be sent to the given
destination, which can be '-' to indicate console output, or a file path to
send the output to a file.

When -@ (or /@ on Windows) has been given, Handprint installs a signal handler
on signal SIGUSR1 that will drop Handprint into the pdb debugger if the signal
is sent to the running process.  It's best to use -t 1 when attempting to use
a debugger because the subthreads will not stop running if the signal is sent.

If given the -V option (/V on Windows), this program will print the version
and other information, and exit without doing anything else.

Return values
~~~~~~~~~~~~~

This program exits with a return code of 0 if no problems are encountered.
It returns a nonzero value otherwise. The following table lists the possible
return values:

    0 = success -- program completed normally
    1 = the user interrupted the program's execution
    2 = encountered a bad or missing value for an option
    3 = no network detected -- cannot proceed
    4 = file error -- encountered a problem with a file
    5 = server error -- encountered a problem with a server
    6 = an exception or fatal error occurred

Command-line arguments summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''

    # Initial setup -----------------------------------------------------------

    pref = '/' if sys.platform.startswith('win') else '-'
    hint = f'(Hint: use {pref}h for help.)'
    ui = UI('Handprint',
            'HANDwritten Page RecognitIoN Test',
            use_color=not no_color,
            be_quiet=quiet,
            show_banner=not (version or list or add_creds != 'A'))
    ui.start()

    if debug != 'OUT':
        if __debug__: set_debug(True, debug, extra='%(threadName)s')
        import faulthandler
        faulthandler.enable()
        if not sys.platform.startswith('win'):
            # Even with a different signal, I can't get this to work on Win.
            pdb_on_signal(signal.SIGUSR1)

    # Preprocess arguments and handle early exits -----------------------------

    if version:
        print_version()
        exit(int(ExitCode.success))
    if list:
        inform('Known services: [bold]{}[/]', ', '.join(services_list()))
        exit(int(ExitCode.success))
    if add_creds != 'A':
        service = add_creds.lower()
        if service not in services_list():
            alert(f'Unknown service: "{service}". {hint}')
            exit(int(ExitCode.bad_arg))
        if not files or len(files) > 1:
            alert(f'Option {pref}a requires one file. {hint}')
            exit(int(ExitCode.bad_arg))
        creds_file = files[0]
        if not readable(creds_file):
            alert(f'File not readable: {creds_file}')
            exit(int(ExitCode.file_error))
        Credentials.save_credentials(service, creds_file)
        inform(f'Saved credentials for service "{service}".')
        exit(int(ExitCode.success))
    services = services_list() if services == 'S' else services.lower().split(
        ',')
    if services != 'S' and not all(s in services_list() for s in services):
        alert_fatal(f'"{services}" is/are not known services. {hint}')
        exit(int(ExitCode.bad_arg))
    display_given = display
    display = ['text'] if display == 'D' else display.lower().split(',')
    possible_displays = [
        'text', 'bb', 'bb-word', 'bb-words', 'bb-line', 'bb-lines', 'bb-para',
        'bb-paragraph', 'bb-paragraphs'
    ]
    if not all(d in possible_displays for d in display):
        alert_fatal(f'Unrecognized value for {pref}d: {display_given}. {hint}')
        exit(int(ExitCode.bad_arg))
    if no_grid and not extended and not compare:
        alert_fatal(
            f'{pref}G without {pref}e or {pref}c produces no output. {hint}')
        exit(int(ExitCode.bad_arg))
    if any(item.startswith('-') for item in files):
        bad = next(item for item in files if item.startswith('-'))
        alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}')
        exit(int(ExitCode.bad_arg))
    if not files and from_file == 'F':
        alert_fatal(f'Need images or URLs to have something to do. {hint}')
        exit(int(ExitCode.bad_arg))
    if relaxed and not compare:
        warn(f'Option {pref}r without {pref}c has no effect. {hint}')
    if text_move != 'M' and ',' not in text_move:
        alert_fatal(
            f'Option {pref}m requires an argument of the form x,y. {hint}')
        exit(int(ExitCode.bad_arg))
    if text_size != 'Z' and not isint(text_size):
        alert_fatal(
            f'Option {pref}z requires an integer as an argument. {hint}')
        exit(int(ExitCode.bad_arg))
    if confidence != 'N':
        if not isreal(confidence):
            alert_fatal(
                f'Option {pref}n requires a real number as an argument. {hint}'
            )
            exit(int(ExitCode.bad_arg))
        confidence = fast_real(confidence)
        if not (0 <= confidence <= 1.0):
            alert_fatal(
                f'Option {pref}n requires a real number between 0 and 1.0. {hint}'
            )
            exit(int(ExitCode.bad_arg))

    # Do the real work --------------------------------------------------------

    if __debug__: log('=' * 8 + f' started {timestamp()} ' + '=' * 8)
    body = exception = None
    try:
        body = MainBody(
            files=files,
            from_file=None if from_file == 'F' else from_file,
            output_dir=None if output_dir == 'O' else output_dir,
            add_creds=None if add_creds == 'A' else add_creds,
            base_name='document' if base_name == 'B' else base_name,
            confidence=0 if confidence == 'N' else confidence,
            text_color='red' if text_color == 'X' else text_color.lower(),
            text_shift='0,0' if text_move == 'M' else text_move,
            text_size='12' if text_size == 'Z' else int(text_size),
            display=display,
            make_grid=not no_grid,
            extended=extended,
            reuse_json=reuse_json,
            services=services,
            threads=max(1,
                        cpu_count() // 2 if threads == 'T' else int(threads)),
            compare='relaxed' if (compare and relaxed) else compare)
        config_interrupt(body.stop, UserCancelled(ExitCode.user_interrupt))
        body.run()
        exception = body.exception
    except Exception as ex:
        exception = sys.exc_info()

    # Try to deal with exceptions gracefully ----------------------------------

    exit_code = ExitCode.success
    if exception:
        if exception[0] == CannotProceed:
            exit_code = exception[1].args[0]
        elif exception[0] in [KeyboardInterrupt, UserCancelled]:
            if __debug__: log(f'received {exception.__class__.__name__}')
            warn('Interrupted.')
            exit_code = ExitCode.user_interrupt
        else:
            ex_class = exception[0]
            ex = exception[1]
            alert_fatal(f'An error occurred ({ex_class.__name__}): {str(ex)}')
            # Return a better error code for some common cases.
            if ex_class in [
                    FileNotFoundError, FileExistsError, PermissionError
            ]:
                exit_code = ExitCode.file_error
            else:
                exit_code = ExitCode.exception
            if __debug__:
                from traceback import format_exception
                details = ''.join(format_exception(*exception))
                logr(f'Exception: {str(ex)}\n{details}')
    else:
        inform('Done.')

    # And exit ----------------------------------------------------------------

    if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8)
    if exit_code == ExitCode.user_interrupt:
        # This is a sledgehammer, but it kills everything, including ongoing
        # network get/post. I have not found a more reliable way to interrupt.
        os._exit(int(exit_code))
    else:
        exit(int(exit_code))
Пример #23
0
    def _do_preflight(self):
        '''Check the option values given by the user, and do other prep.'''

        if not network_available():
            alert_fatal('No network connection.')
            raise CannotProceed(ExitCode.no_network)

        # Sanity-check the arguments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        hint = '(Hint: use -h for help.)'

        if not self.files:
            alert_fatal(
                f'Need at least one folder path or file as argument. {hint}')
            raise CannotProceed(ExitCode.bad_arg)
        if any(item.startswith('-') for item in self.files):
            bad = next(item for item in self.files if item.startswith('-'))
            alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}')
            raise CannotProceed(ExitCode.bad_arg)
        if not self.use_keyring and not any([self.api_key, self.user_id]):
            alert_fatal(
                f"Need Zotero credentials if not using keyring. {hint}")
            raise CannotProceed(ExitCode.bad_arg)

        if self.after_date:
            try:
                # Convert user's input into a canonical format.
                self.after_date = parsed_datetime(self.after_date)
                self.after_date_str = self.after_date.strftime(DATE_FORMAT)
                if __debug__:
                    log(f'parsed after_date as {self.after_date_str}')
            except KeyboardInterrupt as ex:
                if __debug__: log(f'got exception {str(ex)}')
                raise
            except Exception as ex:
                alert_fatal(f'Unable to parse after_date: "{str(ex)}". {hint}')
                raise CannotProceed(ExitCode.bad_arg)

        if self.file_ext:
            self.file_ext = self.file_ext.lower().split(',')
            self.file_ext = [
                '.' + e for e in self.file_ext if not e.startswith('.')
            ]

        # Set up Zotero connection and gather files for work ~~~~~~~~~~~~~~~~~~

        inform('Connecting to Zotero network servers ...')
        self._zotero = Zotero(self.api_key, self.user_id, self.use_keyring)

        if len(self.files) > 1 or path.isdir(self.files[0]):
            inform('Examining folders and looking for files ...')
        # 2 passes: traverse subdirectories recursively, then filter results.
        candidates = []
        for item in self.files:
            if path.isfile(item):
                candidates.append(item)
            elif path.isdir(item):
                if __debug__: log(f'adding files in subdir {antiformat(item)}')
                candidates += files_in_directory(item)
            else:
                warn(f'Not a file nor a folder of files: "{antiformat(item)}"')
        if __debug__: log('gathering list of files ...')
        self._targets = []
        for file in candidates:
            ext = filename_extension(file)
            if path.basename(file).startswith('.') or ext in _IGNORED_EXT:
                if __debug__:
                    log(f'ignoring ignorable file {antiformat(file)}')
                continue
            if self.file_ext and ext not in self.file_ext:
                warn(
                    f'Skipping file without desired extension: {antiformat(file)}'
                )
                continue
            if file_is_alias(file):
                if __debug__: log(f'ignoring macOS alias {antiformat(file)}')
                continue
            self._targets.append(file)
        if __debug__:
            log(f'gathered {pluralized("file", self._targets, True)}')

        if self.after_date:
            if __debug__: log(f'filtering files by date {self.after_date_str}')
            kept = []
            tzinfo = self.after_date.tzinfo
            for file in self.files:
                mtime = datetime.fromtimestamp(Path(file).stat().st_mtime)
                if mtime.replace(tzinfo=tzinfo) >= self.after_date:
                    if __debug__: log(f'keeping {file}')
                    kept.append(file)
            self._targets = kept

        if not self._targets:
            alert_fatal('No files to process; quitting.')
            raise CannotProceed(ExitCode.bad_arg)
Пример #24
0
    def run_services(self, item, index, base_name):
        '''Run all requested services on the image indicated by "item", using
        "index" and "base_name" to construct a download copy of the item if
        it has to be downloaded from a URL first.
        '''
        # Shortcuts to make the code more readable.
        services = self._services

        inform(f'Starting on [white]{item}[/]')
        (item_file, item_fmt) = self._get(item, base_name, index)
        if not item_file:
            return

        dest_dir = self._output_dir if self._output_dir else path.dirname(
            item_file)
        if not writable(dest_dir):
            alert(f'Cannot write output in {dest_dir}.')
            return

        # Normalize input image to the lowest common denominator.
        image = self._normalized(item, item_fmt, item_file, dest_dir)
        if not image.file:
            warn(f'Skipping {relative(item_file)}')
            return

        # Send the file to the services and get Result tuples back.
        self._senders = []
        if self._num_threads == 1:
            # For 1 thread, avoid thread pool to make debugging easier.
            results = [self._send(image, s) for s in services]
        else:
            executor = ThreadPoolExecutor(max_workers=self._num_threads,
                                          thread_name_prefix='ServiceThread')
            for service in services:
                future = executor.submit(self._send, image, service)
                self._senders.append(future)
            results = [future.result() for future in self._senders]

        # If a service failed for some reason (e.g., a network glitch), we
        # get no result back.  Remove empty results & go on with the rest.
        results = [x for x in results if x is not None]
        if not results:
            warn(f'Nothing to do for {item}')
            return

        # Create grid file if requested.
        if self._make_grid:
            base = path.basename(filename_basename(item_file))
            grid_file = path.realpath(
                path.join(dest_dir, base + '.handprint-all.png'))
            inform(f'Creating results grid image: {relative(grid_file)}')
            all_results = [r.annotated for r in results]
            width = math.ceil(math.sqrt(len(all_results)))
            from handprint.images import create_image_grid
            create_image_grid(all_results, grid_file, max_horizontal=width)

        # Clean up after ourselves.
        if not self._extended_results:
            for file in set(image.temp_files | {r.annotated for r in results}):
                if file and path.exists(file):
                    delete_existing(file)
        elif image.file != image.item_file:
            # Delete the resized file.  While it would help efficiency to
            # reuse it on subsequent runs, the risk is that those runs might
            # target different services and would end up using a different-
            # sized image than if we sized it appropriately for _this_ run.
            delete_existing(image.file)

        inform(f'Done with {relative(item)}')