def _save(self, result, file): # First perform some sanity checks. if result is None: warn(f'No data for {file}') return if isinstance(result, tuple): # Assumes 2 elements: data, and error (data, error) = result if error: alert(f'Error: {error}') warn(f'Unable to write {file}') return else: result = data if __debug__: log(f'writing output to file {relative(file)}') if isinstance(result, str): with open(file, 'w', encoding='utf-8') as f: f.write(result) elif isinstance(result, io.BytesIO): with open(file, 'wb') as f: shutil.copyfileobj(result, f) else: # There's no other type in the code, so if we get here ... raise InternalError( 'Unexpected data in save_output() -- please report this.')
def _smaller_file(self, file): if not file: return None file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file): from handprint.images import image_size if image_size(new_file) < self._max_size: inform(f'Reusing resized image found in {relative(new_file)}') return new_file else: # We found a ".handprint.ext" file, perhaps from a previous run, # but for the current set of services, it's larger than allowed. if __debug__: log('existing resized file larger than' + f' {self._max_size}b: {new_file}') inform(f'Size too large; reducing size: {relative(file)}') from handprint.images import reduced_image_size (resized, error) = reduced_image_size(file, new_file, self._max_size) if error: alert(f'Failed to resize {relative(file)}: {error}') return None return resized
def _resized_image(self, file): (max_width, max_height) = self._max_dimensions file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file) and readable(new_file): from handprint.images import image_dimensions (image_width, image_height) = image_dimensions(new_file) if image_width < max_width and image_height < max_height: inform(f'Using reduced image found in {relative(new_file)}') return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, dimension are too large. if __debug__: log('existing resized file larger than' + f' {max_width}x{max_height}: {new_file}') inform(f'Dimensions too large; reducing dimensions: {relative(file)}') from handprint.images import reduced_image_dimensions (resized, error) = reduced_image_dimensions(file, new_file, max_width, max_height) if error: alert(f'Failed to re-dimension {relative(file)}: {error}') return None return resized
def _article_tuples(self, xml): '''Parse the XML input, assumed to be from micropublication.org, and create a list of `Article` records. ''' if __debug__: log(f'parsing XML data') articles = [] if type(xml) == str: # The micropublication xml declaration explicit uses ascii encoding. xml = xml.encode('ascii') try: for element in etree.fromstring(xml).findall('article'): doi = (element.find('doi').text or '').strip() pdf = (element.find('pdf-url').text or '').strip() jats = (element.find('jats-url').text or '').strip() image = (element.find('image-url').text or '').strip() title = (element.find('article-title').text or '').strip() date = element.find('date-published') if date != None: year = (date.find('year').text or '').strip() month = (date.find('month').text or '').strip() day = (date.find('day').text or '').strip() date = year + '-' + month + '-' + day else: date = '' basename = tail_of_doi(doi) status = 'complete' if all([pdf, jats, doi, title, date ]) else 'incomplete' articles.append( Article(self.issn, doi, date, title, basename, pdf, jats, image, status)) except Exception as ex: if __debug__: log(f'could not parse XML from server') alert('Unexpected or badly formed XML returned by server') return articles
def download_file(url, output_file, user=None, pswd=None): inform(f'Downloading {url}') try: download(url, user, pswd, output_file) return True except (NoContent, ServiceFailure, AuthFailure) as ex: alert(str(ex)) return False
def validated_input(msg, default_value, is_valid): while True: if __debug__: log(f'asking user: "******"') default = (' [' + default_value + ']') if default_value else '' value = input(msg + default + ': ') if default_value and value == '': if __debug__: log(f'user chose default value "{default_value}"') return default_value elif is_valid(value): if __debug__: log(f'got "{value}" from user') return value else: alert(f'"{value}" does not appear valid for {msg}') return None
def _converted_file(self, file, to_format, dest_dir): basename = path.basename(filename_basename(file)) new_file = path.join(dest_dir, basename + '.handprint.' + to_format) if path.exists(new_file): inform(f'Using existing converted image in {relative(new_file)}') return new_file else: inform(f'Converting to {to_format} format: {relative(file)}') from handprint.images import converted_image (converted, error) = converted_image(file, to_format, new_file) if error: alert(f'Failed to convert {relative(file)}: {error}') return None return converted
def valid_xml(xml_file, dtd): if __debug__: log(f'parsing XML file {xml_file}') try: root = etree.parse(xml_file) except etree.XMLSyntaxError as ex: alert(f'File contains XML syntax errors: {xml_file}') # The string form of XMLSyntaxError includes line/col & file name. alert(str(ex)) return False except Exception as ex: alert(f'Failed to parse XML file: {xml_file}') alert(str(ex)) return False if __debug__: log(f'validating {xml_file}') if dtd: if dtd.validate(root): if __debug__: log(f'validated without errors') return True else: warn(f'Failed to validate file {xml_file}') warn( f'{pluralized("validation error", dtd.error_log, True)} encountered:' ) for item in dtd.error_log: warn('Line {}, col {} ({}): {}', item.line, item.column, item.type_name, item.message) return False else: return True
def main(api_key='A', no_color=False, after_date='D', file_ext='F', identifier='I', no_keyring=False, list=False, method='M', dry_run=False, overwrite=False, quiet=False, space=False, version=False, debug='OUT', *files): '''Zowie ("ZOtero link WrItEr") is a tool for Zotero users. Zowie writes Zotero select links into the files and/or the macOS Finder metadata attributes of files in the user's local Zotero database. This makes it possible to jump to the Zotero bibliographic record corresponding to a Zotero file attachment when viewing the file from outside of Zotero. Credentials for Zotero access ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Zowie needs to know the user's personal library identifier (also known as the "userID") and a Zotero API key. By default, it tries to get this information from the user's keychain. If the values do not exist in the keychain from a previous run, Zowie will ask the user, and (unless the -K option is given) store the values in the user's keychain so that it does not have to ask again in the future. It is also possible to supply the identifier and API key on the command line using the -i and -a options, respectively; the given values will then override the values stored in the keychain (unless the -K option is also given). This is also how you can replace previously-stored values: use -i and -a (without -K) and the new values will override the stored values. To find out your Zotero userID and create an API key, log in to your Zotero account at Zotero.org and visit https://www.zotero.org/settings/keys Basic usage ~~~~~~~~~~~ Zowie can operate on a folder, or one or more individual files, or a mix of both. Suppose your local Zotero database is located in ~/Zotero/. Perhaps the simplest way to run Zowie is the following command: zowie ~/Zotero If this is your first run of Zowie, it will ask you for your userID and API key, then search for files recursively under ~/Zotero/storage/. For each file found, Zowie will contact the Zotero servers over the network and determine the Zotero URI for the bibliographic entry containing that file. Finally, it will use its default method of writing the Zotero select link, which is to write it into the macOS Finder comments for the file. If you are a user of DEVONthink, you will probably want to use the -s option (see the explanation below in the section on special-case behavior): zowie -s ~/Zotero Instead of a folder, you can invoke Zowie on one or more individual files (but be careful to quote pathnames with spaces in them, such as in this example): zowie -s "~/Zotero/storage/26GS7CZL/Smith 2020 Paper.pdf" Zowie supports multiple methods of writing the Zotero select link. The option -l will cause Zowie to print a list of all the methods available: zowie -l (Note that some methods only work for some file types.) The default method is to write it into Finder comments for the file. (These comments are visible in the Finder's "Get Info" panel.) The option -m can be used to select one or more alternative methods. Separate the names with commas without spaces. For example, the following command will make Zowie write the Zotero link into both the Finder comments and the "Where from" attribute: zowie -m findercomment,wherefrom ~/Zotero/storage Where possible, Zowie tries to preserve the previous contents of metadata attributes. For example, In the case of Finder comments and "Where from", it looks for existing Zotero links in the contents and updates those links only; if it does not find an existing Zotero link, it prepends one instead of replacing the value completely. The general rule is that Zowie will try to detect whether a Zotero select link is already present in the chosen metadata attribute(s) and will only update the link text if a link is found; otherwise, it will not write the Zotero select link at all unless given the overwrite (-o) option. The overwrite option (-o) makes Zowie replace values completely. Check the description of the methods for more details about what they do by default and the impact of the -o option. Filtering by file type ~~~~~~~~~~~~~~~~~~~~~~ By default, Zowie acts on all files it finds on the command line, except for certain files that it always ignores: hidden files and files with extensions .sqlite, .bak, .csl, .css, .js, .json, .pl, and a few others. If the -m option is used to select methods that only apply to specific file types, Zowie will examine each file it finds in turn and only apply the methods that match that particular file's type. You can use the option -f to make Zowie filter the files it finds based on file name extensions. This is useful if you want it to concentrate only on particular file types and ignore other files it might find while scanning folders. For example, zowie -f pdf,mp4,mov ~/Zotero will cause it to only work on .pdf, .mp4, and .mov files. You can provide multiple file extensions separated by commas, without spaces and without the leading periods. Note that Zowie always ignores certain files, such as those ending with .css, .js, .json, .bak, .csl, and a few others. Filtering by date ~~~~~~~~~~~~~~~~~ If the -d option is given, the files will be filtered to use only those whose last-modified date/time stamp is no older than the given date/time description. Valid descriptors are those accepted by the Python dateparser package. Make sure to enclose descriptions within single or double quotes. Examples: zowie -d "2 weeks ago" .... zowie -d "2014-08-29" .... zowie -d "12 Dec 2014" .... zowie -d "July 4, 2013" .... Special-case behavior ~~~~~~~~~~~~~~~~~~~~~ Although Zowie is not aimed solely at DEVONthink users, its development was motivated by the author's desire to use Zotero with that software. A complication arose due to an undocumented feature in DEVONthink: it ignores a Finder comment if it is identical to the value of the "URL" attribute (which is the name it gives to the "com.apple.metadata:kMDItemWhereFroms" extended attribute on a file). In practical terms, if you do something like write the Zotero select link into the Finder comment of a file and then have a DEVONthink smart rule copy the value to the URL field, the Finder comment will appear blank in DEVONthink (even though it exists on the actual file). This can be unexpected and confusing, and has caught people (including the author of Zowie) unaware. To compensate, Zowie 1.2 introduced a new option: it can add a trailing space character to the end of the value it writes into the Finder comment when using the "findercomment" method. Since approaches to copy the Zotero link from the Finder comment to the URL field in DEVONthink will typically strip whitespace around the URL value, the net effect is to make the value in the Finder comment just different enough from the URL field value to prevent DEVONthink from ignoring the Finder comment. Use option -s to make Zowie to add the trailing space character. Additional command-line arguments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If given the -n ("dry run") option, Zowie will only print what it would do without actually doing it. If given the -q option, Zowie will not print its usual informational messages while it is working. It will only print messages for warnings or errors. By default messages printed by Zowie are also color-coded. If given the option -C, Zowie will not color the text of messages it prints. (This latter option is useful when running Zowie within subshells inside other environments such as Emacs.) If given the -V option, this program will print the version and other information, and exit without doing anything else. If given the -@ argument, this program will output a detailed trace of what it is doing. The debug trace will be sent to the given destination, which can be '-' to indicate console output, or a file path to send the output to a file. When -@ has been given, Zowie also installs a signal handler on signal SIGUSR1 that will drop Zowie into the pdb debugger if the signal is sent to the running process. Return values ~~~~~~~~~~~~~ This program exits with a return code of 0 if no problems are encountered. It returns a nonzero value otherwise. The following table lists the possible return values: 0 = success -- program completed normally 1 = the user interrupted the program's execution 2 = encountered a bad or missing value for an option 3 = no network detected -- cannot proceed 4 = file error -- encountered a problem with a file 5 = server error -- encountered a problem with a server 6 = an exception or fatal error occurred Command-line arguments summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ''' # Set up debug logging as soon as possible, if requested ------------------ if debug != 'OUT': if __debug__: set_debug(True, debug) import faulthandler faulthandler.enable() if not sys.platform.startswith('win'): # Even with a different signal, I can't get this to work on Win. import signal from boltons.debugutils import pdb_on_signal pdb_on_signal(signal.SIGUSR1) # Preprocess arguments and handle early exits ----------------------------- if version: from zowie import print_version print_version() exit(int(ExitCode.success)) from bun import UI, inform, warn, alert, alert_fatal ui = UI('Zowie', 'ZOtero link WrItEr', use_color=not no_color, be_quiet=quiet) ui.start() if list: import shutil from textwrap import wrap from zowie.methods import method_object inform('Known methods:\n') width = (shutil.get_terminal_size().columns - 2) or 78 for name in method_names(): text = f'[cyan2]{name}[/]: {method_object(name).description()}' inform('\n'.join(wrap(text, width=width, subsequent_indent=' '))) inform('') exit(int(ExitCode.success)) methods_list = ['findercomment' ] if method == 'M' else method.lower().split(',') bad_name = next((n for n in methods_list if n not in method_names()), None) if bad_name: alert(f'Unrecognized method name "{bad_name}".') alert('The available methods are: ' + ', '.join(method_names()) + '.') exit(int(ExitCode.bad_arg)) # Do the real work -------------------------------------------------------- from commonpy.data_utils import timestamp from commonpy.interrupt import config_interrupt from zowie.exceptions import UserCancelled, FileError, CannotProceed from zowie.main_body import MainBody if __debug__: log('=' * 8 + f' started {timestamp()} ' + '=' * 8) body = exception = None try: body = MainBody(files=files, file_ext=None if file_ext == 'F' else file_ext, api_key=None if api_key == 'A' else api_key, user_id=None if identifier == 'I' else identifier, use_keyring=not no_keyring, after_date=None if after_date == 'D' else after_date, methods=methods_list, dry_run=dry_run, overwrite=overwrite, add_space=space) config_interrupt(body.stop, UserCancelled(ExitCode.user_interrupt)) body.run() exception = body.exception except Exception as ex: exception = sys.exc_info() # Try to deal with exceptions gracefully ---------------------------------- exit_code = ExitCode.success if exception: from commonpy.string_utils import antiformat if __debug__: log(f'main body raised exception: {antiformat(exception)}') if exception[0] == CannotProceed: exit_code = exception[1].args[0] elif exception[0] == FileError: alert_fatal(antiformat(exception[1])) exit_code = ExitCode.file_error elif exception[0] in [KeyboardInterrupt, UserCancelled]: warn('Interrupted.') exit_code = ExitCode.user_interrupt else: msg = antiformat(exception[1]) alert_fatal(f'Encountered error {exception[0].__name__}: {msg}') exit_code = ExitCode.exception if __debug__: from traceback import format_exception details = ''.join(format_exception(*exception)) log(f'Exception: {msg}\n{details}') else: inform('Done.') # And exit ---------------------------------------------------------------- if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8) if __debug__: log(f'exiting with exit code {exit_code}') exit(int(exit_code))
def __init__(self, key, user_id, use_keyring): if key and (not key.isalnum() or len(key) < 20): alert_fatal(f'"{key}" does not appear to be a valid API key.') raise CannotProceed(ExitCode.bad_arg) if user_id and not user_id.isdigit(): alert_fatal(f'"{user_id}" does not appear to be a Zotero user ID.') raise CannotProceed(ExitCode.bad_arg) # If the user supplied all the values on the command line, those are # the values used. If none were supplied by the user on the command # line, all the values are retrieved from the user's keyring. If # some were supplied and others missing, they're filled in from # either the keyring or by prompting the user. if __debug__: log('keyring ' + ('enabled' if use_keyring else 'disabled')) if key is None and user_id is None and use_keyring: # We weren't given a key and id, but we can look in the keyring. if __debug__: log(f'getting id & key from keyring') key, user_id = keyring_credentials() if not key: key = validated_input('API key', key, lambda x: x.isalnum()) if not user_id: user_id = validated_input('User ID', user_id, lambda x: x.isdigit()) if use_keyring: if __debug__: log('saving credentials to keyring') save_keyring_credentials(key, user_id) self._key = key self._user_id = user_id # Get connected and store the Zotero conection object for the user # library first, then look up the group libraries that the user can # access and create Zotero objects for that too. The way we use them, # we don't need to separate between them, so they all go into one list. self._libraries = [] try: if __debug__: log(f'connecting to Zotero as user {user_id}') user = zotero.Zotero(user_id, 'user', key) # pyzotero will return an object but that doesn't mean the user # actually gave valid credentials. Need to try an operation. user.count_items() self._libraries.append(user) raise_for_interrupts() except zotero_errors.UserNotAuthorised as ex: if __debug__: log(f'got exception {str(ex)}') alert_fatal('Unable to connect to Zotero: invalid ID and/or API key.', 'The Zotero servers rejected attempts to connect.') raise CannotProceed(ExitCode.bad_arg) except KeyboardInterrupt as ex: if __debug__: log(f'got exception {str(ex)}') raise except Exception as ex: if __debug__: log(f'failed to create Zotero user object: str(ex)') alert_fatal('Unable to connect to Zotero API.') raise try: for group in user.groups(): if __debug__: log(f'user can access group id {group["id"]}') self._libraries.append(zotero.Zotero(group['id'], 'group', key)) raise_for_interrupts() except KeyboardInterrupt as ex: if __debug__: log(f'got exception {str(ex)}') raise except Exception as ex: if __debug__: log(f'failed to create Zotero group object: str(ex)') alert('Unable to retrieve Zotero group library; proceeding anyway.')
def _send(self, image, service): '''Get results from service named "service" for the "image".''' service_name = f'[{service.name_color()}]{service.name()}[/]' base_path = path.join(image.dest_dir, path.basename(image.file)) json_file = self._renamed(base_path, str(service), 'json') saved_results = None if self._reuse_json and readable(json_file): inform( f'Reading saved results for {service_name} from {relative(json_file)}' ) with open(json_file, 'r') as f: saved_results = json.load(f) output = service.result(image.file, saved_results) else: inform(f'Sending to {service_name} and waiting for response ...') last_time = timer() try: output = service.result(image.file, None) except AuthFailure as ex: raise AuthFailure(f'Service {service}: {str(ex)}') except RateLimitExceeded as ex: time_passed = timer() - last_time if time_passed < 1 / service.max_rate(): warn(f'Pausing {service_name} due to rate limits') wait(1 / service.max_rate() - time_passed) warn(f'Continuing {service_name}') return self._send(image, service) if output.error: # Sanitize the error string in case it contains '{' characters. msg = output.error.replace('{', '{{{{').replace('}', '}}}}') alert(f'{service_name} failed: {msg}') warn( f'No result from {service_name} for {relative(image.file)}' ) return None inform(f'Got result from {service_name}.') raise_for_interrupts() inform(f'Creating annotated image for {service_name}.') annot_path = self._renamed(base_path, str(service), 'png') report_path = None from handprint.images import annotated_image with self._lock: img = annotated_image(image.file, output.boxes, service, self._text_size, self._text_color, self._text_shift, self._display, self._confidence) self._save(img, annot_path) if self._extended_results and (saved_results is None): inform(f'Saving all data for {service_name}.') raw_json = json.dumps(output.data, sort_keys=True, indent=2) self._save(raw_json, json_file) inform(f'Saving extracted text for {service_name}.') txt_file = self._renamed(base_path, str(service), 'txt') self._save(output.text, txt_file) if self._compare: gt_file = alt_extension(image.item_file, 'gt.txt') gt_path = relative(gt_file) report_path = self._renamed(image.item_file, str(service), 'tsv') relaxed = (self._compare == 'relaxed') if readable(gt_file) and nonempty(gt_file): if __debug__: log(f'reading ground truth from {gt_file}') gt_text = open(gt_file, 'r').read() inform(f'Saving {service_name} comparison to ground truth') from handprint.comparison import text_comparison self._save(text_comparison(output.text, gt_text, relaxed), report_path) elif not nonempty(gt_file): warn( f'Skipping {service_name} comparison because {gt_path} is empty' ) else: warn( f'Skipping {service_name} comparison because {gt_path} not available' ) return Result(service, image, annot_path, report_path)
def run_services(self, item, index, base_name): '''Run all requested services on the image indicated by "item", using "index" and "base_name" to construct a download copy of the item if it has to be downloaded from a URL first. ''' # Shortcuts to make the code more readable. services = self._services inform(f'Starting on [white]{item}[/]') (item_file, item_fmt) = self._get(item, base_name, index) if not item_file: return dest_dir = self._output_dir if self._output_dir else path.dirname( item_file) if not writable(dest_dir): alert(f'Cannot write output in {dest_dir}.') return # Normalize input image to the lowest common denominator. image = self._normalized(item, item_fmt, item_file, dest_dir) if not image.file: warn(f'Skipping {relative(item_file)}') return # Send the file to the services and get Result tuples back. self._senders = [] if self._num_threads == 1: # For 1 thread, avoid thread pool to make debugging easier. results = [self._send(image, s) for s in services] else: executor = ThreadPoolExecutor(max_workers=self._num_threads, thread_name_prefix='ServiceThread') for service in services: future = executor.submit(self._send, image, service) self._senders.append(future) results = [future.result() for future in self._senders] # If a service failed for some reason (e.g., a network glitch), we # get no result back. Remove empty results & go on with the rest. results = [x for x in results if x is not None] if not results: warn(f'Nothing to do for {item}') return # Create grid file if requested. if self._make_grid: base = path.basename(filename_basename(item_file)) grid_file = path.realpath( path.join(dest_dir, base + '.handprint-all.png')) inform(f'Creating results grid image: {relative(grid_file)}') all_results = [r.annotated for r in results] width = math.ceil(math.sqrt(len(all_results))) from handprint.images import create_image_grid create_image_grid(all_results, grid_file, max_horizontal=width) # Clean up after ourselves. if not self._extended_results: for file in set(image.temp_files | {r.annotated for r in results}): if file and path.exists(file): delete_existing(file) elif image.file != image.item_file: # Delete the resized file. While it would help efficiency to # reuse it on subsequent runs, the risk is that those runs might # target different services and would end up using a different- # sized image than if we sized it appropriately for _this_ run. delete_existing(image.file) inform(f'Done with {relative(item)}')
def main(add_creds='A', base_name='B', no_color=False, compare=False, display='D', extended=False, from_file='F', no_grid=False, list=False, reuse_json=False, text_move='M', confidence='N', output_dir='O', quiet=False, relaxed=False, services='S', threads='T', version=False, text_color='X', text_size='Z', debug='OUT', *files): '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") runs alternative text recognition services on images of handwritten document pages. Installing credentials for cloud-based services ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If given the command-line flag -l (or /l on Windows), Handprint will print a list of the known services, and exit. Before a given service can be used, if it is cloud-based commercial OCR/HTR service, Handprint needs to be supplied with user credentials for accessing that service. The credentials must be stored in a JSON file with a certain format; see the Handprint user documentation for details about the formats for each service. To add a new credentials file, use the -a option (/a on Windows) in combination with the name of a service and a single file path on the command line. The name supplied right after the -a option must be the name of a recognized service (such as "google", "amazon", "microsoft"), and the file argument must be a JSON file containing the credentials data in the required format for that service. Here is an example of adding credentials for Google (assuming you created the JSON file as described in the docs): handprint -a google mygooglecreds.json Run Handprint with the -a option multiple times to install credentials for each different service. Handprint will copy the credential files to its own configuration directory and exit without doing anything else. The directory is different on different operating sytems; for example, on macOS it is ~/Library/Application Support/Handprint/. Basic usage ~~~~~~~~~~~ After credentials are installed, running Handprint without the -a option will invoke one or more OCR/HTR services on files, directories of files, or URLs. Here is an example of running Handprint on a directory containing images: handprint tests/data/caltech-archives/glaser/ Image paths or URLs can be supplied to Handprint in any of the following ways: a) one or more directory paths or one or more image file paths, which will be interpreted as images (either individually or in directories) to be processed; b) one or more URLs, which will be interpreted as network locations of image files to be processed; or c) if given the -f option (/f on Windows), a file containing either image paths or image URLs. Note that providing URLs on the command line can be problematic due to how terminal shells interpret certain characters, and so when supplying URLs, it's usually better to store the URLs in a file and use the -f option. Regardless, when given URLs, Handprint will first download the images to a local directory indicated by the option -o (/o on Windows), or the current directory if option -o is not used. No matter whether files or URLs, each input should be a single image of a document page in which text should be recognized. Handprint can accept input images in JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF formats. To make the results from different services more easily comparable, Handprint will always convert all input images to the same format (PNG) no matter if some services may accept other formats; it will also downsize input images to the smallest size accepted by any of the services invoked if an image exceeds that size. (For example, if service A accepts files up to 10 MB in size and service B accepts files up to 4 MB, all input images will be resized to 4 MB before sending them to both A and B, even if A could accept a higher- resolution image.) Finally, if the input contains more than one page (e.g., in a PDF file), Handprint will only use the first page and ignore the rest. Be aware that resizing images to the lowest common size means that the text recognition results returned by some services may be different than if the original full-size input image had been sent to that service. If your images are larger (when converted to PNG) than the size threshold for some services (which is currently 4 MB when Microsoft is one of the destinations), then you may wish to compare the results of using multiple services at once versus using the services one at a time. Selecting destination services ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The default action is to run all known services. The option -s (/s on Windows) can be used to select only one service or a list of services instead. Lists of services should be separated by commas; e.g., "google,microsoft". To find out which services are supported by Handprint, run it with the command-line flag -l (or /l on Windows), which will make Handprint print a list of the known services and exit immediately. Visual display of recognition results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ After gathering the results of each service for a given input, Handprint will create a single compound image consisting of the results for each service arranged in a grid. This is intended to make it easier to compare the results of multiple services against each other. To skip the creation of the results grid, use the -G option (/G on Windows). The grid image have a name with the following pattern: somefile.handprint-all.png If given the -e option (/e on Windows), Handprint will produce extended output that includes the complete response from the service (converted to a JSON file by Handprint) and the text extracted (stored as a .txt file). The output of -e will be multiple files like this: somefile.handprint-amazon-rekognition.json somefile.handprint-amazon-rekognition.png somefile.handprint-amazon-rekognition.txt somefile.handprint-amazon-textract.json somefile.handprint-amazon-textract.png somefile.handprint-amazon-textract.txt somefile.handprint-google.json somefile.handprint-google.png somefile.handprint-google.txt somefile.handprint-microsoft.json somefile.handprint-microsoft.png somefile.handprint-microsoft.txt ... The files will be written to the directory indicated by -o, or (if -o is not used) the directory where "somefile" is located. When -o is not used and the input images are given as URLs, then the files are written to the current working directory instead. When the inputs are URLs, Handprint must download a copy of the image located at the network address (because it is not possible to write the results in the network locations represented by the URLs.). The images and other results will be stored files whose root names have the form "document-N", where "N" is an integer. The root name can be changed using the -b option (/b on Windows). The image at networked locations will be converted to ordinary PNG format for maximum compatibility with the different OCR services and written to "document-N.png", and the URL corresponding to each document will be written in a file named "document-N.url" so that it is possible to connect each "document-N.png" to the URL it came from. Finally, note that the use of the -G option (/G on Windows) WITHOUT either the -e or -c option is an error because it means no output would be produced. Type of annotations ~~~~~~~~~~~~~~~~~~~ Handprint produces copies of the input images overlayed with the recognition results received from the different services. By default, it shows only the recognized text. The option -d (/d on Windows) can be used to tell Handprint to display other results. The recognized values are as follows: text -- display the text recognized in the image (default) bb -- display all bounding boxes returned by the service bb-word -- display only the bounding boxes for words (in red) bb-line -- display only the bounding boxes for lines (in blue) bb-para -- display only the bounding boxes for paragraphs (in green) Separate multiple values with a comma. The option "bb" is a shorthand for the value "bb-word,bb-line,bb-para". As an example, the following command will show both the recognized text and the bounding boxes around words: handprint -d text,bb-word somefile.png Note that as of June 2021, the main services (Amazon, Google, Microsoft) do not all provide the same bounding box information in their results. The following table summarizes what is available: Bounding boxes available Service Word Line Paragraph --------- ---- ---- --------- Amazon Y Y - Google Y - Y Microsoft Y Y - If a service does not provide a particular kind of bounding box, Handprint will not display that kind of bounding box in the annotated output for that service. Thresholding by confidence ~~~~~~~~~~~~~~~~~~~~~~~~~~ All of the services return confidence scores for items recognized in the input. By default, Handprint will show all results in the annotated image, no matter how low the score. The option -n (/n on Windows) can be used to threshold the results based on the confidence value for each item (text or bounding boxes). The value provided as the argument to -n must be a floating point number between 0 and 1.0. For example, the following command will make Handprint only show text that is rated with least 99.5% confidence: handprint -n 0.995 somefile.png Note that the confidence values returned by the different services are not normalized against each other. What one service considers to be 80% confidence may not be what another service considers 80% confidence. Handprint performs the thresholding against the raw scores returned by each service individually. Comparing results to expected output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Handprint supports comparing the output of HTR services to expected output (i.e., ground truth) using the option -c (or /c on Windows). This facility requires that the user provides text files that contain the expected text for each input image. The ground-truth text files must have the following characteristics: a) The file containing the expected results should be named ".gt.txt", with a base name identical to the image file. For example, an image file named "somefile.jpg" should have a corresponding text file "somefile.gt.txt". b) The ground-truth text file should be located in the same directory as the input image file. c) The text should be line oriented, with each line representing a line of text in the image. d) The text should be plain text only. No Unicode or binary encodings. (This limitation comes from the HTR services, which -- as of this writing -- return results in plain text format.) Handprint will write the comparison results to a tab-delimited file named after the input image and service but with the extension ".tsv". For example, for an input image "somefile.jpg" and results received from Google, the comparison results will be written to "somefile.handprint-google.tsv". (The use of a tab-delimited format rather than comma-delimited format avoids the need to quote commas and other characters in the text.) Handprint reports, for each text line, the number of errors (the Levenshtein edit distance) and the character error rate (CER), and at the end it also reports a sum total of errors. The CER is computed as the Levenshtein edit distance of each line divided by the number of characters in the expected line text, multiplied by 100; this approach to normalizing the CER value is conventional but note that it can lead to values greater than 100%. By default, comparisons are done on an exact basis; character case is not changed, punctuation is not removed, and stop words are not removed. However, multiple contiguous spaces are converted to one space, and leading spaces are removed from text lines. If given the option -r (/r on Windows), Handprint will relax the comparison algorithm as follows: i) convert all text to lower case ii) ignore certain sentence punctuation characters, namely , . : ; Handprint attempts to cope with possibly-missing text in the HTR results by matching up likely corresponding lines in the expected and received results. It does this by comparing each line of ground-truth text to each line of the HTR results using longest common subsequence similarity, as implemented by the LCSSEQ function in the Python "textdistance" package. If the lines do not pass a threshold score, Handprint looks at subsequent lines of the HTR results and tries to reestablish correspondence to ground truth. If nothing else in the HTR results appear close enough to the expected ground-truth line, the line is assumed to be missing from the HTR results and scored appropriately. Additional command-line arguments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The option -j (/j on Windows) tells Handprint to look for and reuse preexisting results for each input instead of contacting the services. This makes it look for JSON files produced in a previous run with the -e option, somefile.handprint-amazon-rekognition.json somefile.handprint-amazon-textract.json somefile.handprint-google.json somefile.handprint-microsoft.json and use those instead of getting results from the services. This can be useful to save repeated invocations of the services if all you want is to draw the results differently or perform some testing/debugging on the same inputs. To move the position of the text annotations overlayed over the input image, you can use the option -m (or /m on Windows). This takes two numbers separated by a comma in the form x,y. Positive numbers move the text rightward and upward, respectively, relative to the default position. The default position of each text annotation in the annotated output is such that the left edge of the word starts at the location of the upper left corner of the bounding box returned by the service; this has the effect of putting the annotation near, but above, the location of the (actual) word in the input image by default. Using the text-move option allows you to move the annotation if desired. To change the color of the text annotations overlayed over the input image, you can use the option -x (or /x on Windows). You can use hex color codes such as "#ff0000" or X11/CSS4 color names with no spaces such as "purple" or "darkgreen". If you use a hex value, make sure to enclose the value with quotes, or the shell will interpret the pound sign as a comment character. To change the size of the text annotations overlayed over the input image, you can use the option -z (or /z on Windows). The value is in units of points. The default size is 12 points. Handprint will send files to the different services in parallel, using a number of process threads at most equal to 1/2 of the number of cores on the computer it is running on. (E.g., if your computer has 4 cores, it will by default use at most 2 threads.) The option -t (/t on Windows) can be used to change this number. If given the -q option (/q on Windows), Handprint will not print its usual informational messages while it is working. It will only print messages for warnings or errors. By default messages printed by Handprint are also color-coded. If given the option -Z (/Z on Windows), Handprint will not color the text of messages it prints. (This latter option is useful when running Handprint within subshells inside other environments such as Emacs.) If given the -@ argument (/@ on Windows), this program will output a detailed trace of what it is doing. The debug trace will be sent to the given destination, which can be '-' to indicate console output, or a file path to send the output to a file. When -@ (or /@ on Windows) has been given, Handprint installs a signal handler on signal SIGUSR1 that will drop Handprint into the pdb debugger if the signal is sent to the running process. It's best to use -t 1 when attempting to use a debugger because the subthreads will not stop running if the signal is sent. If given the -V option (/V on Windows), this program will print the version and other information, and exit without doing anything else. Return values ~~~~~~~~~~~~~ This program exits with a return code of 0 if no problems are encountered. It returns a nonzero value otherwise. The following table lists the possible return values: 0 = success -- program completed normally 1 = the user interrupted the program's execution 2 = encountered a bad or missing value for an option 3 = no network detected -- cannot proceed 4 = file error -- encountered a problem with a file 5 = server error -- encountered a problem with a server 6 = an exception or fatal error occurred Command-line arguments summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ''' # Initial setup ----------------------------------------------------------- pref = '/' if sys.platform.startswith('win') else '-' hint = f'(Hint: use {pref}h for help.)' ui = UI('Handprint', 'HANDwritten Page RecognitIoN Test', use_color=not no_color, be_quiet=quiet, show_banner=not (version or list or add_creds != 'A')) ui.start() if debug != 'OUT': if __debug__: set_debug(True, debug, extra='%(threadName)s') import faulthandler faulthandler.enable() if not sys.platform.startswith('win'): # Even with a different signal, I can't get this to work on Win. pdb_on_signal(signal.SIGUSR1) # Preprocess arguments and handle early exits ----------------------------- if version: print_version() exit(int(ExitCode.success)) if list: inform('Known services: [bold]{}[/]', ', '.join(services_list())) exit(int(ExitCode.success)) if add_creds != 'A': service = add_creds.lower() if service not in services_list(): alert(f'Unknown service: "{service}". {hint}') exit(int(ExitCode.bad_arg)) if not files or len(files) > 1: alert(f'Option {pref}a requires one file. {hint}') exit(int(ExitCode.bad_arg)) creds_file = files[0] if not readable(creds_file): alert(f'File not readable: {creds_file}') exit(int(ExitCode.file_error)) Credentials.save_credentials(service, creds_file) inform(f'Saved credentials for service "{service}".') exit(int(ExitCode.success)) services = services_list() if services == 'S' else services.lower().split( ',') if services != 'S' and not all(s in services_list() for s in services): alert_fatal(f'"{services}" is/are not known services. {hint}') exit(int(ExitCode.bad_arg)) display_given = display display = ['text'] if display == 'D' else display.lower().split(',') possible_displays = [ 'text', 'bb', 'bb-word', 'bb-words', 'bb-line', 'bb-lines', 'bb-para', 'bb-paragraph', 'bb-paragraphs' ] if not all(d in possible_displays for d in display): alert_fatal(f'Unrecognized value for {pref}d: {display_given}. {hint}') exit(int(ExitCode.bad_arg)) if no_grid and not extended and not compare: alert_fatal( f'{pref}G without {pref}e or {pref}c produces no output. {hint}') exit(int(ExitCode.bad_arg)) if any(item.startswith('-') for item in files): bad = next(item for item in files if item.startswith('-')) alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}') exit(int(ExitCode.bad_arg)) if not files and from_file == 'F': alert_fatal(f'Need images or URLs to have something to do. {hint}') exit(int(ExitCode.bad_arg)) if relaxed and not compare: warn(f'Option {pref}r without {pref}c has no effect. {hint}') if text_move != 'M' and ',' not in text_move: alert_fatal( f'Option {pref}m requires an argument of the form x,y. {hint}') exit(int(ExitCode.bad_arg)) if text_size != 'Z' and not isint(text_size): alert_fatal( f'Option {pref}z requires an integer as an argument. {hint}') exit(int(ExitCode.bad_arg)) if confidence != 'N': if not isreal(confidence): alert_fatal( f'Option {pref}n requires a real number as an argument. {hint}' ) exit(int(ExitCode.bad_arg)) confidence = fast_real(confidence) if not (0 <= confidence <= 1.0): alert_fatal( f'Option {pref}n requires a real number between 0 and 1.0. {hint}' ) exit(int(ExitCode.bad_arg)) # Do the real work -------------------------------------------------------- if __debug__: log('=' * 8 + f' started {timestamp()} ' + '=' * 8) body = exception = None try: body = MainBody( files=files, from_file=None if from_file == 'F' else from_file, output_dir=None if output_dir == 'O' else output_dir, add_creds=None if add_creds == 'A' else add_creds, base_name='document' if base_name == 'B' else base_name, confidence=0 if confidence == 'N' else confidence, text_color='red' if text_color == 'X' else text_color.lower(), text_shift='0,0' if text_move == 'M' else text_move, text_size='12' if text_size == 'Z' else int(text_size), display=display, make_grid=not no_grid, extended=extended, reuse_json=reuse_json, services=services, threads=max(1, cpu_count() // 2 if threads == 'T' else int(threads)), compare='relaxed' if (compare and relaxed) else compare) config_interrupt(body.stop, UserCancelled(ExitCode.user_interrupt)) body.run() exception = body.exception except Exception as ex: exception = sys.exc_info() # Try to deal with exceptions gracefully ---------------------------------- exit_code = ExitCode.success if exception: if exception[0] == CannotProceed: exit_code = exception[1].args[0] elif exception[0] in [KeyboardInterrupt, UserCancelled]: if __debug__: log(f'received {exception.__class__.__name__}') warn('Interrupted.') exit_code = ExitCode.user_interrupt else: ex_class = exception[0] ex = exception[1] alert_fatal(f'An error occurred ({ex_class.__name__}): {str(ex)}') # Return a better error code for some common cases. if ex_class in [ FileNotFoundError, FileExistsError, PermissionError ]: exit_code = ExitCode.file_error else: exit_code = ExitCode.exception if __debug__: from traceback import format_exception details = ''.join(format_exception(*exception)) logr(f'Exception: {str(ex)}\n{details}') else: inform('Done.') # And exit ---------------------------------------------------------------- if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8) if exit_code == ExitCode.user_interrupt: # This is a sledgehammer, but it kills everything, including ongoing # network get/post. I have not found a more reliable way to interrupt. os._exit(int(exit_code)) else: exit(int(exit_code))
def main(after_date='A', no_color=False, dest='D', doi_file='F', journal='J', list_dois=False, output_dir='O', preview=False, quiet=False, rep_file='R', rep_fmt='S', rep_title='T', version=False, no_check=False, no_zip=False, debug='OUT'): '''Create archives of journals suitable for sending to Portico or PMC. The journal whose articles are to be archived must be indicated using the required option -j (or /j on Windows). To list the currently-supported journals, you can use a value of "list" to the -j option: pubarchiver -j list Without any additional options, PubArchiver will contact the journal website and either DataCite or Crossref, and create an archive containing articles and their metadata for all articles published to date by the journal. The options below can be used to select articles and influence other PubArchiver behaviors. Selecting a subset of articles ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If the option -a (or /a on Windows) is given, PubArchiver will download only articles whose publication dates are AFTER the given date. Valid date descriptors are those accepted by the Python dateparser library. Make sure to enclose descriptions within single or double quotes. Examples: pubarchiver -a "2014-08-29" .... pubarchiver -a "12 Dec 2014" .... pubarchiver -a "July 4, 2013" .... pubarchiver -a "2 weeks ago" .... The option -f (or /f on Windows) can be used with a value of a file path to limit archiving to only the DOIs listed in the given file. The file format must be a simple list of one DOI per line. The selection by date performed by the -a option happens after reading the list of articles using the -f option if present, and can be used to filter by date the articles whose DOIs are provided. Controlling the output ~~~~~~~~~~~~~~~~~~~~~~ The value supplied after the option -d (or /d on Windows) can be used to select the destination where the publication archive is intended to be sent after PubArchiver has done its work. The possible alternatives are "portico" and "pmc"; Portico is assumed to be the default destination. This option changes the structure and content of the archive created by PubArchiver. PubArchiver will write its output to the directory indicated by the value of the option -o (or /o on Windows). If no -o is given, the output will be written to the current directory from which PubArchiver is being run. Each article will be written to a subdirectory named after the DOI of the article. The output for each article will consist of an XML metadata file describing the article, the article itself in PDF format, and a subdirectory named "jats" containing the article in JATS XML format along with any image that may appear in the article. The image is always converted to uncompressed TIFF format (because it is considered a good preservation format). Unless the option -Z (or /Z on Windows) is given, the output will be archived in ZIP format. If the output structure (as determine by the -s option) is being generated for PMC, each article will be put into its own individual ZIP archive; else, the default action is to put the collected output of all articles into a single ZIP archive file. Writing a report ~~~~~~~~~~~~~~~~ As it works, PubArchiver writes information to the terminal about the articles it puts into the archive, including whether any problems are encountered. To save this information to a file, use the option -r (or /r on Windows), which will make PubArchiver write a report file. By default, the format of the report file is CSV; the option -s (/s on Windows) can be used to select "csv" or "html" (or both) as the format. The title of the report will be based on the current date, unless the option -t (or /t on Windows) is used to supply a different title. Previewing the list of articles ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If given the option -p (or /p on Windows), pubarchiver will ONLY display a list of articles it will archive and stop short of creating the archive. This is useful to see what would be produced without actually doing it. However, note that because it does not attempt to download the articles and associated files, it will not be able to report on errors that might occur when not operating in preview mode. Consequently, do not use the output of -p as a prediction of eventual success or failure. Return values ~~~~~~~~~~~~~ This program will exit with a return code of 0 if no problems are encountered during execution. If a problem is encountered, it will return a nonzero value. If no network is detected, it returns a value of 1; if the program is interrupted (e.g., using control-c) it returns a value of 2; if it encounters a fatal error, it returns a value of 3. If it encounters any non-fatal problems (such as a missing PDF file or JATS validation error), it returns a nonzero value equal to 100 + the number of articles that had failures. Summarizing the possible return codes: 0 = no errors were encountered -- success 1 = no network detected -- cannot proceed 2 = the user interrupted program execution 3 = an exception or fatal error occurred 100 + n = encountered non-fatal problems on a total of n articles Additional command-line options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The option -l (or /l on Windows) can be used to obtain a list of all DOIs for all articles published by the selected journal. Pubarchiver will print general informational messages as it works. To reduce messages to only warnings and errors, use the option -q (or /q on Windows). Output is color-coded by default unless the -C option (or /C on Windows) is given; this option can be helpful if the color control signals create problems for your terminal emulator. If given the -@ option (/@ on Windows), this program will print a detailed real-time log of what it is doing. The output will be sent to the given destination, which can be '-' to indicate console output, or a file path to send the output to a file. The output is mainly intended for debugging. Pubarchiver always downloads the JATS XML version of articles from micropublication.org (in addition to downloading the PDF version), and by default, pubarchiver validates the XML content against the JATS DTD. To skip the XML validation step, use the option -X (/X on Windows). If given the -V option (/V on Windows), this program will print version information and exit without doing anything else. Command-line options summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ''' if debug != 'OUT': if __debug__: set_debug(True, debug) import faulthandler faulthandler.enable() if version: print_version() exit(0) try: if __debug__: log('=' * 8 + f' started {timestamp()}' + '=' * 8) ui = UI('PubArchiver', use_color=not no_color, be_quiet=quiet) ui.start() if journal == 'J': alert('Must specify a journal using the -j option.') print_supported_journals() exit(1) elif journal in ['list', 'help']: print_supported_journals() exit(0) elif journal not in journal_list(): alert(f'Unrecognized journal "{journal}".') print_supported_journals() exit(1) if not network_available(): alert('No network.') exit(1) handler = journal_handler(journal) if list_dois: inform(f'Asking {handler.name} server for a list of all DOIs ...') articles = handler.all_articles() if articles: inform(f'Got {len(articles)} DOIs from {handler.name}:') print('\n'.join(article.doi for article in articles)) else: warn(f'Failed to get list of articles from {handler.name}') exit(0) body = MainBody(journal=handler, dest='pmc' if dest.lower() == 'pmc' else 'portico', doi_file=doi_file if doi_file != 'F' else None, output_dir='.' if output_dir == 'O' else output_dir, after=None if after_date == 'A' else after_date, report_file=None if rep_file == 'R' else rep_file, report_format='csv' if rep_fmt == 'S' else rep_fmt, report_title=None if rep_title == 'T' else rep_title, do_validate=handler.uses_jats and not no_check, do_zip=not no_zip, preview=preview) body.run() if __debug__: logf(f'finished with {body.failures} failures') if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8) exit(100 + body.failures if body.failures > 0 else 0) except KeyboardInterrupt as ex: warn('Quitting') if __debug__: log(f'returning with exit code 2') exit(2) except Exception as ex: import traceback if __debug__: log(f'{str(ex)}\n{traceback.format_exc()}') alert_fatal(f'{str(ex)}') if __debug__: log(f'returning with exit code 3') exit(3)