Пример #1
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file) and readable(new_file):
         from handprint.images import image_dimensions
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             inform(f'Using reduced image found in {relative(new_file)}')
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {max_width}x{max_height}: {new_file}')
     inform(f'Dimensions too large; reducing dimensions: {relative(file)}')
     from handprint.images import reduced_image_dimensions
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         alert(f'Failed to re-dimension {relative(file)}: {error}')
         return None
     return resized
Пример #2
0
 def _write_report(self, report_file, report_format, title, article_list):
     for fmt in report_format.split(','):
         dest_file = filename_basename(report_file) + '.' + fmt
         if fmt == "csv":
             with open(dest_file, 'w', newline='') as file:
                 file.write('Status,DOI,Date,URL\n')
                 csvwriter = csv.writer(file, delimiter=',')
                 for article in article_list:
                     row = [
                         article.status, article.doi, article.date,
                         article.pdf
                     ]
                     csvwriter.writerow(row)
         elif fmt == "html":
             with open(dest_file, 'w', newline='') as file:
                 file.write(
                     _HTML_REPORT_TOP.format(
                         title or 'Report for ' + timestamp()))
                 for article in article_list:
                     file.write('<tr>')
                     file.write('<td>' + article.status + '</td>')
                     file.write('<td>' + article.doi + '</td>')
                     file.write('<td>' + article.date + '</td>')
                     file.write('<td><a href="{0}">{0}</a></td>'.format(
                         article.pdf))
                     file.write('</tr>')
                 file.write(_HTML_REPORT_BOTTOM)
         else:
             raise ValueError('Unsupported report format "' + fmt + '"')
Пример #3
0
 def _smaller_file(self, file):
     if not file:
         return None
     file_ext = filename_extension(file)
     name_tail = '.handprint' + file_ext
     new_file = file if name_tail in file else filename_basename(
         file) + name_tail
     if path.exists(new_file):
         from handprint.images import image_size
         if image_size(new_file) < self._max_size:
             inform(f'Reusing resized image found in {relative(new_file)}')
             return new_file
         else:
             # We found a ".handprint.ext" file, perhaps from a previous run,
             # but for the current set of services, it's larger than allowed.
             if __debug__:
                 log('existing resized file larger than' +
                     f' {self._max_size}b: {new_file}')
     inform(f'Size too large; reducing size: {relative(file)}')
     from handprint.images import reduced_image_size
     (resized, error) = reduced_image_size(file, new_file, self._max_size)
     if error:
         alert(f'Failed to resize {relative(file)}: {error}')
         return None
     return resized
Пример #4
0
 def _converted_file(self, file, to_format, dest_dir):
     basename = path.basename(filename_basename(file))
     new_file = path.join(dest_dir, basename + '.handprint.' + to_format)
     if path.exists(new_file):
         inform(f'Using existing converted image in {relative(new_file)}')
         return new_file
     else:
         inform(f'Converting to {to_format} format: {relative(file)}')
         from handprint.images import converted_image
         (converted, error) = converted_image(file, to_format, new_file)
         if error:
             alert(f'Failed to convert {relative(file)}: {error}')
             return None
         return converted
Пример #5
0
def converted_image(orig_file, to_format, dest_file=None):
    '''Returns a tuple of (success, output file, error message).
    Returns a tuple of (new_file, error).  The value of 'error' will be None
    if no error occurred; otherwise, the value will be a string summarizing the
    error that occurred and 'new_file' will be set to None.
    '''
    dest_format = canonical_format_name(to_format)
    if dest_file is None:
        dest_file = filename_basename(file) + '.' + dest_format
    # PIL is unable to read PDF files, so in that particular case, we have to
    # convert it using another tool.
    if filename_extension(orig_file) == '.pdf':
        import fitz
        doc = fitz.open(orig_file)
        if len(doc) >= 1:
            if len(doc) >= 2:
                if __debug__:
                    log(f'{orig_file} has > 1 images; using only 1st')
            # FIXME: if there's more than 1 image, we could extra the rest.
            # Doing so will require some architectural changes first.
            if __debug__:
                log(f'extracting 1st image from {relative(dest_file)}')
            page = doc[0]
            pix = page.getPixmap(alpha=False)
            if __debug__: log(f'writing {relative(dest_file)}')
            pix.writeImage(dest_file, dest_format)
            return (dest_file, None)
        else:
            if __debug__:
                log(f'fitz says there is no image image in {relative(orig_file)}'
                    )
            return (None, f'Cannot find an image inside {relative(orig_file)}')
    else:
        # When converting images, PIL may issue a DecompressionBombWarning but
        # it's not a concern in our application.  Ignore it.
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                im = Image.open(orig_file)
                if __debug__: log(f'converting {relative(orig_file)} to RGB')
                im.convert('RGB')
                if __debug__:
                    log(f'saving converted image to {relative(dest_file)}')
                if orig_file == dest_file:
                    im.seek(0)
                im.save(dest_file, dest_format)
                return (dest_file, None)
            except Exception as ex:
                return (None, str(ex))
Пример #6
0
    def targets_from_arguments(self):
        # Validator_collection takes a long time to load.  Delay loading it
        # until needed, so that overall application startup time is faster.
        from validator_collection.checkers import is_url

        targets = []
        if self.from_file:
            if __debug__: log(f'reading {self.from_file}')
            targets = filter(None, open(self.from_file).read().splitlines())
        else:
            for item in self.files:
                if is_url(item):
                    targets.append(item)
                elif isfile(item) and filename_extension(
                        item) in ACCEPTED_FORMATS:
                    targets.append(item)
                elif isdir(item):
                    # It's a directory, so look for files within.
                    targets += files_in_directory(item,
                                                  extensions=ACCEPTED_FORMATS)
                else:
                    warn(f'"{item}" not a file or directory')

        # Filter files created in past runs.
        targets = filter(lambda name: '.handprint' not in name, targets)

        # If there is both a file in the format we generate and another
        # format of that file, ignore the other formats and just use ours.
        # Note: the value of targets is an iterator, but b/c it's tested inside
        # the loop, a separate list is needed (else get unexpected results).
        targets = list(targets)
        keep = []
        for item in targets:
            ext = filename_extension(item)
            base = filename_basename(item)
            if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets):
                # png version of file is also present => skip this other version
                continue
            keep.append(item)
        return keep
Пример #7
0
    def run_services(self, item, index, base_name):
        '''Run all requested services on the image indicated by "item", using
        "index" and "base_name" to construct a download copy of the item if
        it has to be downloaded from a URL first.
        '''
        # Shortcuts to make the code more readable.
        services = self._services

        inform(f'Starting on [white]{item}[/]')
        (item_file, item_fmt) = self._get(item, base_name, index)
        if not item_file:
            return

        dest_dir = self._output_dir if self._output_dir else path.dirname(
            item_file)
        if not writable(dest_dir):
            alert(f'Cannot write output in {dest_dir}.')
            return

        # Normalize input image to the lowest common denominator.
        image = self._normalized(item, item_fmt, item_file, dest_dir)
        if not image.file:
            warn(f'Skipping {relative(item_file)}')
            return

        # Send the file to the services and get Result tuples back.
        self._senders = []
        if self._num_threads == 1:
            # For 1 thread, avoid thread pool to make debugging easier.
            results = [self._send(image, s) for s in services]
        else:
            executor = ThreadPoolExecutor(max_workers=self._num_threads,
                                          thread_name_prefix='ServiceThread')
            for service in services:
                future = executor.submit(self._send, image, service)
                self._senders.append(future)
            results = [future.result() for future in self._senders]

        # If a service failed for some reason (e.g., a network glitch), we
        # get no result back.  Remove empty results & go on with the rest.
        results = [x for x in results if x is not None]
        if not results:
            warn(f'Nothing to do for {item}')
            return

        # Create grid file if requested.
        if self._make_grid:
            base = path.basename(filename_basename(item_file))
            grid_file = path.realpath(
                path.join(dest_dir, base + '.handprint-all.png'))
            inform(f'Creating results grid image: {relative(grid_file)}')
            all_results = [r.annotated for r in results]
            width = math.ceil(math.sqrt(len(all_results)))
            from handprint.images import create_image_grid
            create_image_grid(all_results, grid_file, max_horizontal=width)

        # Clean up after ourselves.
        if not self._extended_results:
            for file in set(image.temp_files | {r.annotated for r in results}):
                if file and path.exists(file):
                    delete_existing(file)
        elif image.file != image.item_file:
            # Delete the resized file.  While it would help efficiency to
            # reuse it on subsequent runs, the risk is that those runs might
            # target different services and would end up using a different-
            # sized image than if we sized it appropriately for _this_ run.
            delete_existing(image.file)

        inform(f'Done with {relative(item)}')
Пример #8
0
    def _save_article_pmc(self, dest_dir, article, xml, zip_articles):
        inform('Writing ' + article.doi)
        to_archive = []

        pdf_file = pmc_pdf_filename(article, dest_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'
        to_archive.append(pdf_file)

        jats_file = jats_filename(article, dest_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')
        to_archive.append(jats_file)

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, dest_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted_img = image_without_alpha(img)
                    converted_img = converted_img.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_file = filename_basename(image_file) + '.tif'
                    # Using save() means that only the 1st frame of a
                    # multiframe image will be saved.
                    converted_img.save(tiff_file,
                                       dpi=_TIFF_DPI,
                                       compression=None,
                                       description=tiff_comments(article))
                    to_archive.append(tiff_file)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__:
                log(f'skipping empty image file URL for {article.doi}')

        # Finally, put the files into their own zip archive.
        if zip_articles:
            if not article.status.startswith('failed'):
                zip_file = pmc_zip_filename(article, dest_dir)
                inform(f'Creating ZIP archive file "{zip_file}"')
                archive_files(zip_file, to_archive)
                if __debug__: log(f'verifying ZIP file {zip_file}')
                verify_archive(zip_file, 'zip')
                for file in to_archive:
                    if __debug__: log(f'deleting file {file}')
                    delete_existing(file)
            else:
                warn(
                    f'ZIP archive for {article.doi} not created due to errors')
Пример #9
0
    def _save_article_portico(self, dest_dir, article, xmldict):
        article_dir = path.join(dest_dir, article.basename)
        jats_dir = path.join(article_dir, 'jats')
        try:
            os.makedirs(article_dir)
            if self.journal.uses_jats:
                os.makedirs(jats_dir)
        except FileExistsError:
            pass
        inform('Writing ' + article.doi)
        xml_file = xml_filename(article, article_dir)
        with open(xml_file, 'w', encoding='utf8') as f:
            if __debug__: log(f'writing XML to {xml_file}')
            f.write(xmltodict.unparse(xmldict, pretty=True))

        pdf_file = pdf_filename(article, article_dir)
        if __debug__: log(f'downloading PDF to {pdf_file}')
        if not download_file(article.pdf, pdf_file):
            warn(f'Could not download PDF file for {article.doi}')
            article.status = 'failed-pdf-download'

        if not self.journal.uses_jats:
            # Nothing more to do.
            return

        jats_file = jats_filename(article, jats_dir)
        if __debug__: log(f'downloading JATS XML to {jats_file}')
        if not download_file(article.jats, jats_file):
            warn(f'Could not download JATS file for {article.doi}')
            article.status = 'failed-jats-download'
        if self.do_validate:
            if not valid_xml(jats_file, self._dtd):
                warn(f'Failed to validate JATS for article {article.doi}')
                article.status = 'failed-jats-validation'
        else:
            if __debug__: log(f'skipping DTD validation of {jats_file}')

        # We need to store the image with the name that appears in the
        # JATS file. That requires a little extra work to extract.
        image_extension = filename_extension(article.image)
        image_file = image_filename(article, jats_dir, ext=image_extension)
        if article.image:
            if __debug__: log(f'downloading image file to {image_file}')
            if download_file(article.image, image_file):
                with Image.open(image_file) as img:
                    converted = image_without_alpha(img)
                    converted = converted.convert('RGB')
                    if __debug__: log(f'converting image to TIFF format')
                    tiff_name = filename_basename(image_file) + '.tif'
                    comments = tiff_comments(article, self.journal.name)
                    # Using save() means only the 1st frame of a multiframe
                    # image will be saved.
                    converted.save(tiff_name,
                                   compression=None,
                                   dpi=_TIFF_DPI,
                                   description=comments)
                # We keep only the uncompressed TIFF version.
                if __debug__: log(f'deleting original image file {image_file}')
                delete_existing(image_file)
            else:
                warn(f'Failed to download image for {article.doi}')
                article.status = 'failed-image-download'
        else:
            if __debug__: log(f'skipping empty image URL for {article.doi}')