Exemplo n.º 1
0
import xml.etree.cElementTree as ET

import psutil
from spreads.vendor.pathlib import Path

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin

IS_WIN = util.is_os('windows')
CLI_BIN = util.find_in_path('scantailor-cli')
GUI_BIN = util.find_in_path('scantailor')

if not CLI_BIN:
    raise util.MissingDependencyException(
        "Could not find executable `scantailor-cli`. Please"
        " install the"
        " appropriate package(s)!")

logger = logging.getLogger('spreadsplug.scantailor')


class ScanTailorPlugin(HookPlugin, ProcessHooksMixin):
    __name__ = 'scantailor'

    @classmethod
    def configuration_template(cls):
        conf = {
            'autopilot':
            OptionTemplate(value=True, docstring="Skip manual correction"),
            'rotate':
            OptionTemplate(value=False, docstring="Rotate pages"),
Exemplo n.º 2
0
    def process(self, pages, target_path):
        """ Run the most recent image of every page through ScanTailor.

        :param pages:       Pages to be processed
        :type pages:        list of :py:class:`spreads.workflow.Page`
        :param target_path: Base directory where rotated images are to be
                            stored
        :type target_path:  :py:class:`pathlib.Path`
        """
        autopilot = self.config['autopilot'].get(bool)
        if not autopilot and not util.find_in_path('scantailor'):
            raise util.MissingDependencyException(
                "Could not find executable `scantailor` in"
                " $PATH. Please install the appropriate"
                " package(s)!")

        # Create temporary files/directories
        projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1])
        out_dir = Path(tempfile.mkdtemp(prefix='st-out'))

        # Map input paths to their pages so we can more easily associate
        # the generated output files with their pages later on
        in_paths = {}
        for page in pages:
            fpath = page.get_latest_processed(image_only=True)
            if fpath is None:
                fpath = page.raw_image
            in_paths[unicode(fpath)] = page

        logger.info("Generating ScanTailor configuration")
        self._generate_configuration(sorted(in_paths.keys()), projectfile,
                                     out_dir)

        if not autopilot:
            logger.warn("If you are changing output settings (in the last "
                        "step, you *have* to run the last step from the GUI. "
                        "Due to a bug in ScanTailor, your settings would "
                        "otherwise be ignored.")
            time.sleep(5)
            logger.info("Opening ScanTailor GUI for manual adjustment")
            util.get_subprocess([GUI_BIN, unicode(projectfile)])
        # Check if the user already generated output files from the GUI
        if not sum(1 for x in out_dir.glob('*.tif')) == len(pages):
            logger.info("Generating output images from ScanTailor "
                        "configuration.")
            self._generate_output(projectfile, out_dir, len(pages))

        # Associate generated output files with our pages
        for fname in out_dir.glob('*.tif'):
            out_stem = fname.stem
            for in_path, page in in_paths.iteritems():
                if Path(in_path).stem == out_stem:
                    target_fname = target_path / fname.name
                    shutil.copyfile(unicode(fname), unicode(target_fname))
                    page.processed_images[self.__name__] = target_fname
                    break
            else:
                logger.warn(
                    "Could not find page for output file {0}".format(fname))

        # Remove temporary files/directories
        shutil.rmtree(unicode(out_dir))
        # FIXME: This fails on Windows since there seems to be some non-gcable
        #        reference to the file around, but I currently cannot figure
        #        out where, so we just ignore the error...
        try:
            projectfile.unlink()
        except WindowsError as e:
            if e.errno == 32:
                pass
Exemplo n.º 3
0
import shutil
import subprocess
import tempfile
import time
import xml.etree.cElementTree as ET
from itertools import chain

import spreads.util as util
from spreads.config import OptionTemplate
from spreads.plugin import HookPlugin, ProcessHooksMixin
from spreads.vendor.pathlib import Path

BIN = util.find_in_path('tesseract')
if not BIN:
    raise util.MissingDependencyException(
        "Could not find executable `tesseract`. Please install the appropriate"
        " package(s)!")

# Newer versions of Tesseract provide a flag to obtain a list of installed
# OCR languages, for older versions we have to read out the directory
# containing the training data for languages.
try:
    AVAILABLE_LANGS = (util.get_subprocess(
        [BIN, "--list-langs"],
        stderr=subprocess.STDOUT,
        stdout=subprocess.PIPE).communicate()[0].split("\n")[1:-1])
    # There should be at least a single language
    if not AVAILABLE_LANGS:
        raise ValueError()
except (subprocess.CalledProcessError, ValueError):
    AVAILABLE_LANGS = [