def create(cls, location, metadata=None, config=None): """ Create a new Workflow. :param location: Base directory that the workflow should be created in :type location: unicode or :py:class:`pathlib.Path` :param metadata: Initial metadata for workflow. Must at least contain a `title` item. :type metadata: dict :param config: Initial configuration for workflow :type config: dict or :py:class:`spreads.config.Configuration` :return: The new instance :rtype: :py:class:`Workflow` """ if not isinstance(location, Path): location = Path(location) if metadata is None or 'title' not in metadata: raise ValidationError( metadata={'title': 'Please specify at least a title'}) path = Path(location/util.slugify(metadata['title'])) if path.exists(): raise ValidationError( name="A workflow with that title already exists") wf = cls(path=path, config=config, metadata=metadata) return wf
def process(self, pages, target_path): # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._fix_hocr(fname) out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname))
def process(self, pages, target_path): # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._fix_hocr(fname) out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname))
def find_all(cls, location, key='slug', reload=False): """ List all workflows in the given location. :param location: Location where the workflows are located :type location: unicode or :py:class:`pathlib.Path` :param key: Attribute to use as key for returned dict :type key: str/unicode :param reload: Do not load workflows from cache :type reload: bool :return: All found workflows :rtype: dict """ if not isinstance(location, Path): location = Path(location) if key not in ('slug', 'id'): raise ValueError("'key' must be one of ('id', 'slug')") if location in cls._cache and not reload: found = cls._cache[location] else: found = [] for candidate in location.iterdir(): is_workflow = (location.is_dir() and ((candidate/'bagit.txt').exists or (candidate/'raw').exists)) if not is_workflow: continue if not next((wf for wf in found if wf.path == candidate), None): logging.debug( "Cache missed, instantiating workflow from {0}." .format(candidate)) workflow = cls(candidate) found.append(workflow) cls._cache[location] = found return {getattr(wf, key): wf for wf in cls._cache[location]}
def find_all(cls, location, key='slug', reload=False): """ List all workflows in the given location. :param location: Location where the workflows are located :type location: unicode/pathlib.Path :param key: Attribute to use as key for returned dict :type key: str :param reload: Do not load workflows from cache :type reload: bool :return: All found workflows :rtype: dict """ if not isinstance(location, Path): location = Path(location) if key not in ('slug', 'id'): raise ValueError("'key' must be one of ('id', 'slug')") if location in cls._cache and not reload: found = cls._cache[location] else: found = [] for candidate in location.iterdir(): is_workflow = (location.is_dir() and ((candidate/'bagit.txt').exists or (candidate/'raw').exists)) if not is_workflow: continue if not next((wf for wf in found if wf.path == candidate), None): logging.debug( "Cache missed, instantiating workflow from {0}." .format(candidate)) workflow = cls(candidate) found.append(workflow) cls._cache[location] = found return {getattr(wf, key): wf for wf in cls._cache[location]}
def get_data_dir(create=False): UNIX_DIR_VAR = 'XDG_DATA_DIRS' UNIX_DIR_FALLBACK = '~/.config' WINDOWS_DIR_VAR = 'APPDATA' WINDOWS_DIR_FALLBACK = '~\\AppData\\Roaming' MAC_DIR = '~/Library/Application Support' base_dir = None if platform.system() == 'Darwin': if Path(UNIX_DIR_FALLBACK).exists: base_dir = UNIX_DIR_FALLBACK else: base_dir = MAC_DIR elif platform.system() == 'Windows': if WINDOWS_DIR_VAR in os.environ: base_dir = os.environ[WINDOWS_DIR_VAR] else: base_dir = WINDOWS_DIR_FALLBACK else: if UNIX_DIR_VAR in os.environ: base_dir = os.environ[UNIX_DIR_VAR] else: base_dir = UNIX_DIR_FALLBACK app_path = Path(base_dir)/'spreads' if create and not app_path.exists(): app_path.mkdir() return unicode(app_path)
def test_fix_hocr(plugin, tmpdir): shutil.copyfile('./tests/data/000.hocr', unicode(tmpdir.join('test.html'))) fpath = Path(unicode(tmpdir.join('test.html'))) plugin._fix_hocr(fpath) with fpath.open('r') as fp: matches = re.findall( r'(<span[^>]*>(<strong>)? +(</strong>)?</span> *){2}', fp.read()) assert len(matches) == 0
def test_split_configuration(plugin, tmpdir): with mock.patch('spreadsplug.scantailor.multiprocessing.cpu_count') as cnt: cnt.return_value = 4 splitfiles = plugin._split_configuration( Path('./tests/data/test.scanTailor'), Path(unicode(tmpdir))) assert len(splitfiles) == 4 tree = ET.parse(unicode(splitfiles[0])) for elem in ('files', 'images', 'pages', 'file-name-disambiguation'): assert len(tree.find('./{0}'.format(elem))) == 7
def test_generate_configuration(popen, proc, plugin): proc.return_value.is_running.return_value = False in_paths = ['{0:03}.jpg'.format(idx) for idx in xrange(5)] proj_file = Path('/tmp/foo.st') out_dir = Path('/tmp/out') plugin._generate_configuration(in_paths, proj_file, out_dir) args = popen.call_args[0][0] for fp in in_paths: assert fp in args
def test_generate_configuration(popen, proc, plugin): proc.return_value.is_running.return_value = False # TODO: Setup up some config variables imgdir = mock.MagicMock(wraps=Path('/tmp/raw')) imgs = [imgdir/"foo.jpg", imgdir/"bar.jpg"] imgdir.iterdir.return_value = imgs plugin._generate_configuration(Path('/tmp/foo.st'), imgdir, Path('/tmp/out'))
def test_perform_replacements(plugin, tmpdir): shutil.copyfile('./tests/data/000.hocr', unicode(tmpdir.join('test.html'))) fpath = Path(unicode(tmpdir.join('test.html'))) plugin._perform_replacements(fpath) with fpath.open('r') as fp: matches = re.findall( r'(<span[^>]*>(<strong>)? +(</strong>)?</span> *){2}', fp.read()) assert len(matches) == 0
def create(cls, location, metadata=None, config=None): if not isinstance(location, Path): location = Path(location) if metadata is None or not 'title' in metadata: raise ValidationError( metadata={'title': 'Please specify at least a title'}) path = Path(location/util.slugify(metadata['title'])) if path.exists(): raise ValidationError( name="A workflow with that title already exists") wf = cls(path=path, config=config, metadata=metadata) return wf
def build_msi(bitness=32): egg_path = Path('spreads.egg-info') if egg_path.exists(): shutil.rmtree(unicode(egg_path)) build_path = Path('build') if not build_path.exists(): build_path.mkdir() pkg_dir = build_path / 'pynsist_pkgs' if pkg_dir.exists(): shutil.rmtree(unicode(pkg_dir)) pkg_dir.mkdir() for pkg in BINARY_PACKAGES.itervalues(): arch = 'win32' if bitness == 32 else 'win-amd64' extract_native_pkg(pkg.format(arch=arch), pkg_dir) for pkg in (x.project_name for x in SOURCE_PACKAGES if x.project_name is not None): copy_info(pkg, pkg_dir) icon = os.path.abspath("spreads.ico") extra_files = [(unicode( (Path('win_deps') / 'extra' / x.format(arch='.amd64' if bitness == 64 else '')).absolute()), None) for x in EXTRA_FILES] nsi_template = os.path.abspath("template.nsi") # NOTE: We need to remove the working directory from sys.path to force # pynsist to copy all of our modules, including 'spreads' and 'spreadsplug' # from the site-packages. Additionally, we need to change into the # build directory. if os.getcwd() in sys.path: sys.path.remove(os.getcwd()) os.chdir(unicode(build_path)) builder = InstallerBuilder( appname="spreads", version=spreads.__version__, packages=[x.module_name for x in SOURCE_PACKAGES], extra_files=extra_files, py_version="2.7.6", py_bitness=bitness, build_dir='msi{0}'.format(bitness), installer_name=None, nsi_template=nsi_template, icon=icon, shortcuts={ 'Configure spreads': { 'entry_point': 'spreads.main:run_config_windows', 'icon': icon, 'console': False }, 'Spreads Web Service': { 'entry_point': 'spreads.main:run_service_windows', 'icon': icon, 'console': False } }) builder.run() os.chdir('..')
def process(self, pages, target_path): autopilot = self.config['autopilot'].get(bool) if not autopilot and not find_in_path('scantailor'): raise MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") subprocess.call([find_in_path('scantailor'), unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) projectfile.unlink()
def test_output(plugin, tmpdir): dummy_pages = [] for idx in xrange(20): dummy_pages.append( Page(Path('000.jpg'), idx, processed_images={'tesseract': Path('./tests/data/000.hocr')})) plugin.output(dummy_pages, tmpdir, None, None) assert tmpdir.join('text.html').exists() tree = ET.parse(unicode(tmpdir.join('text.html'))) assert len(tree.findall('.//span[@class="ocrx_word"]')) == 20 * 201 assert len(tree.findall('.//span[@class="ocr_line"]')) == 20 * 26 assert len(tree.findall('.//p[@class="ocr_par"]')) == 20 * 4 assert len(tree.findall('.//div[@class="ocr_page"]')) == 20
def extract_native_pkg(fname, pkg_dir): zf = zipfile.ZipFile(unicode(Path('win_deps') / 'python' / fname)) tmpdir = Path(tempfile.mkdtemp()) zf.extractall(unicode(tmpdir)) fpaths = [] if (tmpdir / 'PLATLIB').exists(): fpaths += [p for p in (tmpdir / 'PLATLIB').iterdir()] if (tmpdir / 'PURELIB').exists(): fpaths += [p for p in (tmpdir / 'PURELIB').iterdir()] for path in fpaths: if path.is_dir(): shutil.copytree(unicode(path), unicode(pkg_dir / path.name)) else: shutil.copy2(unicode(path), unicode(pkg_dir / path.name)) shutil.rmtree(unicode(tmpdir))
def yield_devices(cls, config): """ Search for usable devices, yield one at a time :param config: spreads configuration :type config: spreads.confit.ConfigView """ SPECIAL_CASES = { # noqa # (idVendor, idProduct): SpecialClass (0x4a9, 0x31ef): QualityFix, # not r47, but has the same bug (0x4a9, 0x3218): QualityFix, (0x4a9, 0x3223): A3300, (0x4a9, 0x3224): QualityFix, (0x4a9, 0x3225): QualityFix, (0x4a9, 0x3226): QualityFix, (0x4a9, 0x3227): QualityFix, (0x4a9, 0x3228): QualityFix, (0x4a9, 0x3229): QualityFix, (0x4a9, 0x322a): QualityFix, (0x4a9, 0x322b): QualityFix, (0x4a9, 0x322c): QualityFix, } # Check if we can find the chdkptp executable chdkptp_path = Path(config["chdkptp_path"].get(unicode)) if not chdkptp_path.exists() or not (chdkptp_path / 'chdkptp').exists(): raise MissingDependencyException( "Could not find executable `chdkptp`. Please make sure that " "the `chdkptp_path` setting in your `chdkcamera` " "configuration points to " "a directory containing chdkptp " "and its libraries. Current setting is `{0}`".format( chdkptp_path)) # only match ptp devices in find_all def is_ptp(dev): for cfg in dev: if usb.util.find_descriptor(cfg, bInterfaceClass=6, bInterfaceSubClass=1): return True for dev in usb.core.find(find_all=True, custom_match=is_ptp): ids = (dev.idVendor, dev.idProduct) if ids in SPECIAL_CASES: yield SPECIAL_CASES[ids](config, dev) else: yield cls(config, dev)
def find_by_slug(cls, location, slug): if not isinstance(location, Path): location = Path(location) try: return cls.find_all(location, key='slug')[slug] except KeyError: return None
def find_by_id(cls, location, id): if not isinstance(location, Path): location = Path(location) try: return cls.find_all(location, key='id')[id] except KeyError: return None
def output(self, pages, target_path, metadata, table_of_contents): """ Go through pages and bundle their most recent images into a DJVU file. :param pages: Pages to bundle :param target_path: list of :py:class:`spreads.workflow.Page` :param metadata: Metadata to include in DJVU file :type metadata: :py:class:`spreads.metadata.Metadata` :param table_of_contents: Table of contents to include in DJVU file :type table_of_contents: list of :py:class:`TocEntry` """ logger.info("Assembling DJVU.") tmpdir = Path(tempfile.mkdtemp()) for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir / fpath.name) link_path.symlink_to(fpath) # TODO: Add metadata # TODO: Add table of contents djvu_file = target_path / "book.djvu" cmd = ["djvubind", unicode(tmpdir), '--no-ocr'] logger.debug("Running " + " ".join(cmd)) subprocess.check_output(cmd, stderr=subprocess.STDOUT) os.rename("book.djvu", unicode(djvu_file)) shutil.rmtree(unicode(tmpdir))
def cfg_path(self): """ Path to YAML file of the user-specific configuration. :returns: Path :rtype: :py:class:`pathlib.Path` """ return Path(self._config.config_dir()) / confit.CONFIG_FILENAME
def __init__(self, path, config=None, step=None, step_done=None, id=None): self._logger = logging.getLogger('Workflow') self._logger.debug("Initializing workflow {0}".format(path)) self.step = step self.step_done = step_done if not isinstance(path, Path): path = Path(path) self.path = path if not self.path.exists(): self.path.mkdir() self.id = id if self.images: self.pages_shot = len(self.images) else: self.pages_shot = 0 # See if supplied `config` is already a valid ConfigView object if isinstance(config, confit.ConfigView): self.config = config elif isinstance(config, Configuration): self.config = config.as_view() else: self.config = self._load_config(config) self._capture_lock = threading.RLock() self.active = False self._devices = None self._pluginmanager = None # Instantiate plugins self.plugins = [ cls(self.config) for cls in plugin.get_plugins( *self.config["plugins"].get()).values() ]
def get_workflow(workflow_id): # See if the workflow is among our cached instances if workflow_id in WorkflowCache: return WorkflowCache[workflow_id] logger.debug("Loading workflow {0} from database".format(workflow_id)) with open_connection() as con: db_data = con.execute("SELECT * FROM workflow WHERE workflow.id=?", (workflow_id, )).fetchone() if db_data is None: logger.warn("Workflow {0} was not found.".format(workflow_id)) return None db_workflow = DbWorkflow(*db_data) # Try to load configuration from database if db_workflow.config is not None: config = json.loads(db_workflow.config) else: config = None workflow = Workflow(path=Path(app.config['base_path']) / db_workflow.name, config=config, step=db_workflow.step, step_done=bool(db_workflow.step_done), id=workflow_id) WorkflowCache[workflow_id] = workflow return workflow
def create_workflow(): """ Create a new workflow. Payload should be a JSON object. The only required attribute is 'name' for the desired workflow name. Optionally, 'config' can be set to a configuration object in the form "plugin_name: { setting: value, ...}". Returns the newly created workflow as a JSON object. """ data = json.loads(request.data) path = Path(app.config['base_path']) / unicode(data['name']) # Setup default configuration config = app.config['default_config'] # Overlay user-supplied values, if existant user_config = data.get('config', None) if user_config is not None: config = config.with_overlay(user_config) workflow = Workflow(config=config, path=path, step=data.get('step', None), step_done=data.get('step_done', None)) try: workflow.id = persistence.save_workflow(workflow) except persistence.ValidationError as e: return make_response(json.dumps(dict(errors=e.errors)), 400, {'Content-Type': 'application/json'}) return make_response(json.dumps(workflow), 200, {'Content-Type': 'application/json'})
def _generate_output(self, projectfile, out_dir, num_pages): """ Run last step for the project file and keep track of the progress by emitting :py:attr:`on_progressed` signals. :param projectfile: Path ScanTailor configuration file :type projectfile: :py:class:`pathlib.Path` :param out_dir: Output directory for processed files :type out_dir: :py:class:`pathlib.Path` :param num_pages: Total number of pages to process :type num_pages: int """ logger.debug("Generating output...") temp_dir = Path(tempfile.mkdtemp(prefix="spreads.")) split_config = self._split_configuration(projectfile, temp_dir) logger.debug("Launching those subprocesses!") processes = [ util.get_subprocess([ CLI_BIN, '--start-filter=6', unicode(cfgfile), unicode(out_dir) ]) for cfgfile in split_config ] last_count = 0 while processes: recent_count = sum(1 for x in out_dir.glob('*.tif')) if recent_count > last_count: progress = 0.5 + (float(recent_count) / num_pages) / 2 self.on_progressed.send(self, progress=progress) last_count = recent_count for p in processes[:]: if p.poll() is not None: processes.remove(p) time.sleep(.01) shutil.rmtree(unicode(temp_dir))
def test_generate_configuration_noenhanced(popen, proc, config, pluginclass): proc.return_value.is_running.return_value = False # TODO: Setup up some config variables with mock.patch('subprocess.check_output') as mock_co: mock_co.return_value = "".join(chain( repeat("\n", 7), ("scantailor-cli [options] <image, image, ...>" " <output_directory>",)) ) plugin = pluginclass(config) imgdir = mock.MagicMock(wraps=Path('/tmp/raw')) imgs = [imgdir/"foo.jpg", imgdir/"bar.jpg"] imgdir.iterdir.return_value = imgs plugin._generate_configuration(Path('/tmp/foo.st'), imgdir, Path('/tmp/out')) assert (unicode(imgs[0]) in popen.call_args[0][0])
def test_capture_noprepare(jpeg, camera): camera._run.side_effect = ( chdkcamera.CHDKPTPException('dev not in rec mode'), None) with mock.patch.object(camera, 'prepare_capture') as prepare: camera.capture(Path('/tmp/000.jpg')) assert prepare.call_count == 1 assert camera._run.call_count == 2
def yield_devices(cls, config): """ Search for usable devices, yield one at a time :param config: spreads configuration :type config: spreads.confit.ConfigView """ SPECIAL_CASES = { # (idVendor, idProduct): SpecialClass (0x4a9, 0x31ef): QualityFix, # not r47, but has the same bug (0x4a9, 0x3218): QualityFix, (0x4a9, 0x3223): QualityFix, (0x4a9, 0x3224): QualityFix, (0x4a9, 0x3225): QualityFix, (0x4a9, 0x3226): QualityFix, (0x4a9, 0x3227): QualityFix, (0x4a9, 0x3228): QualityFix, (0x4a9, 0x3229): QualityFix, (0x4a9, 0x322a): A2200, (0x4a9, 0x322b): QualityFix, (0x4a9, 0x322c): QualityFix, } # Check if we can find the chdkptp executable chdkptp_path = Path(config["chdkptp_path"].get(unicode)) if not chdkptp_path.exists() or not (chdkptp_path/'chdkptp').exists(): raise MissingDependencyException( "Could not find executable `chdkptp`. Please make sure that " "the `chdkptp_path` setting in your `chdkcamera` " "configuration points to " "a directory containing chdkptp " "and its libraries. Current setting is `{0}`" .format(chdkptp_path) ) # only match ptp devices in find_all def is_ptp(dev): for cfg in dev: if usb.util.find_descriptor(cfg, bInterfaceClass=6, bInterfaceSubClass=1): return True for dev in usb.core.find(find_all=True, custom_match=is_ptp): ids = (dev.idVendor, dev.idProduct) if ids in SPECIAL_CASES: yield SPECIAL_CASES[ids](config, dev) else: yield cls(config, dev)
def last_modified(self): # We use the most recent of the modified timestamps of the two # checksum files of the BagIt directory, since any relevant changes # to the workflow's structure will cause a change in at least one # file hash. return datetime.fromtimestamp( max(Path(self.path/fname).stat().st_mtime for fname in ('manifest-md5.txt', 'tagmanifest-md5.txt')))
def test_capture_raw(jpeg, camera): jpeg.return_value = mock.Mock() camera.config['shoot_raw'] = True camera.capture(Path('/tmp/000.dng')) assert camera._run.call_count == 1 assert "-dng " in camera._run.call_args_list[0][0][0] assert camera._run.call_args_list[0][0][0].endswith('"/tmp/000"') assert jpeg.called_once_with('/tmp/000.dng')
def transfer_to_stick(wf_id, base_path): workflow = Workflow.find_by_id(base_path, wf_id) stick = find_stick() files = list(workflow.path.rglob('*')) num_files = len(files) # Filter out problematic characters clean_name = (workflow.path.name.replace(':', '_') .replace('/', '_')) workflow.status['step'] = 'transfer' try: if IS_WIN: target_path = Path(stick)/clean_name else: mount = stick.get_dbus_method( "FilesystemMount", dbus_interface="org.freedesktop.UDisks.Device") mount_point = mount('', []) target_path = Path(mount_point)/clean_name if target_path.exists(): shutil.rmtree(unicode(target_path)) target_path.mkdir() signals['transfer:started'].send(workflow) for num, path in enumerate(files, 1): signals['transfer:progressed'].send( workflow, progress=(num/num_files)*0.79, status=path.name) workflow.status['step_done'] = (num/num_files)*0.79 target = target_path/path.relative_to(workflow.path) if path.is_dir(): target.mkdir() else: shutil.copyfile(unicode(path), unicode(target)) finally: if 'mount_point' in locals(): signals['transfer:progressed'].send(workflow, progress=0.8, status="Syncing...") workflow.status['step_done'] = 0.8 unmount = stick.get_dbus_method( "FilesystemUnmount", dbus_interface="org.freedesktop.UDisks.Device") unmount([], timeout=1e6) # dbus-python doesn't know an infinite # timeout... unmounting sometimes takes a # long time, since the device has to be # synced. signals['transfer:completed'].send(workflow) workflow.status['step'] = None
def test_process(): # No need for confit.Configuration, since the plugin doesn't have any # configuration config = {'autorotate': None} pages = [Page(Path('{0:03}.jpg'.format(idx))) for idx in xrange(4)] target_path = Path('/tmp/dummy') with mock.patch('spreadsplug.autorotate.ProcessPoolExecutor') as mockctx: plugin = autorotate.AutoRotatePlugin(config) pool = mockctx.return_value.__enter__.return_value plugin.process(pages, target_path) # The text file should not have been passed assert pool.submit.call_count == 4 # We only want the second parameter to submit, the first is the # function to call assert sorted([unicode(p.raw_image) for p in pages ]) == (sorted(x[0][1] for x in pool.submit.call_args_list))
def test_capture(jpeg, camera): jpeg.return_value = mock.Mock() camera.capture(Path('/tmp/000.jpg')) assert camera._run.call_count == 1 assert camera._run.call_args_list[0][0][0].startswith('remoteshoot') assert camera._run.call_args_list[0][0][0].endswith('"/tmp/000"') assert jpeg.called_once_with('/tmp/000.jpg') assert jpeg.return_value.exif_orientation == 6 assert jpeg.return_value.save.called_once_with('/tmp/000.jpg')
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir / fpath.name) link_path.symlink_to(fpath) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] (tmpdir / ocr_path.name).symlink_to(ocr_path) images.append(link_path) # TODO: Use metadata to create a METAFILE for pdfbeads # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads pdf_file = target_path / "book.pdf" cmd = [find_in_path("pdfbeads"), "-d"] cmd.extend([f.name for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send(self, progress=float(current_count) / len(images)) time.sleep(.01) logger.debug("Output:\n{0}".format(proc.stdout.read())) os.chdir(old_path)
def process(self, pages, target_path): """ For each page, rotate the most recent image according to its EXIF orientation tag. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where processed images are to be stored :type target_path: :py:class:`pathlib.Path` """ # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._perform_replacements(fname) # For each hOCR file, try to find a corresponding input image # and associate it to the image's page out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname))
def process(self, pages, target_path): """ For each page, rotate the most recent image according to its EXIF orientation tag. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where processed images are to be stored :type target_path: :py:class:`pathlib.Path` """ # TODO: This plugin should be 'output' only, since we ideally work # with fully binarized output images # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[fpath] = page out_dir = Path(tempfile.mkdtemp(prefix='tess-out')) language = self.config["language"].get() logger.info("Performing OCR") logger.info("Language is \"{0}\"".format(language)) self._perform_ocr(in_paths, out_dir, language) for fname in chain(out_dir.glob('*.hocr'), out_dir.glob('*.html')): self._perform_replacements(fname) # For each hOCR file, try to find a corresponding input image # and associate it to the image's page out_stem = fname.stem for in_path, page in in_paths.iteritems(): if in_path.stem == out_stem: target_fname = target_path / fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn( "Could not find page for output file {0}".format(fname))
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) link_path.symlink_to(fpath) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] (tmpdir/ocr_path.name).symlink_to(ocr_path) images.append(link_path) # TODO: Use metadata to create a METAFILE for pdfbeads # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads pdf_file = target_path/"book.pdf" cmd = [find_in_path("pdfbeads"), "-d"] cmd.extend([f.name for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) logger.debug("Output:\n{0}".format(proc.stdout.read())) os.chdir(old_path)
def create(cls, location, name, config=None, metadata=None): if not isinstance(location, Path): location = Path(location) if (location / name).exists(): raise ValidationError( name="A workflow with that name already exists") wf = cls(path=location / name, config=config, metadata=metadata) if not location in cls._cache: cls._cache[location] = [] cls._cache[location].append(wf) return wf
def test_process(call, plugin, tmpdir): def create_out_files(pf, out_dir, num): for p in pages: (out_dir/(p.raw_image.stem + '.tif')).touch() plugin._generate_configuration = mock.Mock() plugin._generate_output = create_out_files plugin.config['autopilot'] = True pages = [Page(Path('{0:03}.jpg'.format(idx))) for idx in xrange(5)] target_dir = Path(unicode(tmpdir)) plugin.process(pages, target_dir) assert call.call_count == 0 for p in pages: assert 'scantailor' in p.processed_images assert p.processed_images['scantailor'].parent == target_dir assert p.processed_images['scantailor'].exists() plugin.config['autopilot'] = False plugin.process(pages, target_dir) assert call.call_count == 1
def copy_info(pkg, pkg_dir): try: dist = pkg_resources.get_distribution(pkg) except pkg_resources.DistributionNotFound: raise IOError("No distribution could be found for {0}!".format(pkg)) if dist.location == os.getcwd(): egg_name = dist.project_name else: egg_name = dist.egg_name() egg_path = Path(dist.location) / (egg_name + ".egg-info") dist_path = Path(dist.location) / (dist.project_name + "-" + dist.version + ".dist-info") if egg_path.exists(): src_path = egg_path elif dist_path.exists(): src_path = dist_path else: raise IOError("No egg-info or dist-info could be found for {0}!".format(pkg)) if src_path.is_dir(): shutil.copytree(unicode(src_path), unicode(pkg_dir / src_path.name)) else: shutil.copy2(unicode(src_path), unicode(pkg_dir / src_path.name))
def build_msi(bitness=32): egg_path = Path('spreads.egg-info') if egg_path.exists(): shutil.rmtree(unicode(egg_path)) build_path = Path('build') if not build_path.exists(): build_path.mkdir() pkg_dir = build_path/'pynsist_pkgs' if pkg_dir.exists(): shutil.rmtree(unicode(pkg_dir)) pkg_dir.mkdir() for pkg in BINARY_PACKAGES.itervalues(): arch = 'win32' if bitness == 32 else 'win-amd64' extract_native_pkg(pkg.format(arch=arch), pkg_dir) for pkg in (x.project_name for x in SOURCE_PACKAGES if x.project_name is not None): copy_info(pkg, pkg_dir) icon = os.path.abspath("spreads.ico") extra_files = [(unicode((Path('win_deps') / 'extra' / x.format(arch='.amd64' if bitness == 64 else '')) .absolute()), None) for x in EXTRA_FILES] nsi_template = os.path.abspath("template.nsi") # NOTE: We need to remove the working directory from sys.path to force # pynsist to copy all of our modules, including 'spreads' and 'spreadsplug' # from the site-packages. Additionally, we need to change into the # build directory. if os.getcwd() in sys.path: sys.path.remove(os.getcwd()) os.chdir(unicode(build_path)) builder = InstallerBuilder( appname="spreads", version=spreads.__version__, packages=[x.module_name for x in SOURCE_PACKAGES], extra_files=extra_files, py_version="2.7.6", py_bitness=bitness, build_dir='msi{0}'.format(bitness), installer_name=None, nsi_template=nsi_template, icon=icon, shortcuts={ 'Configure spreads': { 'entry_point': 'spreads.main:run_config_windows', 'icon': icon, 'console': False}, 'Spreads Web Service': { 'entry_point': 'spreads.main:run_service_windows', 'icon': icon, 'console': False} } ) builder.run() os.chdir('..')
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) output = proc.stdout.read() errors = proc.stderr.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def process(self, pages, target_path): """ Run the most recent image of every page through ScanTailor. :param pages: Pages to be processed :type pages: list of :py:class:`spreads.workflow.Page` :param target_path: Base directory where rotated images are to be stored :type target_path: :py:class:`pathlib.Path` """ autopilot = self.config['autopilot'].get(bool) if not autopilot and not util.find_in_path('scantailor'): raise util.MissingDependencyException( "Could not find executable `scantailor` in" " $PATH. Please install the appropriate" " package(s)!") # Create temporary files/directories projectfile = Path(tempfile.mkstemp(suffix='.ScanTailor')[1]) out_dir = Path(tempfile.mkdtemp(prefix='st-out')) # Map input paths to their pages so we can more easily associate # the generated output files with their pages later on in_paths = {} for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image in_paths[unicode(fpath)] = page logger.info("Generating ScanTailor configuration") self._generate_configuration(sorted(in_paths.keys()), projectfile, out_dir) if not autopilot: logger.warn("If you are changing output settings (in the last " "step, you *have* to run the last step from the GUI. " "Due to a bug in ScanTailor, your settings would " "otherwise be ignored.") time.sleep(5) logger.info("Opening ScanTailor GUI for manual adjustment") util.get_subprocess([GUI_BIN, unicode(projectfile)]) # Check if the user already generated output files from the GUI if not sum(1 for x in out_dir.glob('*.tif')) == len(pages): logger.info("Generating output images from ScanTailor " "configuration.") self._generate_output(projectfile, out_dir, len(pages)) # Associate generated output files with our pages for fname in out_dir.glob('*.tif'): out_stem = fname.stem for in_path, page in in_paths.iteritems(): if Path(in_path).stem == out_stem: target_fname = target_path/fname.name shutil.copyfile(unicode(fname), unicode(target_fname)) page.processed_images[self.__name__] = target_fname break else: logger.warn("Could not find page for output file {0}" .format(fname)) # Remove temporary files/directories shutil.rmtree(unicode(out_dir)) # FIXME: This fails on Windows since there seems to be some non-gcable # reference to the file around, but I currently cannot figure # out where, so we just ignore the error... try: projectfile.unlink() except WindowsError as e: if e.errno == 32: pass
def open_connection(): db_path = Path(app.config['database']) if not db_path.exists(): initialize_database() return sqlite3.connect(unicode(db_path))