def output(self, pages, target_path, metadata, table_of_contents): """ Go through pages and bundle their most recent images into a PDF file. :param pages: Pages to bundle :param target_path: list of :py:class:`spreads.workflow.Page` :param metadata: Metadata to include in PDF file :type metadata: :py:class:`spreads.metadata.Metadata` :param table_of_contents: Table of contents to include in PDF file :type table_of_contents: list of :py:class:`TocEntry` """ logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: errors = "" is_jbig2 = False cur_jbig2_page = 0 while proc.poll() is None: cur_line = proc.stderr.readline() errors += "\n" + cur_line prep_match = re.match(r"^Prepared data for processing (.*)$", cur_line) proc_match = re.match(r"^Processed (.*)$", cur_line) jbig2_match = re.match( r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ " r"log2:\d+$", cur_line) progress = None if prep_match: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == prep_match.group(1)) progress = file_idx/(len(images)*2) elif jbig2_match: cur_jbig2_page += int(jbig2_match.group(1)) progress = (len(images) + cur_jbig2_page) / (len(images)*2) is_jbig2 = True elif proc_match and not is_jbig2: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == proc_match.group(1)) progress = (len(images) + file_idx)/(len(images)*2) if progress is not None: self.on_progressed.send(self, progress=progress) time.sleep(.01) output = proc.stdout.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def output(self, pages, target_path, metadata, table_of_contents): logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir/'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir/fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir/ocr_path.name)) else: (tmpdir/ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute()/"book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: last_count = 0 while proc.poll() is None: current_count = sum(1 for x in tmpdir.glob('*.jbig2')) if current_count > last_count: last_count = current_count self.on_progressed.send( self, progress=float(current_count)/len(images)) time.sleep(.01) output = proc.stdout.read() errors = proc.stderr.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))
def _generate_configuration(self, in_paths, projectfile, out_dir): filterconf = [self.config[x].get(bool) for x in ('rotate', 'split_pages', 'deskew', 'content', 'auto_margins')] start_filter = filterconf.index(True)+1 end_filter = len(filterconf) - list(reversed(filterconf)).index(True) marginconf = self.config['margins'].as_str_seq() generation_cmd = [find_in_path('scantailor-cli'), '--start-filter={0}'.format(start_filter), '--end-filter={0}'.format(end_filter), '--layout=1.5', '-o={0}'.format(projectfile)] page_detection = self.config['detection'].get() == 'page' if self._enhanced and page_detection: generation_cmd.extend([ '--enable-page-detection', '--disable-content-detection', '--enable-fine-tuning' ]) else: generation_cmd.extend([ '--margins-top={0}'.format(marginconf[0]), '--margins-right={0}'.format(marginconf[1]), '--margins-bottom={0}'.format(marginconf[2]), '--margins-left={0}'.format(marginconf[3]), ]) # NOTE: We cannot pass individual filenames on windows, since we have # a limit of 32,768 characters for commands. Thus, we first try to # find a wildcard for our paths that matches only them, and if that # fails, throw an Exception and tell the user to use a proper OS... wildcard = wildcardify(in_paths) if not wildcard and IS_WIN: raise SpreadsException("Please use a proper operating system.") elif not wildcard: generation_cmd.extend(in_paths) else: generation_cmd.append(wildcard) generation_cmd.append(unicode(out_dir)) logger.debug(" ".join(generation_cmd)) proc = psutil.Process(subprocess.Popen(generation_cmd).pid) num_images = len(in_paths) num_steps = (end_filter - start_filter)+1 last_fileidx = 0 recent_fileidx = 0 finished_steps = 0 while proc.is_running(): try: recent_fileidx = next(in_paths.index(x.path) for x in proc.open_files() if x.path in in_paths) except StopIteration: pass except psutil.AccessDenied: # This means the process is no longer running break if recent_fileidx == last_fileidx: time.sleep(.01) continue if recent_fileidx < last_fileidx: finished_steps += 1 last_fileidx = recent_fileidx progress = 0.5*((finished_steps*num_images+last_fileidx) / float(num_steps*num_images)) self.on_progressed.send(self, progress=progress)
def output(self, pages, target_path, metadata, table_of_contents): """ Go through pages and bundle their most recent images into a PDF file. :param pages: Pages to bundle :param target_path: list of :py:class:`spreads.workflow.Page` :param metadata: Metadata to include in PDF file :type metadata: :py:class:`spreads.metadata.Metadata` :param table_of_contents: Table of contents to include in PDF file :type table_of_contents: list of :py:class:`TocEntry` """ logger.info("Assembling PDF.") tmpdir = Path(tempfile.mkdtemp()) meta_file = tmpdir / 'metadata.txt' with codecs.open(unicode(meta_file), "w", "utf-8") as fp: for key, value in metadata.iteritems(): if key == 'title': fp.write("Title: \"{0}\"\n".format(value)) if key == 'creator': for author in value: fp.write("Author: \"{0}\"\n".format(author)) images = [] for page in pages: fpath = page.get_latest_processed(image_only=True) if fpath is None: fpath = page.raw_image link_path = (tmpdir / fpath.name) if IS_WIN: shutil.copy(unicode(fpath), unicode(link_path)) else: link_path.symlink_to(fpath.absolute()) if 'tesseract' in page.processed_images: ocr_path = page.processed_images['tesseract'] if IS_WIN: shutil.copy(unicode(ocr_path), unicode(tmpdir / ocr_path.name)) else: (tmpdir / ocr_path.name).symlink_to(ocr_path.absolute()) images.append(link_path.absolute()) pdf_file = target_path.absolute() / "book.pdf" # TODO: Use table_of_contents to create a TOCFILE for pdfbeads # TODO: Use page.page_label to create a LSPEC for pdfbeads # NOTE: pdfbeads only finds *html files for the text layer in the # working directory, so we have to chdir() into it old_path = os.path.abspath(os.path.curdir) os.chdir(unicode(tmpdir)) cmd = [BIN, "-d", "-M", unicode(meta_file)] if IS_WIN: cmd.append(util.wildcardify(tuple(f.name for f in images))) else: cmd.extend([unicode(f) for f in images]) cmd.extend(["-o", unicode(pdf_file)]) logger.debug("Running " + " ".join(cmd)) proc = util.get_subprocess(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=IS_WIN) if IS_WIN: # NOTE: Due to a bug in the jbig2enc version for Windows, the error # output gets huge, creating a deadlock. Hence, we go the # safe way and use `communicate()`, though this means no # progress notification for the user. output, errors = proc.communicate() else: errors = "" is_jbig2 = False cur_jbig2_page = 0 while proc.poll() is None: cur_line = proc.stderr.readline() errors += "\n" + cur_line prep_match = re.match(r"^Prepared data for processing (.*)$", cur_line) proc_match = re.match(r"^Processed (.*)$", cur_line) jbig2_match = re.match( r"^JBIG2 compression complete. pages:(\d+) symbols:\d+ " r"log2:\d+$", cur_line) progress = None if prep_match: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == prep_match.group(1)) progress = file_idx / (len(images) * 2) elif jbig2_match: cur_jbig2_page += int(jbig2_match.group(1)) progress = (len(images) + cur_jbig2_page) / (len(images) * 2) is_jbig2 = True elif proc_match and not is_jbig2: file_idx = next(idx for idx, f in enumerate(images) if unicode(f) == proc_match.group(1)) progress = (len(images) + file_idx) / (len(images) * 2) if progress is not None: self.on_progressed.send(self, progress=progress) time.sleep(.01) output = proc.stdout.read() logger.debug("pdfbeads stdout:\n{0}".format(output)) logger.debug("pdfbeads stderr:\n{0}".format(errors)) os.chdir(old_path) shutil.rmtree(unicode(tmpdir))