def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") metadata = Metadata.get_metadata_from_book(self.utils.report, self.book["source"]) needs_manual_approval = True if "nlb:needs-manual-approval" in metadata and metadata["nlb:needs-manual-approval"] == "false": needs_manual_approval = False if needs_manual_approval: self.utils.report.title = self.title + ": {} trenger manuell gjennomgang{}".format( metadata["identifier"], "" if "title" not in metadata else " (" + metadata["title"] + ")") else: self.utils.report.title = self.title + ": {} ble automatisk godkjent �😄{}".format( metadata["identifier"], "" if "title" not in metadata else " (" + metadata["title"] + ")") self.utils.report.info("Boken ble automatisk godkjent.") archived_path, stored = self.utils.filesystem.storeBook(self.book["source"], metadata["identifier"]) self.utils.report.attachment(None, archived_path, "DEBUG") return True
def getDirectoryEdition(directory_id, edition_id, force_update, method): path = os.path.normpath(Directory.dirs_flat[directory_id]) if directory_id in Directory.dirs_flat else None if not path: return "", 404 book_path = None if path in Directory.dirs: names = list(Directory.dirs[path]._md5.keys()) for name in names: if Path(name).stem == edition_id: book_path = os.path.join(path, name) break if not book_path: return "", 404 if method == "HEAD": return "", 200 else: return jsonify(Metadata.get_metadata_from_book(logging, book_path, force_update=force_update)), 200
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") self.utils.report.info("Lager en kopi av filsettet") temp_htmldir_obj = tempfile.TemporaryDirectory() temp_htmldir = temp_htmldir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_htmldir) self.utils.report.info("Finner HTML-fila") html_file = None for root, dirs, files in os.walk(temp_htmldir): for f in files: if f.endswith("html"): html_file = os.path.join(root, f) if not html_file or not os.path.isfile(html_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne en HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet " return False html_xml = ElementTree.parse(html_file).getroot() identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:identifier']") metadata = Metadata.get_metadata_from_book(self.utils.report, temp_htmldir) line_spacing = "single" duplex = "true" for e in html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:format.linespacing']"): if "double" == e.attrib["content"]: line_spacing = "double" for e in html_xml.xpath( "/*/*[local-name()='head']/*[@name='dc:format.printing']"): if "single-sided" == e.attrib["content"]: duplex = "false" self.utils.report.info("Linjeavstand: {}".format( "åpen" if line_spacing == "double" else "enkel")) self.utils.report.info("Trykk: {}".format("enkeltsidig" if duplex == "false" else "dobbeltsidig")) bookTitle = "" bookTitle = " (" + html_xml.xpath( "string(/*/*[local-name()='head']/*[local-name()='title']/text())" ) + ") " identifier = identifier[0].attrib[ "content"] if identifier and "content" in identifier[ 0].attrib else None if not identifier: self.utils.report.error( self.book["name"] + ": Klarte ikke å finne boknummer i HTML-fil.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet " return False epub_identifier = html_xml.xpath( "/*/*[local-name()='head']/*[@name='nlbprod:identifier.epub']") epub_identifier = epub_identifier[0].attrib[ "content"] if epub_identifier and "content" in epub_identifier[ 0].attrib else None # ---------- konverter til PEF ---------- # create context for Pipeline 2 job html_dir = os.path.dirname(html_file) html_context = {} for root, dirs, files in os.walk(html_dir): for file in files: kind = mimetypes.guess_type(file)[0] if kind is not None and kind.split("/")[0] in [ "image", "video", "audio" ]: continue # ignore media files fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, html_dir) html_context[relpath] = fullpath script_id = "nlb:html-to-pef" pipeline_and_script_version = [ ("1.11.1-SNAPSHOT", "1.10.0-SNAPSHOT"), ] braille_arguments = { "source": os.path.basename(html_file), "braille-standard": "(dots:6)(grade:0)", "line-spacing": line_spacing, "duplex": duplex, } # for custom Statped options using NLBs PIP (remove `and False` or replace with `or True` to test) if metadata["library"].lower() == "statped" and False: # see: https://github.com/nlbdev/pipeline/blob/nlb/nlb/book-to-pef/src/main/resources/xml/html-to-pef.xpl#L146-L167 # # (1) 'http://www.nlb.no/pipeline/modules/braille/pre-processing.xsl', # (2) 'http://www.daisy.org/pipeline/modules/braille/xml-to-pef/generate-toc.xsl', # (3) if ($default-table-class = '') then resolve-uri('add-table-classes.xsl') else (), # (4) if ($insert-boilerplate = 'true') then 'http://www.nlb.no/pipeline/modules/braille/insert-boilerplate.xsl' else (), # (5) if ($apply-default-stylesheet = 'true') then 'http://www.nlb.no/pipeline/modules/braille/default.scss' else (), # (6) if ($stylesheet) then tokenize($stylesheet,',') else ()),' ')"/> braille_arguments["insert-boilerplate"] = "false" # disable (4) braille_arguments[ "apply-default-stylesheet"] = "false" # disable (5) # (1-3) will still be included. Specifying (6) let's us include replacements for (4) and (5) braille_arguments["stylesheet"] = ",".join([ "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss" ]) # for custom Statped options using DAISYs PIP (remove `and False` or replace with `or True` to test) if metadata["library"].lower() == "statped" and True: # use DAISYs version of PIP instead script_id = "html-to-pef" pipeline_and_script_version = [ ("1.14.6", None), ("1.14.5", None), ("1.14.4", "4.2.0"), ("1.14.4-SNAPSHOT", "4.1.1"), ("1.14.3", "4.1.1"), ("1.14.2", "4.1.0"), ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ] braille_arguments = { "html": os.path.basename(html_file), "transform": "(formatter:dotify)(translator:liblouis)(dots:6)(grade:0)", "stylesheet": " ".join([ # 1. better volume breaking, and also removes title page and print toc, moves the colophon and copyright page to the end of the book # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/pre-processing.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/pre-processing.xsl", #"https://raw.githubusercontent.com/daisy/pipeline/master/modules/braille/xml-to-pef/src/main/resources/xml/xslt/generate-toc.xsl", # 3. NLB: Add table classes based on the dimensions of the table, for better handling of tables "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/add-table-classes.xsl", # 4. NLB: Generate a new title page and about page in the frontmatter # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/insert-boilerplate.xsl", "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl", # 5. Statped-specific SCSS "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss", ]), "page-width": '38', "page-height": '29', "toc-depth": '2', "maximum-number-of-sheets": '50', "include-production-notes": 'true', "hyphenation": 'false', "allow-volume-break-inside-leaf-section-factor": '10', "prefer-volume-break-before-higher-level-factor": '1', "stylesheet-parameters": "(skip-margin-top-of-page:true)", } pef_tempdir_object = tempfile.TemporaryDirectory() self.utils.report.info("Konverterer fra HTML til PEF...") found_pipeline_version = None found_script_version = None with DaisyPipelineJob( self, script_id, braille_arguments, pipeline_and_script_version=pipeline_and_script_version, context=html_context) as dp2_job: found_pipeline_version = dp2_job.found_pipeline_version found_script_version = dp2_job.found_script_version # get conversion report if os.path.isdir( os.path.join(dp2_job.dir_output, "preview-output-dir")): Filesystem.copy( self.utils.report, os.path.join(dp2_job.dir_output, "preview-output-dir"), os.path.join(self.utils.report.reportDir(), "preview")) self.utils.report.attachment( None, os.path.join(self.utils.report.reportDir(), "preview" + "/" + identifier + ".pef.html"), "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR") if dp2_job.status != "SUCCESS": self.utils.report.info("Klarte ikke å konvertere boken") self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle return False dp2_pef_dir = os.path.join(dp2_job.dir_output, "pef-output-dir") dp2_new_pef_dir = os.path.join(dp2_job.dir_output, "output-dir") if not os.path.exists(dp2_pef_dir) and os.path.exists( dp2_new_pef_dir): dp2_pef_dir = dp2_new_pef_dir if not os.path.isdir(dp2_pef_dir): self.utils.report.info("Finner ikke den konverterte boken.") self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle return False Filesystem.copy(self.utils.report, dp2_pef_dir, pef_tempdir_object.name) self.utils.report.info("Boken ble konvertert.") self.utils.report.info("Kopierer metadata fra HTML til PEF...") try: pef_file = None for root, dirs, files in os.walk(pef_tempdir_object.name): for f in files: if f.endswith(".pef"): pef_file = os.path.join(root, f) if not pef_file or not os.path.isfile(pef_file): self.utils.report.error(self.book["name"] + ": Klarte ikke å finne en PEF-fil.") else: additional_metadata = [] additional_metadata.append( ("daisy-pipeline-engine-version", "nlbprod", "http://www.nlb.no/production", None, found_pipeline_version)) additional_metadata.append( ("daisy-pipeline-script-id", "nlbprod", "http://www.nlb.no/production", None, script_id)) additional_metadata.append( ("daisy-pipeline-script-version", "nlbprod", "http://www.nlb.no/production", None, found_script_version)) for argument in braille_arguments: if argument in ["source", "html"]: continue # skip HTML file path values = braille_arguments[argument] values = values if isinstance(values, list) else [values] for value in values: additional_metadata.append( ("daisy-pipeline-argument", "nlbprod", "http://www.nlb.no/production", argument, value)) transfer_metadata_from_html_to_pef(html_file, pef_file, additional_metadata) except Exception: self.utils.report.warning(traceback.format_exc(), preformatted=True) self.utils.report.error( "An error occured while trying to insert metadata about the conversion" ) self.utils.report.info("Kopierer til PEF-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( pef_tempdir_object.name, identifier) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + identifier + " ble konvertert 👍😄" + bookTitle return True
def on_book(self): epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" + epubTitle return temp_obj = tempfile.TemporaryDirectory() temp_dir = temp_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dir) self.utils.report.info("Henter metadata fra api.nlb.no") creative_work_metadata = None edition_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( self.book["name"], editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") edition_metadata = Metadata.get_edition_from_api(self.book["name"]) if creative_work_metadata is not None: break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere." ) return False library = edition_metadata["library"].lower() # in case of wrong upper lower cases if library == "nlb": library = "NLB" elif library == "statped": library = "Statped" elif library == "kabb": library = "KABB" if library.lower() != "statped": self.utils.report.error("Ikke en Statped bok. Avbryter") self.utils.report.should_email = False return False # Filesystem.copy(self.utils.report, self.book["source"], temp_dir) self.utils.report.info("Kopierer til EPUB master-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_dir, epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " er valid 👍😄" + epubTitle self.utils.filesystem.deleteSource() return True
def _signatures_refresh_thread(self): idle_start_time = time.time() while self.shouldRun(): time.sleep(5) if time.time() - Metadata.signatures_last_update > 3600*3: Metadata.get_signatures_from_quickbase("0", refresh=True) # discard the result, we just want to trigger an update
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") metadata = Metadata.get_metadata_from_book(self.utils.report, self.book["source"]) metadata["identifier"] = re.sub(r"[^\d]", "", metadata["identifier"]) if not metadata["identifier"]: self.utils.report.error( "Klarte ikke å bestemme boknummer for {}".format( self.book["name"])) return False if metadata["identifier"] != self.book["name"]: self.utils.report.info("Boknummer for {} er: {}".format( self.book["name"], metadata["identifier"])) self.utils.report.info("Lager en kopi av DTBoken") temp_dtbookdir_obj = tempfile.TemporaryDirectory() temp_dtbookdir = temp_dtbookdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dtbookdir) # find DTBook XML dtbook = None for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if f.endswith(".xml"): xml = ElementTree.parse(os.path.join(root, f)).getroot() if xml.xpath( "namespace-uri()" ) == "http://www.daisy.org/z3986/2005/dtbook/": dtbook = os.path.join(root, f) break if dtbook is not None: break if not dtbook: self.utils.report.error(self.book["name"] + ": Klarte ikke å finne DTBook") return False # rename all files to lower case for root, dirs, files in os.walk(temp_dtbookdir): for f in files: if not f.lower() == f: self.utils.report.warn( "renaming to lowercase: {}".format(f)) shutil.move(os.path.join(root, f), os.path.join(root, f.lower())) temp_dtbook_file_obj = tempfile.NamedTemporaryFile() temp_dtbook_file = temp_dtbook_file_obj.name self.utils.report.info("Rydder opp i nordisk DTBook") xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-dtbook.xsl"), source=dtbook, target=temp_dtbook_file) if not xslt.success: return False shutil.copy(temp_dtbook_file, dtbook) self.utils.report.info("Validerer Nordisk DTBook...") # create context for Pipeline 2 job dtbook_dir = os.path.dirname(dtbook) dtbook_context = {} for root, dirs, files in os.walk(dtbook_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, dtbook_dir) dtbook_context[relpath] = fullpath with DaisyPipelineJob( self, "nordic-dtbook-validate", { "dtbook": os.path.basename(dtbook), "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_validate: dtbook_validate_status = None if dp2_job_dtbook_validate.status == "SUCCESS": dtbook_validate_status = "SUCCESS" elif dp2_job_dtbook_validate.status in ["VALIDATION_FAIL", "FAIL"]: dtbook_validate_status = "WARN" else: dtbook_validate_status = "ERROR" report_file = os.path.join(dp2_job_dtbook_validate.dir_output, "html-report/report.xhtml") if dtbook_validate_status == "WARN": report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join( [e.strip() for e in error.xpath('.//text()')]).strip() error_text = " ".join(error_text.split()).strip() if bool( error_text) else error_text if (bool(error_text) and (error_text.startswith("[tpb124]") or error_text.startswith("[tpb43]") or error_text.startswith("[tpb10] Meta dc:Publisher") or error_text.startswith("[tpb10] Meta dc:Date") or error_text.startswith("[opf3g]") or 'element "h1" not allowed here' in error_text or 'element "h2" not allowed here' in error_text or 'element "h3" not allowed here' in error_text or 'element "h4" not allowed here' in error_text or 'element "h5" not allowed here' in error_text or 'element "h6" not allowed here' in error_text or 'token "toc-brief" invalid' in error_text)): continue # ignorer disse feilmeldingene if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split()).strip( ) if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB"): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ("0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) else: dtbook_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook.html"), dtbook_validate_status) if dtbook_validate_status == "ERROR": self.utils.report.error("Klarte ikke å validere boken") return False if dtbook_validate_status == "WARN": self.utils.report.warn( "DTBoken er ikke valid, men vi fortsetter alikevel.") self.utils.report.info( "Konverterer fra Nordisk DTBook til Nordisk HTML...") temp_htmldir_obj = tempfile.TemporaryDirectory() temp_htmldir = temp_htmldir_obj.name temp_htmlfile = None with DaisyPipelineJob( self, "nordic-dtbook-to-html", { "dtbook": os.path.basename(dtbook), "fail-on-error": "false", "no-legacy": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=dtbook_context) as dp2_job_dtbook_to_html: convert_status = "SUCCESS" if dp2_job_dtbook_to_html.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join( dp2_job_dtbook_to_html.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error( "Klarte ikke å konvertere boken fra DTBook til HTML") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-dtbook-to-html.html"), convert_status) return False dp2_html_dir = os.path.join(dp2_job_dtbook_to_html.dir_output, "output-dir") if not os.path.isdir(dp2_html_dir): self.utils.report.error( "Finner ikke 'output-dir' for den konverterte boken: {}". format(dp2_html_dir)) return False Filesystem.copy(self.utils.report, dp2_html_dir, temp_htmldir) temp_htmlfile = os.path.join(temp_htmldir, metadata["identifier"] + ".xhtml") if not os.path.isfile(temp_htmlfile): self.utils.report.error( "Finner ikke den konverterte boken: {}".format(temp_htmlfile)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Rydder opp i nordisk HTML") temp_html_xslt_output_obj = tempfile.NamedTemporaryFile() temp_html_xslt_output = temp_html_xslt_output_obj.name xslt = Xslt(self, stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir, NordicDTBookToEpub.uid, "nordic-cleanup-html.xsl"), source=temp_htmlfile, target=temp_html_xslt_output) if not xslt.success: return False shutil.copy(temp_html_xslt_output, temp_htmlfile) self.utils.report.info( "Konverterer fra Nordisk HTML til Nordisk EPUB3...") # create context for Pipeline 2 job html_dir = os.path.dirname(temp_htmlfile) html_context = {} for root, dirs, files in os.walk(html_dir): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, html_dir) html_context[relpath] = fullpath temp_epub_file_obj = tempfile.NamedTemporaryFile() temp_epub_file = temp_epub_file_obj.name with DaisyPipelineJob(self, "nordic-html-to-epub3", { "html": os.path.basename(temp_htmlfile), "fail-on-error": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context=html_context) as dp2_job_html_to_epub: convert_status = "SUCCESS" if dp2_job_html_to_epub.status == "SUCCESS" else "ERROR" convert_report_file = os.path.join(dp2_job_html_to_epub.dir_output, "html-report/report.xhtml") if convert_status != "SUCCESS": self.utils.report.error("Klarte ikke å konvertere boken") # get conversion report if os.path.isfile(convert_report_file): with open(convert_report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-html-to-epub3.html"), convert_status) return False dp2_epub_file = os.path.join(dp2_job_html_to_epub.dir_output, "output-dir", metadata["identifier"] + ".epub") if not os.path.isfile(dp2_epub_file): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_epub_file)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False self.utils.report.info("Validerer Nordisk EPUB 3...") epub_file = dp2_epub_file.asFile() with DaisyPipelineJob(self, "nordic-epub3-validate", {"epub": os.path.basename(epub_file)}, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={ os.path.basename(epub_file): epub_file }) as dp2_job_epub_validate: epub_validate_status = "SUCCESS" if dp2_job_epub_validate.status == "SUCCESS" else "ERROR" report_file = os.path.join(dp2_job_epub_validate.dir_output, "html-report/report.xhtml") if epub_validate_status == "ERROR": # attach intermediary file from conversion with open(temp_htmlfile, 'r') as intermediary_htmlfile: self.utils.report.attachment( intermediary_htmlfile.readlines(), os.path.join(self.utils.report.reportDir(), "intermediary-html.html"), "DEBUG") epub_validate_status = "WARN" report_doc = ElementTree.parse(report_file) errors = report_doc.xpath( '//*[@class="error" or @class="message-error"]') for error in errors: error_text = " ".join([ e.strip() for e in error.xpath('.//text()') ]).strip() error_text = " ".join(error_text.split()).strip( ) if bool(error_text) else error_text if (bool(error_text) and (error_text.startswith("[nordic280]") or "PKG-021: Corrupted image file encountered." in error_text)): continue # ignorer disse feilmeldingene else: self.utils.report.warn( "Not ignoring: {}".format(error_text)) if error_text.startswith("Incorrect file signature"): magic_number = error.xpath( '*[@class="message-details"]/*[last()]/*[last()]/text()' )[0] magic_number = " ".join(magic_number.split( )).strip() if bool(magic_number) else magic_number # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46 if magic_number.startswith( "0xFF 0xD8 0xFF 0xDB" ): # Also allow JPEG RAW continue elif magic_number[: 19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[ 30:] == ( "0x45 0x78 0x69 0x66" ): # Also allow EXIF continue else: epub_validate_status = "ERROR" self.utils.report.error(error_text) else: epub_validate_status = "ERROR" self.utils.report.error(error_text) # get conversion report if os.path.isfile(report_file): with open(report_file, 'r') as result_report: self.utils.report.attachment( result_report.readlines(), os.path.join(self.utils.report.reportDir(), "report-epub3.html"), epub_validate_status) if epub_validate_status == "ERROR": self.utils.report.error( "Klarte ikke å validere EPUB 3-versjonen av boken") return False Filesystem.copy(self.utils.report, dp2_epub_file, temp_epub_file) epub = Epub(self.utils.report, temp_epub_file) if not epub.isepub(): return False self.utils.report.info( "Boken ble konvertert. Kopierer til EPUB3-fra-DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( epub.asDir(), metadata["identifier"], overwrite=self.overwrite) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = "{}: {} ble konvertert 👍😄 ({})".format( self.title, metadata["identifier"], metadata["title"]) return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # check that this is an EPUB (we only insert metadata into EPUBs) if not epub.isepub(): return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke ├Ц bestemme boknummer basert p├Ц dc:identifier." ) return False if epub.identifier() != self.book["name"].split(".")[0]: self.utils.report.error( self.book["name"] + ": Filnavn stemmer ikke overens med dc:identifier: {}".format( epub.identifier())) return False should_produce, metadata_valid = Metadata.should_produce( epub.identifier(), self.publication_format, report=self.utils.report) if not metadata_valid: self.utils.report.info( "{} har feil i metadata for {}. Avbryter.".format( epub.identifier(), self.publication_format)) self.utils.report.title = "{}: {} har feil i metadata for {} ЪўГЪЉј {}".format( self.title, epub.identifier(), self.publication_format, epubTitle) return False if not should_produce: self.utils.report.info( "{} skal ikke produseres som {}. Avbryter.".format( epub.identifier(), self.publication_format)) self.utils.report.title = "{}: {} Skal ikke produseres som {} Ъци {}".format( self.title, epub.identifier(), self.publication_format, epubTitle) return True self.utils.report.info("Lager en kopi av EPUBen") temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir) temp_epub = Epub(self.utils.report, temp_epubdir) is_valid = Metadata.insert_metadata( self.utils.report, temp_epub, publication_format=self.publication_format, report_metadata_errors=False) if not is_valid: self.utils.report.error( "Bibliofil-metadata var ikke valide. Avbryter.") return False self.utils.report.info( "Boken ble oppdatert med format-spesifikk metadata. Kopierer til {}-arkiv." .format(self.publication_format)) archived_path, stored = self.utils.filesystem.storeBook( temp_epub.asDir(), epub.identifier()) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = "{}: {} har f├Цtt {}-spesifikk metadata og er klar til ├Ц produseres ЪЉЇЪўё {}".format( self.title, epub.identifier(), self.publication_format, temp_epub.meta("dc:title")) return True
def on_book(self): self.utils.report.info("Validerer Daisy 2.02 lydbok") if self.dp1_home == "" or self.validator_script == "": if not self.init_environment(): self.utils.report.error( "Pipeline1 ble ikke funnet. Avbryter..") return False folder = self.book["name"] if self.book["name"].isnumeric() is False: self.utils.report.warn( f"{folder} er ikke et tall, prosesserer ikke denne boka. Mulig det er en multivolum bok." ) self.utils.report.should_email = False return False if os.path.isdir(os.path.join(self.dir_out, folder)): self.utils.report.error( f"{folder} finnes allerede på share, avbryter.") return False if self.nlbsamba_out == "": self.nlbsamba_out = Config.get("nlbsamba.dir") if self.nlbsamba_out is None: self.nlbsamba_out = "" temp_obj = tempfile.TemporaryDirectory() temp_dir = temp_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_dir) if not os.path.isfile(os.path.join(temp_dir, "ncc.html")): self.utils.report.error("Finner ikke ncc fila") self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎. Er dette en daisy 2.02 lydbok med en ncc.html fil?" return False try: ncc_tree = ElementTree.parse(os.path.join(temp_dir, "ncc.html")) ncc_encoding = ncc_tree.docinfo.encoding.lower() nccdoc = ncc_tree.getroot() except Exception: self.utils.report.info( "Klarte ikke lese ncc fila. Sjekk loggen for detaljer.") self.utils.report.debug(traceback.format_exc(), preformatted=True) return False edition_identifier = "" audio_title = "" audio_title = nccdoc.xpath("string(//*[@name='dc:title']/@content)") edition_identifier = nccdoc.xpath( "string(//*[@name='dc:identifier']/@content)") if ncc_encoding != 'utf-8': self.utils.report.error( self.book["name"] + ": Encodingen til filen er ikke utf-8, (f{ncc_encoding}) avbryter." ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False str_edition_identifier = str(edition_identifier) str_book_name = str(self.book["name"]) if edition_identifier == ( "") or str_edition_identifier != str_book_name: self.utils.report.error( self.book["name"] + f": Klarte ikke å bestemme boknummer basert på dc:identifier. dc:identifier: {str_edition_identifier} mappenavn: {str_book_name}" ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " feilet 😭👎" return False self.utils.report.info("Henter metadata fra api.nlb.no") creative_work_metadata = None edition_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( edition_identifier, editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") edition_metadata = Metadata.get_edition_from_api( edition_identifier) if creative_work_metadata is not None: break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere." ) return False library = edition_metadata["library"].lower() # in case of wrong upper lower cases if library == "nlb": library = "NLB" elif library == "statped": library = "Statped" elif library == "kabb": library = "KABB" periodical = False if creative_work_metadata[ "newspaper"] is True or creative_work_metadata[ "magazine"] is True: periodical = True if len(edition_identifier) != 12: self.utils.report.error( f"Boka {edition_identifier} er en avis eller et magasin, men utgavenummeret har ikke 12 siffer" ) return False else: if len(edition_identifier) != 6: self.utils.report.error( f"Boka {edition_identifier} har ikke 6 siffer") return False root_directory = Path(temp_dir) max_size = 702545920 - 20971520 size = sum(f.stat().st_size for f in root_directory.glob('**/*') if f.is_file()) multi_volume = False if size >= max_size: self.utils.report.info( f"{edition_identifier} er på størrelse {size}, sjekker om det er en multivolum bok." ) multi_volume = True else: self.utils.report.info( f"{edition_identifier} er på størrelse {size} bytes") multi_volume_dirs = [] if multi_volume: files_dir = os.listdir(self.dir_in) for file in files_dir: if file.startswith(self.book["name"]) and file[-1].isdigit( ) and file[-2] == "_": self.utils.report.info( f"{file} er en del av multi volum boka {edition_identifier}" ) multi_volume_dirs.append(file) multi_volume_directory = Path( os.path.join(self.dir_in, file)) multi_volume_size = size = sum( f.stat().st_size for f in multi_volume_directory.glob('**/*') if f.is_file()) if multi_volume_size >= max_size: self.utils.report.info( f" Multi volum mappen {file} er på størrelse {multi_volume_size}, dette er for stort" ) self.utils.report.title = self.title + ": " + self.book[ "name"] + " Lydbok feilet 😭👎" return False else: multi_volume_files = os.listdir(multi_volume_directory) self.utils.report.info( f"Validerer filer til multi volum {file}...") if self.check_files(edition_identifier, multi_volume_files, library, multi_volume_directory, multi_volume) is False: return False if len(multi_volume_dirs) <= 0: self.utils.report.error( f"{edition_identifier} bør være en multivolum bok, men har ikke flere multivolum mapper. Avbryter." ) self.utils.report.title = self.title + ": " + self.book[ "name"] + "Lydbok feilet 😭👎" return False files_book = os.listdir(temp_dir) if "default.css" in files_book and library != "Statped": self.utils.report.info("Erstatter default.css med en tom fil") open(os.path.join(temp_dir, "default.css"), 'w').close() self.utils.report.info("Validerer filer...") if self.check_files(edition_identifier, files_book, library, temp_dir, False) is False: return False dc_creator = nccdoc.xpath("string(//*[@name='dc:creator']/@content)") if not len(dc_creator) >= 1: self.utils.report.error( f"{edition_identifier} finner ikke dc:creator, dette må boka ha" ) return False dc_narrator = nccdoc.xpath( "string(//*[@name='ncc:narrator']/@content)") if not len(dc_narrator) >= 1: self.utils.report.error( f"{edition_identifier} finner ikke ncc:narrator, dette må boka ha" ) return False multimedia_types = [ "audioOnly", "audioNcc", "audioPartText", "audioFullText", "textPartAudio", "textNcc" ] ncc_multimedia_type = nccdoc.xpath( "string(//*[@name='ncc:multimediaType']/@content)") if ncc_multimedia_type not in multimedia_types: self.utils.report.error( f"{edition_identifier} har ikke en valid ncc:multimediaType, dette må boka ha. Multimediatype er {ncc_multimedia_type}" ) return False first_head_class = nccdoc.xpath( "string(//*[local-name()='h1'][1]/@class)") second_head = nccdoc.xpath("string(//*[local-name()='h1'][2])").lower() accepted_second_head = [ "lydbokavtalen", "audiobook agreement", "the audiobook agreement", "tigar announcement", "nlb" ] if first_head_class != "title": self.utils.report.error( f"{edition_identifier} første heading {first_head_class} er ikke title" ) return False if second_head not in accepted_second_head and library == "NLB" and creative_work_metadata[ "newspaper"] is False and not ( creative_work_metadata["magazine"] is True and library == "KABB"): self.utils.report.error( f"{edition_identifier} andre heading {second_head} er ikke Lydbokavtalen, Audiobook agreement, eller Tigar announcement" ) return False if library != "Statped": status = self.validate_book(os.path.join(temp_dir, "ncc.html")) if status == "ERROR" or status is False: self.utils.report.error( "Pipeline validator: Boka er ikke valid. Se rapport.") return False self.utils.report.info("Pipeline validator: Boka er valid") if multi_volume: for folder in multi_volume_dirs: self.utils.report.debug(f"Flytter multivolum fil {folder}") archived_path_multi, stored = self.utils.filesystem.storeBook( os.path.join(self.dir_in, folder), folder) self.utils.report.attachment(None, archived_path_multi, "DEBUG") if self.nlbsamba_out != "": archived_path_samba_multi, stored_samba_multi = self.utils.filesystem.storeBook( os.path.join(self.dir_in, folder), folder, dir_out=self.nlbsamba_out) self.utils.report.attachment(None, archived_path_samba_multi, "DEBUG") shutil.rmtree(os.path.join(self.dir_in, folder)) if library == "Statped": css_format = "Statped" elif edition_metadata["includesText"] is True: css_format = "daisy202" else: css_format = "daisy202-ncc" self.utils.report.info(f"Inserting CSS: {css_format}") if library != "Statped": self.utils.filesystem.insert_css( os.path.join(temp_dir, "default.css"), library, css_format) files_temp = os.listdir(temp_dir) archived_path, stored = self.utils.filesystem.storeBook( temp_dir, edition_identifier) if self.nlbsamba_out != "": archived_path_samba, stored_samba = self.utils.filesystem.storeBook( temp_dir, edition_identifier, dir_out=self.nlbsamba_out) self.utils.report.attachment(None, archived_path_samba, "DEBUG") files_out = os.listdir(os.path.join(self.dir_out, edition_identifier)) if self.nlbsamba_out != "": if len(files_temp) == len( os.listdir( os.path.join(self.nlbsamba_out, edition_identifier))): with open( os.path.join(self.nlbsamba_out, edition_identifier, '.donedaisy'), 'w') as file: self.utils.report.debug(".donedaisy created") else: self.utils.report.error( f"MANGLER FILER i {self.nlbsamba_out}, sjekk utmappa") return False if len(files_temp) == len(files_out): with open( os.path.join(self.dir_out, edition_identifier, '.donedaisy'), 'w') as file: self.utils.report.debug(".donedaisy created") else: self.utils.report.error( f"MANGLER FILER i {self.dir_out}, sjekk utmappa") return False self.utils.report.info("Boka er godkjent og overført") if periodical: available_title = "" if creative_work_metadata["newspaper"] is False: available_title = audio_title Bibliofil.book_available("DAISY 2.02", edition_identifier, title=available_title) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + edition_identifier + " er valid 👍😄" + audio_title self.utils.filesystem.deleteSource() return True
def plot(self, uids, name): dot = Digraph(name="Produksjonssystem", format="png") dot.graph_attr["bgcolor"] = "transparent" node_ranks = {} for rank in Directory.dirs_ranked: node_ranks[rank["id"]] = [] # remember edges so that we don't plot them twice edges = {} for uid in uids: pipeline = None for p in self.pipelines: if p[0].uid == uid: pipeline = p break if not pipeline: continue group_pipeline = pipeline[0].get_current_group_pipeline() title = group_pipeline.get_group_title() pipeline_id = group_pipeline.get_group_id() # re.sub(r"[^a-z\d]", "", title.lower()) queue = group_pipeline.get_queue() queue_created = len([book for book in queue if Pipeline.get_main_event(book) == "created"]) if queue else 0 queue_deleted = len([book for book in queue if Pipeline.get_main_event(book) == "deleted"]) if queue else 0 queue_modified = len([book for book in queue if Pipeline.get_main_event(book) == "modified"]) if queue else 0 queue_triggered = len([book for book in queue if Pipeline.get_main_event(book) == "triggered"]) if queue else 0 queue_autotriggered = len([book for book in queue if Pipeline.get_main_event(book) == "autotriggered"]) if queue else 0 queue_string = [] if queue_created: queue_string.append("nye:"+str(queue_created)) if queue_modified: queue_string.append("endret:"+str(queue_modified)) if queue_deleted: queue_string.append("slettet:"+str(queue_deleted)) if queue_triggered: queue_string.append("trigget:"+str(queue_triggered)) if queue_autotriggered: queue_string.append("autotrigget:"+str(queue_autotriggered)) queue_string = ", ".join(queue_string) queue_size = 0 if queue: queue_size = len(queue) if not group_pipeline.should_handle_autotriggered_books(): queue_size -= queue_autotriggered book = Metadata.pipeline_book_shortname(group_pipeline) relpath_in = None netpath_in = "" rank_in = None if pipeline[0].dir_in: for rank in Directory.dirs_ranked: for dir in rank["dirs"]: if os.path.normpath(pipeline[0].dir_in) == os.path.normpath(rank["dirs"][dir]): rank_in = rank["id"] break if pipeline[0].dir_in and not pipeline[0].dir_base: relpath_in = os.path.basename(os.path.dirname(pipeline[0].dir_in)) elif pipeline[0].dir_in and pipeline[0].dir_base: base_path = Filesystem.get_base_path(pipeline[0].dir_in, pipeline[0].dir_base) relpath_in = os.path.relpath(pipeline[0].dir_in, base_path) if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path: pass else: if pipeline[0].dir_in not in self.buffered_network_paths: smb, file, unc = Filesystem.networkpath(pipeline[0].dir_in) host = Filesystem.get_host_from_url(smb) self.buffered_network_paths[pipeline[0].dir_in] = smb self.buffered_network_hosts[pipeline[0].dir_in] = host netpath_in = self.buffered_network_hosts[pipeline[0].dir_in] if not netpath_in: netpath_in = self.buffered_network_paths[pipeline[0].dir_in] book_count_in = self.get_book_count(pipeline[0].dir_in) label_in = "< <font point-size='24'>{}</font>{}{} >".format( relpath_in, "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_in, "bok" if book_count_in == 1 else "bøker"), "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_in.replace("\\", "\\\\")) if netpath_in else "") relpath_out = None netpath_out = "" rank_out = None if pipeline[0].dir_out: for rank in Directory.dirs_ranked: for dir in rank["dirs"]: if os.path.normpath(pipeline[0].dir_out) == os.path.normpath(rank["dirs"][dir]): rank_out = rank["id"] break if pipeline[0].dir_out and not pipeline[0].dir_base: relpath_out = os.path.basename(os.path.dirname(pipeline[0].dir_out)) elif pipeline[0].dir_out and pipeline[0].dir_base: base_path = Filesystem.get_base_path(pipeline[0].dir_out, pipeline[0].dir_base) relpath_out = os.path.relpath(pipeline[0].dir_out, base_path) if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path: pass else: if pipeline[0].dir_out not in self.buffered_network_paths: smb, file, unc = Filesystem.networkpath(pipeline[0].dir_out) host = Filesystem.get_host_from_url(smb) self.buffered_network_paths[pipeline[0].dir_out] = unc self.buffered_network_hosts[pipeline[0].dir_out] = host netpath_out = self.buffered_network_hosts[pipeline[0].dir_out] if not netpath_out: netpath_out = self.buffered_network_paths[pipeline[0].dir_out] book_count_out = self.get_book_count(pipeline[0].dir_out, pipeline[0].parentdirs) label_out = "< <font point-size='24'>{}</font>{}{} >".format( relpath_out, "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_out, "bok" if book_count_out == 1 else "bøker"), "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_out.replace("\\", "\\\\")) if netpath_out else "") if rank_out: node_ranks[rank_out].append(pipeline_id) elif rank_in: next_rank = self.next_rank(rank_in) if next_rank: node_ranks[next_rank].append(pipeline_id) else: node_ranks[rank_in].append(pipeline_id) state = group_pipeline.get_state() status = group_pipeline.get_status() progress_text = group_pipeline.get_progress() pipeline_label = "< <font point-size='26'>{}</font>{} >".format( title, "".join(["\n<br/><i><font point-size='22'>{}</font></i>".format(val) for val in [queue_string, progress_text, status] if val])) fillcolor = "lightskyblue1" if book or queue_size: fillcolor = "lightslateblue" elif state == "considering": fillcolor = "lightskyblue3" elif not group_pipeline.running: fillcolor = "white" elif isinstance(group_pipeline, DummyPipeline): fillcolor = "snow" dot.attr("node", shape="box", style="filled", fillcolor=fillcolor) dot.node(pipeline_id, pipeline_label.replace("\\", "\\\\")) if relpath_in: fillcolor = "wheat" if not pipeline[0].dir_in_obj or not pipeline[0].dir_in_obj.is_available(): fillcolor = "white" dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor) dot.node(pipeline[1], label_in) if pipeline[1] not in edges: edges[pipeline[1]] = [] if pipeline_id not in edges[pipeline[1]]: edges[pipeline[1]].append(pipeline_id) dot.edge(pipeline[1], pipeline_id) node_ranks[rank_in].append(pipeline[1]) if relpath_out: fillcolor = "wheat" if not pipeline[0].dir_out_obj or not pipeline[0].dir_out_obj.is_available(): fillcolor = "white" dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor) dot.node(pipeline[2], label_out) if pipeline_id not in edges: edges[pipeline_id] = [] if pipeline[2] not in edges[pipeline_id]: edges[pipeline_id].append(pipeline[2]) dot.edge(pipeline_id, pipeline[2]) node_ranks[rank_out].append(pipeline[2]) for rank in node_ranks: subgraph = Digraph("cluster_" + rank, graph_attr={"style": "dotted"}) subgraph.graph_attr["bgcolor"] = "#FFFFFFAA" if node_ranks[rank]: subgraph.attr("node", shape="none", style="filled", fillcolor="transparent") subgraph.node("_ranklabel_" + rank, "< <i><font point-size='28'>{}</font></i> >".format(" <br/>".join(str(self.rank_name(rank)).split(" ")))) for dir in node_ranks[rank]: subgraph.node(dir) dot.subgraph(subgraph) dot.render(os.path.join(self.report_dir, name + "_")) # there seems to be some race condition when doing this across a mounted network drive, # so if we get an exception we retry a few times and hope that it works. # see: https://github.com/nlbdev/produksjonssystem/issues/81 for t in reversed(range(10)): try: shutil.copyfile(os.path.join(self.report_dir, name + "_.png"), os.path.join(self.report_dir, name + ".png")) with open(os.path.join(self.report_dir, name + ".js"), "w") as javascript_file: javascript_file.write("setTime(\"{}\");".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) break except Exception as e: logging.debug(" Unable to copy plot image: {}".format(os.path.join(self.report_dir, name + "_.png"))) time.sleep(0.5) if t == 0: raise e dashboard_file = os.path.join(self.report_dir, name + ".html") if not os.path.isfile(dashboard_file): dashboard_template = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../dashboard.html')) if not os.path.exists(self.report_dir): os.makedirs(self.report_dir) shutil.copyfile(dashboard_template, dashboard_file)
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") self.utils.report.info("Locating HTML file") epub = Epub(self.utils.report, self.book["source"]) if not epub.isepub(): return False assert epub.isepub(), "The input must be an EPUB" spine = epub.spine() if not len(spine) == 1: self.utils.report.warn( "There must only be one item in the EPUB spine") return False html_file = os.path.join(self.book["source"], os.path.dirname(epub.opf_path()), spine[0]["href"]) identifier = epub.identifier() self.utils.report.info("lag en kopi av boka") temp_resultdir_obj = tempfile.TemporaryDirectory() temp_resultdir = temp_resultdir_obj.name Filesystem.copy(self.utils.report, os.path.dirname(html_file), temp_resultdir) temp_result = os.path.join(temp_resultdir, identifier + ".xml") self.utils.report.info("sletter EPUB-spesifikke filer") for root, dirs, files in os.walk(temp_resultdir): for file in files: if Path(file).suffix.lower() in [ ".xhtml", ".html", ".smil", ".mp3", ".wav", ".opf" ]: os.remove(os.path.join(root, file)) shutil.copy(html_file, temp_result) temp_xslt_output_obj = tempfile.NamedTemporaryFile() temp_xslt_output = temp_xslt_output_obj.name # MATHML to stem self.utils.report.info("Erstatter evt. MathML i boka...") mathml_validation = Mathml_validator(self, source=temp_result) if not mathml_validation.success: return False mathML_result = Mathml_to_text(self, source=temp_result, target=temp_result) if not mathML_result.success: return False self.utils.report.info("Fikser Webarch-oppmerking") self.utils.report.debug("webarch-fixup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToNarrationEpub.uid, "webarch-fixup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Setter inn lydbokavtalen...") self.utils.report.debug("bokinfo-tts-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "bokinfo-tts-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) creative_work_metadata = None timeout = 0 while creative_work_metadata is None and timeout < 5: timeout = timeout + 1 creative_work_metadata = Metadata.get_creative_work_from_api( identifier, editions_metadata="all", use_cache_if_possible=True, creative_work_metadata="all") if creative_work_metadata is not None: if creative_work_metadata["magazine"] is True: self.utils.report.info( "Fjerner sidetall fordi det er et tidsskrift...") self.utils.report.debug("remove-pagenum.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "remove-pagenum.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) break if creative_work_metadata is None: self.utils.report.warning( "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Konverterer likevel." ) library = epub.meta("schema:library") library = library.upper() if library else library logo = os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "{}_logo.png".format(library)) if os.path.isfile(logo): # epub_dir = os.path.join(temp_resultdir, "EPUB") image_dir = os.path.join(temp_resultdir, "images") if not os.path.isdir(image_dir): os.mkdir(image_dir) shutil.copy(logo, image_dir) self.utils.report.info("Konverterer fra XHTML5 til DTBook...") self.utils.report.debug("html-to-dtbook.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "html-to-dtbook.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Gjør tilpasninger i DTBook") self.utils.report.debug("dtbook-cleanup.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-cleanup.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) # Fjern denne transformasjonen hvis det oppstår kritiske proplemer med håndteringen av komplekst innhold self.utils.report.info( "Legger inn ekstra informasjon om komplekst innhold") self.utils.report.debug("optimaliser-komplekst-innhold.xsl") self.utils.report.debug(" source = " + temp_result) self.utils.report.debug(" target = " + temp_xslt_output) xslt = Xslt(self, stylesheet=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "optimaliser-komplekst-innhold.xsl"), source=temp_result, target=temp_xslt_output) if not xslt.success: return False shutil.copy(temp_xslt_output, temp_result) self.utils.report.info("Validerer DTBook...") # NOTE: This RelaxNG schema assumes that we're using DTBook 2005-3 and MathML 3.0 dtbook_relax = Relaxng( self, relaxng=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/rng/dtbook-2005-3.mathml-3.integration.rng"), source=temp_result) dtbook_sch = Schematron(self, schematron=os.path.join( Xslt.xslt_dir, NlbpubToTtsDtbook.uid, "dtbook-schema/sch/dtbook.mathml.sch"), source=temp_result) if not dtbook_relax.success: self.utils.report.error("Validering av DTBook feilet (RelaxNG)") if not dtbook_sch.success: self.utils.report.error("Validering av DTBook feilet (Schematron)") if not dtbook_relax.success or not dtbook_sch.success: tempfile_stored = os.path.join(self.utils.report.reportDir(), os.path.basename(temp_result)) shutil.copy(temp_result, tempfile_stored) self.utils.report.info( f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}" ) self.utils.report.attachment(None, tempfile_stored, "DEBUG") return False self.utils.report.info( "Boken ble konvertert. Kopierer til DTBook-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( temp_resultdir, identifier) self.utils.report.attachment(None, archived_path, "DEBUG") return True
def on_book(self): self.utils.report.attachment(None, self.book["source"], "DEBUG") epub = Epub(self.utils.report, self.book["source"]) epubTitle = "" try: epubTitle = " (" + epub.meta("dc:title") + ") " except Exception: pass # sjekk at dette er en EPUB if not epub.isepub(): return False if not epub.identifier(): self.utils.report.error( self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.") return False if epub.identifier() != self.book["name"].split(".")[0]: self.utils.report.error( self.book["name"] + ": Filnavn stemmer ikke overens med dc:identifier: {}".format( epub.identifier())) return False temp_xml_file_obj = tempfile.NamedTemporaryFile() temp_xml_file = temp_xml_file_obj.name self.utils.report.info("Lager en kopi av EPUBen") temp_epubdir_withimages_obj = tempfile.TemporaryDirectory() temp_epubdir_withimages = temp_epubdir_withimages_obj.name Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir_withimages) self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler") temp_epubdir_obj = tempfile.TemporaryDirectory() temp_epubdir = temp_epubdir_obj.name Filesystem.copy(self.utils.report, temp_epubdir_withimages, temp_epubdir) for root, dirs, files in os.walk( os.path.join(temp_epubdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) os.remove(fullpath) Path(fullpath).touch() temp_epub = Epub(self.utils.report, temp_epubdir) self.utils.report.info("Rydder opp i nordisk EPUB nav.xhtml") nav_path = os.path.join(temp_epubdir, temp_epub.nav_path()) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup-nav.xsl"), source=nav_path, target=temp_xml_file, parameters={ "cover": " ".join([item["href"] for item in temp_epub.spine()]), "base": os.path.dirname( os.path.join(temp_epubdir, temp_epub.opf_path())) + "/" }) if not xslt.success: return False shutil.copy(temp_xml_file, nav_path) self.utils.report.info("Rydder opp i nordisk EPUB package.opf") opf_path = os.path.join(temp_epubdir, temp_epub.opf_path()) xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup-opf.xsl"), source=opf_path, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, opf_path) html_dir_obj = tempfile.TemporaryDirectory() html_dir = html_dir_obj.name html_file = os.path.join(html_dir, epub.identifier() + ".xhtml") self.utils.report.info("Finner ut hvilket bibliotek boka tilhører…") edition_metadata = Metadata.get_edition_from_api( epub.identifier(), report=self.utils.report) library = None if edition_metadata is not None and edition_metadata[ "library"] is not None: library = edition_metadata["library"] else: library = Metadata.get_library_from_identifier( epub.identifier(), self.utils.report) self.utils.report.info(f"Boka tilhører '{library}'") self.utils.report.info("Zipper oppdatert versjon av EPUBen...") temp_epub.asFile(rebuild=True) self.utils.report.info( "Konverterer fra Nordisk EPUB 3 til Nordisk HTML 5...") epub_file = temp_epub.asFile() with DaisyPipelineJob(self, "nordic-epub3-to-html", { "epub": os.path.basename(epub_file), "fail-on-error": "false" }, pipeline_and_script_version=[ ("1.13.6", "1.4.6"), ("1.13.4", "1.4.5"), ("1.12.1", "1.4.2"), ("1.11.1-SNAPSHOT", "1.3.0"), ], context={os.path.basename(epub_file): epub_file}) as dp2_job_convert: convert_status = "SUCCESS" if dp2_job_convert.status == "SUCCESS" else "ERROR" if convert_status != "SUCCESS": self.utils.report.error("Klarte ikke å konvertere boken") return False dp2_html_dir = os.path.join(dp2_job_convert.dir_output, "output-dir", epub.identifier()) dp2_html_file = os.path.join(dp2_job_convert.dir_output, "output-dir", epub.identifier(), epub.identifier() + ".xhtml") if not os.path.isdir(dp2_html_dir): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_html_dir)) return False if not os.path.isfile(dp2_html_file): self.utils.report.error( "Finner ikke den konverterte boken: {}".format( dp2_html_file)) self.utils.report.info( "Kanskje filnavnet er forskjellig fra IDen?") return False Filesystem.copy(self.utils.report, dp2_html_dir, html_dir) self.utils.report.info("Rydder opp i nordisk HTML") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "nordic-cleanup.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info("Rydder opp i ns0 i page-normal") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "ns0-cleanup.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info("Rydder opp i innholdsfortegnelsen") xslt = Xslt(self, stylesheet=os.path.join(Xslt.xslt_dir, NordicToNlbpub.uid, "fix-toc-span.xsl"), source=html_file, target=temp_xml_file) if not xslt.success: return False shutil.copy(temp_xml_file, html_file) self.utils.report.info( "Legger til EPUB-filer (OPF, NAV, container.xml, mediatype)...") nlbpub_tempdir_obj = tempfile.TemporaryDirectory() nlbpub_tempdir = nlbpub_tempdir_obj.name nlbpub = Epub.from_html(self, html_dir, nlbpub_tempdir) if nlbpub is None: return False self.utils.report.info( "Erstatter tomme bildefiler med faktiske bildefiler") for root, dirs, files in os.walk( os.path.join(nlbpub_tempdir, "EPUB", "images")): for file in files: fullpath = os.path.join(root, file) relpath = os.path.relpath(fullpath, nlbpub_tempdir) os.remove(fullpath) Filesystem.copy(self.utils.report, os.path.join(temp_epubdir_withimages, relpath), fullpath) temp_epub = Epub(self.utils.report, temp_epubdir) nlbpub.update_prefixes() self.utils.report.info( "Boken ble konvertert. Kopierer til NLBPUB-arkiv.") archived_path, stored = self.utils.filesystem.storeBook( nlbpub.asDir(), temp_epub.identifier(), overwrite=self.overwrite) self.utils.report.attachment(None, archived_path, "DEBUG") self.utils.report.title = self.title + ": " + epub.identifier( ) + " ble konvertert 👍😄" + epubTitle return True