예제 #1
0
    def _update_md5(self, name):
        assert self.dir_path is not None, "Cannot get MD5 checksum for {} when there is no input directory".format(
            name)

        path = os.path.join(self.dir_path, name)

        assert "/" not in name

        with self._md5_lock:
            shallow_md5, _ = Filesystem.path_md5(
                path=path,
                shallow=True,
                expect=self._md5[name]["shallow"]
                if name in self._md5 else None)
            deep_md5, modified = Filesystem.path_md5(
                path=path,
                shallow=False,
                expect=self._md5[name]["deep"] if name in self._md5 else None)
            modified = max(
                modified if modified else 0,
                self._md5[name]["modified"] if name in self._md5 else 0)
            self._md5[name] = {
                "shallow": shallow_md5,
                "shallow_checked": int(time.time()),
                "deep": deep_md5,
                "deep_checked": int(time.time()),
                "modified": modified,
            }
예제 #2
0
    def get_results(self):
        result_obj = tempfile.NamedTemporaryFile(
            prefix="daisy-pipeline-results-", suffix=".zip")
        result = result_obj.name

        url = DaisyPipelineJob.encode_url(
            self.engine, "/jobs/{}/result".format(self.job_id), {})

        with requests.get(url, stream=True) as r:
            with open(result, 'wb') as f:
                shutil.copyfileobj(r.raw, f)

        if os.path.isfile(result) and os.path.getsize(result) > 0:
            Filesystem.unzip(self.pipeline.utils.report, result,
                             self.dir_output)
예제 #3
0
    def __init__(self,
                 pipeline=None,
                 source=None,
                 stdout_level="INFO",
                 stderr_level="INFO",
                 cwd=None):
        assert pipeline
        assert source

        if not cwd:
            cwd = tempfile.gettempdir()

        self.success = False

        Epubcheck.init_environment()

        # epubcheck works better when the input is zipped
        if source.lower().endswith(".opf"):
            pipeline.utils.report.debug("EPUB is not zipped, zipping…")
            root_path = os.path.dirname(source)
            while True:
                assert root_path != os.path.dirname(
                    root_path
                ), "No mimetype file or META-INF directory found in the EPUB, unable to determine root directory"
                is_root = False
                for filename in os.listdir(root_path):
                    if filename == "mimetype" or filename == "META-INF":
                        is_root = True
                        break
                if is_root:
                    break
                else:
                    root_path = os.path.dirname(root_path)

            epub = Epub(pipeline.utils.report, root_path)
            source = epub.asFile()

        try:
            command = ["java", "-jar", Epubcheck.epubcheck_jar]
            command.append(source)

            pipeline.utils.report.debug("Running Epubcheck")
            process = Filesystem.run_static(command,
                                            cwd,
                                            pipeline.utils.report,
                                            stdout_level=stdout_level,
                                            stderr_level=stderr_level)
            self.success = process.returncode == 0

        except subprocess.TimeoutExpired:
            pipeline.utils.report.error(
                "Epubcheck for {} took too long and were therefore stopped.".
                format(os.path.basename(source)))

        except Exception:
            pipeline.utils.report.debug(traceback.format_exc(),
                                        preformatted=True)
            pipeline.utils.report.error(
                "An error occured while running Epubcheck (for " +
                str(source) + ")")
예제 #4
0
    def _update_book_count_thread(self):
        while self.should_run:
            time.sleep(1)
            try:
                for dir in list(self.book_count.keys()):
                    dirs = []
                    parentdirs = self.book_count[dir]["parentdirs"]
                    if parentdirs:
                        for parentdir in parentdirs:
                            dirs.append(os.path.join(dir, parentdirs[parentdir]))
                    else:
                        dirs.append(dir)
                    if (self.book_count[dir]["modified"] + 15 < time.time()):
                        books = []
                        for d in dirs:
                            if os.path.isdir(d):
                                books += Filesystem.list_book_dir(d)
                        self.book_count[dir]["modified"] = time.time()
                        self.book_count[dir]["count"] = len(set(books))

                    if not self.should_run:
                        break

            except Exception:
                logging.exception("An error occurred while updating book count")
예제 #5
0
    def format_email_report(content, dirs, dir_log, logfile, book_archive):
        # Formats the daily report message in html format for email. img_string penguin for linux

        img_string = ("<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAYCAYAAADzoH0MAAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAA"
                      "sTAAALEwEAmpwYAAAAB3RJTUUH4goFCTApeBNtqgAAA2pJREFUOMt1lF1rI2UYhu/JfCST6bRp2kyCjmWzG0wllV1SULTSyoLpcfHU5jyLP6IUQX+"
                      "DLqw/wBbPWiUeaLHBlijpiZbNR9PdDUPKSjL5mszX48Ha6U6HveGBeeGd67nf+3lnGCIi3FKv10e9/hQMw+Du3XuYn4/hjaJbqtVqtL29Tfn8KuXz"
                      "q1QsFqlardKb5AFs26ZyuUzZbJZUVSVFUUgQBBIEgTKZDB0cHJDjOEGAaZrU6XTo6OiICoUCqapKxWKRdnd3aXZ2liRJIkmSaHNzkzqdThDw5Mn3t"
                      "La2Rul0mmKxGOXzq3R4eEiNRoMWFxdJlmWSZZkymQxVKpUAgFtaUvH5w3t43jLx429jXF62sb+/j6urK9i2DZZlAQCu68IwjECG3MbGp7h//wFedp"
                      "9Bc77BTz+Xsbe3BwDeywAgCALC4XAAEGJZFgsLC3j3vQcoPfoSiqKAZdlADYdDnJ2dBQDszs7OzvVCVVXE4/MwXv4NnmMxI8/AcUOwbRuu60LXdWx"
                      "tbYHn+RsHPjuhEBJxEV9/McK3JQsPV+dfnZPjwHEczs/PUS7/4j/C64tut4uZyA9Y+sRG8kMWf/zjwLZthEIhhEIhWJaFx4+/84XpAWzbRvvyL7z/"
                      "cQvMOzKO2wq07r9e9+tqNpuo1WpBQK/XgyQ/gyh8BGADv/+agOu6gTBN00SlUrkZ4/WDruuIzX4ABp9hqA/R6XzlC+t1XVxcYDweIxqN3jgwTRMC/"
                      "xZc+22MR3GY5qvuHMdBEASfi36/j8lk4ncwnU7Bshwsy4JlWV76kiSB4zj0+33Pgeu6cBzHDyAiOI6N6ZQBy7KQJAk8zyORSMAwDIxGIw8giiI4jv"
                      "eH6LouRqMRDGMChmGQTqcRDoeRyWQQDofB87xX8Xgc0ajodyAIAgaDgdelUChA0zTkciuo1+vgOG8rUqkUIpGIHxCPx9FqtbyNc3NzKJVK0DQNROS"
                      "biKIkg2NMJpPQdR2NRhOpVNL7Eh3HgSAIPoBhTEBEYBjmBsCyLJaXlyHLMk5PTyGKIkRRRCQSgaIoGI/HHuD4+Bi5XA4rKytgbv+VNU1Dtfon6vWn"
                      "4Hked+6k0ev1cHJyghcvnnsjlmUZ6+vrQYDjOLAsC5OJAdd1EI1G/78nJtrtCzSaTQz0AVKpJLLZLP4DF17fodMaIVYAAAAASUVORK5CYII")
    # + siste del: "=\" alt=\"DATA\">")

        message = ""
        first_dir_log = True
        timeout = 600
        timeout_start = time.time()

        for line in content:
            if time.time() > timeout_start + timeout:
                return message
            if "(li) " in line:
                line = line.replace("(li) ", "")
                message = message + "\n<ul>\n<li>" + line + "</li>\n</ul>"
            elif "(href) " in line:
                line = line.replace("(href) ", "")
                for dir in dirs:
                    dir_unc = Filesystem.networkpath(dir)[2]
                    if dir_unc in line:
                        split_href = line.split(", ")
                        if len(split_href) == 3:
                            smb_img_string = img_string + "=\" alt=\"{}\">".format(split_href[-1])
                            message = message + "\n<ul>\n<li><a href=\"file:///{}\">{}</a> {}</li>\n</ul>".format(split_href[1], split_href[0], smb_img_string)
                if logfile in line:
                    if first_dir_log:
                        split_href = line.split(", ")
                        smb_img_string = img_string + "=\" alt=\"{}\">".format(split_href[-1])
                        if len(split_href) == 3:
                            short_path = "log.txt"
                            message = message + "\n<ul>\n<li><a href=\"file:///{}\">{}</a> {}</li>\n</ul>".format(split_href[1], short_path, smb_img_string)
                            first_dir_log = False
            elif line != "":
                first_dir_log = True
                if "mail:" in line:
                    splitline = line.split("mail: ")
                    splitmail = splitline[-1].split(", ")
                    smb_img_string = img_string + "=\" alt=\"{}\">".format(splitmail[-1])
                    message = message + "\n<p><b>{}<a href=\"file:///{}\">Link</a> {}</b></p>".format(splitline[0], splitmail[0], smb_img_string)
                    continue
                elif "[" in line:
                    message = message + "\n" + "<p><b>" + line + "</b></p>"
        return message
예제 #6
0
def update():
    project_dir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".."))

    process = Filesystem.run_static(["git", "pull"], cwd=project_dir)

    if process.returncode == 0:
        return jsonify(process.stdout.decode("utf-8")), 200
    else:
        return jsonify(process.stderr.decode("utf-8")), 500
예제 #7
0
    def setUp(self):
        print("TEST: setUp (override os.unlink)")
        self.target = os.path.normpath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                         'target', 'unittest'))
        self.dir_in = os.path.join(self.target, "in")
        self.dir_out = os.path.join(self.target, "out")
        self.pipeline = MockPipeline(self.dir_in, self.dir_in, self.dir_out)
        self.filesystem = Filesystem(self.pipeline)

        if os.path.exists(self.target):
            shutil.rmtree(self.target)

        self.original_unlink = os.unlink
        os.unlink = FilesystemTest.unlink
예제 #8
0
def getDirectories(structure):
    if structure == "ranked":
        return jsonify(Directory.dirs_ranked)

    elif structure == "resolved":
        dirs = {}

        buffered_network_paths = Config.get("buffered_network_paths", {})
        buffered_network_hosts = Config.get("buffered_network_hosts", {})

        for dir in Directory.dirs_flat:
            if isinstance(dir, str) and dir not in buffered_network_paths:
                smb, file, unc = Filesystem.networkpath(Directory.dirs_flat[dir])
                host = Filesystem.get_host_from_url(smb)
                buffered_network_paths[dir] = smb
                Config.set("buffered_network_paths." + dir, smb)
                buffered_network_hosts[dir] = host
                Config.set("buffered_network_hosts." + dir, host)
            dirs[dir] = buffered_network_paths[dir]

        return jsonify(dirs)

    else:
        return jsonify(Directory.dirs_flat)
예제 #9
0
def triggerDirectoryEdition(directory_id, edition_id):
    path = os.path.normpath(Directory.dirs_flat[directory_id]) if directory_id in Directory.dirs_flat else None

    if not path:
        return None, 404

    file_stems = [Path(file).stem for file in Filesystem.list_book_dir(path)]
    if edition_id not in file_stems:
        return None, 404

    result = []
    for pipeline in Pipeline.pipelines:
        if pipeline.dir_in and os.path.normpath(pipeline.dir_in) == path:
            pipeline.trigger(edition_id, auto=False)
            result.append(pipeline.uid)

    return jsonify(result), 200
예제 #10
0
    def _trigger_epub_catalog_thread(self):
        last_check = 0

        self.watchdog_bark()
        while self.shouldRun:
            time.sleep(5)
            self.watchdog_bark()

            if not self.dirsAvailable():
                continue

            # Check for update every 3 days
            max_update_interval = 60 * 60 * 24 * 3
            if time.time() - last_check < max_update_interval:
                continue

            last_check = time.time()
            logging.info("Updating formatklar and filesize for ebooks")
            list_books = Filesystem.list_book_dir(self.dir_out)
            Bibliofil.update_list_of_books("XHTML", list_books)
예제 #11
0
    def is_available(self):
        if self.last_availability_check_time >= time.time() - 10:
            if not self.last_availability_check_time:
                logging.debug("Directory is not available (cached result)" + (
                    ": {}".format(self.dir_path) if self.dir_path else ""))
            return self.last_availability_check_result

        self.last_availability_check_time = time.time()

        if self.dir_path is None:
            self.last_availability_check_result = True
            return self.last_availability_check_result

        self.last_availability_check_result = False

        is_mount = Filesystem.ismount(self.dir_path)
        contains_books = False
        if is_mount:
            for entry in os.scandir(self.dir_path):
                contains_books = True
                break
        mount_is_mounted = not is_mount or contains_books

        self.last_availability_check_result = os.path.isdir(
            self.dir_path) and mount_is_mounted

        if not self.last_availability_check_result:
            logging.warning("Directory is not available: " +
                            str(self.dir_path))
            logging.debug(
                str(self.dir_path) + " is " +
                ("" if os.path.isdir(self.dir_path) else "not ") +
                " a directory.")
            logging.debug(
                str(self.dir_path) + " is " + ("" if is_mount else "not ") +
                " a mounted filesystem.")
            logging.debug(
                str(self.dir_path) + " does " +
                ("" if contains_books else "not ") + " contain books.")

        return self.last_availability_check_result
예제 #12
0
    def _trigger_newsletter_thread(self):
        last_check = 0
        # If no newsletter this month, trigger newsletter
        self.watchdog_bark()
        while self.shouldRun:
            time.sleep(5)
            self.watchdog_bark()

            if not self.dirsAvailable():
                continue

            max_update_interval = 60 * 60
            if time.time() - last_check < max_update_interval:
                continue

            last_check = time.time()
            self.newsletter_identifier = "120209"
            self.newsletter_identifier += time.strftime("%m%Y")
            self.year_month = datetime.datetime.today().strftime('%Y-%m')
            if self.newsletter_identifier not in Filesystem.list_book_dir(
                    self.dir_out):
                logging.info("Lager nyhetsbrev for: " + self.year_month)
                self.trigger(self.newsletter_identifier)
예제 #13
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_dir = os.path.dirname(opf_path)
        html_file = os.path.join(html_dir, html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_xml_obj = tempfile.NamedTemporaryFile()
        temp_xml = temp_xml_obj.name

        # MATHML to stem
        self.utils.report.info("Erstatter evt. MathML i boka...")
        mathml_validation = Mathml_validator(self, source=html_file)
        if not mathml_validation.success:
            self.utils.report.error(
                "NLBPUB contains MathML errors, aborting...")
            return False

        mathML_result = Mathml_to_text(self,
                                       source=html_file,
                                       target=html_file)

        if not mathML_result.success:
            return False

        self.utils.report.info(
            "Lager skjulte overskrifter der det er nødvendig")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "create-hidden-headlines.xsl"),
                    source=html_file,
                    target=temp_xml,
                    parameters={
                        "cover-headlines": "from-type",
                        "frontmatter-headlines": "from-type",
                        "bodymatter-headlines": "from-text",
                        "backmatter-headlines": "from-type"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, html_file)

        self.utils.report.info("Tilpasser innhold for e-bok...")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "prepare-for-ebook.xsl"),
                    source=html_file,
                    target=temp_xml)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, html_file)

        # Use library-specific logo and stylesheet if available

        library = temp_epub.meta("schema:library")
        library = library.upper() if library else library
        logo = os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                            "{}_logo.png".format(library))

        if os.path.isfile(logo):
            shutil.copy(logo, os.path.join(html_dir, os.path.basename(logo)))

        PrepareForEbook.update_css()

        stylesheet = PrepareForEbook.css_tempfile_obj.name
        if library is not None and library.lower() == "statped":
            stylesheet = PrepareForEbook.css_tempfile_statped_obj.name
        shutil.copy(stylesheet, os.path.join(html_dir, "ebok.css"))

        self.utils.report.info("Legger til logoen i OPF-manifestet")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "add-to-opf-manifest.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={
                        "href": os.path.basename(logo),
                        "media-type": "image/png"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, opf_path)

        self.utils.report.info("Legger til CSS-fila i OPF-manifestet")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForEbook.uid,
                                            "add-to-opf-manifest.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={
                        "href": "ebok.css",
                        "media-type": "text/css"
                    })
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_xml, opf_path)

        # add cover if missing

        opf_xml = ElementTree.parse(opf_path).getroot()
        cover_id = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[contains(concat(' ', @properties, ' '), ' cover-image ')]/@id"
        )  # from properties
        if not cover_id:
            cover_id = opf_xml.xpath(
                "/*/*[local-name()='manifest']/*[@name='cover']/@content"
            )  # from metadata
        if not cover_id:
            cover_id = opf_xml.xpath(
                "/*/*[local-name()='manifest']/*[starts-with(@media-type, 'image/') and contains(@href, 'cover')]/@id"
            )  # from filename
        cover_id = cover_id[0] if cover_id else None

        if not cover_id:
            # cover not found in the book, let's try NLBs API

            # NOTE: identifier at this point is the e-book identifier
            edition_url = "{}/editions/{}?creative-work-metadata=none&edition-metadata=all".format(
                Config.get("nlb_api_url"), epub.identifier())

            response = requests.get(edition_url)
            self.utils.report.debug(
                "looking for cover image in: {}".format(edition_url))
            if response.status_code == 200:
                response_json = response.json()
                if "data" not in response_json:
                    self.utils.report.debug("response as JSON:")
                    self.utils.report.debug(str(response_json))
                    raise Exception(
                        "No 'data' in response: {}".format(edition_url))
                data = response_json["data"]
                cover_url = data["coverUrlLarge"]
                if cover_url is not None and cover_url.startswith("http"):
                    response = requests.get(cover_url)
                    if response.status_code == 200:
                        _, extension = os.path.splitext(cover_url)
                        target_href = "cover" + extension
                        target_dir = os.path.dirname(opf_path)
                        with open(os.path.join(target_dir, target_href),
                                  "wb") as target_file:
                            target_file.write(response.content)

                        self.utils.report.info(
                            "Legger til bildet av bokomslaget i OPF-manifestet"
                        )
                        media_type = None
                        if extension.lower() in [
                                ".png"
                        ]:  # check for png, just in case. Should always be jpg though.
                            media_type = "image/png"
                        else:
                            media_type = "image/jpeg"
                        xslt = Xslt(self,
                                    stylesheet=os.path.join(
                                        Xslt.xslt_dir, PrepareForEbook.uid,
                                        "add-to-opf-manifest.xsl"),
                                    source=opf_path,
                                    target=temp_xml,
                                    parameters={
                                        "href": target_href,
                                        "media-type": media_type
                                    })
                        if not xslt.success:
                            self.utils.report.title = self.title + ": " + epub.identifier(
                            ) + " feilet 😭👎" + epubTitle
                            return False
                        shutil.copy(temp_xml, opf_path)

                        opf_xml = ElementTree.parse(opf_path).getroot()
                        cover_id = opf_xml.xpath(
                            "/*/*[local-name()='manifest']/*[@href = '{}']/@id"
                            .format(target_href))  # from filename
                        cover_id = cover_id[0] if cover_id else None

        if cover_id is None or len(cover_id) == 0:
            self.utils.report.warn(
                "Klarte ikke å finne bilde av bokomslaget for {}".format(
                    epub.identifier()))

        self.utils.report.info("Legger til properties i OPF etter behov")
        temp_epub.update_opf_properties()

        # validate with epubcheck
        if Epubcheck.isavailable():
            epubcheck = Epubcheck(self, opf_path)
            if not epubcheck.success:
                tempfile_stored_opf = os.path.join(
                    self.utils.report.reportDir(), os.path.basename(opf_path))
                shutil.copy(opf_path, tempfile_stored_opf)
                tempfile_stored = os.path.join(self.utils.report.reportDir(),
                                               os.path.basename(html_file))
                shutil.copy(html_file, tempfile_stored)
                self.utils.report.info(
                    f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}"
                )
                self.utils.report.attachment(None, tempfile_stored, "DEBUG")
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return
        else:
            self.utils.report.warn(
                "Epubcheck er ikke tilgjengelig, EPUB blir ikke validert!")

        # ---------- lagre filsett ----------

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til HTML-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
    def on_book(self):
        epub = Epub(self.utils.report, self.book["source"])
        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        temp_obj = tempfile.TemporaryDirectory()
        temp_dir = temp_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_dir)

        self.utils.report.info("Henter metadata fra api.nlb.no")
        creative_work_metadata = None
        edition_metadata = None

        timeout = 0
        while creative_work_metadata is None and timeout < 5:

            timeout = timeout + 1
            creative_work_metadata = Metadata.get_creative_work_from_api(
                self.book["name"],
                editions_metadata="all",
                use_cache_if_possible=True,
                creative_work_metadata="all")
            edition_metadata = Metadata.get_edition_from_api(self.book["name"])
            if creative_work_metadata is not None:
                break

        if creative_work_metadata is None:
            self.utils.report.warning(
                "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere."
            )
            return False

        library = edition_metadata["library"].lower()

        # in case of wrong upper lower cases
        if library == "nlb":
            library = "NLB"
        elif library == "statped":
            library = "Statped"
        elif library == "kabb":
            library = "KABB"

        if library.lower() != "statped":
            self.utils.report.error("Ikke en Statped bok. Avbryter")
            self.utils.report.should_email = False
            return False


#        Filesystem.copy(self.utils.report, self.book["source"], temp_dir)

        self.utils.report.info("Kopierer til EPUB master-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_dir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " er valid 👍😄" + epubTitle
        self.utils.filesystem.deleteSource()
        return True
예제 #15
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        temp_xml_obj = tempfile.NamedTemporaryFile()
        temp_xml = temp_xml_obj.name

        self.utils.report.info("Flater ut NLBPUB")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "nlbpub-flatten.xsl"),
                    source=html_file,
                    target=temp_xml)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        shutil.copy(temp_xml, html_file)

        self.utils.report.info("Deler opp NLBPUB i flere HTML-filer")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "nlbpub-split.xsl"),
                    source=html_file,
                    target=temp_xml,
                    parameters={"output-dir": os.path.dirname(html_file)})
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        os.remove(html_file)

        spine_hrefs = []
        for href in sorted(os.listdir(os.path.dirname(html_file))):
            if href.endswith(".xhtml") and href not in [
                    "nav.xhtml", os.path.basename(html_file)
            ]:
                spine_hrefs.append(href)

        self.utils.report.info("Oppdaterer OPF-fil")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "update-opf.xsl"),
                    source=opf_path,
                    target=temp_xml,
                    parameters={"spine-hrefs": ",".join(spine_hrefs)})
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return
        shutil.copy(temp_xml, opf_path)

        nav_path = os.path.join(temp_epubdir, temp_epub.nav_path())

        self.utils.report.info("Lager nytt navigasjonsdokument")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NlbpubToEpub.uid,
                                            "generate-nav.xsl"),
                    source=opf_path,
                    target=nav_path)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return

        self.utils.report.info("Legger til properties i OPF etter behov")
        temp_epub.update_opf_properties()

        if Epubcheck.isavailable():
            epubcheck = Epubcheck(self, opf_path)
            if not epubcheck.success:
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return
        else:
            self.utils.report.warn(
                "Epubcheck not available, EPUB will not be validated!")

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til e-bok-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, temp_epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        Bibliofil.book_available(NlbpubToEpub.publication_format,
                                 temp_epub.identifier())
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")

        self.utils.report.info("Locating HTML file")
        epub = Epub(self.utils.report, self.book["source"])
        if not epub.isepub():
            return False
        assert epub.isepub(), "The input must be an EPUB"
        spine = epub.spine()
        if not len(spine) == 1:
            self.utils.report.warn(
                "There must only be one item in the EPUB spine")
            return False
        html_file = os.path.join(self.book["source"],
                                 os.path.dirname(epub.opf_path()),
                                 spine[0]["href"])

        identifier = epub.identifier()

        self.utils.report.info("lag en kopi av boka")
        temp_resultdir_obj = tempfile.TemporaryDirectory()
        temp_resultdir = temp_resultdir_obj.name
        Filesystem.copy(self.utils.report, os.path.dirname(html_file),
                        temp_resultdir)
        temp_result = os.path.join(temp_resultdir, identifier + ".xml")

        self.utils.report.info("sletter EPUB-spesifikke filer")
        for root, dirs, files in os.walk(temp_resultdir):
            for file in files:
                if Path(file).suffix.lower() in [
                        ".xhtml", ".html", ".smil", ".mp3", ".wav", ".opf"
                ]:
                    os.remove(os.path.join(root, file))
        shutil.copy(html_file, temp_result)

        temp_xslt_output_obj = tempfile.NamedTemporaryFile()
        temp_xslt_output = temp_xslt_output_obj.name

        # MATHML to stem
        self.utils.report.info("Erstatter evt. MathML i boka...")
        mathml_validation = Mathml_validator(self, source=temp_result)
        if not mathml_validation.success:
            return False

        mathML_result = Mathml_to_text(self,
                                       source=temp_result,
                                       target=temp_result)

        if not mathML_result.success:
            return False

        self.utils.report.info("Fikser Webarch-oppmerking")
        self.utils.report.debug("webarch-fixup.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToNarrationEpub.uid,
                                            "webarch-fixup.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Setter inn lydbokavtalen...")
        self.utils.report.debug("bokinfo-tts-dtbook.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "bokinfo-tts-dtbook.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        creative_work_metadata = None
        timeout = 0

        while creative_work_metadata is None and timeout < 5:

            timeout = timeout + 1
            creative_work_metadata = Metadata.get_creative_work_from_api(
                identifier,
                editions_metadata="all",
                use_cache_if_possible=True,
                creative_work_metadata="all")
            if creative_work_metadata is not None:
                if creative_work_metadata["magazine"] is True:
                    self.utils.report.info(
                        "Fjerner sidetall fordi det er et tidsskrift...")
                    self.utils.report.debug("remove-pagenum.xsl")
                    self.utils.report.debug("    source = " + temp_result)
                    self.utils.report.debug("    target = " + temp_xslt_output)
                    xslt = Xslt(self,
                                stylesheet=os.path.join(
                                    Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                                    "remove-pagenum.xsl"),
                                source=temp_result,
                                target=temp_xslt_output)
                    if not xslt.success:
                        return False
                    shutil.copy(temp_xslt_output, temp_result)
                break

        if creative_work_metadata is None:
            self.utils.report.warning(
                "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Konverterer likevel."
            )

        library = epub.meta("schema:library")
        library = library.upper() if library else library
        logo = os.path.join(Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                            "{}_logo.png".format(library))

        if os.path.isfile(logo):
            # epub_dir = os.path.join(temp_resultdir, "EPUB")
            image_dir = os.path.join(temp_resultdir, "images")
            if not os.path.isdir(image_dir):
                os.mkdir(image_dir)
            shutil.copy(logo, image_dir)

        self.utils.report.info("Konverterer fra XHTML5 til DTBook...")
        self.utils.report.debug("html-to-dtbook.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "html-to-dtbook.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Gjør tilpasninger i DTBook")
        self.utils.report.debug("dtbook-cleanup.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            NlbpubToTtsDtbook.uid,
                                            "dtbook-cleanup.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        # Fjern denne transformasjonen hvis det oppstår kritiske proplemer med håndteringen av komplekst innhold
        self.utils.report.info(
            "Legger inn ekstra informasjon om komplekst innhold")
        self.utils.report.debug("optimaliser-komplekst-innhold.xsl")
        self.utils.report.debug("    source = " + temp_result)
        self.utils.report.debug("    target = " + temp_xslt_output)
        xslt = Xslt(self,
                    stylesheet=os.path.join(
                        Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                        "optimaliser-komplekst-innhold.xsl"),
                    source=temp_result,
                    target=temp_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_xslt_output, temp_result)

        self.utils.report.info("Validerer DTBook...")
        # NOTE: This RelaxNG schema assumes that we're using DTBook 2005-3 and MathML 3.0
        dtbook_relax = Relaxng(
            self,
            relaxng=os.path.join(
                Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                "dtbook-schema/rng/dtbook-2005-3.mathml-3.integration.rng"),
            source=temp_result)
        dtbook_sch = Schematron(self,
                                schematron=os.path.join(
                                    Xslt.xslt_dir, NlbpubToTtsDtbook.uid,
                                    "dtbook-schema/sch/dtbook.mathml.sch"),
                                source=temp_result)
        if not dtbook_relax.success:
            self.utils.report.error("Validering av DTBook feilet (RelaxNG)")
        if not dtbook_sch.success:
            self.utils.report.error("Validering av DTBook feilet (Schematron)")
        if not dtbook_relax.success or not dtbook_sch.success:
            tempfile_stored = os.path.join(self.utils.report.reportDir(),
                                           os.path.basename(temp_result))
            shutil.copy(temp_result, tempfile_stored)
            self.utils.report.info(
                f"Validering av DTBook feilet, lagrer temp fil for feilsøking: {tempfile_stored}"
            )
            self.utils.report.attachment(None, tempfile_stored, "DEBUG")
            return False

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til DTBook-arkiv.")
        archived_path, stored = self.utils.filesystem.storeBook(
            temp_resultdir, identifier)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        return True
예제 #17
0
    def test_copy_locked_files(self):

        print("creating book without any locked files")
        book1 = os.path.join(self.dir_in, "book1")
        os.makedirs(os.path.join(book1, "images"))
        Path(os.path.join(book1, "ncc.html")).touch()
        Path(os.path.join(book1, "images/Image.png")).touch()
        Path(os.path.join(book1, "images/zmage.png")).touch()

        print("creating book with a \"Thumbs.db\" file locked by Windows")
        book2 = os.path.join(self.dir_in, "book2")
        os.makedirs(os.path.join(book2, "images"))
        Path(os.path.join(book2, "ncc.html")).touch()
        Path(os.path.join(book2, "images/Image.png")).touch()
        Path(os.path.join(book2, "images/Thumbs.db")).touch()
        Path(os.path.join(book2, "images/zmage.png")).touch()

        print("creating book with a \"locked\" file locked by Windows")
        book3 = os.path.join(self.dir_in, "book3")
        os.makedirs(os.path.join(book3, "images"))
        Path(os.path.join(book3, "ncc.html")).touch()
        Path(os.path.join(book3, "images/Image.png")).touch()
        Path(os.path.join(book3, "images/locked")).touch()
        Path(os.path.join(book3, "images/zmage.png")).touch()

        target_book1 = os.path.join(self.dir_out, "book1")
        target_book2 = os.path.join(self.dir_out, "book2")
        target_book3 = os.path.join(self.dir_out, "book3")

        print("copy book1 to target_book1")
        Filesystem.copy(self.pipeline.utils.report, book1, target_book1)
        dirlist = os.listdir(target_book1)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book1, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[ERROR]")]) == 0)
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)

        print("copy book1 to target_book1 once more")
        Filesystem.copy(self.pipeline.utils.report, book1, target_book1)
        dirlist = os.listdir(target_book1)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book1, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[ERROR]")]) == 0)
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)

        print("copy book2 to target_book2")
        Filesystem.copy(self.pipeline.utils.report, book2, target_book2)
        dirlist = os.listdir(target_book2)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book2, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[ERROR]")]) == 0)
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)

        print("copy book2 to target_book2 once more")
        Filesystem.copy(self.pipeline.utils.report, book2, target_book2)
        dirlist = os.listdir(target_book2)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book2, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[ERROR]")]) == 0)
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)

        print("copy book3 to target_book3")
        Filesystem.copy(self.pipeline.utils.report, book3, target_book3)
        dirlist = os.listdir(target_book3)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book3, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "locked", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[ERROR]")]) == 0)
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)

        print("copy book3 to target_book3 once more")
        Filesystem.copy(self.pipeline.utils.report, book3, target_book3)
        dirlist = os.listdir(target_book3)
        dirlist.sort()
        self.assertEqual(dirlist, ["images", "ncc.html"])
        dirlist = os.listdir(os.path.join(target_book3, "images"))
        dirlist.sort()
        self.assertEqual(dirlist, ["Image.png", "locked", "zmage.png"])
        self.assertTrue(
            len([m for m in self.pipeline.messages
                 if m.startswith("[WARN]")]) == 0)
        self.assertTrue(
            len([
                m for m in self.pipeline.messages
                if m.startswith("[ERROR]") and "/locked" in m
            ]) >= 1)
예제 #18
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier() or not epub.identifier().isnumeric():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        self.utils.report.info("Lager en kopi av EPUBen")
        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)

        if not os.path.exists(os.path.join(self.dir_out, epub.identifier())):
            os.makedirs(os.path.join(self.dir_out, epub.identifier()))

        time_created = time.strftime("%Y-%m-%dT%H:%M:%S")
        dictfiles = {}
        changelog = "changelog.txt"
        deleted = "deleted.yml"
        files = "files.yml"
        extra_files = [changelog, deleted, files, "restore_files.py"]
        changes_made = False
        new_epub = False

        # Overview of deleted files and changelog history
        deleted_path = os.path.join(self.dir_out, epub.identifier(), deleted)
        changelog_path = os.path.join(self.dir_out, epub.identifier(),
                                      changelog)

        deleted_doc = {}
        if os.path.isfile(deleted_path):
            with open(deleted_path, 'r') as f:
                deleted_doc = yaml.load(f, Loader=yaml.FullLoader) or {}

        # Dictfiles contains the most recent version of each file, saved to files.yml
        for (path, subdir_list,
             file_list) in walk(os.path.join(self.dir_out, epub.identifier())):
            for file_name in file_list:
                if file_name in extra_files:
                    continue
                file_path = os.path.join(path, file_name)
                relative_path = file_path.replace(
                    os.path.join(self.dir_out, epub.identifier()), "")
                relative_path = relative_path.strip("/")
                short_path = self.short_path_by_one(relative_path)
                new_dict = {short_path: relative_path}

                if short_path not in dictfiles:
                    dictfiles.update(new_dict)
                elif dictfiles[short_path] < relative_path:
                    dictfiles.update(new_dict)

        new_file_list = []
        changelog_string = ""
        file_added_again = False
        # Changelog.txt contains the history of changes to this nlbpub with timestamps
        for temp_path, temp_subdir_list, temp_file_list in walk(temp_epubdir):
            for temp_file in temp_file_list:
                full_temp_file_path = os.path.join(temp_path, temp_file)
                temp_file = full_temp_file_path.replace(temp_epubdir, "")
                temp_file = temp_file.strip("/")
                new_file_list.append(temp_file)

                if temp_file in dictfiles and filecmp.cmp(
                        full_temp_file_path,
                        os.path.join(self.dir_out, epub.identifier(),
                                     dictfiles[temp_file])):
                    os.remove(full_temp_file_path)

                elif temp_file in dictfiles and not filecmp.cmp(
                        full_temp_file_path,
                        os.path.join(self.dir_out, epub.identifier(),
                                     dictfiles[temp_file])):
                    changes_made = True
                    new_location = {
                        temp_file: os.path.join(time_created, temp_file)
                    }
                    dictfiles.update(new_location)
                    self.utils.report.info("Fil endret: " + temp_file)
                    changelog_string += ("\n{}:     Fil endret: {}".format(
                        time_created, temp_file))

                elif temp_file not in dictfiles:
                    if dictfiles == {}:
                        new_epub = True
                    changes_made = True
                    new_file = {
                        temp_file: os.path.join(time_created, temp_file)
                    }
                    dictfiles.update(new_file)
                    if not new_epub:
                        self.utils.report.info("Fil lagt til: " + temp_file)
                        changelog_string += (
                            "\n{}:     Fil lagt til: {}".format(
                                time_created, temp_file))

                if temp_file in deleted_doc:
                    changes_made = True
                    file_added_again = True
                    deleted_doc.pop(temp_file, None)
                    self.utils.report.info("Fil lagt til på nytt: " +
                                           temp_file)
                    changelog_string += (
                        "\n{}:     Fil lagt til på nytt: {}".format(
                            time_created, temp_file))

        dirs = next(walk(temp_epubdir))[1]
        for dir in dirs:
            self.del_empty_dirs(temp_epubdir, dir)

        if file_added_again:
            with open(deleted_path, 'w') as deleted_file:
                for key in deleted_doc:
                    deleted_file.write("\n'{}': '{}'".format(
                        key.replace("'", "''"),
                        time_created.replace("'", "''")))

        # Deleted file history saved to deleted files.yml
        with open(deleted_path,
                  self.append_write(deleted_path)) as deleted_file:
            for key in dictfiles:
                if key not in new_file_list and key not in deleted_doc and key not in extra_files:
                    changes_made = True
                    self.utils.report.info("Fil slettet: " + key)
                    changelog_string += ("\n{}:     Fil slettet: {}".format(
                        time_created, key))
                    deleted_file.write("\n'{}': '{}'".format(
                        key.replace("'", "''"),
                        time_created.replace("'", "''")))

        # Changelog saved to changelog.txt
        with open(changelog_path,
                  self.append_write(changelog_path)) as changelog_file:
            changelog_file.write(changelog_string)

        deleted_doc = {}
        if os.path.isfile(deleted_path):
            with open(deleted_path, 'r') as f:
                deleted_doc = yaml.load(f, Loader=yaml.FullLoader) or {}

        for del_file in deleted_doc:
            try:
                del dictfiles[del_file]
            except Exception:
                self.utils.report.debug(traceback.format_exc(),
                                        preformatted=True)

        with open(os.path.join(temp_epubdir, files), 'w') as files_doc:
            for file in dictfiles:
                files_doc.write("\n'{}': '{}'".format(
                    file.replace("'", "''"),
                    dictfiles[file].replace("'", "''")))

        # Save copy of different files in NLBPUB master. Different versions of files under NLBPUB-tidligere/xxxxxxx/time
        # To restore a certain version copy files from the each folder up to the wanted version to a new folder

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, epub.identifier(), subdir=time_created)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        if changes_made:
            if new_epub:
                self.utils.report.info(
                    "Endringer oppdaget for: " + epub.identifier() +
                    ", ny epub ble kopiert til NLBpub tidligere versjoner.")
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " 👍😄" + epubTitle + " , ny epub ble kopiert"
            else:
                self.utils.report.info(
                    "Endringer oppdaget for: " + epub.identifier() +
                    ", endrede filer ble kopiert til NLBpub tidligere versjoner."
                )
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " 👍😄" + epubTitle + " , endring registrert"
        else:
            self.utils.report.info("Ingen endringer oppdaget for " +
                                   epub.identifier())
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " 👍😄" + epubTitle + " ,  ingen endring registrert"
            self.utils.report.should_email = False
        return True
예제 #19
0
    def on_book(self):
        epub = Epub(self.utils.report, self.book["source"])
        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass
        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return

        self.utils.report.info("Lager en kopi av EPUBen med tomme bildefiler")
        temp_noimages_epubdir_obj = tempfile.TemporaryDirectory()
        temp_noimages_epubdir = temp_noimages_epubdir_obj.name
        Filesystem.copy(self.utils.report, epub.asDir(), temp_noimages_epubdir)
        if os.path.isdir(os.path.join(temp_noimages_epubdir, "EPUB",
                                      "images")):
            temp_xml_obj = tempfile.NamedTemporaryFile()
            temp_xml = temp_xml_obj.name
            opf_image_references = []
            html_image_references = {}
            for root, dirs, files in os.walk(
                    os.path.join(temp_noimages_epubdir, "EPUB")):
                for file in files:
                    if file.endswith(".opf"):
                        opf_file = os.path.join(root, file)
                        self.utils.report.info(
                            "Fjerner alle bildereferanser fra OPFen, og erstatter med en referanse til dummy.jpg..."
                        )
                        opf_xml_document = ElementTree.parse(opf_file)
                        opf_xml = opf_xml_document.getroot()
                        image_items = opf_xml.xpath(
                            "//*[local-name()='item' and starts-with(@media-type, 'image/')]"
                        )
                        replaced = False
                        for image_item in image_items:
                            if image_item.attrib[
                                    "href"] not in opf_image_references:
                                opf_image_references.append(
                                    image_item.attrib["href"])

                            if image_item.get("href") == "images/cover.jpg":
                                pass  # don't change the reference to cover.jpg

                            elif not replaced:
                                image_item.attrib["href"] = "images/dummy.jpg"
                                replaced = True

                            else:
                                image_item.getparent().remove(image_item)

                        opf_xml_document.write(opf_file,
                                               method='XML',
                                               xml_declaration=True,
                                               encoding='UTF-8',
                                               pretty_print=False)

                    if file.endswith(".xhtml"):
                        html_file = os.path.join(root, file)

                        html_xml_document = ElementTree.parse(html_file)
                        html_xml = html_xml_document.getroot()
                        image_references = html_xml.xpath(
                            "//@href | //@src | //@altimg")
                        for reference in image_references:
                            path = reference.split("#")[0]
                            if path.startswith("images/"):
                                if path not in html_image_references:
                                    html_image_references[path] = []
                                html_image_references[path].append(file)

                        self.utils.report.info(
                            "Erstatter alle bildereferanser med images/dummy.jpg..."
                        )
                        self.utils.report.debug("dummy-jpg.xsl")
                        self.utils.report.debug("    source = " + html_file)
                        self.utils.report.debug("    target = " + temp_xml)
                        xslt = Xslt(self,
                                    stylesheet=os.path.join(
                                        Xslt.xslt_dir, IncomingNordic.uid,
                                        "dummy-jpg.xsl"),
                                    source=html_file,
                                    target=temp_xml)
                        if not xslt.success:
                            self.utils.report.title = self.title + ": " + epub.identifier(
                            ) + " feilet 😭👎" + epubTitle
                            return False
                        shutil.copy(temp_xml, html_file)

            # validate for the presence of image files here, since epubcheck won't be able to do it anymore after we change the EPUB
            image_files_present = []
            for root, dirs, files in os.walk(
                    os.path.join(temp_noimages_epubdir, "EPUB", "images")):
                for file in files:
                    fullpath = os.path.join(root, file)
                    relpath = os.path.relpath(
                        fullpath, os.path.join(temp_noimages_epubdir, "EPUB"))
                    image_files_present.append(relpath)
            image_error = False
            for file in image_files_present:
                if file not in opf_image_references:
                    self.utils.report.error(
                        "Bildefilen er ikke deklarert i OPFen: " + file)
                    image_error = True
            for file in opf_image_references:
                if file not in image_files_present:
                    self.utils.report.error(
                        "Bildefilen er deklarert i OPFen, men finnes ikke: " +
                        file)
                    image_error = True
            for file in html_image_references:
                if file not in opf_image_references:
                    self.utils.report.error(
                        "Bildefilen er deklarert i HTMLen, men finnes ikke: " +
                        file + " (deklarert i: " +
                        ", ".join(html_image_references[file]) + ")")
                    image_error = True
            if image_error:
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return False

            for root, dirs, files in os.walk(
                    os.path.join(temp_noimages_epubdir, "EPUB", "images")):
                for file in files:
                    if file == "cover.jpg":
                        continue  # don't delete the cover file
                    fullpath = os.path.join(root, file)
                    os.remove(fullpath)
            shutil.copy(
                os.path.join(Xslt.xslt_dir, IncomingNordic.uid,
                             "reference-files", "demobilde.jpg"),
                os.path.join(temp_noimages_epubdir, "EPUB", "images",
                             "dummy.jpg"))

        temp_noimages_epub = Epub(self.utils.report, temp_noimages_epubdir)

        self.utils.report.info(
            "Validerer EPUB med epubcheck og nordiske retningslinjer...")
        epub_noimages_file = temp_noimages_epub.asFile()
        with DaisyPipelineJob(self,
                              "nordic-epub3-validate",
                              {"epub": os.path.basename(epub_noimages_file)},
                              priority="high",
                              pipeline_and_script_version=[
                                  ("1.13.6", "1.4.6"),
                                  ("1.13.4", "1.4.5"),
                                  ("1.12.1", "1.4.2"),
                                  ("1.11.1-SNAPSHOT", "1.3.0"),
                              ],
                              context={
                                  os.path.basename(epub_noimages_file):
                                  epub_noimages_file
                              }) as dp2_job:

            # get validation report
            report_file = os.path.join(dp2_job.dir_output,
                                       "html-report/report.xhtml")
            if os.path.isfile(report_file):
                with open(report_file, 'r') as result_report:
                    self.utils.report.attachment(
                        result_report.readlines(),
                        os.path.join(self.utils.report.reportDir(),
                                     "report.html"),
                        "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR")

            if dp2_job.status != "SUCCESS":
                self.utils.report.error("Klarte ikke å validere boken")
                self.utils.report.title = self.title + ": " + epub.identifier(
                ) + " feilet 😭👎" + epubTitle
                return

        self.utils.report.debug("Making a copy of the EPUB to work on…")
        epub_fixed, epub_fixed_obj = epub.copy()
        epub_unzipped = epub_fixed.asDir()
        nav_path = os.path.join(epub_unzipped, epub_fixed.nav_path())
        mathML_validation_result = True
        mathml_error_count = 0
        mathml_errors_not_shown = 0
        mathml_report_errors_max = 10
        for root, dirs, files in os.walk(epub_unzipped):
            for f in files:
                file = os.path.join(root, f)
                if not file.endswith(".xhtml") or file is nav_path:
                    continue
                self.utils.report.info("Checking MathML in " + file)
                mathml_validation = Mathml_validator(
                    self,
                    source=file,
                    report_errors_max=mathml_report_errors_max)
                if not mathml_validation.success:
                    mathml_error_count += mathml_validation.error_count
                    mathml_errors_not_shown += max(
                        (mathml_validation.error_count -
                         mathml_report_errors_max), 0)
                    if mathml_error_count > mathml_report_errors_max:
                        mathml_report_errors_max = 0  # don't put any more errors for the other HTML documents in the main report
                    mathML_validation_result = False
        if mathml_errors_not_shown > 0:
            self.utils.report.error(
                "{} additional MathML errors not shown in the main report. Check the log for details."
                .format(mathml_errors_not_shown))
        if mathML_validation_result is False:
            return False

        self.utils.report.debug(
            "Making sure that the EPUB has the correct file and directory permissions…"
        )
        epub_fixed.fix_permissions()

        try:
            self.utils.report.info("Genererer ACE-rapport...")
            ace_dir = os.path.join(self.utils.report.reportDir(),
                                   "accessibility-report")
            process = self.utils.filesystem.run(
                [IncomingNordic.ace_cli, "-o", ace_dir,
                 epub_fixed.asFile()])
            if process.returncode == 0:
                self.utils.report.info("ACE-rapporten ble generert.")
            else:
                self.utils.report.warn(
                    "En feil oppstod ved produksjon av ACE-rapporten for " +
                    epub.identifier())
                self.utils.report.debug(traceback.format_stack())

            # attach report
            ace_status = None
            with open(os.path.join(ace_dir, "report.json")) as json_report:
                ace_status = json.load(
                    json_report)["earl:result"]["earl:outcome"]
            if ace_status == "pass":
                ace_status = "SUCCESS"
            else:
                ace_status = "WARN"
            self.utils.report.attachment(None,
                                         os.path.join(ace_dir, "report.html"),
                                         ace_status)

        except subprocess.TimeoutExpired:
            self.utils.report.warn(
                "Det tok for lang tid å lage ACE-rapporten for " +
                epub.identifier() + ", og prosessen ble derfor stoppet.")

        except Exception:
            self.utils.report.warn(
                "En feil oppstod ved produksjon av ACE-rapporten for " +
                epub.identifier())
            self.utils.report.debug(traceback.format_exc(), preformatted=True)

        self.utils.report.info(
            "Boken er valid. Kopierer til EPUB master-arkiv.")

        archived_path, stored = self.utils.filesystem.storeBook(
            epub_fixed.asDir(), epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " er valid 👍😄" + epubTitle
        self.utils.filesystem.deleteSource()
        return True
예제 #20
0
    def on_book(self):
        epub = Epub(self.utils.report, self.book["source"])
        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass
        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book["name"] + " feilet 😭👎" + epubTitle
            return

        if not epub.identifier():
            self.utils.report.error(self.book["name"] + ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book["name"] + " feilet 😭👎" + epubTitle
            return

        self.utils.report.should_email = self.should_email_default
        self.utils.report.should_message_slack = self.should_message_slack
        self.utils.report.info("Lager kopi av EPUB...")
        nordic_epubdir_obj = tempfile.TemporaryDirectory()
        nordic_epubdir = nordic_epubdir_obj.name
        Filesystem.copy(self.pipeline.utils.report, epub.asDir(), nordic_epubdir)
        nordic_epub = Epub(self.utils.report, nordic_epubdir)

        html_file = os.path.join(nordic_epubdir, "EPUB", nordic_epub.identifier() + ".xhtml")
        nav_file = os.path.join(nordic_epubdir, "EPUB", "nav" + ".xhtml")
        package_file = os.path.join(nordic_epubdir, "EPUB", "package" + ".opf")
        nlbpub_files = [html_file, nav_file, package_file]

        for file in nlbpub_files:
            if not os.path.isfile(file):
                self.utils.report.error(file + " Not found. This is not a valid NLBPUB")

        self.utils.report.info("Validerer NLBPUB")
        schematron_files = ["nordic2015-1.sch", "nordic2015-1.nav-references.sch", "nordic2015-1.opf.sch"]
        rng_files = "nordic-html5.rng"
        html_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[0]), source=html_file)
        nav_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[1]), source=nav_file)
        opf_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", schematron_files[2]), source=package_file)
        warning_sch = Schematron(self,
                                 schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", "nlbpub-check-need-for-manual-intervention.sch"),
                                 source=html_file)
        schematron_list = [html_sch, nav_sch, opf_sch]
        html_relax = Relaxng(self, relaxng=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", rng_files), source=html_file)

        for i in range(0, len(schematron_list)):
            if not schematron_list[i].success:
                self.utils.report.error("Validering av NLBPUB feilet etter schematron: " + schematron_files[i])
                return False
        if not html_relax.success:
            self.utils.report.error("Validering av NLBPUB feilet etter RELAXNG: " + rng_files)
            return False

        self.utils.report.info("Boken er valid.")

        if not self.skip_warning:

            #warning_sch = Schematron(self, schematron=os.path.join(Xslt.xslt_dir, "incoming-NLBPUB", "nlbpub-check-need-for-manual-intervention.sch"), source=html_file)

            if warning_sch.success is False:
                if self.uid == "NLBPUB-incoming-warning":
                    archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier())
                    self.utils.report.attachment(None, archived_path, "DEBUG")
                    self.utils.report.title = self.title + ": " + epub.identifier() + " er valid, men må sjekkes manuelt 👍😄" + epubTitle
                    self.utils.report.should_email = True
                    self.utils.report.should_message_slack = True
                    return True
                else:
                    self.utils.report.should_email = False
                    self.utils.report.should_message_slack = False
                    self.utils.report.title = self.title + ": " + epub.identifier() + " er valid, men må sjekkes manuelt 👍😄" + epubTitle
                    return True
            else:
                if self.uid == "NLBPUB-incoming-validator":
                    archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier())
                    self.utils.report.attachment(None, archived_path, "DEBUG")
                    self.utils.report.title = self.title + ": " + epub.identifier() + " er valid 👍😄" + epubTitle
                    self.utils.filesystem.deleteSource()
                    return True
                else:
                    self.utils.report.info(epub.identifier() + " er valid og har ingen advarsler.")
                    return True

        archived_path, stored = self.utils.filesystem.storeBook(nordic_epubdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier() + " er valid 👍😄" + epubTitle
        return True
예제 #21
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")

        temp_absdir_obj = tempfile.TemporaryDirectory()
        temp_absdir = temp_absdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_absdir)

        file_exists = {
            "abstracts": False,
            "back-cover": False,
            "test-audio": False
        }

        if not os.path.isfile(os.path.join(temp_absdir, "ncc.html")):
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎. Er dette en daisy 2.02 lydbok med en ncc.html fil?"
            return False
        try:
            nccdoc = ElementTree.parse(os.path.join(temp_absdir,
                                                    "ncc.html")).getroot()

        except Exception:
            self.utils.report.info(
                "Klarte ikke lese ncc fila. Sjekk loggen for detaljer.")
            self.utils.report.debug(traceback.format_exc(), preformatted=True)

        edition_identifier = ""
        audio_title = ""
        audio_title = " (" + nccdoc.xpath(
            "string(//*[@name='dc:title']/@content)") + ") "
        issue_identifier = nccdoc.xpath(
            "string(//*[@name='dc:identifier']/@content)")
        edition_identifier = issue_identifier[0:6]

        if edition_identifier == (""):
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + "Lydbok feilet 😭👎"
            return False

        try:
            smilFile = nccdoc.xpath(
                "substring-before(//*[text()='Bokomtale' or text()='Baksidetekst' or text()='Omslagstekst']/@href,'#')"
            )
            smilFile_Id = nccdoc.xpath(
                "substring-after(//*[text()='Bokomtale' or text()='Baksidetekst' or text()='Omslagstekst']/@href,'#')"
            )

        except Exception:
            self.utils.report.debug(traceback.format_exc(), preformatted=True)
            self.utils.report.error(
                "Det oppstod en feil for" + edition_identifier +
                " under lasting av smilfilene. Sjekk loggen for detaljer.")
            return False
        # Back-cover

        if (smilFile != ""):
            try:
                smildoc = ElementTree.parse(os.path.join(
                    temp_absdir, smilFile)).getroot()
                mp3File = smildoc.xpath("string((//audio/@src)[1])")
                mp3File_start = smildoc.xpath(
                    "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[1]/@clip-begin),'='),'s')"
                    .format(smilFile_Id))
                mp3File_end = smildoc.xpath(
                    "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[last()]/@clip-end),'='),'s')"
                    .format(smilFile_Id))
                if mp3File_start == mp3File_end:
                    self.utils.report.info(
                        "Klarte ikke å bestemme start-/slutt-tid for baksidetekst"
                    )

                # Creates audio segment in milliseconds from start to end of the abstract file
                mp3 = AudioSegment.from_mp3(os.path.join(temp_absdir, mp3File))
                new_mp3 = mp3[float(mp3File_start) * 1000:float(mp3File_end) *
                              1000]
                new_mp3.export(
                    os.path.join(temp_absdir,
                                 self.parentdirs["back-cover"] + ".mp3"))
                self.utils.report.info("Baksidetekst eksportert fra: " +
                                       mp3File)
                file_exists["back-cover"] = True

            except Exception:
                self.utils.report.debug(traceback.format_exc(),
                                        preformatted=True)
                self.utils.report.info(
                    "Klarte ikke hente ut baksidetekst for " +
                    edition_identifier + " sjekk loggen for detaljer.")
        else:
            self.utils.report.info("Baksidetekst ikke funnet for " +
                                   edition_identifier)

        # creates abstract from ncc --> smil --> mp3
        several_smilFiles = []
        several_smilFiles_id = []
        try:
            number_of_smilfiles = int(nccdoc.xpath("count(//@href)"))
            for i in range(number_of_smilfiles):
                several_smilFiles.append(
                    nccdoc.xpath(
                        "substring-before((//@href)[{0}],'#')".format(i + 1)))
                several_smilFiles_id.append(
                    nccdoc.xpath(
                        "substring-after((//@href)[{0}],'#')".format(i + 1)))
        except Exception:
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.info("Klarte ikke hente ut .smil filene for " +
                                   edition_identifier + audio_title)

        timeout = time.time() + 60 * 2
        duration = 0
        num = 0
        try:
            while (duration <= 50 and time.time() < timeout
                   and int(number_of_smilfiles / 2 + num) < int(
                       number_of_smilfiles * 0.9)):
                smilFile_abstract = several_smilFiles[int(number_of_smilfiles *
                                                          0.5 + num)]
                smilFile_abstract_id = several_smilFiles_id[int(
                    number_of_smilfiles * 0.5 + num)]
                smildoc_abstract = ElementTree.parse(
                    os.path.join(temp_absdir, smilFile_abstract)).getroot()

                mp3File_abstract_start = float(
                    smildoc_abstract.xpath(
                        "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[1]/@clip-begin),'='),'s')"
                        .format(smilFile_abstract_id)))

                if (smilFile_abstract == several_smilFiles[
                        int(number_of_smilfiles * 0.5 + num) + 1]):
                    smilFile_abstract_id = several_smilFiles_id[
                        int(number_of_smilfiles * 0.5 + num) + 1]

                mp3File_abstract_end = float(
                    smildoc_abstract.xpath(
                        "substring-before(substring-after(((//par[@id='{0}' or text/@id='{0}']//audio)[last()]/@clip-end),'='),'s')"
                        .format(smilFile_abstract_id)))
                duration = mp3File_abstract_end - mp3File_abstract_start
                num = num + 1
            mp3File_abstract = smildoc_abstract.xpath(
                "string((//audio/@src)[1])")
        except Exception:
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.info("Lydutdrag fra smilfiler feilet.")

        if (duration >= 75):
            mp3File_abstract_end = mp3File_abstract_start + 75

        # As a last resort, just use an mp3 of sufficient length

        if (duration < 20):
            try:

                for item in os.listdir(temp_absdir):
                    if (item.endswith(".mp3")):
                        try_mp3 = AudioSegment.from_mp3(
                            os.path.join(temp_absdir, item))

                        if (len(try_mp3) / 1000 > duration):
                            mp3File_abstract = item
                            mp3File_abstract_start = 0
                            mp3File_abstract_end = len(try_mp3) / 1000
                            duration = mp3File_abstract_end

                            if (duration > 75):
                                mp3File_abstract_start = 0.0
                                mp3File_abstract_end = 75.0
                                break
            except Exception:
                self.utils.report.debug(traceback.format_exc(),
                                        preformatted=True)
                self.utils.report.info(
                    "Klarte ikke hente ut lydutdrag basert på mp3 filene i mappa. Sjekk loggen for detaljer."
                )

        # Export abstract
        try:
            mp3_abstract = AudioSegment.from_mp3(
                os.path.join(temp_absdir, mp3File_abstract))
            new_mp3_abstract = mp3_abstract[mp3File_abstract_start *
                                            1000:mp3File_abstract_end * 1000]
            final_mp3 = new_mp3_abstract.fade_out(3000)
            final_mp3.export(
                os.path.join(temp_absdir,
                             self.parentdirs["abstracts"] + ".mp3"))
            self.utils.report.info("Lydutdrag eksportert fra: " +
                                   mp3File_abstract)
            file_exists["abstracts"] = True

        except Exception:
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.error(
                "Klarte ikke eksportere excerpt.mp3. Har du ffmpeg kodeken for .mp3 filer?"
            )

        # Copies abstract and back cover to dir_out
        if (os.path.isfile(
                os.path.join(temp_absdir,
                             self.parentdirs["back-cover"] + ".mp3"))
                or os.path.isfile(
                    os.path.join(temp_absdir,
                                 self.parentdirs["abstracts"] + ".mp3"))):

            if (file_exists["back-cover"]):
                shutil.copy(
                    os.path.join(temp_absdir,
                                 self.parentdirs["back-cover"] + ".mp3"),
                    os.path.join(temp_absdir,
                                 self.parentdirs["test-audio"] + ".mp3"))
                file_exists["test-audio"] = True
                if (self.parentdirs["abstracts"]):
                    self.utils.report.info(
                        "Baksidetekst og lydutdrag funnet. Kopierer til {}.mp3"
                        .format(self.parentdirs["test-audio"]))
                else:
                    self.utils.report.info(
                        "Baksidetekst funnet. Kopierer til {}.mp3".format(
                            self.parentdirs["test-audio"]))
            elif (self.parentdirs["abstracts"]):
                shutil.copy(
                    os.path.join(temp_absdir,
                                 self.parentdirs["abstracts"] + ".mp3"),
                    os.path.join(temp_absdir,
                                 self.parentdirs["test-audio"] + ".mp3"))
                file_exists["test-audio"] = True
                self.utils.report.info("Lydutdrag funnet. Kopierer til " +
                                       self.parentdirs["test-audio"])

            for key in self.parentdirs:
                if (file_exists[key]):
                    archived_path, stored = self.utils.filesystem.storeBook(
                        os.path.join(temp_absdir,
                                     self.parentdirs[key] + ".mp3"),
                        edition_identifier,
                        parentdir=self.parentdirs[key],
                        file_extension="mp3")
                    if edition_identifier != issue_identifier:
                        archived_path, stored = self.utils.filesystem.storeBook(
                            os.path.join(temp_absdir,
                                         self.parentdirs[key] + ".mp3"),
                            issue_identifier,
                            parentdir=self.parentdirs[key],
                            file_extension="mp3")
                    self.utils.report.attachment(None, archived_path, "DEBUG")

            self.utils.report.title = self.title + ": " + edition_identifier + " lydutdrag ble eksportert 👍😄" + audio_title
        else:
            self.utils.report.title = (
                "Klarte ikke hente ut hverken baksidetekst eller lydutdrag 😭👎. "
            ) + audio_title
            return False

        return True
예제 #22
0
    def plot(self, uids, name):
        dot = Digraph(name="Produksjonssystem", format="png")
        dot.graph_attr["bgcolor"] = "transparent"

        node_ranks = {}
        for rank in Directory.dirs_ranked:
            node_ranks[rank["id"]] = []

        # remember edges so that we don't plot them twice
        edges = {}

        for uid in uids:
            pipeline = None
            for p in self.pipelines:
                if p[0].uid == uid:
                    pipeline = p
                    break
            if not pipeline:
                continue

            group_pipeline = pipeline[0].get_current_group_pipeline()

            title = group_pipeline.get_group_title()
            pipeline_id = group_pipeline.get_group_id()  # re.sub(r"[^a-z\d]", "", title.lower())

            queue = group_pipeline.get_queue()

            queue_created = len([book for book in queue if Pipeline.get_main_event(book) == "created"]) if queue else 0
            queue_deleted = len([book for book in queue if Pipeline.get_main_event(book) == "deleted"]) if queue else 0
            queue_modified = len([book for book in queue if Pipeline.get_main_event(book) == "modified"]) if queue else 0
            queue_triggered = len([book for book in queue if Pipeline.get_main_event(book) == "triggered"]) if queue else 0
            queue_autotriggered = len([book for book in queue if Pipeline.get_main_event(book) == "autotriggered"]) if queue else 0
            queue_string = []
            if queue_created:
                queue_string.append("nye:"+str(queue_created))
            if queue_modified:
                queue_string.append("endret:"+str(queue_modified))
            if queue_deleted:
                queue_string.append("slettet:"+str(queue_deleted))
            if queue_triggered:
                queue_string.append("trigget:"+str(queue_triggered))
            if queue_autotriggered:
                queue_string.append("autotrigget:"+str(queue_autotriggered))
            queue_string = ", ".join(queue_string)

            queue_size = 0
            if queue:
                queue_size = len(queue)
                if not group_pipeline.should_handle_autotriggered_books():
                    queue_size -= queue_autotriggered
            book = Metadata.pipeline_book_shortname(group_pipeline)

            relpath_in = None
            netpath_in = ""
            rank_in = None
            if pipeline[0].dir_in:
                for rank in Directory.dirs_ranked:
                    for dir in rank["dirs"]:
                        if os.path.normpath(pipeline[0].dir_in) == os.path.normpath(rank["dirs"][dir]):
                            rank_in = rank["id"]
                            break
            if pipeline[0].dir_in and not pipeline[0].dir_base:
                relpath_in = os.path.basename(os.path.dirname(pipeline[0].dir_in))
            elif pipeline[0].dir_in and pipeline[0].dir_base:
                base_path = Filesystem.get_base_path(pipeline[0].dir_in, pipeline[0].dir_base)
                relpath_in = os.path.relpath(pipeline[0].dir_in, base_path)
                if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path:
                    pass
                else:
                    if pipeline[0].dir_in not in self.buffered_network_paths:
                        smb, file, unc = Filesystem.networkpath(pipeline[0].dir_in)
                        host = Filesystem.get_host_from_url(smb)
                        self.buffered_network_paths[pipeline[0].dir_in] = smb
                        self.buffered_network_hosts[pipeline[0].dir_in] = host
                    netpath_in = self.buffered_network_hosts[pipeline[0].dir_in]
                    if not netpath_in:
                        netpath_in = self.buffered_network_paths[pipeline[0].dir_in]
            book_count_in = self.get_book_count(pipeline[0].dir_in)
            label_in = "< <font point-size='24'>{}</font>{}{} >".format(
                relpath_in,
                "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_in, "bok" if book_count_in == 1 else "bøker"),
                "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_in.replace("\\", "\\\\")) if netpath_in else "")

            relpath_out = None
            netpath_out = ""
            rank_out = None
            if pipeline[0].dir_out:
                for rank in Directory.dirs_ranked:
                    for dir in rank["dirs"]:
                        if os.path.normpath(pipeline[0].dir_out) == os.path.normpath(rank["dirs"][dir]):
                            rank_out = rank["id"]
                            break
            if pipeline[0].dir_out and not pipeline[0].dir_base:
                relpath_out = os.path.basename(os.path.dirname(pipeline[0].dir_out))
            elif pipeline[0].dir_out and pipeline[0].dir_base:
                base_path = Filesystem.get_base_path(pipeline[0].dir_out, pipeline[0].dir_base)
                relpath_out = os.path.relpath(pipeline[0].dir_out, base_path)
                if "master" in pipeline[0].dir_base and pipeline[0].dir_base["master"] == base_path:
                    pass
                else:
                    if pipeline[0].dir_out not in self.buffered_network_paths:
                        smb, file, unc = Filesystem.networkpath(pipeline[0].dir_out)
                        host = Filesystem.get_host_from_url(smb)
                        self.buffered_network_paths[pipeline[0].dir_out] = unc
                        self.buffered_network_hosts[pipeline[0].dir_out] = host
                    netpath_out = self.buffered_network_hosts[pipeline[0].dir_out]
                    if not netpath_out:
                        netpath_out = self.buffered_network_paths[pipeline[0].dir_out]
            book_count_out = self.get_book_count(pipeline[0].dir_out, pipeline[0].parentdirs)
            label_out = "< <font point-size='24'>{}</font>{}{} >".format(
                relpath_out,
                "\n<br/><i><font point-size='20'>{} {}</font></i>".format(book_count_out, "bok" if book_count_out == 1 else "bøker"),
                "\n<br/><i><font point-size='20'>{}</font></i>".format(netpath_out.replace("\\", "\\\\")) if netpath_out else "")

            if rank_out:
                node_ranks[rank_out].append(pipeline_id)
            elif rank_in:
                next_rank = self.next_rank(rank_in)
                if next_rank:
                    node_ranks[next_rank].append(pipeline_id)
                else:
                    node_ranks[rank_in].append(pipeline_id)

            state = group_pipeline.get_state()
            status = group_pipeline.get_status()
            progress_text = group_pipeline.get_progress()
            pipeline_label = "< <font point-size='26'>{}</font>{} >".format(
                title,
                "".join(["\n<br/><i><font point-size='22'>{}</font></i>".format(val) for val in [queue_string, progress_text, status] if val]))

            fillcolor = "lightskyblue1"
            if book or queue_size:
                fillcolor = "lightslateblue"
            elif state == "considering":
                fillcolor = "lightskyblue3"
            elif not group_pipeline.running:
                fillcolor = "white"
            elif isinstance(group_pipeline, DummyPipeline):
                fillcolor = "snow"
            dot.attr("node", shape="box", style="filled", fillcolor=fillcolor)
            dot.node(pipeline_id, pipeline_label.replace("\\", "\\\\"))

            if relpath_in:
                fillcolor = "wheat"
                if not pipeline[0].dir_in_obj or not pipeline[0].dir_in_obj.is_available():
                    fillcolor = "white"
                dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor)
                dot.node(pipeline[1], label_in)
                if pipeline[1] not in edges:
                    edges[pipeline[1]] = []
                if pipeline_id not in edges[pipeline[1]]:
                    edges[pipeline[1]].append(pipeline_id)
                    dot.edge(pipeline[1], pipeline_id)
                node_ranks[rank_in].append(pipeline[1])

            if relpath_out:
                fillcolor = "wheat"
                if not pipeline[0].dir_out_obj or not pipeline[0].dir_out_obj.is_available():
                    fillcolor = "white"
                dot.attr("node", shape="folder", style="filled", fillcolor=fillcolor)
                dot.node(pipeline[2], label_out)
                if pipeline_id not in edges:
                    edges[pipeline_id] = []
                if pipeline[2] not in edges[pipeline_id]:
                    edges[pipeline_id].append(pipeline[2])
                    dot.edge(pipeline_id, pipeline[2])
                node_ranks[rank_out].append(pipeline[2])

        for rank in node_ranks:
            subgraph = Digraph("cluster_" + rank, graph_attr={"style": "dotted"})
            subgraph.graph_attr["bgcolor"] = "#FFFFFFAA"

            if node_ranks[rank]:
                subgraph.attr("node", shape="none", style="filled", fillcolor="transparent")
                subgraph.node("_ranklabel_" + rank, "< <i><font point-size='28'>{}</font></i> >".format(" <br/>".join(str(self.rank_name(rank)).split(" "))))

            for dir in node_ranks[rank]:
                subgraph.node(dir)

            dot.subgraph(subgraph)

        dot.render(os.path.join(self.report_dir, name + "_"))

        # there seems to be some race condition when doing this across a mounted network drive,
        # so if we get an exception we retry a few times and hope that it works.
        # see: https://github.com/nlbdev/produksjonssystem/issues/81
        for t in reversed(range(10)):
            try:
                shutil.copyfile(os.path.join(self.report_dir, name + "_.png"), os.path.join(self.report_dir, name + ".png"))
                with open(os.path.join(self.report_dir, name + ".js"), "w") as javascript_file:
                    javascript_file.write("setTime(\"{}\");".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                break
            except Exception as e:
                logging.debug(" Unable to copy plot image: {}".format(os.path.join(self.report_dir, name + "_.png")))
                time.sleep(0.5)
                if t == 0:
                    raise e

        dashboard_file = os.path.join(self.report_dir, name + ".html")
        if not os.path.isfile(dashboard_file):
            dashboard_template = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../dashboard.html'))
            if not os.path.exists(self.report_dir):
                os.makedirs(self.report_dir)
            shutil.copyfile(dashboard_template, dashboard_file)
예제 #23
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_html_obj = tempfile.NamedTemporaryFile()
        temp_html = temp_html_obj.name

        self.utils.report.info("Tilpasser innhold for punktskrift...")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir,
                                            PrepareForBraille.uid,
                                            "prepare-for-braille.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        # ---------- hent nytt boknummer fra /html/head/meta[@name='dc:identifier'] og bruk som filnavn ----------

        html_xml = ElementTree.parse(temp_html).getroot()
        result_identifier = html_xml.xpath(
            "/*/*[local-name()='head']/*[@name='dc:identifier']")
        result_identifier = result_identifier[0].attrib[
            "content"] if result_identifier and "content" in result_identifier[
                0].attrib else None
        if not result_identifier:
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å finne boknummer i ny HTML-fil.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        shutil.copy(html_file, temp_html)
        os.remove(html_file)
        html_file = os.path.join(
            os.path.dirname(html_file), result_identifier +
            ".html")  # Bruk html istedenfor xhtml når det ikke er en EPUB
        shutil.copy(temp_html, html_file)
        # TODO: sett inn HTML5 doctype: <!DOCTYPE html>

        # ---------- slett EPUB-spesifikke filer ----------

        items = opf_xml.xpath("/*/*[local-name()='manifest']/*")
        for item in items:
            delete = False

            if "properties" in item.attrib and "nav" in re.split(
                    r'\s+', item.attrib["properties"]):
                delete = True

            if "media-type" in item.attrib:
                if item.attrib["media-type"].startswith("audio/"):
                    delete = True
                elif item.attrib["media-type"] == "application/smil+xml":
                    delete = True

            if not delete or "href" not in item.attrib:
                continue

            fullpath = os.path.join(os.path.dirname(opf_path),
                                    item.attrib["href"])
            os.remove(fullpath)
        os.remove(opf_path)

        # ---------- lagre HTML-filsett ----------

        html_dir = os.path.dirname(opf_path)

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til arkiv for punkt-klare HTML-filer."
        )

        archived_path, stored = self.utils.filesystem.storeBook(
            html_dir, self.book["name"])
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + self.book[
            "name"] + " ble konvertert 👍😄" + epubTitle
        return True
예제 #24
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # check that this is an EPUB (we only insert metadata into EPUBs)
        if not epub.isepub():
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke ├Ц bestemme boknummer basert p├Ц dc:identifier."
            )
            return False

        if epub.identifier() != self.book["name"].split(".")[0]:
            self.utils.report.error(
                self.book["name"] +
                ": Filnavn stemmer ikke overens med dc:identifier: {}".format(
                    epub.identifier()))
            return False

        should_produce, metadata_valid = Metadata.should_produce(
            epub.identifier(),
            self.publication_format,
            report=self.utils.report)
        if not metadata_valid:
            self.utils.report.info(
                "{} har feil i metadata for {}. Avbryter.".format(
                    epub.identifier(), self.publication_format))
            self.utils.report.title = "{}: {} har feil i metadata for {} ­ЪўГ­ЪЉј {}".format(
                self.title, epub.identifier(), self.publication_format,
                epubTitle)
            return False
        if not should_produce:
            self.utils.report.info(
                "{} skal ikke produseres som {}. Avbryter.".format(
                    epub.identifier(), self.publication_format))
            self.utils.report.title = "{}: {} Skal ikke produseres som {} ­Ъци {}".format(
                self.title, epub.identifier(), self.publication_format,
                epubTitle)
            return True

        self.utils.report.info("Lager en kopi av EPUBen")
        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self.utils.report, temp_epubdir)

        is_valid = Metadata.insert_metadata(
            self.utils.report,
            temp_epub,
            publication_format=self.publication_format,
            report_metadata_errors=False)
        if not is_valid:
            self.utils.report.error(
                "Bibliofil-metadata var ikke valide. Avbryter.")
            return False

        self.utils.report.info(
            "Boken ble oppdatert med format-spesifikk metadata. Kopierer til {}-arkiv."
            .format(self.publication_format))

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epub.asDir(), epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")

        self.utils.report.title = "{}: {} har f├Цtt {}-spesifikk metadata og er klar til ├Ц produseres ­ЪЉЇ­Ъўё {}".format(
            self.title, epub.identifier(), self.publication_format,
            temp_epub.meta("dc:title"))

        return True
예제 #25
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self, temp_epubdir)

        # ---------- gjør tilpasninger i HTML-fila med XSLT ----------

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_dir = os.path.dirname(opf_path)
        html_file = os.path.join(html_dir, html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_html_obj = tempfile.NamedTemporaryFile()
        temp_html = temp_html_obj.name

        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, PrepareForDocx.uid,
                                            "prepare-for-docx.xsl"),
                    source=html_file,
                    target=temp_html)
        if not xslt.success:
            self.utils.report.title = self.title + ": " + epub.identifier(
            ) + " feilet 😭👎" + epubTitle
            return False
        shutil.copy(temp_html, html_file)

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_epubdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True
예제 #26
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")

        metadata = Metadata.get_metadata_from_book(self.utils.report,
                                                   self.book["source"])
        metadata["identifier"] = re.sub(r"[^\d]", "", metadata["identifier"])
        if not metadata["identifier"]:
            self.utils.report.error(
                "Klarte ikke å bestemme boknummer for {}".format(
                    self.book["name"]))
            return False
        if metadata["identifier"] != self.book["name"]:
            self.utils.report.info("Boknummer for {} er: {}".format(
                self.book["name"], metadata["identifier"]))

        self.utils.report.info("Lager en kopi av DTBoken")
        temp_dtbookdir_obj = tempfile.TemporaryDirectory()
        temp_dtbookdir = temp_dtbookdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_dtbookdir)

        # find DTBook XML
        dtbook = None
        for root, dirs, files in os.walk(temp_dtbookdir):
            for f in files:
                if f.endswith(".xml"):
                    xml = ElementTree.parse(os.path.join(root, f)).getroot()
                    if xml.xpath(
                            "namespace-uri()"
                    ) == "http://www.daisy.org/z3986/2005/dtbook/":
                        dtbook = os.path.join(root, f)
                        break
                if dtbook is not None:
                    break
        if not dtbook:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne DTBook")
            return False

        # rename all files to lower case
        for root, dirs, files in os.walk(temp_dtbookdir):
            for f in files:
                if not f.lower() == f:
                    self.utils.report.warn(
                        "renaming to lowercase: {}".format(f))
                    shutil.move(os.path.join(root, f),
                                os.path.join(root, f.lower()))

        temp_dtbook_file_obj = tempfile.NamedTemporaryFile()
        temp_dtbook_file = temp_dtbook_file_obj.name

        self.utils.report.info("Rydder opp i nordisk DTBook")
        xslt = Xslt(self,
                    stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir,
                                            NordicDTBookToEpub.uid,
                                            "nordic-cleanup-dtbook.xsl"),
                    source=dtbook,
                    target=temp_dtbook_file)
        if not xslt.success:
            return False
        shutil.copy(temp_dtbook_file, dtbook)

        self.utils.report.info("Validerer Nordisk DTBook...")

        # create context for Pipeline 2 job
        dtbook_dir = os.path.dirname(dtbook)
        dtbook_context = {}
        for root, dirs, files in os.walk(dtbook_dir):
            for file in files:
                fullpath = os.path.join(root, file)
                relpath = os.path.relpath(fullpath, dtbook_dir)
                dtbook_context[relpath] = fullpath

        with DaisyPipelineJob(
                self,
                "nordic-dtbook-validate", {
                    "dtbook": os.path.basename(dtbook),
                    "no-legacy": "false"
                },
                pipeline_and_script_version=[
                    ("1.13.6", "1.4.6"),
                    ("1.13.4", "1.4.5"),
                    ("1.12.1", "1.4.2"),
                    ("1.11.1-SNAPSHOT", "1.3.0"),
                ],
                context=dtbook_context) as dp2_job_dtbook_validate:
            dtbook_validate_status = None
            if dp2_job_dtbook_validate.status == "SUCCESS":
                dtbook_validate_status = "SUCCESS"
            elif dp2_job_dtbook_validate.status in ["VALIDATION_FAIL", "FAIL"]:
                dtbook_validate_status = "WARN"
            else:
                dtbook_validate_status = "ERROR"

            report_file = os.path.join(dp2_job_dtbook_validate.dir_output,
                                       "html-report/report.xhtml")

            if dtbook_validate_status == "WARN":
                report_doc = ElementTree.parse(report_file)
                errors = report_doc.xpath(
                    '//*[@class="error" or @class="message-error"]')
                for error in errors:
                    error_text = " ".join(
                        [e.strip() for e in error.xpath('.//text()')]).strip()
                    error_text = " ".join(error_text.split()).strip() if bool(
                        error_text) else error_text
                    if (bool(error_text) and
                        (error_text.startswith("[tpb124]")
                         or error_text.startswith("[tpb43]")
                         or error_text.startswith("[tpb10] Meta dc:Publisher")
                         or error_text.startswith("[tpb10] Meta dc:Date")
                         or error_text.startswith("[opf3g]")
                         or 'element "h1" not allowed here' in error_text
                         or 'element "h2" not allowed here' in error_text
                         or 'element "h3" not allowed here' in error_text
                         or 'element "h4" not allowed here' in error_text
                         or 'element "h5" not allowed here' in error_text
                         or 'element "h6" not allowed here' in error_text
                         or 'token "toc-brief" invalid' in error_text)):
                        continue  # ignorer disse feilmeldingene

                    if error_text.startswith("Incorrect file signature"):
                        magic_number = error.xpath(
                            '*[@class="message-details"]/*[last()]/*[last()]/text()'
                        )[0]
                        magic_number = " ".join(magic_number.split()).strip(
                        ) if bool(magic_number) else magic_number

                        # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46

                        if magic_number.startswith(
                                "0xFF 0xD8 0xFF 0xDB"):  # Also allow JPEG RAW
                            continue
                        elif magic_number[:
                                          19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[
                                              30:] == ("0x45 0x78 0x69 0x66"
                                                       ):  # Also allow EXIF
                            continue
                        else:
                            dtbook_validate_status = "ERROR"
                            self.utils.report.error(error_text)

                    else:
                        dtbook_validate_status = "ERROR"
                        self.utils.report.error(error_text)

            # get conversion report
            if os.path.isfile(report_file):
                with open(report_file, 'r') as result_report:
                    self.utils.report.attachment(
                        result_report.readlines(),
                        os.path.join(self.utils.report.reportDir(),
                                     "report-dtbook.html"),
                        dtbook_validate_status)

            if dtbook_validate_status == "ERROR":
                self.utils.report.error("Klarte ikke å validere boken")
                return False

            if dtbook_validate_status == "WARN":
                self.utils.report.warn(
                    "DTBoken er ikke valid, men vi fortsetter alikevel.")

        self.utils.report.info(
            "Konverterer fra Nordisk DTBook til Nordisk HTML...")
        temp_htmldir_obj = tempfile.TemporaryDirectory()
        temp_htmldir = temp_htmldir_obj.name
        temp_htmlfile = None
        with DaisyPipelineJob(
                self,
                "nordic-dtbook-to-html", {
                    "dtbook": os.path.basename(dtbook),
                    "fail-on-error": "false",
                    "no-legacy": "false"
                },
                pipeline_and_script_version=[
                    ("1.13.6", "1.4.6"),
                    ("1.13.4", "1.4.5"),
                    ("1.12.1", "1.4.2"),
                    ("1.11.1-SNAPSHOT", "1.3.0"),
                ],
                context=dtbook_context) as dp2_job_dtbook_to_html:
            convert_status = "SUCCESS" if dp2_job_dtbook_to_html.status == "SUCCESS" else "ERROR"

            convert_report_file = os.path.join(
                dp2_job_dtbook_to_html.dir_output, "html-report/report.xhtml")

            if convert_status != "SUCCESS":
                self.utils.report.error(
                    "Klarte ikke å konvertere boken fra DTBook til HTML")

                # get conversion report
                if os.path.isfile(convert_report_file):
                    with open(convert_report_file, 'r') as result_report:
                        self.utils.report.attachment(
                            result_report.readlines(),
                            os.path.join(self.utils.report.reportDir(),
                                         "report-dtbook-to-html.html"),
                            convert_status)

                return False

            dp2_html_dir = os.path.join(dp2_job_dtbook_to_html.dir_output,
                                        "output-dir")

            if not os.path.isdir(dp2_html_dir):
                self.utils.report.error(
                    "Finner ikke 'output-dir' for den konverterte boken: {}".
                    format(dp2_html_dir))
                return False

            Filesystem.copy(self.utils.report, dp2_html_dir, temp_htmldir)
            temp_htmlfile = os.path.join(temp_htmldir,
                                         metadata["identifier"] + ".xhtml")

        if not os.path.isfile(temp_htmlfile):
            self.utils.report.error(
                "Finner ikke den konverterte boken: {}".format(temp_htmlfile))
            self.utils.report.info(
                "Kanskje filnavnet er forskjellig fra IDen?")
            return False

        self.utils.report.info("Rydder opp i nordisk HTML")
        temp_html_xslt_output_obj = tempfile.NamedTemporaryFile()
        temp_html_xslt_output = temp_html_xslt_output_obj.name
        xslt = Xslt(self,
                    stylesheet=os.path.join(NordicDTBookToEpub.xslt_dir,
                                            NordicDTBookToEpub.uid,
                                            "nordic-cleanup-html.xsl"),
                    source=temp_htmlfile,
                    target=temp_html_xslt_output)
        if not xslt.success:
            return False
        shutil.copy(temp_html_xslt_output, temp_htmlfile)

        self.utils.report.info(
            "Konverterer fra Nordisk HTML til Nordisk EPUB3...")

        # create context for Pipeline 2 job
        html_dir = os.path.dirname(temp_htmlfile)
        html_context = {}
        for root, dirs, files in os.walk(html_dir):
            for file in files:
                fullpath = os.path.join(root, file)
                relpath = os.path.relpath(fullpath, html_dir)
                html_context[relpath] = fullpath

        temp_epub_file_obj = tempfile.NamedTemporaryFile()
        temp_epub_file = temp_epub_file_obj.name
        with DaisyPipelineJob(self,
                              "nordic-html-to-epub3", {
                                  "html": os.path.basename(temp_htmlfile),
                                  "fail-on-error": "false"
                              },
                              pipeline_and_script_version=[
                                  ("1.13.6", "1.4.6"),
                                  ("1.13.4", "1.4.5"),
                                  ("1.12.1", "1.4.2"),
                                  ("1.11.1-SNAPSHOT", "1.3.0"),
                              ],
                              context=html_context) as dp2_job_html_to_epub:
            convert_status = "SUCCESS" if dp2_job_html_to_epub.status == "SUCCESS" else "ERROR"

            convert_report_file = os.path.join(dp2_job_html_to_epub.dir_output,
                                               "html-report/report.xhtml")

            if convert_status != "SUCCESS":
                self.utils.report.error("Klarte ikke å konvertere boken")

                # get conversion report
                if os.path.isfile(convert_report_file):
                    with open(convert_report_file, 'r') as result_report:
                        self.utils.report.attachment(
                            result_report.readlines(),
                            os.path.join(self.utils.report.reportDir(),
                                         "report-html-to-epub3.html"),
                            convert_status)

                return False

            dp2_epub_file = os.path.join(dp2_job_html_to_epub.dir_output,
                                         "output-dir",
                                         metadata["identifier"] + ".epub")

            if not os.path.isfile(dp2_epub_file):
                self.utils.report.error(
                    "Finner ikke den konverterte boken: {}".format(
                        dp2_epub_file))
                self.utils.report.info(
                    "Kanskje filnavnet er forskjellig fra IDen?")
                return False

            self.utils.report.info("Validerer Nordisk EPUB 3...")
            epub_file = dp2_epub_file.asFile()
            with DaisyPipelineJob(self,
                                  "nordic-epub3-validate",
                                  {"epub": os.path.basename(epub_file)},
                                  pipeline_and_script_version=[
                                      ("1.13.6", "1.4.6"),
                                      ("1.13.4", "1.4.5"),
                                      ("1.12.1", "1.4.2"),
                                      ("1.11.1-SNAPSHOT", "1.3.0"),
                                  ],
                                  context={
                                      os.path.basename(epub_file): epub_file
                                  }) as dp2_job_epub_validate:
                epub_validate_status = "SUCCESS" if dp2_job_epub_validate.status == "SUCCESS" else "ERROR"

                report_file = os.path.join(dp2_job_epub_validate.dir_output,
                                           "html-report/report.xhtml")

                if epub_validate_status == "ERROR":

                    # attach intermediary file from conversion
                    with open(temp_htmlfile, 'r') as intermediary_htmlfile:
                        self.utils.report.attachment(
                            intermediary_htmlfile.readlines(),
                            os.path.join(self.utils.report.reportDir(),
                                         "intermediary-html.html"), "DEBUG")

                    epub_validate_status = "WARN"

                    report_doc = ElementTree.parse(report_file)
                    errors = report_doc.xpath(
                        '//*[@class="error" or @class="message-error"]')
                    for error in errors:
                        error_text = " ".join([
                            e.strip() for e in error.xpath('.//text()')
                        ]).strip()
                        error_text = " ".join(error_text.split()).strip(
                        ) if bool(error_text) else error_text

                        if (bool(error_text) and
                            (error_text.startswith("[nordic280]")
                             or "PKG-021: Corrupted image file encountered."
                             in error_text)):
                            continue  # ignorer disse feilmeldingene
                        else:
                            self.utils.report.warn(
                                "Not ignoring: {}".format(error_text))

                        if error_text.startswith("Incorrect file signature"):
                            magic_number = error.xpath(
                                '*[@class="message-details"]/*[last()]/*[last()]/text()'
                            )[0]
                            magic_number = " ".join(magic_number.split(
                            )).strip() if bool(magic_number) else magic_number

                            # JFIF already allowed: 0xFF 0xD8 0xFF 0xE0 0x?? 0x?? 0x4A 0x46 0x49 0x46

                            if magic_number.startswith(
                                    "0xFF 0xD8 0xFF 0xDB"
                            ):  # Also allow JPEG RAW
                                continue
                            elif magic_number[:
                                              19] == "0xFF 0xD8 0xFF 0xE1" and magic_number[
                                                  30:] == (
                                                      "0x45 0x78 0x69 0x66"
                                                  ):  # Also allow EXIF
                                continue
                            else:
                                epub_validate_status = "ERROR"
                                self.utils.report.error(error_text)

                        else:
                            epub_validate_status = "ERROR"
                            self.utils.report.error(error_text)

                # get conversion report
                if os.path.isfile(report_file):
                    with open(report_file, 'r') as result_report:
                        self.utils.report.attachment(
                            result_report.readlines(),
                            os.path.join(self.utils.report.reportDir(),
                                         "report-epub3.html"),
                            epub_validate_status)

                if epub_validate_status == "ERROR":
                    self.utils.report.error(
                        "Klarte ikke å validere EPUB 3-versjonen av boken")
                    return False

            Filesystem.copy(self.utils.report, dp2_epub_file, temp_epub_file)

        epub = Epub(self.utils.report, temp_epub_file)
        if not epub.isepub():
            return False

        self.utils.report.info(
            "Boken ble konvertert. Kopierer til EPUB3-fra-DTBook-arkiv.")
        archived_path, stored = self.utils.filesystem.storeBook(
            epub.asDir(), metadata["identifier"], overwrite=self.overwrite)
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = "{}: {} ble konvertert 👍😄 ({})".format(
            self.title, metadata["identifier"], metadata["title"])
        return True
예제 #27
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")

        self.utils.report.info("Lager en kopi av filsettet")
        temp_htmldir_obj = tempfile.TemporaryDirectory()
        temp_htmldir = temp_htmldir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_htmldir)

        self.utils.report.info("Finner HTML-fila")
        html_file = None
        for root, dirs, files in os.walk(temp_htmldir):
            for f in files:
                if f.endswith("html"):
                    html_file = os.path.join(root, f)
        if not html_file or not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne en HTML-fil.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet "
            return False

        html_xml = ElementTree.parse(html_file).getroot()
        identifier = html_xml.xpath(
            "/*/*[local-name()='head']/*[@name='dc:identifier']")

        metadata = Metadata.get_metadata_from_book(self.utils.report,
                                                   temp_htmldir)

        line_spacing = "single"
        duplex = "true"
        for e in html_xml.xpath(
                "/*/*[local-name()='head']/*[@name='dc:format.linespacing']"):
            if "double" == e.attrib["content"]:
                line_spacing = "double"
        for e in html_xml.xpath(
                "/*/*[local-name()='head']/*[@name='dc:format.printing']"):
            if "single-sided" == e.attrib["content"]:
                duplex = "false"
        self.utils.report.info("Linjeavstand: {}".format(
            "åpen" if line_spacing == "double" else "enkel"))
        self.utils.report.info("Trykk: {}".format("enkeltsidig" if duplex ==
                                                  "false" else "dobbeltsidig"))

        bookTitle = ""
        bookTitle = " (" + html_xml.xpath(
            "string(/*/*[local-name()='head']/*[local-name()='title']/text())"
        ) + ") "

        identifier = identifier[0].attrib[
            "content"] if identifier and "content" in identifier[
                0].attrib else None
        if not identifier:
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å finne boknummer i HTML-fil.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet "
            return False
        epub_identifier = html_xml.xpath(
            "/*/*[local-name()='head']/*[@name='nlbprod:identifier.epub']")
        epub_identifier = epub_identifier[0].attrib[
            "content"] if epub_identifier and "content" in epub_identifier[
                0].attrib else None

        # ---------- konverter til PEF ----------

        # create context for Pipeline 2 job
        html_dir = os.path.dirname(html_file)
        html_context = {}
        for root, dirs, files in os.walk(html_dir):
            for file in files:
                kind = mimetypes.guess_type(file)[0]
                if kind is not None and kind.split("/")[0] in [
                        "image", "video", "audio"
                ]:
                    continue  # ignore media files
                fullpath = os.path.join(root, file)
                relpath = os.path.relpath(fullpath, html_dir)
                html_context[relpath] = fullpath

        script_id = "nlb:html-to-pef"
        pipeline_and_script_version = [
            ("1.11.1-SNAPSHOT", "1.10.0-SNAPSHOT"),
        ]
        braille_arguments = {
            "source": os.path.basename(html_file),
            "braille-standard": "(dots:6)(grade:0)",
            "line-spacing": line_spacing,
            "duplex": duplex,
        }

        # for custom Statped options using NLBs PIP (remove `and False` or replace with `or True` to test)
        if metadata["library"].lower() == "statped" and False:
            # see: https://github.com/nlbdev/pipeline/blob/nlb/nlb/book-to-pef/src/main/resources/xml/html-to-pef.xpl#L146-L167
            #
            # (1) 'http://www.nlb.no/pipeline/modules/braille/pre-processing.xsl',
            # (2) 'http://www.daisy.org/pipeline/modules/braille/xml-to-pef/generate-toc.xsl',
            # (3) if ($default-table-class = '') then resolve-uri('add-table-classes.xsl') else (),
            # (4) if ($insert-boilerplate = 'true') then 'http://www.nlb.no/pipeline/modules/braille/insert-boilerplate.xsl' else (),
            # (5) if ($apply-default-stylesheet = 'true') then 'http://www.nlb.no/pipeline/modules/braille/default.scss' else (),
            # (6) if ($stylesheet) then tokenize($stylesheet,',') else ()),' ')"/>

            braille_arguments["insert-boilerplate"] = "false"  # disable (4)
            braille_arguments[
                "apply-default-stylesheet"] = "false"  # disable (5)

            # (1-3) will still be included. Specifying (6) let's us include replacements for (4) and (5)
            braille_arguments["stylesheet"] = ",".join([
                "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl",
                "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss"
            ])

        # for custom Statped options using DAISYs PIP (remove `and False` or replace with `or True` to test)
        if metadata["library"].lower() == "statped" and True:
            # use DAISYs version of PIP instead
            script_id = "html-to-pef"
            pipeline_and_script_version = [
                ("1.14.6", None),
                ("1.14.5", None),
                ("1.14.4", "4.2.0"),
                ("1.14.4-SNAPSHOT", "4.1.1"),
                ("1.14.3", "4.1.1"),
                ("1.14.2", "4.1.0"),
                ("1.13.6", "1.4.6"),
                ("1.13.4", "1.4.5"),
                ("1.12.1", "1.4.2"),
                ("1.11.1-SNAPSHOT", "1.3.0"),
            ]

            braille_arguments = {
                "html":
                os.path.basename(html_file),
                "transform":
                "(formatter:dotify)(translator:liblouis)(dots:6)(grade:0)",
                "stylesheet":
                " ".join([
                    # 1. better volume breaking, and also removes title page and print toc, moves the colophon and copyright page to the end of the book
                    # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/pre-processing.xsl",
                    "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/pre-processing.xsl",

                    #"https://raw.githubusercontent.com/daisy/pipeline/master/modules/braille/xml-to-pef/src/main/resources/xml/xslt/generate-toc.xsl",

                    # 3. NLB: Add table classes based on the dimensions of the table, for better handling of tables
                    "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/add-table-classes.xsl",

                    # 4. NLB: Generate a new title page and about page in the frontmatter
                    # "https://raw.githubusercontent.com/nlbdev/pipeline/nlb/nlb/book-to-pef/src/main/resources/xml/insert-boilerplate.xsl",
                    "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/xslt/insert-boilerplate.xsl",
                    # 5. Statped-specific SCSS
                    "https://raw.githubusercontent.com/StatpedEPUB/nlb-scss/master/src/scss/braille.scss",
                ]),
                "page-width":
                '38',
                "page-height":
                '29',
                "toc-depth":
                '2',
                "maximum-number-of-sheets":
                '50',
                "include-production-notes":
                'true',
                "hyphenation":
                'false',
                "allow-volume-break-inside-leaf-section-factor":
                '10',
                "prefer-volume-break-before-higher-level-factor":
                '1',
                "stylesheet-parameters":
                "(skip-margin-top-of-page:true)",
            }

        pef_tempdir_object = tempfile.TemporaryDirectory()

        self.utils.report.info("Konverterer fra HTML til PEF...")
        found_pipeline_version = None
        found_script_version = None
        with DaisyPipelineJob(
                self,
                script_id,
                braille_arguments,
                pipeline_and_script_version=pipeline_and_script_version,
                context=html_context) as dp2_job:
            found_pipeline_version = dp2_job.found_pipeline_version
            found_script_version = dp2_job.found_script_version

            # get conversion report
            if os.path.isdir(
                    os.path.join(dp2_job.dir_output, "preview-output-dir")):
                Filesystem.copy(
                    self.utils.report,
                    os.path.join(dp2_job.dir_output, "preview-output-dir"),
                    os.path.join(self.utils.report.reportDir(), "preview"))
                self.utils.report.attachment(
                    None,
                    os.path.join(self.utils.report.reportDir(),
                                 "preview" + "/" + identifier + ".pef.html"),
                    "SUCCESS" if dp2_job.status == "SUCCESS" else "ERROR")

            if dp2_job.status != "SUCCESS":
                self.utils.report.info("Klarte ikke å konvertere boken")
                self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle
                return False

            dp2_pef_dir = os.path.join(dp2_job.dir_output, "pef-output-dir")
            dp2_new_pef_dir = os.path.join(dp2_job.dir_output, "output-dir")
            if not os.path.exists(dp2_pef_dir) and os.path.exists(
                    dp2_new_pef_dir):
                dp2_pef_dir = dp2_new_pef_dir

            if not os.path.isdir(dp2_pef_dir):
                self.utils.report.info("Finner ikke den konverterte boken.")
                self.utils.report.title = self.title + ": " + identifier + " feilet 😭👎" + bookTitle
                return False

            Filesystem.copy(self.utils.report, dp2_pef_dir,
                            pef_tempdir_object.name)

            self.utils.report.info("Boken ble konvertert.")

        self.utils.report.info("Kopierer metadata fra HTML til PEF...")
        try:
            pef_file = None
            for root, dirs, files in os.walk(pef_tempdir_object.name):
                for f in files:
                    if f.endswith(".pef"):
                        pef_file = os.path.join(root, f)
            if not pef_file or not os.path.isfile(pef_file):
                self.utils.report.error(self.book["name"] +
                                        ": Klarte ikke å finne en PEF-fil.")
            else:
                additional_metadata = []
                additional_metadata.append(
                    ("daisy-pipeline-engine-version", "nlbprod",
                     "http://www.nlb.no/production", None,
                     found_pipeline_version))
                additional_metadata.append(
                    ("daisy-pipeline-script-id", "nlbprod",
                     "http://www.nlb.no/production", None, script_id))
                additional_metadata.append(
                    ("daisy-pipeline-script-version", "nlbprod",
                     "http://www.nlb.no/production", None,
                     found_script_version))
                for argument in braille_arguments:
                    if argument in ["source", "html"]:
                        continue  # skip HTML file path
                    values = braille_arguments[argument]
                    values = values if isinstance(values, list) else [values]
                    for value in values:
                        additional_metadata.append(
                            ("daisy-pipeline-argument", "nlbprod",
                             "http://www.nlb.no/production", argument, value))

                transfer_metadata_from_html_to_pef(html_file, pef_file,
                                                   additional_metadata)

        except Exception:
            self.utils.report.warning(traceback.format_exc(),
                                      preformatted=True)
            self.utils.report.error(
                "An error occured while trying to insert metadata about the conversion"
            )

        self.utils.report.info("Kopierer til PEF-arkiv.")
        archived_path, stored = self.utils.filesystem.storeBook(
            pef_tempdir_object.name, identifier)
        self.utils.report.attachment(None, archived_path, "DEBUG")

        self.utils.report.title = self.title + ": " + identifier + " ble konvertert 👍😄" + bookTitle
        return True
    def on_book(self):
        self.utils.report.info("Validerer Daisy 2.02 lydbok")

        if self.dp1_home == "" or self.validator_script == "":
            if not self.init_environment():
                self.utils.report.error(
                    "Pipeline1 ble ikke funnet. Avbryter..")
                return False

        folder = self.book["name"]
        if self.book["name"].isnumeric() is False:
            self.utils.report.warn(
                f"{folder} er ikke et tall, prosesserer ikke denne boka. Mulig det er en multivolum bok."
            )
            self.utils.report.should_email = False
            return False

        if os.path.isdir(os.path.join(self.dir_out, folder)):
            self.utils.report.error(
                f"{folder} finnes allerede på share, avbryter.")
            return False

        if self.nlbsamba_out == "":
            self.nlbsamba_out = Config.get("nlbsamba.dir")
        if self.nlbsamba_out is None:
            self.nlbsamba_out = ""

        temp_obj = tempfile.TemporaryDirectory()
        temp_dir = temp_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_dir)

        if not os.path.isfile(os.path.join(temp_dir, "ncc.html")):
            self.utils.report.error("Finner ikke ncc fila")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎. Er dette en daisy 2.02 lydbok med en ncc.html fil?"
            return False
        try:
            ncc_tree = ElementTree.parse(os.path.join(temp_dir, "ncc.html"))
            ncc_encoding = ncc_tree.docinfo.encoding.lower()
            nccdoc = ncc_tree.getroot()

        except Exception:
            self.utils.report.info(
                "Klarte ikke lese ncc fila. Sjekk loggen for detaljer.")
            self.utils.report.debug(traceback.format_exc(), preformatted=True)
            return False

        edition_identifier = ""
        audio_title = ""
        audio_title = nccdoc.xpath("string(//*[@name='dc:title']/@content)")
        edition_identifier = nccdoc.xpath(
            "string(//*[@name='dc:identifier']/@content)")

        if ncc_encoding != 'utf-8':
            self.utils.report.error(
                self.book["name"] +
                ": Encodingen til filen er ikke utf-8, (f{ncc_encoding}) avbryter."
            )
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        str_edition_identifier = str(edition_identifier)
        str_book_name = str(self.book["name"])
        if edition_identifier == (
                "") or str_edition_identifier != str_book_name:
            self.utils.report.error(
                self.book["name"] +
                f": Klarte ikke å bestemme boknummer basert på dc:identifier. dc:identifier: {str_edition_identifier} mappenavn: {str_book_name}"
            )
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        self.utils.report.info("Henter metadata fra api.nlb.no")
        creative_work_metadata = None
        edition_metadata = None

        timeout = 0
        while creative_work_metadata is None and timeout < 5:

            timeout = timeout + 1
            creative_work_metadata = Metadata.get_creative_work_from_api(
                edition_identifier,
                editions_metadata="all",
                use_cache_if_possible=True,
                creative_work_metadata="all")
            edition_metadata = Metadata.get_edition_from_api(
                edition_identifier)
            if creative_work_metadata is not None:
                break

        if creative_work_metadata is None:
            self.utils.report.warning(
                "Klarte ikke finne et åndsverk tilknyttet denne utgaven. Prøver igjen senere."
            )
            return False

        library = edition_metadata["library"].lower()

        # in case of wrong upper lower cases
        if library == "nlb":
            library = "NLB"
        elif library == "statped":
            library = "Statped"
        elif library == "kabb":
            library = "KABB"

        periodical = False
        if creative_work_metadata[
                "newspaper"] is True or creative_work_metadata[
                    "magazine"] is True:
            periodical = True
            if len(edition_identifier) != 12:
                self.utils.report.error(
                    f"Boka {edition_identifier} er en avis eller et magasin, men utgavenummeret har ikke 12 siffer"
                )
                return False
        else:
            if len(edition_identifier) != 6:
                self.utils.report.error(
                    f"Boka {edition_identifier} har ikke 6 siffer")
                return False

        root_directory = Path(temp_dir)
        max_size = 702545920 - 20971520
        size = sum(f.stat().st_size for f in root_directory.glob('**/*')
                   if f.is_file())
        multi_volume = False
        if size >= max_size:
            self.utils.report.info(
                f"{edition_identifier} er på størrelse {size}, sjekker om det er en multivolum bok."
            )
            multi_volume = True
        else:
            self.utils.report.info(
                f"{edition_identifier} er på størrelse {size} bytes")

        multi_volume_dirs = []
        if multi_volume:
            files_dir = os.listdir(self.dir_in)

            for file in files_dir:
                if file.startswith(self.book["name"]) and file[-1].isdigit(
                ) and file[-2] == "_":
                    self.utils.report.info(
                        f"{file} er en del av multi volum boka {edition_identifier}"
                    )
                    multi_volume_dirs.append(file)
                    multi_volume_directory = Path(
                        os.path.join(self.dir_in, file))
                    multi_volume_size = size = sum(
                        f.stat().st_size
                        for f in multi_volume_directory.glob('**/*')
                        if f.is_file())
                    if multi_volume_size >= max_size:
                        self.utils.report.info(
                            f" Multi volum mappen {file} er på størrelse {multi_volume_size}, dette er for stort"
                        )
                        self.utils.report.title = self.title + ": " + self.book[
                            "name"] + " Lydbok feilet 😭👎"
                        return False
                    else:
                        multi_volume_files = os.listdir(multi_volume_directory)
                        self.utils.report.info(
                            f"Validerer filer til multi volum {file}...")
                        if self.check_files(edition_identifier,
                                            multi_volume_files, library,
                                            multi_volume_directory,
                                            multi_volume) is False:
                            return False

            if len(multi_volume_dirs) <= 0:
                self.utils.report.error(
                    f"{edition_identifier} bør være en multivolum bok, men har ikke flere multivolum mapper. Avbryter."
                )
                self.utils.report.title = self.title + ": " + self.book[
                    "name"] + "Lydbok feilet 😭👎"
                return False

        files_book = os.listdir(temp_dir)

        if "default.css" in files_book and library != "Statped":
            self.utils.report.info("Erstatter default.css med en tom fil")
            open(os.path.join(temp_dir, "default.css"), 'w').close()

        self.utils.report.info("Validerer filer...")
        if self.check_files(edition_identifier, files_book, library, temp_dir,
                            False) is False:
            return False

        dc_creator = nccdoc.xpath("string(//*[@name='dc:creator']/@content)")
        if not len(dc_creator) >= 1:
            self.utils.report.error(
                f"{edition_identifier} finner ikke dc:creator, dette må boka ha"
            )
            return False

        dc_narrator = nccdoc.xpath(
            "string(//*[@name='ncc:narrator']/@content)")
        if not len(dc_narrator) >= 1:
            self.utils.report.error(
                f"{edition_identifier} finner ikke ncc:narrator, dette må boka ha"
            )
            return False

        multimedia_types = [
            "audioOnly", "audioNcc", "audioPartText", "audioFullText",
            "textPartAudio", "textNcc"
        ]
        ncc_multimedia_type = nccdoc.xpath(
            "string(//*[@name='ncc:multimediaType']/@content)")
        if ncc_multimedia_type not in multimedia_types:
            self.utils.report.error(
                f"{edition_identifier} har ikke en valid ncc:multimediaType, dette må boka ha. Multimediatype er {ncc_multimedia_type}"
            )
            return False

        first_head_class = nccdoc.xpath(
            "string(//*[local-name()='h1'][1]/@class)")
        second_head = nccdoc.xpath("string(//*[local-name()='h1'][2])").lower()

        accepted_second_head = [
            "lydbokavtalen", "audiobook agreement", "the audiobook agreement",
            "tigar announcement", "nlb"
        ]

        if first_head_class != "title":
            self.utils.report.error(
                f"{edition_identifier} første heading {first_head_class} er ikke title"
            )
            return False

        if second_head not in accepted_second_head and library == "NLB" and creative_work_metadata[
                "newspaper"] is False and not (
                    creative_work_metadata["magazine"] is True
                    and library == "KABB"):
            self.utils.report.error(
                f"{edition_identifier} andre heading {second_head} er ikke Lydbokavtalen, Audiobook agreement, eller Tigar announcement"
            )
            return False

        if library != "Statped":
            status = self.validate_book(os.path.join(temp_dir, "ncc.html"))
            if status == "ERROR" or status is False:
                self.utils.report.error(
                    "Pipeline validator: Boka er ikke valid. Se rapport.")
                return False
            self.utils.report.info("Pipeline validator: Boka er valid")

        if multi_volume:
            for folder in multi_volume_dirs:
                self.utils.report.debug(f"Flytter multivolum fil {folder}")
                archived_path_multi, stored = self.utils.filesystem.storeBook(
                    os.path.join(self.dir_in, folder), folder)
                self.utils.report.attachment(None, archived_path_multi,
                                             "DEBUG")
                if self.nlbsamba_out != "":
                    archived_path_samba_multi, stored_samba_multi = self.utils.filesystem.storeBook(
                        os.path.join(self.dir_in, folder),
                        folder,
                        dir_out=self.nlbsamba_out)
                    self.utils.report.attachment(None,
                                                 archived_path_samba_multi,
                                                 "DEBUG")
                shutil.rmtree(os.path.join(self.dir_in, folder))

        if library == "Statped":
            css_format = "Statped"
        elif edition_metadata["includesText"] is True:
            css_format = "daisy202"
        else:
            css_format = "daisy202-ncc"
        self.utils.report.info(f"Inserting CSS: {css_format}")
        if library != "Statped":
            self.utils.filesystem.insert_css(
                os.path.join(temp_dir, "default.css"), library, css_format)

        files_temp = os.listdir(temp_dir)
        archived_path, stored = self.utils.filesystem.storeBook(
            temp_dir, edition_identifier)
        if self.nlbsamba_out != "":
            archived_path_samba, stored_samba = self.utils.filesystem.storeBook(
                temp_dir, edition_identifier, dir_out=self.nlbsamba_out)
            self.utils.report.attachment(None, archived_path_samba, "DEBUG")

        files_out = os.listdir(os.path.join(self.dir_out, edition_identifier))
        if self.nlbsamba_out != "":
            if len(files_temp) == len(
                    os.listdir(
                        os.path.join(self.nlbsamba_out, edition_identifier))):
                with open(
                        os.path.join(self.nlbsamba_out, edition_identifier,
                                     '.donedaisy'), 'w') as file:
                    self.utils.report.debug(".donedaisy created")
            else:
                self.utils.report.error(
                    f"MANGLER FILER i {self.nlbsamba_out}, sjekk utmappa")
                return False
        if len(files_temp) == len(files_out):
            with open(
                    os.path.join(self.dir_out, edition_identifier,
                                 '.donedaisy'), 'w') as file:
                self.utils.report.debug(".donedaisy created")
        else:
            self.utils.report.error(
                f"MANGLER FILER i {self.dir_out}, sjekk utmappa")
            return False

        self.utils.report.info("Boka er godkjent og overført")

        if periodical:
            available_title = ""
            if creative_work_metadata["newspaper"] is False:
                available_title = audio_title
            Bibliofil.book_available("DAISY 2.02",
                                     edition_identifier,
                                     title=available_title)

        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + edition_identifier + " er valid 👍😄" + audio_title
        self.utils.filesystem.deleteSource()
        return True
예제 #29
0
    def email(self, recipients, subject=None, should_email=True, should_message_slack=True, should_attach_log=True, should_escape_chars=True):
        if not subject:
            assert isinstance(self.title, str) or self.pipeline is not None, "either title or pipeline must be specified when subject is missing"
            subject = self.title if self.title else self.pipeline.title

        smtp = {
            "host": Config.get("email.smtp.host", None),
            "port": Config.get("email.smtp.port", None),
            "user": Config.get("email.smtp.user", None),
            "pass": Config.get("email.smtp.pass", None)
        }
        sender = Address(Config.get("email.sender.name", "undefined"), addr_spec=Config.get("email.sender.address", "*****@*****.**"))

        # 0. Create attachment with complete log (including DEBUG statements)
        if should_attach_log is True:
            self.attachLog()

        attachments = []
        for m in self._messages["attachment"]:
            smb, file, unc = Filesystem.networkpath(m["text"])
            base_path = Filesystem.get_base_path(m["text"], self.pipeline.dir_base)
            relpath = os.path.relpath(m["text"], base_path) if base_path else None
            if m["text"].startswith(self.reportDir()):
                relpath = os.path.relpath(m["text"], self.reportDir())
            if not [a for a in attachments if a["unc"] == unc]:
                attachments.append({
                    "title": "{}{}".format(relpath, ("/" if os.path.isdir(m["text"]) else "")),
                    "smb": smb,
                    "file": file,
                    "unc": unc,
                    "severity": m["severity"]
                })

        # Determine overall status
        status = "INFO"
        for message_type in self._messages:
            for m in self._messages[message_type]:

                if m["severity"] == "SUCCESS" and status in ["INFO"]:
                    status = "SUCCESS"
                elif m["severity"] == "WARN" and status in ["INFO", "SUCCESS"]:
                    status = "WARN"
                elif m["severity"] == "ERROR":
                    status = "ERROR"

        try:
            assert isinstance(smtp, dict), "smtp must be a dict"
            assert isinstance(sender, Address), "sender must be a Address"
            assert isinstance(recipients, str) or isinstance(recipients, list) or isinstance(recipients, tuple), "recipients must be a str, list or tuple"
            assert isinstance(self.title, str) or self.pipeline and isinstance(self.pipeline.title, str), "title or pipeline.title must be a str"

            if isinstance(recipients, str):
                recipients = [recipients]
            elif isinstance(recipients, tuple):
                recipients = list(recipients)

            if status == "ERROR":
                for key in Config.get("administrators", default=[]):
                    if key not in recipients:
                        recipients.append(key)

            # when testing, only allow e-mail addresses defined in the ALLOWED_EMAIL_ADDRESSES_IN_TEST env var
            if Config.get("test"):
                subject = "[test] " + subject
                filtered_recipients = []
                for recipient in recipients:
                    if recipient in Config.get("email.allowed_email_addresses_in_test"):
                        filtered_recipients.append(recipient)
                recipients = filtered_recipients

            # 1. join lines with severity SUCCESS/INFO/WARN/ERROR
            markdown_text = []
            for m in self._messages["message"]:
                if should_escape_chars:
                    text = m['text'].replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
                else:
                    text = m['text']
                if m['preformatted'] is True:
                    markdown_text.append("<pre>{}</pre>".format(text))
                elif m['severity'] != 'DEBUG':
                    markdown_text.append(text)
            if attachments != [] or should_attach_log is True:
                markdown_text.append("\n----\n")
                markdown_text.append("\n# Lenker\n")
                markdown_text.append("\n<ul style=\"list-style: none;\">")

                # Pick icon and style for INFO-attachments
                attachment_styles = {
                    "DEBUG": {
                        "icon": "🗎",
                        "style": ""
                    },
                    "INFO": {
                        "icon": "🛈",
                        "style": ""
                    },
                    "SUCCESS": {
                        "icon": "😄",
                        "style": "background-color: #bfffbf;"
                    },
                    "WARN": {
                        "icon": "😟",
                        "style": "background-color: #ffffbf;"
                    },
                    "ERROR": {
                        "icon": "😭",
                        "style": "background-color: #ffbfbf;"
                    }
                }

                for attachment in attachments:
                    # UNC links seems to be preserved when viewed in Outlook.
                    # file: and smb: URIs are disallowed or removed.
                    # So these links will only work in Windows.
                    # If we need this to work cross-platform, we would have
                    # to map the network share paths to a web server so that
                    # the transfers go through http:. This could maybe be mapped
                    # using environment variables.
                    li = "<li>"
                    li += "<span style=\"vertical-align: middle; font-size: 200%;\">" + attachment_styles[attachment["severity"]]["icon"] + "</span> "
                    li += "<span style=\"vertical-align: middle; " + attachment_styles[attachment["severity"]]["style"] + "\">"
                    li += "<a href=\"file:///" + attachment["unc"] + "\">" + attachment["title"] + "</a> "
                    li += "<a href=\"" + attachment["smb"] + "\">" + self.img_string + "=\" alt=\"" + attachment["smb"] + "\"/>" + "</a> "
                    li += "</span>"
                    li += "</li>"
                    markdown_text.append(li)
                markdown_text.append("</ul>\n")
                label_string = ""
                for label in self.pipeline.labels:
                    label_string += "[{}] ".format(label)
                markdown_text.append("\n[{}] {} [{}] [status:{}]".format(self.pipeline.uid, label_string, self.pipeline.publication_format, status))
            markdown_text = "\n".join(markdown_text)

            # 2. parse string as Markdown and render as HTML
            if should_escape_chars:
                markdown_html = markdown.markdown(markdown_text, extensions=['markdown.extensions.fenced_code', 'markdown.extensions.codehilite'])
            else:
                markdown_html = markdown_text
            markdown_html = '''<!DOCTYPE html>
<html>
<head>
<meta charset=\"utf-8\"/>
<title>''' + subject.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") + '''</title>
</head>
<body>
''' + markdown_html + '''
</body>
</html>
'''

            if not should_email:
                logging.info("[e-mail] Not sending email")
            else:
                # 3. build e-mail
                msg = EmailMessage()
                msg['Subject'] = re.sub(r"\s", " ", subject).strip()
                msg['From'] = sender
                msg['To'] = Report.emailStringsToAddresses(recipients)
                msg.set_content(markdown_text)
                msg.add_alternative(markdown_html, subtype="html")
                logging.info("[e-mail] E-mail with subject '{}' will be sent to: {}".format(msg['Subject'], ", ".join(recipients)))

                # 4. send e-mail
                if smtp["host"] and smtp["port"]:
                    smtp_server = "{}:{}".format(smtp["host"], smtp["port"])
                    logging.info("[e-mail] SMTP server: {}".format(smtp_server))
                    with smtplib.SMTP(smtp_server) as s:
                        s.ehlo()
                        # s.starttls()
                        if smtp["user"] and smtp["pass"]:
                            s.login(smtp["user"], smtp["pass"])
                        else:
                            logging.debug("[e-mail] user/pass not configured")
                        logging.debug("[e-mail] sending…")
                        s.send_message(msg)
                        logging.debug("[e-mail] sending complete.")
                else:
                    logging.warning("[e-mail] host/port not configured")

                temp_md_obj = tempfile.NamedTemporaryFile(suffix=".md")
                temp_html_obj = tempfile.NamedTemporaryFile(suffix=".html")
                with open(temp_md_obj.name, "w") as f:
                    f.write(markdown_text)
                    logging.debug("[e-mail] markdown: {}".format(temp_md_obj.name))
                with open(temp_html_obj.name, "w") as f:
                    f.write(markdown_html)
                    logging.debug("[e-mail] html: {}".format(temp_html_obj.name))
                if should_attach_log is True:
                    path_mail = os.path.join(self.reportDir(), "email.html")
                    shutil.copy(temp_html_obj.name, path_mail)
                    self.mailpath = Filesystem.networkpath(path_mail)
                else:
                    yesterday = datetime.now() - timedelta(1)
                    yesterday = str(yesterday.strftime("%Y-%m-%d"))
                    path_mail = os.path.join(self.pipeline.dir_reports, "logs", "dagsrapporter", yesterday, self.pipeline.uid + ".html")
                    shutil.copy(temp_html_obj.name, path_mail)
                    self.mailpath = Filesystem.networkpath(path_mail)

        except AssertionError as e:
            logging.error("[e-mail] " + str(e))
        if not should_message_slack:
            logging.warning("Not sending message to slack")
        else:
            # 5. send message to Slack
            slack_attachments = []
            for attachment in attachments:
                color = None
                if attachment["severity"] == "SUCCESS":
                    color = "good"
                elif attachment["severity"] == "WARN":
                    color = "warning"
                elif attachment["severity"] == "ERROR":
                    color = "danger"
                slack_attachments.append({
                    "title_link": attachment["smb"],
                    "title": attachment["title"],
                    "fallback": attachment["title"],
                    "color": color
                })
            Slack.slack(text=subject, attachments=slack_attachments)
예제 #30
0
    def on_book(self):
        self.utils.report.attachment(None, self.book["source"], "DEBUG")
        epub = Epub(self.utils.report, self.book["source"])

        epubTitle = ""
        try:
            epubTitle = " (" + epub.meta("dc:title") + ") "
        except Exception:
            pass

        # sjekk at dette er en EPUB
        if not epub.isepub():
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        if not epub.identifier():
            self.utils.report.error(
                self.book["name"] +
                ": Klarte ikke å bestemme boknummer basert på dc:identifier.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎"
            return False

        # language must be exctracted from epub or else docx default language (nb) wil be used in the converted file
        language = ""
        try:
            #language = " (" + epub.meta("dc:language") + ") "
            language = epub.meta("dc:language")

        except Exception:
            pass

        # ---------- lag en kopi av EPUBen ----------

        temp_epubdir_obj = tempfile.TemporaryDirectory()
        temp_epubdir = temp_epubdir_obj.name
        Filesystem.copy(self.utils.report, self.book["source"], temp_epubdir)
        temp_epub = Epub(self, temp_epubdir)

        opf_path = temp_epub.opf_path()
        if not opf_path:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne OPF-fila i EPUBen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        opf_path = os.path.join(temp_epubdir, opf_path)
        opf_xml = ElementTree.parse(opf_path).getroot()

        html_file = opf_xml.xpath(
            "/*/*[local-name()='manifest']/*[@id = /*/*[local-name()='spine']/*[1]/@idref]/@href"
        )
        html_file = html_file[0] if html_file else None
        if not html_file:
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila i OPFen.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False
        html_file = os.path.join(os.path.dirname(opf_path), html_file)
        if not os.path.isfile(html_file):
            self.utils.report.error(self.book["name"] +
                                    ": Klarte ikke å finne HTML-fila.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        temp_xml_file_obj = tempfile.NamedTemporaryFile()
        temp_xml_file = temp_xml_file_obj.name

        self.utils.report.info(
            "Konverterer fra ASCIIMath til norsk punktnotasjon…")
        xslt = Xslt(self,
                    stylesheet=os.path.join(Xslt.xslt_dir, NLBpubToDocx.uid,
                                            "nordic-asciimath-epub.xsl"),
                    source=html_file,
                    target=temp_xml_file)
        if not xslt.success:
            return False
        shutil.copy(temp_xml_file, html_file)

        # ---------- konverter HTML-fila til DOCX ----------

        temp_docxdir_obj = tempfile.TemporaryDirectory()
        temp_docxdir = temp_docxdir_obj.name

        try:
            self.utils.report.info("Konverterer fra XHTML til DOCX...")
            process = self.utils.filesystem.run([
                "/usr/bin/ebook-convert",
                html_file,
                os.path.join(temp_docxdir,
                             epub.identifier() + "_calibre.docx"),
                "--chapter=/",
                "--chapter-mark=none",
                "--page-breaks-before=/",
                "--no-chapters-in-toc",
                "--toc-threshold=0",
                "--docx-page-size=a4",
                # "--linearize-tables",
                "--extra-css=" +
                os.path.join(Xslt.xslt_dir, self.uid, 'extra.css'),

                # NOTE: microsoft fonts must be installed:
                # sudo apt-get install ttf-mscorefonts-installer
                "--embed-font-family=Verdana",
                "--docx-page-margin-top=42",
                "--docx-page-margin-bottom=42",
                "--docx-page-margin-left=70",
                "--docx-page-margin-right=56",
                #"--language="+epub.meta('dc:language'),
                ("--language=" + language) if language else "",
                "--base-font-size=13",
                #"--remove-paragraph-spacing",
                #"--remove-paragraph-spacing-indent-size=-1",
                "--font-size-mapping=13,13,13,13,13,13,13,13"
            ])

            if process.returncode == 0:
                self.utils.report.info("Boken ble konvertert.")

                # -------------  script from kvile ---------------
                document = Document(
                    os.path.join(temp_docxdir,
                                 epub.identifier() + "_calibre.docx"))
                emptyParagraph = False
                normalParagraph = "Normal"
                normalParagraphNoIndent = "NormalNoIndent"
                headingIndent = Cm(1.25)
                fontSize = Pt(13)
                # ny kode 2021-01-20
                #folder = os.path.join(temp_docxdir)

                folder = Path(temp_docxdir)

                # slutt ny kode

                #self.utils.report.info("Folder: "+folder)

                def zipdir(src, dst, zip_name):
                    os.chdir(dst)
                    ziph = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
                    for root, dirs, files in os.walk(src):
                        for file in files:
                            ziph.write(os.path.join(root, file),
                                       arcname=os.path.join(
                                           root.replace(src, ""), file))
                    ziph.close()

                def writeFile(txt, dst):
                    tempFile = open(folder / dst, "w+")
                    tempFile.write(txt)
                    tempFile.close()

                def delete_paragraph(paragraph):
                    # self.utils.report.info("Delete paragraph: ")
                    p = paragraph._element
                    p.getparent().remove(p)
                    p._p = p._element = None

                def delete_element(element):
                    element.getparent().remove(element)
                    element._element = None

                indent = Cm(0.44)
                hangingIndentList = Cm(0.63)
                document.styles[normalParagraph].font.size = fontSize
                document.styles[
                    normalParagraph].paragraph_format.first_line_indent = indent
                styleNoIndent = document.styles.add_style(
                    'NormalNoIndent', WD_STYLE_TYPE.PARAGRAPH)
                styleNoIndent.base_style = document.styles[normalParagraph]
                document.styles[
                    normalParagraphNoIndent].paragraph_format.first_line_indent = Cm(
                        0)

                # set style to normal for regular paragraphs, set keep_with_next to false, remove multiple empty paragraphs, and remove empty p after page nr or heading
                for paragraph in document.paragraphs:
                    # deleting empty text-elements
                    emptyTextElementList = document.element.xpath(
                        "//w:t[. = '']")
                    for emptyTextElement in emptyTextElementList:
                        delete_element(emptyTextElement)
                    paragraph.paragraph_format.keep_with_next = None
                    if re.match("Para 0[1-9]|[0-9] Block|Para [0-9]",
                                paragraph.style.name
                                ) and paragraph.style.font.underline != True:
                        paragraph.style = normalParagraph
                    if len(paragraph.text) <= 1 or re.match(
                            r"^--- \d+ til ", paragraph.text
                    ) or paragraph.style.name[
                            0:
                            7] == "Heading":  # if empty p or page nr or heading
                        paragraph.text = re.sub(
                            r"^\s(.*)", r"\1",
                            paragraph.text)  #remove space at beginning av p
                        # self.utils.report.info("Paragraph.text <= 1 ")
                        if len(
                                paragraph.text
                        ) == 0 and emptyParagraph:  #if last p also was empty or page nr
                            #        self.utils.report.info("Paragraph.text == 0 ")
                            delete_paragraph(paragraph)
                        emptyParagraph = True
                    else:
                        emptyParagraph = False
                        if re.match(r"^\s*STATPED_DUMMYTEXT_LI_OL\s*$",
                                    paragraph.text):
                            paragraph.text = ""
                # no indent after Heading, page-nr, or paragraphs starting with "Bilde: ", paragraphs in only bold (text=^_[^_]*_$) and the paragraph after p in only bold, or on empty p.
                removeIndent = False
                for paragraph in document.paragraphs:
                    #remove space at beginning of line after <br/>
                    spaceAfterBreakList = paragraph._element.xpath(
                        r'w:r/w:br[@w:clear="none"]/following::w:t[@xml:space="preserve"][1]'
                    )
                    if len(spaceAfterBreakList) > 0:
                        for spaceAfterBreakElement in spaceAfterBreakList:
                            if re.match(
                                    '^ ', spaceAfterBreakElement.text
                            ) and not (spaceAfterBreakElement.xpath(
                                    r'preceding-sibling::*[1][self::w:t]')):
                                spaceAfterBreakElement.text = re.sub(
                                    r"^ ", r"", spaceAfterBreakElement.text)
                    #remove break before paragraph end
                        breakBeforeParagraphEndList = paragraph._element.xpath(
                            r'w:r[last()]/w:br[@w:clear="none" and not(following-sibling::*)]'
                        )
                        if len(breakBeforeParagraphEndList) > 0:
                            delete_element(breakBeforeParagraphEndList[0])

                    t = paragraph.text.strip()
                    if re.match(
                            r"^Bilde: |^Forklaring: |^--- \d+ til |^_[^_]*_$|^STATPED_DUMMYTEXT_LIST_UNSTYLED|^STATPED_DUMMYTEXT_P_BEFORE_DL",
                            t) or ((removeIndent or len(t) == 0)
                                   and paragraph.style.name == "Normal"):
                        paragraph.style = normalParagraphNoIndent
                    # Remove dummy-text and set hengemarg
                    if re.match(
                            r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL)",
                            paragraph.text):
                        paragraph.paragraph_format.left_indent = hangingIndentList  #Pt(0)
                        paragraph.paragraph_format.first_line_indent = -hangingIndentList  #Pt(-20)
                    if re.match(r"^STATPED_DUMMYTEXT", paragraph.text):
                        paragraph.text = re.sub(
                            r"^(STATPED_DUMMYTEXT_LIST_UNSTYLED|STATPED_DUMMYTEXT_DL|STATPED_DUMMYTEXT_P_BEFORE_DL)",
                            "", paragraph.text)
                    if len(t) == 0 or paragraph.style.name[
                            0:7] == "Heading" or re.match(
                                r"^--- \d+ til |^_[^_]*_$", t):
                        removeIndent = True
                    else:
                        removeIndent = False

                # remove bold from Headings.
                paraStylesWithoutBoldOrUnderline = [
                ]  #list of all para-styles without underline or bold
                paraStylesWithoutUnderline = [
                ]  #list of all para-styles without underline
                for style in document.styles:
                    if style.name[0:7] == "Heading":
                        style.font.bold = None
                        style.paragraph_format.left_indent = headingIndent  #Pt(0)
                        style.paragraph_format.first_line_indent = -headingIndent  #Pt(-20)
                        style.paragraph_format.space_before = Pt(0)
                        style.paragraph_format.space_after = Pt(0)
                        style_element = style._element
                        spacing = style_element.xpath(r'w:pPr/w:spacing')[0]
                        spacing.set(qn('w:beforeLines'), "0")
                        spacing.set(qn('w:afterLines'), "0")
                    if style.name[0:5] == "Para ":
                        if style.font.underline != True:
                            paraStylesWithoutUnderline.append(style.name)
                            if style.font.bold != True:
                                paraStylesWithoutBoldOrUnderline.append(
                                    style.name)

                # find all para-styles with wanted properties in tables and change style
                paraStylesInTables = []
                #for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline:
                for paraStyleWithoutUnderline in paraStylesWithoutUnderline:
                    for element in document.element.xpath(
                            "//w:tbl//w:p//w:pStyle[@w:val = '" +
                            paraStyleWithoutUnderline + "']"):
                        paraStylesInTables.append(element)
                for paraStyleInTables in paraStylesInTables:
                    paraStyleInTables.attrib[
                        '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent  # or normalParagraph

                # uncomment if you want to modify first p in a cell
                # firstParaStylesInTables = []
                # for paraStyleWithoutBoldOrUnderline in paraStylesWithoutBoldOrUnderline:
                #     for element in document.element.xpath("//w:tc//w:p[position()=1]//w:pStyle[@w:val = '" + normalParagraph + "']"):
                #         firstParaStylesInTables.append(element)
                # for paraStyleInTables in firstParaStylesInTables:
                #     paraStyleInTables.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'] = normalParagraphNoIndent

                # tables missing required <w:tblGrid>, so throws: docx.oxml.exceptions.InvalidXmlError: required ``<w:tblGrid>`` child element not present
                #from docx.table import _Cell, Table
                #from docx.oxml.text.paragraph import CT_P
                # for row in table.columns:
                #     try:
                #         for cell in row.cells:
                #             firstP = True
                #             for p in cell.paragraphs:
                #                 if p.style.font.underline != True and re.match(r"^Para | Block",p.style.name):
                #                     if firstP:
                #                         p.style = "NormalNoIndent"
                #                         firstP = False
                #                     else:
                #                         p.style = "Normal"
                #     except Exception as e:
                #         pass

                document.save(
                    os.path.join(temp_docxdir,
                                 epub.identifier() + ".docx"))
                self.utils.report.info(
                    "Temp-fil ble lagret: " +
                    os.path.join(temp_docxdir,
                                 epub.identifier() + ".docx"))

                wordFile = os.path.join(temp_docxdir,
                                        epub.identifier() + ".docx")

                zipDocument = zipfile.ZipFile((folder / wordFile))
                tempFolder = "temp"
                zipDocument.extractall(folder / tempFolder)
                zipDocument.close()
                zippedFile = tempFolder + "/word/numbering.xml"
                xmlFile = open((folder / zippedFile), 'r+')
                xmlText = xmlFile.read()
                xmlText = re.sub(r'w:left="1152"', r'w:left="360"', xmlText)
                xmlText = re.sub(r'w:left="1512"', r'w:left="720"', xmlText)
                xmlText = re.sub(r'w:left="1872"', r'w:left="1080"', xmlText)
                xmlText = re.sub(
                    r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%([1-9])\."/>',
                    r'<w:numFmt w:val="lowerLetter"/><w:lvlText w:val="%\1)"/>',
                    xmlText)  # a. as a) in lists
                #xmlText = re.sub(r'<w:lvlText w:val="%(1|2)\."/>', r'<w:lvlText w:val="%\1)"/>', xmlText) # a. as a), and 1. as 1) in lists

                writeFile(xmlText, zippedFile)
                zipdir(str(folder / tempFolder), str(folder),
                       os.path.join(temp_docxdir,
                                    epub.identifier() + ".docx"))


# ---------- end script from kvile -------

            else:
                self.utils.report.error(
                    "En feil oppstod ved konvertering til DOCX for " +
                    epub.identifier())
                self.utils.report.debug(traceback.format_stack())
                self.utils.report.title = self.title + ": " + self.book[
                    "name"] + " feilet 😭👎" + epubTitle
                return False

        except subprocess.TimeoutExpired:
            self.utils.report.error(
                "Det tok for lang tid å konvertere " + epub.identifier() +
                " til DOCX, og Calibre-prosessen ble derfor stoppet.")
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        except Exception:
            self.utils.report.error(
                "En feil oppstod ved konvertering til DOCX for " +
                epub.identifier())
            self.utils.report.info(traceback.format_exc(), preformatted=True)
            self.utils.report.title = self.title + ": " + self.book[
                "name"] + " feilet 😭👎" + epubTitle
            return False

        archived_path, stored = self.utils.filesystem.storeBook(
            temp_docxdir, epub.identifier())
        self.utils.report.attachment(None, archived_path, "DEBUG")
        self.utils.report.title = self.title + ": " + epub.identifier(
        ) + " ble konvertert 👍😄" + epubTitle
        return True