Пример #1
0
 def render_title(self) -> None:
     """
     Render and write the title.xhtml file (title page of the book)
     """
     logger.debug("Rendering title.xhtml...")
     self.render_and_write(join(OEBPS, "title.xhtml"),
                           **dict(title=self.title))
Пример #2
0
def is_rss(node: ElementTree.Element, title: str) -> bool:
    """Return true if the element is indeed an RSS feed"""
    rss_type = node.attrib.get("type")
    if rss_type != "rss":
        logger.debug(f"Unknown type for `{title}`: {rss_type}")
        return False
    return True
Пример #3
0
 def render_html_toc(self) -> None:
     """
     Generate, render and write the toc.xhtml file (Table of Contents)
     """
     logger.debug("Rendering toc.xhtml...")
     toc = self.generate_html_toc()
     kwargs = dict(toc=toc, title=self.title)
     self.render_and_write(join(OEBPS, "toc.xhtml"), **kwargs)
Пример #4
0
 def render_ncx_toc(self) -> None:
     """
     Generate, render and write the toc.ncx file (Table of Contents)
     """
     logger.debug("Rendering toc.ncx...")
     navpoints = self.generate_navpoints()
     kwargs = dict(title=self.title, uuid=self.uuid, navpoints=navpoints)
     self.render_and_write(join(OEBPS, "toc.ncx"), **kwargs)
Пример #5
0
 def remove_container(self, container: Optional[Container] = None) -> None:
     """Stop and remove a docker container"""
     container = container or self.container
     if container:
         logger.debug("Stopping container...")
         container.stop()
         logger.debug("Removing container...")
         container.remove()
Пример #6
0
 def prepare_epub_dirs(self) -> None:
     """
     Create all the basic folders in the EPUB archive
     """
     logger.debug("Creating EPUB folders...")
     makedirs(join(self._dst_path, META_INF))
     makedirs(join(self._dst_path, OEBPS, CONTENT))
     makedirs(join(self._dst_path, OEBPS, IMAGES))
Пример #7
0
 def clean_existing_containers(self) -> None:
     """Remove any existing mercury parser API containers"""
     all_containers = self.client.containers.list(all=True, sparse=True)
     for container in all_containers:
         if container.attrs["Names"] == [f"/{CONTAINER_NAME}"]:
             logger.debug(
                 "Found an existing container with the same name...")
             self.remove_container(container)
Пример #8
0
def get_title(node: ElementTree.Element) -> Optional[str]:
    """Retrieve the feed's title from the XML element"""
    # The `title` and `text` are usually identical
    title = node.attrib.get("title", node.attrib.get("text"))
    if title:
        title = strip_common_unicode_chars(title)
    else:
        logger.debug("Could not find title for RSS feed")
    return title
Пример #9
0
 def copy_fixed_files(self) -> None:
     """
     Copy to the destination folder all the files that don't require rendering and can be copied as is
     """
     logger.debug("Copying fixed files...")
     self.copy_file("mimetype")
     self.copy_file(join(META_INF, "container.xml"))
     self.copy_file(join(OEBPS, IMAGES, "cover.png"))
     self.copy_file(join(OEBPS, "stylesheet.css"))
Пример #10
0
 def generate_spine_articles(self) -> str:
     """
     Create <itemref> elements for all the articles in the <spine> section
     """
     logger.debug("Generating spine articles...")
     spine_article_template = Template('<itemref idref="${id}"/>')
     return "\n\t".join([
         spine_article_template.substitute(id=article.id)
         for article in self.articles
     ])
Пример #11
0
def send_email_message(server: smtplib.SMTP, msg: EmailMessage) -> bool:
    """Send a single EmailMessage"""
    logger.debug("Sending the email...")
    try:
        server.send_message(msg)
        return True
    except smtplib.SMTPException as e:
        logger.error(
            f"Caught an exception while trying to send an email.\nError: {e}")
        return False
Пример #12
0
 def run_mercury_container(self) -> Container:
     """Launch a new mercury-parser docker container"""
     logger.debug("Launching a new mercury-parser Docker container...")
     self.container = self.client.containers.run(
         "wangqiru/mercury-parser-api:latest",
         detach=True,
         ports={f"{MERCURY_PORT}/tcp": MERCURY_PORT},
         name=CONTAINER_NAME,
     )
     return self.container
Пример #13
0
 def download_image(self, url: str) -> str:
     """
     Download an image from a URL into the images folder
     """
     logger.debug(f"Downloading image {url}...")
     image_name = images.get_image_filename(url)
     if len(image_name) > 150:
         image_name = image_name[:150]
     image_path = join(self.images_path, image_name)
     images.download_image(url, image_path)
     return image_name
Пример #14
0
 def generate_manifest_articles(self) -> str:
     """
     Create <item> elements for all the articles in the <manifest> section
     """
     logger.debug("Generating manifest articles...")
     manifest_article_template = Template(
         '<item id="${id}" href="content/${id}.xhtml" media-type="application/xhtml+xml"/>'
     )
     return "\n\t".join([
         manifest_article_template.substitute(id=article.id)
         for article in self.articles
     ])
Пример #15
0
 def generate_navpoints(self) -> str:
     """
     Create a navpoint per article for use in the toc.ncx file
     """
     logger.debug("Generating navpoints...")
     template = self.get_template(join(_R2K, "navpoint.xml"))
     navpoints = [
         dict(id=article.id, title=article.title, order=i + NAVPOINT_OFFSET)
         for i, article in enumerate(self.articles)
     ]
     return "\n\t\t".join(
         [template.substitute(**navpoint) for navpoint in navpoints])
Пример #16
0
def create_epub(raw_articles: List[Article], title: str) -> str:
    """
    Create an EPUB book from multiple articles.

    :returns temp path to created ebook
    """
    epub_path = mkdtemp()
    logger.debug(f"Creating epub folder in {epub_path}")

    articles = [
        EPUBArticle(raw_article, epub_path) for raw_article in raw_articles
    ]
    book = EPUB(articles, title, epub_path)
    return book.build()
Пример #17
0
 def validate_container_is_up() -> None:
     """Try to connect to the mercury parser service several times. Quit app if not successful"""
     errors = set()
     logger.debug(
         f"Launched container at {BASE_MERCURY_URL}. Validating it's up...")
     while retries := CONNECTION_ATTEMPTS:
         try:
             requests.get(BASE_MERCURY_URL)
             logger.debug("Connected!")
             return
         except ConnectionError as e:
             errors.add(e)
         sleep(1)
         retries -= 1
Пример #18
0
 def generate_manifest_images(self) -> str:
     """
     Create <item> elements for all the images in the <manifest> section
     """
     logger.debug("Generating manifest images...")
     manifest_image_template = Template(
         '<item id="${id}" href="images/${id}" media-type="image/${ext}"/>')
     manifest_images: List[dict] = [
         dict(id=image_name, ext=images.get_img_extension(image_name))
         for image_name in listdir(join(self._dst_path, OEBPS, IMAGES))
         if image_name != "cover.png"
     ]
     return "\n\t".join([
         manifest_image_template.substitute(**image)
         for image in manifest_images
     ])
Пример #19
0
def get_feeds_from_url(url: str) -> list:
    """
    Try to parse the URL and find any RSS feeds in the webpage

    Adapted from: https://gist.github.com/alexmill/9bc634240531d81c3abe
    """
    logger.info(f"Attempting to find RSS feeds from {url}...")

    # If the URL itself is a proper RSS feed, just return it
    if is_rss_feed(url):
        logger.debug("URL is already a proper RSS feed")
        return [url]

    html = get_html(url)
    possible_feeds = get_feeds_from_links(html) + get_feeds_from_atags(
        url, html)

    return [url for url in set(possible_feeds) if is_rss_feed(url)]
Пример #20
0
def set_content(msg: EmailMessage, title: str, url: Optional[str],
                attachment_path: Optional[str]) -> None:
    """Either set the text content of the email message, or attach an attachment, based on the current parser"""
    if attachment_path:
        # We are marking the attachment as HTML, although it's an epub, because kindle doesn't officially accept
        # EPUB files in emails, but unofficially it will convert the file with kindlegen and it'll work fine
        # Reference: https://www.amazon.com/gp/sendtokindle/email
        filename = f"{title}.html"
        logger.debug(f"Setting attachment for {title}")
        with open(attachment_path, "rb") as f:
            msg.add_attachment(
                f.read(),
                maintype="text",
                subtype=f'html; charset=utf-8; name="{filename}"',
                filename=filename,
            )
    elif url:
        logger.debug(f"Setting email content to {url}")
        msg.set_content(url)
Пример #21
0
    def compress_epub(self) -> str:
        """
        Create the EPUB ZIP archive

        First add the `mimetype` file - EPUB specs say it must be first and uncompressed.
        Then, recursively add all the files and folders under META-INF and OEBPS
        """
        logger.debug("Creating an epub archive...")

        epub_name = f"{self.id}.epub"
        epub_path = join(mkdtemp(prefix="epub"), epub_name)

        # The EPUB must contain the META-INF and mimetype files at the root, so
        with ZipFile(epub_path, "w") as epub:
            # Add the mimetype file first (as is required by EPUB format) and set it to be uncompressed
            epub.write(join(self._dst_path, MIMETYPE),
                       arcname=MIMETYPE,
                       compress_type=ZIP_STORED)
            self.recursively_add_files_to_epub_archive(epub)
        return epub_path
Пример #22
0
    def __exit__(self, exc_type: Optional[Type[BaseException]],
                 exc_val: Optional[BaseException],
                 exc_tb: Optional[TracebackType]) -> bool:
        """
        Context manager __exit__ for MercuryParser

            1. Remove the container
            2. Stop the program for any expected errors
        """
        self.remove_container()

        if exc_val:
            if isinstance(exc_val, (DockerAPIError, ConnectionError)):
                logger.error(
                    "Could not connect to Docker. Run with -v to get more details"
                )
                logger.debug(f"Error info:\n{exc_val}")
                sys.exit(1)
            else:
                raise exc_val
        return True
Пример #23
0
    def render_opf(self) -> None:
        """
        Render the content.opf XML file

        content.opf requires the following:
            1. <item> elements for all the articles in the <manifest> section
            2. <item> elements for all the images in the <manifest> section
            3. <itemref> elements for all the articles in the <spine> section
        """
        logger.debug("Generating content.opf...")
        manifest_articles = self.generate_manifest_articles()
        manifest_images = self.generate_manifest_images()
        spine_articles = self.generate_spine_articles()
        kwargs = dict(
            title=self.title,
            date=self.date,
            uuid=self.uuid,
            manifest_articles=manifest_articles,
            manifest_images=manifest_images,
            spine_articles=spine_articles,
        )
        self.render_and_write(join(OEBPS, "content.opf"), **kwargs)
Пример #24
0
 def get_parsed_doc(url: str) -> dict:
     """Make an HTTP call to the mercury API and get the parsed document"""
     full_url = f"{BASE_MERCURY_URL}?url={url}"
     logger.debug("Parsing article with Mercury Parser...")
     logger.debug(f"Sending request to {full_url}")
     result = requests.get(full_url).json()
     logger.debug("Finished parsing")
     return result
Пример #25
0
    def parse_images(self, raw_content: str) -> str:
        """
        Parse and download images in the article

        Go over the content of the article and:
            1. Find all the `img` tags in the HTML
            2. Download all the images to the `images` folder in the EPUB dir
            3. Set the relative paths to those images in the HTML content
            4. Update the `content` attribute with the new HTML content
        """
        soup = BeautifulSoup(raw_content, "html.parser")
        logger.debug("Looking for images...")
        for img in soup.find_all("img"):
            img_url = images.get_img_url(self.url, img)
            if not img_url:
                continue

            image_name = self.download_image(img_url)
            # Ad the articles live in the `content` folder, we need to go one level up
            image_path = join("..", IMAGES, image_name)
            img["src"] = image_path

        return soup.decode()
Пример #26
0
    def render_articles(self) -> None:
        """
        Go over all the articles and write their formatted content to disk

        For each article:
            1. Parse its content (also downloads images)
            2. If the parse did not succeed do nothing
            3. If the article was parsed successfully, use the `article.xhtml` template to create the final article
        """
        logger.debug("Rendering articles...")
        parser_cls = self._get_parser_class()
        with parser_cls() as parser:
            for article in self.articles:
                if not article.parse(parser):
                    continue
                kwargs = dict(title=article.title,
                              author=article.author,
                              date=article.date,
                              content=article.content)
                article_path = join(OEBPS, CONTENT, f"{article.id}.xhtml")
                article_html = self.render_template(
                    join(OEBPS, CONTENT, "article.xhtml"), **kwargs)
                self.write_file(article_html, article_path)
Пример #27
0
def send_email_messages(msgs: List[EmailMessage]) -> int:
    """Send an email"""
    messages_sent = 0
    try:
        logger.debug("Connecting to SMTP...")
        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.ehlo()
            logger.debug("Logging into the SMTP server...")
            server.login(config.send_from, config.password)
            for msg in msgs:
                if send_email_message(server, msg):
                    messages_sent += 1
                    logger.debug("Email sent successfully!")
    except smtplib.SMTPException as e:
        logger.error(
            f"Caught an exception while trying to send an email.\nError: {e}")
    return messages_sent
Пример #28
0
def get_url(node: ElementTree.Element, title: str) -> Optional[str]:
    """Retrieve the feed's URL from the XML element"""
    url = node.attrib.get("xmlUrl")
    if not url:
        logger.debug(f"Could not find URL for `{title}`")
    return url