/
readablepub.py
executable file
·110 lines (85 loc) · 3.79 KB
/
readablepub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
"""
Readablepub downloads a cleaned-up copy of any online article and
packages it for offline reading on many devices (as epub).
Images are included, but scripts or stylesheets aren't.
We use Readability's Parser API, so you must sign up at:
https://www.readability.com/developers/api
"""
from __future__ import print_function
import sys, logging, os, argparse
from readability import ParserClient
from slugify import slugify
from lxml import etree
import requests
from ebooklib.utils import parse_html_string
from ebooklib.plugins.base import BasePlugin
from ebooklib import epub
class DownloadImagesPlugin(BasePlugin):
"""
Ebooklib has nice hooks in the form of plugins. This one downloads
all referenced images in the cleaned-up HTML and changes the src
attributes so that they point at the local resources.
"""
NAME = "DownloadImagesPlugin"
def html_before_write(self, book, chapter):
try:
html_tree = parse_html_string(chapter.content)
except:
return
for img_elem in html_tree.iterfind('.//img'):
href = img_elem.attrib['src']
split_href = os.path.splitext(img_elem.attrib['src'])
# We can just slugify the original URL to determine the new URL
img_local_filename = slugify(split_href[0]) + split_href[1]
book.add_item(
epub.EpubItem(
uid=img_local_filename,
file_name=img_local_filename,
content=requests.get(href).content))
# Alter the HTML element to point at the local resource
img_elem.attrib['src'] = img_local_filename
chapter.content = etree.tostring(html_tree, pretty_print=True,
encoding='utf-8')
class ReadabilityToEpub:
def __init__(self, parser_token=None):
if not parser_token:
raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api")
self.parser_client = ParserClient(token=parser_token)
def convert_url(self, url):
parser_resp = self.parser_client.get_article(url).json()
epub_book = epub.EpubBook()
epub_book.set_title(parser_resp['title'])
epub_book.add_author(parser_resp['author'])
content_html = epub.EpubHtml(
title=parser_resp['title'],
file_name='content.xhtml',
content="<h1>{}</h1>\n{}".format(
parser_resp['title'],
parser_resp['content']))
epub_book.add_item(content_html)
epub_book.add_item(epub.EpubNcx())
epub_book.add_item(epub.EpubNav())
# A spine determines the order in which content will be shown
epub_book.spine = [content_html]
epub.write_epub("{}.epub".format(slugify(parser_resp['title'])),
epub_book,
dict(plugins=[DownloadImagesPlugin()]))
def main():
parser = argparse.ArgumentParser(description="Save online articles as EPUB using the Readability API")
parser.add_argument('url', type=str, help="URL of the article")
parser.add_argument('--token', type=str, help="Readability API Parser token")
args = parser.parse_args()
if not args.token:
token_file_name = os.path.join(os.path.expanduser('~'), '.readability_parser_token')
try:
token = open(token_file_name).read().strip()
except:
sys.exit("You did not pass a Readability parser token as argument and we couldn't read it from {}".format(
token_file_name))
else:
token = args.token
downloader = ReadabilityToEpub(parser_token=token)
downloader.convert_url(args.url)
if __name__ == '__main__':
main()