def to_epub(parser): """ God this function is ugly. """ author = parser.author title = parser.title book = epub.EpubBook() book.set_title(title) book.set_language('en') book.add_author(author) chapters_info = [] chapters_obj = [] for chapter in parser.get_chapters_content(): file_name = 'chapter_%d.xhtml' % chapter['order'] c = epub.EpubHtml(title=chapter['name'], file_name=file_name) c.content = chapter['soup'].prettify() chapters_info.append((file_name, title, '')) book.add_item(c) chapters_obj.append(c) for image in parser.processed_images.values(): img = epub.EpubImage() img.file_name = 'images/%s' % image.name img.content = open(image.path, 'rb').read() book.add_item(img) book.toc = (chapters_obj) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['nav'] book.spine += chapters_obj epub.write_epub(title.replace(' ', '_') + '.epub', book, {})
def add_chapter(self, title, file_name, content, ptype='chapter'): """ 生成章节内容实例 params: title: 小节标题 file_name: 保存文件名 content: 文件内容 ptype: 保存类型,默认为文章类型(chapter), 还可以是`image`图片类型 return: chapter: 返回生成的小节对象实例 """ if title is None or file_name is None or content is None: return None c1 = None if ptype == 'chapter': c1 = epub.EpubHtml(title=title, file_name=file_name, lang=self.lang) elif ptype == 'image': c1 = epub.EpubImage(title=title, file_name=file_name, lang=self.lang) if isinstance(content, list) or isinstance(content, tuple): text = "" for line in content: text += "<p>" + line + "</p>\n" c1.set_content(text) else: c1.set_content(content) return c1
def add_image(self, image_filename: str): if image_filename == "cover.jpeg" or image_filename == "cover.jpg": return def get_image_type(_image_filename): suffix = Path(_image_filename).suffix if ".jpeg" == suffix.lower() or ".jpg" == suffix.lower(): return "image/jpeg" if ".png" == suffix.lower(): return "image/png" if ".svg" == suffix.lower(): return "image/svg+xml" if ".gif" == suffix.lower(): return "image/gif" raise RuntimeError( f"Unsupported image file format: {_image_filename}") item = epub.EpubImage() item.id = "image_" + image_filename.replace("/", "_") item.file_name = image_filename item.media_type = get_image_type(image_filename) item.content = self.book.get_binary_content(image_filename) self.epub_book.add_item(item)
def add_colophon(self, document, documents=None): colophon = self.find_colophon(document or documents[0]) if colophon: html = self.clean_html( self.render_colophon(colophon, document, documents)) # pull in any static images used in the colophon doc = ET.HTML(html) images = [ img for img in doc.xpath('//img[@src]') if img.get('src').startswith('/static/') ] # rewrite paths to be relative for img in images: img.set('src', img.get('src')[1:]) html = ET.tostring(doc) entry = epub.EpubHtml(uid='colophon', file_name='colophon.xhtml') entry.content = html self.book.add_item(entry) self.book.spine.append(entry) for fname in set(img.get('src') for img in images): local_fname = find_static(fname[7:]) if local_fname: img = epub.EpubImage() img.file_name = fname with open(local_fname, 'rb') as f: img.content = f.read() self.book.add_item(img)
def find_uncommon_words_in_one_text(self, text): #找出一段文字中生僻字的位置 pos_list=[] for idx,char in enumerate(text): if re.match(r'\s', char): continue if char not in self.kindle_charset: pos_list.append(idx) #没有生成图片时才重新生成 if char not in self.char_image_map: is_in_big_font=False #查找char所在的字体并渲染,big font还是不够大啊 for font, render in self.fonts_render.items(): if char in self.font_charset_map[font]: (surface, textpos) = render.render(char, (0, 0, 0)) b=char.encode("unicode_escape") name = str(b[2:])[2:-1] name+=".png" pygame.image.save(surface, os.path.join(self.temp_dirctory,name)) data=open(os.path.join(self.temp_dirctory,name),'rb').read() self.book.add_item(epub.EpubImage(file_name=os.path.join(self.font_image_dir, name),media_type='image/png', content=data)) self.char_image_map[char]=name is_in_big_font=True break if not is_in_big_font: print("Very very uncommon: ",char) return pos_list
def _create_epub_images(self): """ Create epub image objects :Args: - self (:class:`ExportBook`): current class instance """ for i, attachment in enumerate(self.attachments): if ('static/' + os.path.basename( attachment.attachment.name)) not in self.embeded_images: continue try: f = open(attachment.attachment.name, "rb") blob = f.read() f.close() except (IOError, OSError): continue else: filename = os.path.basename( attachment.attachment.name.encode("utf-8")) itm = epub.EpubImage() itm.file_name = 'static/%s' % filename itm.content = blob self.epub_book.add_item(itm)
def fetch_image(self, doc, book, lable_xpath=r"//img", attr='src'): '''图片链接下载''' for _link in doc.xpath(lable_xpath): img_url = _link.get(attr) if not img_url.startswith('http'): print(f"img_url:{img_url} invalid! not startswith http") continue print(f"xpath:{lable_xpath},attr:{attr}, img_url={img_url}") if img_url not in self.url_doer: self.url_doer.add(img_url) resp = req_get_info(img_url, proxies=self.proxy) if resp is None: continue img_item = epub.EpubImage() img_url = re.sub(r'\?.*', '', img_url) # 过滤?及其后参数请求信息# file_name = '{:03d}_{}'.format( self.img_idx, img_url.rsplit('/', maxsplit=1)[1] ) img_item.file_name = file_name self.img_idx += 1 img_item.set_content(resp.content) book.add_item(img_item) _link.set(attr, file_name) else: print("already downloaded url:", img_url) img_url = re.sub(r'\?.*', '', img_url) # 过滤?及其后参数请求信息# file_name = '{:03d}_{}'.format( self.img_idx, img_url.rsplit('/', maxsplit=1)[1] ) _link.set('src', file_name) return doc
def create_image_objects(news, image_number): """Return list of epubImage objects""" list_of_image_objects = [] list_of_images = download_images(news) for image in list_of_images: img_obj = epub.EpubImage() img_obj.file_name = f"{image_number}.jpg" image_number += 1 img_obj.media_type = "image/jpeg" img_obj.set_content(image) list_of_image_objects.append(img_obj) return list_of_image_objects, image_number
def add_attachments(self, document, file_dir): fnames = set( img.get('src')[6:] for img in document.doc.root.xpath( '//a:img[@src]', namespaces={'a': document.doc.namespace}) if img.get('src', '').startswith('media/')) for attachment in document.attachments.all(): if attachment.filename in fnames: img = epub.EpubImage() img.file_name = f'{file_dir}/media/{attachment.filename}' img.content = attachment.file.read() self.book.add_item(img)
def addJpegImage(self, imageData): """Adds a jpeg image from the imageData array to the book and returns the reference name for the image to be used in html. @param imageData Image data in format jpeg @return The name of the image to be used in html """ epimg = epub.EpubImage() epimg.file_name = "grf/image_%i.jpg" % self.imgCount self.imgCount += 1 epimg.media_type = "image/jpeg" epimg.set_content(imageData) self.ebook.add_item(epimg) return epimg.file_name
def make_cover_image(app): if not (app.book_cover and os.path.isfile(app.book_cover)): return None # end if logger.info('Creating cover: %s', app.book_cover) # ext = app.book_cover.split('.')[-1] cover_image = epub.EpubImage() cover_image.file_name = 'cover.jpg' cover_image.media_type = 'image/jpeg' with open(app.book_cover, 'rb') as image_file: cover_image.content = image_file.read() # end with return cover_image
def get_picture(index: int, src: str, timeout, img_list: list): try: req = requests.get(src, timeout=timeout) except Exception as e: print(src, e) else: img_data = req.content match = re.search('[0-9]+.(jpg|png)', src) name = match.group() img = epub.EpubImage() img.file_name = name img.media_type = "image/" + match.group(1) img.content = img_data img_list[index] = img
def _inline_remote_image(self, src): epub_img = epub.EpubImage() digest = hashlib.sha256(src.encode("utf-8")).hexdigest() digest_name = "{}.{}".format(digest, src.rsplit(".")[-1]) epub_img.file_name = os.path.join(self.tmp_path, digest_name) if not os.path.exists(epub_img.file_name): logger.info("Downloading remote image %s", src) resp = requests.get(src) with open(epub_img.file_name, "wb") as f: f.write(resp.content) logger.info("Remote image %s added as %s", src, epub_img.file_name) return epub_img
def make_chapter_images(book, image_output_path): if not os.path.isdir(image_output_path): return # end if for filename in os.listdir(image_output_path): if not filename.endswith('.jpg'): continue # end if image_item = epub.EpubImage() image_item.media_type = 'image/jpeg' image_item.file_name = 'images/' + filename with open(os.path.join(image_output_path, filename), 'rb') as fp: image_item.content = fp.read() # end with book.add_item(image_item)
def _inline_local_image(self, img, src): epub_img = epub.EpubImage() epub_img.file_name = src image_path = os.path.join(self.html_root, src) if not os.path.exists(image_path): logger.error("File %s doesn't exists, skipping!", image_path) raise IOError("Can't open %s" % image_path, image_path) with open(image_path, "rb") as f: epub_img.content = f.read() if "style" in img.params: del img.params["style"] logger.info("Local image %s added", epub_img.file_name) return epub_img
def assemble(self): urls = self._get_urls() book = epub.EpubBook() spine = [epub.EpubNcx(), epub.EpubNav(), self._get_cover()] toc = [] for url in reversed(urls): cache_entry = self._cache.get(url) if not cache_entry: continue post = Entry(cache_entry, filter_index.ENTRY_FILTERS) chapter = post.get_epub_chapter() spine.append(chapter) toc.append(epub.Link(chapter.file_name, chapter.title, chapter.id)) for url, filename in post.get_image_urls(): img = epub.EpubImage() img.file_name = filename try: img.content = self._cache.get(url, binary=True) img.media_type = 'image/jpeg' book.add_item(img) except PageNotFoundError, e: # Ignored, just skip the image. pass
def add_chapter(self, title, content): r""" 添加章节 :param title: 标题 :param content: 内容 :return: """ # 处理图片 img_urls = re.findall(r'<img\ssrc="(\S+)"', content) for img_url in img_urls: pic_path = "images/%s.jpg" % str(uuid.uuid1()) # 下载图片 content = content.replace(img_url, pic_path) img_data = urllib.urlopen(img_url).read() image_item = epub.EpubImage() image_item.set_content(img_data) image_item.file_name = pic_path self.book.add_item(image_item) chapter = epub.EpubHtml(title=title, file_name='%s.xhtml' % str(uuid.uuid1()), lang='hr') chapter.content = content self.chapters.append(chapter) self.book.add_item(chapter)
def _add_images(book, html, base_url): tree = fromstring(html) for node in tree.xpath('//*[@src]'): if node.tag not in ('img', 'video'): continue url = node.get('src') if node.tag == 'video' or _not_image_file(url) or _image_too_small(node): node.getparent().remove(node) else: file_name = _download_image(urljoin(base_url, url)) if file_name is None: node.getparent().remove(node) else: node.set('src', file_name) img = epub.EpubImage( file_name=file_name, content=open(join(OUTBOX, file_name), 'rb').read() ) book.add_item(img) return tostring(tree)
if img: with tag('figure'): img_src = 'https:{}'.format(img['src'].replace( '/hrthumbs', '')) img_path = handle_img(img_src) img_name, img_ext = os.path.splitext( img_path.split('/')[-1]) epub_img_path = 'images/{}{}'.format(img_name, img_ext) # Add the image to the EPUB, if it isn't already if book.get_item_with_href(epub_img_path): warnings.warn( '{} has already been added'.format(img_path)) else: epub_img = epub.EpubImage() epub_img.uid = img_name epub_img.file_name = epub_img_path if img_ext == '.jpg': epub_img.media_type = 'image/jpeg' else: raise ValueError( 'You\'re adding something that isn\'t a JPEG' ) with open(img_path, 'rb') as img_bin: epub_img.content = img_bin.read() book.add_item(epub_img)
def make_image(self, block): """ Given a dict object containing the block info for an image, generate the image HTML """ page_no = block['page_no'] if page_no == 1: # The first page's image is made into the cover automatically return # pad out the filename to four digits origfile = '{dir}/{base}_jp2/{base}_{page:0>4}.jp2'.format( dir=self.tmpdir.name, base=self.base, page=block['page_no']) basefile = 'img_{:0>4}.png'.format(self.picnum) pngfile = '{}/{}'.format(self.tmpdir.name, basefile) in_epub_imagefile = 'images/{}'.format(basefile) # get image dimensions from ABBYY block attributes # (left, top, right, bottom) box = self.image_dim(block) width = box[2] - box[0] height = box[3] - box[1] # ignore if this image is entirely encapsulated in another image for each_pic in self.metadata['pics_by_page']: # Ignore if this is just the block itself if each_pic == block: continue new_box = self.image_dim(each_pic) for (old, new) in zip(box, new_box): if old <= new: return # make the image: try: i = Image.open(origfile) except IOError as e: self.logger.warning("Can't open image {}: {}".format(origfile, e)) try: i.crop(box).save(pngfile) except IOError as e: self.logger.warning( "Can't crop image {} and save to {}: {}".format( origfile, pngfile, e)) epubimage = epub.EpubImage() epubimage.file_name = in_epub_imagefile with open(pngfile, 'rb') as f: epubimage.content = f.read() epubimage = self.book.add_item(epubimage) container_w = width / int(block['style']['pagewidth']) * 100 content = u''' <div style="width: {c_w}%;"> <img src="{src}" alt="Picture #{picnum}"> </div> '''.format( c_w=container_w, src=in_epub_imagefile, picnum=self.picnum, w=width, h=height, ) # increment the image number self.picnum += 1 return content
def make_image(self, block): """ Given a dict object containing the block info for an image, generate the image HTML """ page_no = block['page_no'] if page_no == 0: # The first page's image is made into the cover automatically return # pad out the filename to four digits origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format( dir=self.tmpdir, item_bookpath=self.item_bookpath, page=page_no) if not os.path.isfile(origfile): return basefile = 'img_{:0>4}.png'.format(self.picnum) outfile = '{}/{}'.format(self.tmpdir, basefile) in_epub_imagefile = 'images/{}'.format(basefile) # get image dimensions from ABBYY block attributes # (left, top, right, bottom) box = self.image_dim(block) width = box[2] - box[0] height = box[3] - box[1] # some image processors also need the original page dimensions pagewidth = float(block['style']['pagewidth']) pageheight = float(block['style']['pageheight']) pagedim = (pagewidth, pageheight) # ignore if this image is entirely encapsulated in another image for each_pic in self.metadata['pics_by_page']: # Ignore if this is just the block itself if each_pic == block: continue new_box = self.image_dim(each_pic) if all(i >= j for i, j in zip(box, new_box)): return # make the image: imageobj = ImageFactory(self.image_processor) try: imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim) except RuntimeError as e: # for failed image creation, keep processing the epub self.logger.error(e) return '' epubimage = epub.EpubImage() epubimage.file_name = in_epub_imagefile with open(outfile, 'rb') as f: epubimage.content = f.read() epubimage = self.book.add_item(epubimage) # to approximate original layout, set the image container width to # percentage of the page width container_w = (width / pagewidth) * 100 content = u''' <div style="width: {c_w}%;"> <img src="{src}" alt="Picture #{picnum}"> </div> '''.format( c_w=container_w, src=in_epub_imagefile, picnum=self.picnum, w=width, h=height, ) # increment the image number self.picnum += 1 return content
def export_book(input_file, filename): """Reads content of book in Booki.zip format and converts it to EPUB format. This function reads content of the book in Booki.zip file, creates new book in EPUB format and converts entire content into it. There are some things which are different in new EPUB format. One of them is how links and interlinks are handled. """ epub_book = ExportEpubBook() # Creating new EPUB file epub_book.add_prefix('bkterms', 'http://booktype.org/') # Read old Booki.zip format bookizip = BookiZip(input_file) _toc, _section, _section_name = [], [], None spine = ['nav'] # Get filesnames of all the chapters/sections file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()] x = 0 for typ, file_name, title in bookizip.get_toc(): # Ignore sections if typ == 1: if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) _section_name = title _section = [] continue # Create new chapter with new filename c1 = epub.EpubHtml(title=title, file_name='{}.xhtml'.format(file_name[6:-5])) cont = unicode(bookizip.read(file_name), 'utf-8') _section.append(c1) try: tree = parse_html_string(cont.encode('utf-8')) except: # Just ignore everything if we can not parse the chapter continue # Change all the links in the document for elem in tree.iter(): if elem.tag == 'a': href = elem.get('href') if href: urlp = urlparse.urlparse(href) url_title = urlp.path if urlp.scheme == '': if url_title and url_title in file_names: fixed_href = url_title + '.xhtml' if urlp.fragment: fixed_href = "{}#{}".format( fixed_href, urlp.fragment) elem.set('href', fixed_href) else: # ovdje brishe sve shto je externo. to se ne bi trebalo desavati elem.drop_tag() c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) epub_book.add_item(c1) spine.append(c1) x += 1 if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) # Add all of the attachments for att_name in bookizip.get_attachments(): try: blob = bookizip.read(att_name) except (IOError, OSError): continue else: itm = epub.EpubImage() itm.file_name = att_name itm.content = blob epub_book.add_item(itm) epub_book.set_title('Title', 'main') epub_book.set_language('en') epub_book.add_author('Author', role='aut', uid='author') epub_book.toc = _toc epub_book.spine = spine epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) opts = {'plugins': [TidyPlugin(), standard.SyntaxPlugin()]} epub.write_epub(filename, epub_book, opts)
def _ebookize_all_news(self, parsed_articles): """ Adds the previously processed news data to the ebook. :param parsed_articles: The previously processed news data. """ print("* Ebook-izing downloaded headlines. *") # some initialization template = self.env.get_template('tmpl/article_template.html') self.article_toc_list = [] # put each into ebook for a in parsed_articles: print("Loading #{} into ebook: {}".format(a["count"], a["title"])) if a["top_image"] is not None: img_file_name = "art_img/image_{:03d}".format(a["count"]) epimg = epub.EpubImage() epimg.file_name = img_file_name epimg.media_type = "image/jpeg" img_resp = requests.get(a["top_image"]) img = img_resp.content epimg.set_content(img) self.book.add_item(epimg) a["top_image"] = img_file_name c = epub.EpubHtml(title=a["title"], file_name="article_{}.xhtml".format(a["count"]), lang='en') tree = publish_doctree(a["article_text"]) html = publish_from_doctree(tree, writer_name='html').decode() soup = BeautifulSoup(html, 'lxml') body_only = soup.find('body').find('div', {"class": "document"}) # skip articles that have barred keywords if any(kw in a["title"].lower() for kw in settings.TITLE_EXCLUSIONS): print("\tArticle title contains a barred keyword. Skipping.") continue if len(body_only.findAll('p')) < settings.MIN_PARAGRAPHS_FOR_AN_ARTICLE: print( "\tArticle from {} too short. It may be paywalled or a video. It may also have been parsed incorrectly." "\n\tURL: {}".format(a["source"], a["url"])) # fall back to justext to synthesize article a["article_text"] = "" count = 0 paragraphs = justext.justext(requests.get(a["url"]).content, justext.get_stoplist("English")) for paragraph in paragraphs: if not paragraph.is_boilerplate: count += 1 a["article_text"] += "<p>{}</p>".format(paragraph.text) if count < settings.MIN_PARAGRAPHS_FOR_AN_ARTICLE: print("\t\tArticle parsed correctly but actually short. Skipping.") continue # if it's still short, then it's actually short and not parsed incorrectly...continue else: print("\t\tArticle was indeed parsed incorrectly. Fallback has parsed it correctly.") else: a["article_text"] = body_only c.set_content(template.render(article=a)) self.chaps.append(c) self.book.add_item(c) self.article_toc_list.append( epub.Link("article_{}.xhtml".format(a["count"]), "{} - {}".format(a["title"], a["source"]), "art%d" % a["count"]))
def export_booktype(bookid): # Get Booktype Book try: booktype_book = models.Book.objects.get(url_title__iexact=bookid) except models.Book.DoesNotExist: print 'NO SUCH BOOK' sys.exit(-1) book_version = booktype_book.getVersion(None) # START CREATING THE BOOK book = epub.EpubBook() # set basic info book.set_identifier('booktype:%s' % booktype_book.url_title) book.set_title(booktype_book.title) book.set_language('en') # set description if booktype_book.description != '': book.add_metadata('DC', 'description', booktype_book.description) # set license lic = booktype_book.license if lic: book.add_metadata('DC', 'rights', lic.name) # The Contributors for Booktype book # book.add_author('Thea von Harbou', role='aut', uid='author') book.add_author('Aleksandar Erkalovic', role='aut', uid='author') book.add_author('Aleksandar Erkalovic', file_as='Aleksandar Erkalovic', role='ill', uid='illustrator') # set cover image img = open('cover.jpg', 'r').read() book.set_cover("image.jpg", img) toc = [] section = [] spine = ['cover', 'nav'] for chapter in book_version.getTOC(): if chapter.chapter: c1 = epub.EpubHtml(title=chapter.chapter.title, file_name='%s.xhtml' % (chapter.chapter.url_title, )) c1.add_link(href="style/default.css", rel="stylesheet", type="text/css") if chapter.chapter.title == 'Arabic': c1.set_language('ar') if chapter.chapter.title == 'Japanase': c1.set_language('jp') cont = chapter.chapter.content c1.content = cont book.add_item(c1) spine.append(c1) if len(section) > 1: section[1].append(c1) else: if len(section) > 0: toc.append(section[:]) section = [] section = [epub.Section(chapter.name), []] # this is section if len(section) > 0: toc.append(section[:]) for i, attachment in enumerate( models.Attachment.objects.filter(version=book_version)): try: f = open(attachment.attachment.name, "rb") blob = f.read() f.close() except (IOError, OSError), e: continue else: fn = os.path.basename(attachment.attachment.name.encode("utf-8")) itm = epub.EpubImage() itm.file_name = 'static/%s' % fn itm.content = blob book.add_item(itm)
def epub_write_coolshell(dt_last): book = epub.EpubBook() today = date.today() article_title = 'coolshell-%d%d%d' % (today.year, today.month, today.day) # set metadata book.set_identifier('id123456') book.set_title(article_title) book.set_language('en') book.add_author('Chen hao') #read html; fetch the title; fetch the text content response = urlopen('https://coolshell.cn/') content = response.read().decode('utf-8', 'ignore') response.close() with open('coolshell.html', 'w') as f: f.write(content) tree = lxml.html.fromstring(content) chapter_tocs = [] book.spine = ['nav'] chapter_no = 1 pubtime_xpath = "//h5/a/time/@datetime" pubtime_format = '%Y-%m-%dT%H:%M:%S' title_xpath = "//h1[@class='entry-title']" content_xpath = "//article/div[@class='entry-content']" end_xpath = "//p[re:match(., '全文完')]" match = CSSSelector('h2.entry-title a') for chapter in match(tree): href = chapter.get('href') print(href) response = urlopen(href) content = response.read().decode('utf-8', 'ignore') response.close() chapter_tree = lxml.html.fromstring(content) str_pubtime = chapter_tree.xpath(pubtime_xpath)[0][0:19] dt_pubtime = datetime.strptime(str_pubtime, pubtime_format) if dt_pubtime <= dt_last: continue title = chapter_tree.xpath(title_xpath)[0].text content_tree = chapter_tree.xpath(content_xpath)[0] if end_xpath.find('re:match') > -1: last_item = content_tree.xpath(end_xpath, namespaces={"re": "http://exslt.org/regular-expressions"})[0] else: last_item = content_tree.xpath(end_xpath)[0] b_del = False for item in content_tree.getchildren(): if b_del: content_tree.remove(item) if item == last_item: b_del = True img_xpath = "//img" for img_item in content_tree.xpath(img_xpath): if is_ancestor(content_tree, img_item): img_url = img_item.get('src') listtmp = re.split('/+', img_url) jpg_name = listtmp[-1] img_local = '%02d%s' % (chapter_no, jpg_name) print('img ' + img_url + ' local ' + img_local) get_image_from_url(img_url, img_local) img_item.set('src', img_local) #add the image to book img_item = epub.EpubImage() img_item.file_name = img_local try: img_item.content = open(img_local, 'rb').read() except Exception: print('Error open %s' % img_local) book.add_item(img_item) chapter_content = tostring(content_tree, encoding='unicode') chapter_file = 'chap_%02d.xhtml' % chapter_no # create chapter c1 = epub.EpubHtml(title=title, file_name=chapter_file, lang='hr') c1.content='<html><body><h1>'+title+'</h1>'+chapter_content+'</body></html>' book.add_item(c1) chapter_tocs.append(epub.Link(chapter_file, title, title)) book.spine.append(c1) chapter_no = chapter_no + 1 #if chapter_no > 2: # break # define Table Of Contents book.toc = tuple(chapter_tocs) #book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'), # (epub.Section('Simple book'), # (c1, )) # ) #book.toc = (epub.Link('chap_01.xhtml', chapter_title, chapter_title) # ) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define CSS style style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) # basic spine #book.spine = ['nav', c1] # write to the file epub.write_epub(article_title + '.epub', book, {})
def embed_images(book, soup): """Embeds remote images in EPUB HTML chapters""" for img in soup.find_all('img'): src = img.get('src') # Remove junk images if not src: img.decompose() continue if src.startswith('denied:'): img.decompose() continue if src.startswith('data:'): img.decompose() continue src_parts = urlparse(src) ext = os.path.splitext(src_parts.path)[1] name = str(hash(src)) + ext if name not in image_names: # Create `EpubImage` wrapper object image = epub.EpubImage() image.id = str(hash(src)) image.file_name = name thumbnail_hash = src + str(IMAGE_MAX_SIZE) thumbnail_bytes = cache.get(thumbnail_hash) # Download the image if thumbnail_bytes: thumbnail = io.BytesIO(thumbnail_bytes) else: thumbnail = io.BytesIO() try: logging.info('Downloading image %s', img['src']) content = requests.get(img['src'], timeout=3.05).content except (requests.exceptions.ContentDecodingError, requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: logging.error('Skipping image %s (%s)' % (img['src'], e)) continue original = io.BytesIO() original.write(content) try: # Create smaller, greyscale image from source image # convert to `RGBA` before `L` or Pillow will complain im = Image.open(original).convert('RGBA') im.thumbnail(IMAGE_MAX_SIZE) if IMAGE_GREYSCALE: im = im.convert('L') im.save(thumbnail, 'png' if ext == '.png' else 'jpeg') except OSError as e: logging.error('Skipping image %s (%s)' % (img['src'], e)) continue cache.set(thumbnail_hash, thumbnail.getvalue()) thumbnail.seek(0) image.content = thumbnail.read() book.add_item(image) image_names.add(name) img['style'] = 'max-width: 100%' img['src'] = name
def generate_epub(url, path): ''' Generate epub document at the given path, taking list of html file names of listOfFiles and taking images from listofImg. Each html file is added as a new chapter. Parameters ----------- path : Path to which epub is saved, as given by the user in the GUI. Returns --------- Status of execution: "Okay" if successful ; Exception otherwise. ''' global pb_label global progress global progress_var global popup # setting progress in progress bar popup.title("Generating Epub...") pb_label.set("Now generating epub from extracted contents....") popup.update() sleep(5 / 1000) # lauch task progress = 85 progress_var.set(progress) try: book = epub.EpubBook() # add metadata book.set_identifier('sample12345678') book.set_title(book_title) book.set_language('en') object_list = [] spine_list = ['cover', 'nav'] book.add_author('We_did_our_best') img = Image.open("cover.jpg") draw = ImageDraw.Draw(img) font = ImageFont.truetype("arial.ttf", size=20) MAX_W, MAX_H = img.size w, h = draw.textsize(book_title, font=font) draw.text(((MAX_W - w) / 2, (MAX_H - h) / 2), book_title, fill="white", font=font, anchor="ms", align="center") img.save('cover-out.jpg') book.set_cover("cover-out.jpg", open('cover-out.jpg', 'rb').read()) j = 0 global listOfFiles global listofImg for i in listOfFiles: htmlfile = i + ".html" pathForTemp = os.path.join(path, pathTemp) htmlfilepath = pathForTemp + "/" + htmlfile file = codecs.open(htmlfilepath, "r", "utf-8") content = file.read() soup1 = BeautifulSoup(content, "html.parser") chaptertitle = soup1.find('title').string # Adding chapters chaptertitle = chaptertitle.strip() c1 = epub.EpubHtml(title=chaptertitle, file_name=chaptertitle + '.xhtml', lang='en') c1.content = content object_list.append(c1) spine_list.append(c1) book.add_item(c1) j = j + 1 # adding images for img in listofImg: ext = img.split(".")[-1] if ext == "svg": continue elif ext == "jpg": ext1 = "JPEG" else: ext1 = ext i = Image.open(img) b = io.BytesIO() i.save(b, ext1) j = b.getvalue() ei = epub.EpubImage() ei.file_name = img ei.media_type = 'image/' + ext ei.content = j book.add_item(ei) # add table of contents book.toc = object_list # add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define css style style = ''' @namespace epub "http://www.idpf.org/2007/ops"; body { font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif; } h2 { text-align: left; text-transform: uppercase; font-weight: 200; } ol { list-style-type: none; } ol > li:first-child { margin-top: 0.3em; } nav[epub|type~='toc'] > ol > li > ol { list-style-type:square; } nav[epub|type~='toc'] > ol > li > ol > li { margin-top: 0.3em; } ''' # add css file nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) # spine book.spine = spine_list # create epub file epubname = os.path.join(path, book_title + '.epub') epub.write_epub(epubname, book, {}) except Exception as e: listOfFiles = [] listofImg = [] print(e) return "Exception" listOfFiles = [] listofImg = [] # setting progress in progress bar popup.update() sleep(5 / 1000) # lauch task progress = 100 pb_label.set("saved epub doc at: \n{} ....".format(path)) progress_var.set(progress) popup.update() sleep(3) popup.withdraw() print("saved epub doc at {}".format(path)) return "Okay"
def epub_write_rss_csdn(username, dt_last): book = epub.EpubBook() today = date.today() filename_feed = username + 'csdn.feed' article_title = '%s-csdn-%d%d%d' % (username, today.year, today.month, today.day) # set metadata book.set_identifier('id123456') book.set_title(article_title) book.set_language('en') book.add_author(username) #read html; fetch the title; fetch the text content response = urlopen('https://blog.csdn.net/%s/rss/list' % username) content = response.read().decode('utf-8', 'ignore') response.close() with open(filename_feed, 'w') as f: f.write(content) tree = etree.parse(filename_feed) chapter_tocs = [] book.spine = ['nav'] chapter_no = 1 #<div id="content_views" class="markdown_views prism-github-gist">#//article/div[[@id='article_content']/div[@id='content_views']] #<div class="postTime"># no need this config pubtime_xpath = "pubDate" pubtime_format = '%Y/%m/%d %H:%M:%S' title_xpath = "title" link_xpath = "link" content_xpath = "//article/div[[@id='article_content']/div[@id='content_views']" end_xpath = "//div[@class='postTime']" chapters = tree.xpath("//item") for chapter in chapters: str_pubtime = chapter.xpath(pubtime_xpath)[0].text dt_pubtime = datetime.strptime(str_pubtime, pubtime_format) if dt_pubtime <= dt_last: continue title = chapter.xpath(title_xpath)[0].text href = chapter.xpath(link_xpath)[0].text print(href) response = urlopen(href) content = response.read().decode('utf-8', 'ignore') response.close() chapter_tree = lxml.html.fromstring(content) content_tree = chapter_tree.xpath(content_xpath)[0] if end_xpath.find('re:match') > -1: last_item = content_tree.xpath(end_xpath, namespaces={"re": "http://exslt.org/regular-expressions"})[0] else: last_item = content_tree.xpath(end_xpath)[0] b_del = False for item in content_tree.getchildren(): if b_del: content_tree.remove(item) if item == last_item: b_del = True img_xpath = "//img" for img_item in content_tree.xpath(img_xpath): if is_ancestor(content_tree, img_item): img_url = img_item.get('src') listtmp = re.split('/+', img_url) jpg_name = listtmp[-1] img_local = '%02d%s' % (chapter_no, jpg_name) print('img ' + img_url + ' local ' + img_local) get_image_from_url(img_url, img_local) img_item.set('src', img_local) #add the image to book img_item = epub.EpubImage() img_item.file_name = img_local try: img_item.content = open(img_local, 'rb').read() except Exception: print('Error open %s' % img_local) book.add_item(img_item) chapter_content = tostring(content_tree, encoding='unicode') chapter_file = 'chap_%02d.xhtml' % chapter_no # create chapter c1 = epub.EpubHtml(title=title, file_name=chapter_file, lang='hr') c1.content='<html><body><h1>'+title+'</h1>'+chapter_content+'</body></html>' book.add_item(c1) chapter_tocs.append(epub.Link(chapter_file, title, title)) book.spine.append(c1) chapter_no = chapter_no + 1 # define Table Of Contents book.toc = tuple(chapter_tocs) #book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'), # (epub.Section('Simple book'), # (c1, )) # ) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define CSS style style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) # basic spine # write to the file epub.write_epub(article_title + '.epub', book, {})
def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find('head') if head is not None and len(head) != 0: for _item in head: if _item.tag == 'base': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) elif _item.tag == 'link': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes' ]) elif _item.tag == 'title': if _item.text == '': head.remove(_item) elif _item.tag == 'meta': leave_only( _item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == 'script': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'type', 'charset', 'async', 'defer', 'crossorigin' ]) elif _item.tag == 'source': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) elif _item.tag == 'style': leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find('body')) != 0: body = tree.find('body') for _item in body.iter(): # it is not # <a class="indexterm" href="ch05.html#ix_epub:trigger_element"> if _item.tag == 'a': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'href', 'target', 'download', 'rel', 'hreflang', 'type' ]) elif _item.tag == 'area': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type' ]) elif _item.tag == 'audio': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls' ]) elif _item.tag == 'blockquote': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'button': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'name', 'type', 'value', 'menu' ]) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'del': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) elif _item.tag == 'embed': leave_only( _item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) elif _item.tag == 'fieldset': leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) elif _item.tag == 'details': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target' ]) elif _item.tag == 'iframe': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height' ]) elif _item.tag == 'img': _src = _item.get('src', '').lower() if _src.startswith('http://') or _src.startswith( 'https://'): if 'remote-resources' not in chapter.properties: chapter.properties.append('remote-resources') # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG from ebooklib import epub _img = epub.EpubImage(file_name=_item.get('src')) book.add_item(_img) leave_only( _item, ATTRIBUTES_GLOBAL + [ 'alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height' ]) elif _item.tag == 'input': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple', 'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step' 'type', 'value', 'width' ]) elif _item.tag == 'ins': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'keygen': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name' ]) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'map': leave_only(_item, ATTRIBUTES_GLOBAL + ['name']) elif _item.tag == 'menu': leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label']) elif _item.tag == 'object': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height' ]) elif _item.tag == 'ol': leave_only( _item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type']) elif _item.tag == 'optgroup': leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label']) elif _item.tag == 'option': leave_only( _item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value']) elif _item.tag == 'output': leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name']) elif _item.tag == 'param': leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value']) elif _item.tag == 'progress': leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max']) elif _item.tag == 'q': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'select': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size' ]) elif _item.tag == 'table': if _item.get('border', None): if _item.get('border') == '0': _item.set('border', '') if _item.get('summary', None): _caption = etree.Element('caption', {}) _caption.text = _item.get('summary') _item.insert(0, _caption) # add it as caption del _item.attrib['summary'] leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable']) elif _item.tag == 'dl': _d = _item.find('dd') if _d is not None and len(_d) == 0: pass # http://html5doctor.com/the-dl-element/ # should be like this really # some of the elements can be missing # dl # dt # dd # dt # dd elif _item.tag == 'td': leave_only( _item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers']) elif _item.tag == 'textarea': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form', 'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required', 'rows', 'wrap' ]) elif _item.tag in ['col', 'colgroup']: leave_only(_item, ATTRIBUTES_GLOBAL + ['span']) elif _item.tag == 'th': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted' ]) elif _item.tag in ['time']: leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime']) elif _item.tag in ['track']: leave_only( _item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default']) elif _item.tag == 'video': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls', 'width', 'height' ]) elif _item.tag == 'svg': # We need to add property "svg" in case we have embeded svg file if 'svg' not in chapter.properties: chapter.properties.append('svg') if _item.get('viewbox', None): del _item.attrib['viewbox'] if _item.get('preserveaspectratio', None): del _item.attrib['preserveaspectratio'] else: for _attr in six.iterkeys(_item.attrib): if _attr not in ATTRIBUTES_GLOBAL: del _item.attrib[_attr] chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return chapter.content
def build(self): '''build issue, downloading articles if needed, and write ebook''' self.fetch_issue() self.info() for s in self.sections: s.build(self.db) book = epub.EpubBook() # add metadata book.set_title(self.title) book.set_identifier(self.id) book.set_language(self.language) book.add_author(self.author) toc = [] spine = [] if self.cover_img: img = fetch(self.cover_img).content book.set_cover("image.jpg", img) spine.append('cover') spine.append('nav') # Sections for section in self.sections: items = [] for article in section.articles: if not article.content: logging.error('%s could not be downloaded. Skipping.', article.url) continue item = epub.EpubHtml(title=article.title, file_name='{}.xhtml'.format( article.title), lang=self.language) item.content = article.content # images were downloaded by the article, and placed # in disk for refenrence. We now add them to the book. for filename in article.images: img = epub.EpubImage() img.file_name = filename with open(filename, 'rb') as f: img.content = f.read() book.add_item(img) items.append(item) for item in items: book.add_item(item) toc.append((epub.Section(section.title, href=items[0].file_name), items)) spine.extend(items) book.toc = toc book.spine = spine # add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # create epub file epub.write_epub('{}.epub'.format(self.id), book, {})