def get_book(initial_url): base_url = 'http://www.wattpad.com' html = get_html(initial_url) # Get basic book information author = html.select('div.author-info strong a')[0].get_text() title = html.select('h1')[0].get_text().strip() description = html.select('h2.description')[0].get_text() coverurl = html.select('div.cover.cover-lg img')[0]['src'] labels = ['Wattpad'] for label in html.select('div.tags a'): if '/' in label['href']: labels.append(label.get_text()) print("'{}' by {}".format(title, author)) # print(next_page_url) # Get list of chapters chapterlist_url = "{}{}".format(initial_url, "/parts") chapterlist = get_html(chapterlist_url).select('ul.table-of-contents a') epubfile = "{} - {}.epub".format(title, author) if not os.path.exists(epubfile): book = epub.EpubBook() book.set_title(title) book.add_author(author) book.set_language('en') # book.add_metadata('DC', 'subject', 'Wattpad') for label in labels: book.add_metadata('DC', 'subject', label) # TODO: add a cover without breaking everything # urllib.request.urlretrieve(coverurl, "cover.jpg") # img = open("cover.jpg", "r", encoding="utf-8") # book.set_cover('cover.jpg', img) # os.remove("cover.jpg") # Define CSS style nav_css = epub.EpubItem(uid="style_nav", file_name="Style/nav.css", media_type="text/css", content=open("CSS/nav.css").read()) body_css = epub.EpubItem(uid="style_body", file_name="Style/body.css", media_type="text/css", content=open("CSS/body.css").read()) # Add CSS file book.add_item(nav_css) book.add_item(body_css) # Introduction intro_ch = epub.EpubHtml(title='Introduction', file_name='intro.xhtml') intro_ch.add_item(body_css) intro_template = Template(open("HTML/intro.xhtml").read()) intro_html = intro_template.substitute(title=title, author=author, url=initial_url, synopsis=description) intro_ch.content = intro_html book.add_item(intro_ch) allchapters = [] for item in chapterlist: chaptertitle = item.get_text().strip().replace("/", "-") if chaptertitle.upper() != "A-N": print("Working on: {}".format(chaptertitle)) chapter = get_chapter("{}{}".format(base_url, item['href'])) book.add_item(chapter) allchapters.append(chapter) # Define Table of Contents book.toc = (epub.Link('intro.xhtml', 'Introduction', 'intro'), (epub.Section('Chapters'), allchapters)) # Add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Basic spine myspine = [intro_ch, 'nav'] for i in allchapters: myspine.append(i) book.spine = myspine # Write the epub to file epub.write_epub(epubfile, book, {}) else: print("Epub file already exists, not updating")
def make_epub_html(filename, title, contents): html = epub.EpubHtml(title=title, file_name=filename, lang='zh') html.content = '<h2>{}</h2>'.format(title) html.content += trans_quote(contents) return html
def write_epub(user_slug, doc_slug, file_path): # Get all the data config = load_env_config() data = Data(config) user = data.user_get(user_slug) # or None if not user: raise RuntimeError("User not found: %s", user_slug) document = data.userDocument_get(user_slug, doc_slug) # or Noen if not document: raise RuntimeError("Document not found: %s" % doc_slug) # ------------------------- # 0. Create book # 1. Create cover # 2. Create title page # 3. Create chapter (which basically is the book) # ... This upgrades to multiple chapters when compiling books. # Pre-processing... settings = Settings({ 'config:user': user_slug, 'config:document': doc_slug, }) wiki = Wiki(settings) xhtml = wiki.process(user_slug, doc_slug, document) metadata = wiki.compile_metadata(config['TIME_ZONE'], user_slug, doc_slug) metadata['url'] = '/read/{:s}/{:s}'.format(user_slug, doc_slug), title = metadata.get('title', 'Untitled') summary = metadata.get('summary', '') author = metadata.get('author', 'Anonymous') date = metadata.get('date', '') # ------------------------- # 0. CREATE BOOK book = epub.EpubBook() # set metadata book.set_identifier(user_slug + '+' + doc_slug) book.set_title(title) book.set_language('en') book.add_author(author) # define CSS style with open('static/epub.css') as f: style = f.read() global_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(global_css) # ------------------------- # 1. Create Cover tmp_cover_file = "/tmp/%s-%s-cover.png" % (user_slug, doc_slug) image = make_background((1600, 2200), (160, 184, 160)) cover = make_cover(image, [title, summary, author, date], [COLOR_TEXT, COLOR_SHADOW]) cover.save(tmp_cover_file, "JPEG") chapter_file_name = doc_slug + '.xhtml' assert os.path.exists(tmp_cover_file) cover_image = open(tmp_cover_file, 'rb').read() book.set_cover("image.jpg", cover_image) # ------------------------- # 2. Create Title Page date_string = datetime.now().strftime("%Y-%m-%d %H:%M:%S") title_xhtml = """ <html> <body> <div>Generated by <i>Article Wiki</i>:</div> <div>%s</div> <div> </div> <div>Permanent URL:</div> <div>http://chapman.wiki/read/%s/%s</div> </body> </html> """ % (date_string, user_slug, doc_slug) c1 = epub.EpubHtml(title="About this book", file_name="title.xhtml", lang='en') c1.content = title_xhtml c1.add_item(global_css) book.add_item(c1) # ------------------------- # 3. Create Chapter c2 = epub.EpubHtml(title=title, file_name=chapter_file_name, lang='en') c2.content = xhtml c2.add_item(global_css) book.add_item(c2) # Define Table Of Contents book.toc = ( epub.Link(chapter_file_name, title, doc_slug), # (epub.Section(user_slug), (c2)) ) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # basic spine book.spine = ['nav', c1, c2] # write to the file epub.write_epub(file_path, book, {})
def parse(): global source, out, book if not os.path.exists(source): logger.error('%s 文件不存在!' % source) return logger.info('开始解析') filename = os.path.split(source)[-1] if out == '': out = os.path.splitext(filename)[0] + '.epub' # 目录管理 toc = [ (epub.Section(filename), []), ] # toc = [] # 主线 spine = ['nav'] set_cover = False # print(filename, out) # 输入文件夹的情况 if os.path.isdir(filename): logger.info('输入了文件夹 %s' % filename) filelist = os.listdir(filename) for file in filelist: if os.path.splitext(file.lower())[-1] != '.zip': filelist.remove(file) # 先按文件长短,后按文件名排序 filelist.sort(key=lambda k: (len(k), k)) for myzipfile in filelist: logger.info('输入了文件 %s' % myzipfile) myzipfilename = myzipfile.split('.')[0] toc.append((epub.Section(myzipfilename), [])) zipped = zipfile.ZipFile(os.path.join(filename, myzipfile), 'r') zipfilelist = list(zipped.filelist) # 先按文件长短,后按文件名排序 zipfilelist.sort(key=lambda k: (len(k.filename), k.filename)) for file in zipfilelist: data = zipped.read(file) logger.info("添加文件%s, 文件大小%sKB" % (file.filename, len(data) // 1000)) img = epub.EpubItem(file_name="images/%s/%s" % (myzipfilename, file.filename), media_type="image/%s" % os.path.splitext(file.filename)[-1][1:], content=data) if set_cover is False: set_cover = True book.set_cover('cover.jpg', data) page = epub.EpubHtml(title=file.filename, file_name='%s_%s.html' % (myzipfile, file.filename)) page.set_content(("<img src=\"%s\">" % ("images/%s/%s" % (myzipfilename, file.filename))).encode()) toc[-1][1].append(page) toc[-1][1].append(img) # toc.append((epub.Section(file.filename.split('.')[0]), [page, img])) spine.append(page) # spine.append(img) book.add_item(page) book.add_item(img) # 输入zip文件的情况 if not os.path.isdir(filename): logger.info('输入了文件 %s' % filename) if os.path.splitext(filename.lower())[-1] != '.zip': logger.error('不是zip文件') sys.exit() zipped = zipfile.ZipFile(filename, 'r') filelist = list(zipped.filelist) # 先按文件长短,后按文件名排序 filelist.sort(key=lambda k: (len(k.filename), k.filename)) for file in filelist: data = zipped.read(file) logger.info("添加文件%s, 文件大小%sKB" % (file.filename, len(data) // 1000)) img = epub.EpubItem(file_name="images/%s" % file.filename, media_type="image/%s" % os.path.splitext(file.filename)[-1][1:], content=data) if set_cover is False: set_cover = True book.set_cover('cover.jpg', data) page = epub.EpubHtml(title=file.filename, file_name='%s.html' % file.filename) page.set_content( ("<img src=\"%s\">" % ("images/%s" % file.filename)).encode()) toc[-1][1].append(page) toc[-1][1].append(img) # toc.append((epub.Section(file.filename.split('.')[0]), [page, img])) spine.append(page) # spine.append(img) book.add_item(page) book.add_item(img) book.toc = toc # add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # create spine book.spine = spine epub.write_epub(out, book) sys.exit()
def createEbook(grimoireData): book = epub.EpubBook() book.set_identifier('destinyGrimoire') book.set_title('Destiny Grimoire') book.set_language('en') book.add_author('Bungie') book.set_cover("cover.jpg", open('cover.jpg', 'rb').read()) style = ''' cardname { display: block; text-align: center; font-size:150%; } cardimage { float: left; margin-right: 5%; width: 40%; height: 40%; } cardintro { display: block; padding: 5%; } carddescription {} container { width: 100%; clear: both; } ''' default_css = epub.EpubItem(uid="style_default", file_name="style/default.css", media_type="text/css", content=style) book.add_item(default_css) book.spine = ['nav'] counter = 1 tocSections = () for theme in grimoireData["themes"]: themePages = () for page in theme["pages"]: pageCards = () for card in page["cards"]: if counter > 0: bookPage = epub.EpubHtml( title=chapterTitle(card["cardName"]), file_name=chapterPageFile(card["cardName"], counter), lang='en', content="") bookPage.add_item(default_css) imageBaseFileName = '%s_img' % (chapterBaseFileName( card["cardName"], counter)) imagePath = createCardImage( imageBaseFileName, os.path.join( 'images/%s' % (os.path.basename(card["image"]["sourceImage"]))), card["image"]["regionXStart"], card["image"]["regionYStart"], card["image"]["regionWidth"], card["image"]["regionHeight"]) book.add_item( epub.EpubItem(uid=imageBaseFileName, file_name=imagePath, content=open(imagePath, 'rb').read())) bookPage.content = u''' <cardname">%s</cardname> <cardintro>%s</cardintro> <container> <cardimage><img src="%s"/></cardimage> <carddescription">%s</carddescription> </container>''' % (card["cardName"], safeValue(card["cardIntro"]), imagePath, safeValue(card["cardDescription"])) book.add_item(bookPage) pageCards = pageCards + (bookPage, ) book.spine.append(bookPage) counter += 1 themePages = themePages + ( (epub.Section(page["pageName"]), pageCards), ) tocSections = tocSections + ( (epub.Section(theme["themeName"]), themePages), ) book.toc = tocSections book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) epub.write_epub('destinyGrimoire.epub', book)
async def createEpub(link, channel): chapters = {} chapter = reqJson(link + "1") print(link + "1") book = epub.EpubBook() # set metadata book.set_identifier(chapter['info']['urlId']) book.set_title(chapter['info']['title']) book.set_language('en') threads = [] book.add_author(chapter['info']['author']) for i in range(1, int(chapter['info']['chapters'])+1): t = threading.Thread(target=worker, args=(book, i, chapters, link)) threads.append(t) t.start() for thread in threads: thread.join() chapters = collections.OrderedDict(sorted(chapters.items())) for _, c in sorted(chapters.items()): print(c.title) book.add_item(c) print("requesting intro_page") intro_page = reqJson(link) intro = epub.EpubHtml(title='Introduction', file_name='introduction' + '.xhtml', lang='hr') intro.content = """ <html> <head> <title>Introduction</title> <link rel="stylesheet" href="style/main.css" type="text/css" /> </head> <body> <h1>%s</h1> <p><b>By: %s</b></p> <p>%s</p> </body> </html> """ % (intro_page['title'], intro_page['author'], intro_page['desc']) book.add_item(intro) # define Table Of Contents book.toc = (epub.Link('introduction.xhtml', 'Introduction', 'intro'), (epub.Section('rest of the beautiful owl'), list(chapters.values())) ) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define CSS style style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) # basic spine doc_style = epub.EpubItem( uid="doc_style", file_name="style/main.css", media_type="text/css", content=open("style.css").read() ) nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml') nav_page.add_item(doc_style) book.add_item(nav_page) book.spine = [intro, nav_page] + list(chapters.values()) print("creating book with name: " + intro_page['title'].replace('/', '_') + '.epub') if not os.path.isdir('Books'): os.mkdir('Books') epub.write_epub("Books/" + intro_page['title'].replace('/', '_') + '.epub', book, {})
book.add_author(author) header_name = os.path.join( savePath, list( filter( re.compile(r'.+header\..+').match, os.listdir(savePath)))[0]) book.set_cover("cover.jpg", open(header_name, 'rb').read()) book.toc = [] book.spine = ['cover'] # create about page chapter = epub.EpubHtml(title='about', file_name='about.xhtml') chapter.content = f'<h1>About</h1><p>Title: {name}</p><p>Author: {book.metadata["http://purl.org/dc/elements/1.1/"]["creator"][0][0]}</p><p>Source: <a href="{"https://tapas.io/series/" + urlName}">{"https://tapas.io/series/" + urlName}</a></p>' book.add_item(chapter) book.spine.append(chapter) # Append nav page book.spine.append('nav') # create chapters for pageCount, pageData in enumerate(data): printLine( 'Downloaded page {}/{}...'.format(pageCount + 1, len(data)), True) pagePq = pq(
from ebooklib import epub book = epub.EpubBook() book.set_identifier('test123') book.set_title('Test book') book.set_language('zh-TW') c1 = epub.EpubHtml(title='Chapter01', file_name='chap_01.xhtml', lang='zh-TW') c1.connect = u'<h1>Chapter01</h1><p>This is chapter 01 for test</p>' book.add_item(c1) book.toc = (epub.Link('chap_01.xhtml', 'Chapter01', 'ch01'), (epub.Section('Simple Book'), (c1, ))) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) book.spine = ['nav', c1] epub.write_epub('test.epub', book, {})
def get_book(initial_url): base_url = 'http://www.wattpad.com' html = get_html(initial_url) # Get basic book information author = html.select( 'div.author-info:nth-child(1) > div:nth-child(2) > a')[0].get_text() title = html.select('.story-info__title')[0].get_text().strip() description = html.select('.description-text')[0].get_text() coverurl = html.select('.story-cover > img')[0]['src'] labels = ['Wattpad'] for label in html.select('div.tags a'): if '/' in label['href']: labels.append(label.get_text()) if debug: print("Author: " + author) print("Title: " + title) print("Description: " + description) print("Cover: " + coverurl) print("Labels:" + " ".join(labels)) print("'{}' by {}".format(title, author).encode("utf-8")) # print(next_page_url) # Get list of chapters chapterlist = html.select('.story-parts')[0].select('ul:nth-child(1) li a') # Remove from the file name those characters that Microsoft does NOT allow. # This also affects the FAT filesystem used on most phone/tablet sdcards # and other devices used to read epub files. # Disallowed characters: \/:*?"<>|^ filename = title for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '^']: if i in filename: filename = filename.replace(i, '') # Apple products disallow files starting with dot filename = filename.lstrip('.') epubfile = "./books/{} - {}.epub".format(filename, author) if not os.path.exists(epubfile): book = epub.EpubBook() book.set_identifier("wattpad.com//%s/%s" % (initial_url.split('/')[-1], len(chapterlist))) book.set_title(title) book.add_author(author) book.set_language('en') # book.add_metadata('DC', 'subject', 'Wattpad') for label in labels: book.add_metadata('DC', 'subject', label) # Add a cover if it's available if get_cover(coverurl): cover = True book.set_cover(file_name='cover.jpg', content=open('cover.jpg', 'rb').read(), create_page=True) os.remove('cover.jpg') # Define CSS style css_path = os.path.join("./utils/css", "nav.css") nav_css = epub.EpubItem(uid="style_nav", file_name="Style/nav.css", media_type="text/css", content=open(css_path).read()) css_path = os.path.join("./utils/css", "body.css") body_css = epub.EpubItem(uid="style_body", file_name="Style/body.css", media_type="text/css", content=open(css_path).read()) # Add CSS file book.add_item(nav_css) book.add_item(body_css) # Introduction intro_ch = epub.EpubHtml(title='Introduction', file_name='intro.xhtml') intro_ch.add_item(body_css) template_path = os.path.join("./utils/html", "intro.xhtml") intro_template = Template(open(template_path).read()) intro_html = intro_template.substitute(title=title, author=author, url=initial_url, synopsis=description) intro_ch.content = intro_html book.add_item(intro_ch) allchapters = [] for i, item in enumerate(chapterlist, start=1): chaptertitle = item.get_text().strip().replace("/", "-") if chaptertitle.upper() != "A-N": print("Working on: {}".format(chaptertitle).encode("utf-8")) chapter = get_chapter("{}{}".format(base_url, item['href']), i) book.add_item(chapter) allchapters.append(chapter) i = i + 1 # Define Table of Contents book.toc = (epub.Link('intro.xhtml', 'Introduction', 'intro'), (epub.Section('Chapters'), allchapters)) # Add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Basic spine myspine = [] if cover: myspine.append('cover') myspine.extend([intro_ch, 'nav']) myspine.extend(allchapters) book.spine = myspine # Write the epub to file epub.write_epub(epubfile, book, {}) return epubfile else: print("Epub file already exists, not updating") return epubfile
ctr = 0 for k, v in chapter_d.items(): ctr += 1 ch_name = v['chapter'] book = v['book'] # name and roman numeral? name = ch_name.split(' ')[0].lower().encode() rest = ch_name.split(' ')[-1] if name in names_b and rest in roman_numerals: chapter_ct = rom_d[rest] else: name = ch_name.lower().encode() chapter_ct = 0 new_ch_name = ch_name + ' ' + v['book'] this_link = new_ch_name.replace(' ', '_') + '.html' this_chapter = epub.EpubHtml(title=new_ch_name, file_name=this_link, lang='en', uid=str(ctr)) this_chapter.set_content(book_d[book][name][chapter_ct]['content']) toc.append(epub.Link(this_link, ch_name)) spine.append(this_chapter) AFWD.add_item(this_chapter) AFWD.toc = toc AFWD.spine = spine AFWD.add_item(AFFC_css) AFWD.add_item(epub.EpubNav()) epub.write_epub('A Feast with Dragons.epub', AFWD)
SM = SheetMusic(infile) random_filename = str(uuid.uuid4()) sheet = {'tabs':SM.tabs,\ 'title':SM.title,\ 'file':random_filename} sheets.append(sheet) sheets.sort(key=lambda x: x['title']) # sort by title ascending # see https://github.com/aerkalov/ebooklib for details book = epub.EpubBook() book.set_identifier(str(uuid.uuid4())) book.set_title('My Favourite Sheets') book.set_language('en') book.add_author('Your Name Here') book.spine = ['nav'] for s in sheets: c = epub.EpubHtml(title=s['title'], file_name=s['file'], lang='en') c.content = s['tabs'] book.add_item(c) book.spine = book.spine + [c] # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # write to the file epub.write_epub('my_favourite_sheets.epub', book, {})
def _create_toc(self): """ Create table of contents :Args: - self (:class:`ExportBook`): current class instance """ self.toc = OrderedDict() self.spine = ['nav'] self.hold_chapters_urls = [ i.url_title for i in self.book_version.get_hold_chapters() ] for chapter in self.book_version.get_toc(): if chapter.chapter: c1 = epub.EpubHtml(title=chapter.chapter.title, file_name='%s.xhtml' % (chapter.chapter.url_title, )) # hook for some extra customizations cont = self._chapter_content_hook(chapter.chapter.content) try: tree = parse_html_string(cont.encode('utf-8')) except Exception as err: logger.error('Error parsing chapter content %s' % err) continue # hook for some extra customizations self._chapter_tree_hook(tree) for elem in tree.iter(): self._handle_chapter_element(elem) c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) # hook for some extra customizations self._epub_chapter_hook(c1) self.epub_book.add_item(c1) self.spine.append(c1) if chapter.parent: self.toc[chapter.parent.id][1].append(c1) else: if chapter.has_children(): self.toc[chapter.id] = [c1, []] else: self.toc[chapter.id] = c1 else: epub_sec = epub.Section(chapter.name) if chapter.parent: self.toc[chapter.parent.id][1].append(epub_sec) else: self.toc[chapter.id] = [epub_sec, []]
def get_book(initial_url): base_url = 'http://www.wattpad.com' html = get_html(initial_url) # Get basic book information author = html.select('div.author-info strong a')[0].get_text() title = html.select('h1')[0].get_text().strip() description = html.select('h2.description')[0].get_text() coverurl = html.select('div.cover.cover-lg img')[0]['src'] labels = ['Wattpad'] for label in html.select('div.tags a'): if '/' in label['href']: labels.append(label.get_text()) if debug: print("Author: " + author) print("Title: " + title) print("Description: " + description) print("Cover: " + coverurl) print("Labels:" + " ".join(labels)) print("'{}' by {}".format(title, author)) # print(next_page_url) # Get list of chapters chapterlist_url = "{}{}".format(initial_url, "/parts") chapterlist = get_html(chapterlist_url).select('ul.table-of-contents a') epubfile = "{} - {}.epub".format(title, author) if not os.path.exists(epubfile): book = epub.EpubBook() book.set_identifier("wattpad.com//%s/%s" % (initial_url.split('/')[-1], len(chapterlist))) book.set_title(title) book.add_author(author) book.set_language('en') # book.add_metadata('DC', 'subject', 'Wattpad') for label in labels: book.add_metadata('DC', 'subject', label) # Add a cover if it's available if get_cover(coverurl): cover = True book.set_cover(file_name='cover.jpg', content=open('cover.jpg', 'rb').read(), create_page=True) os.remove('cover.jpg') # Define CSS style nav_css = epub.EpubItem(uid="style_nav", file_name="Style/nav.css", media_type="text/css", content=open("CSS/nav.css").read()) body_css = epub.EpubItem(uid="style_body", file_name="Style/body.css", media_type="text/css", content=open("CSS/body.css").read()) # Add CSS file book.add_item(nav_css) book.add_item(body_css) # Introduction intro_ch = epub.EpubHtml(title='Introduction', file_name='intro.xhtml') intro_ch.add_item(body_css) intro_template = Template(open("HTML/intro.xhtml").read()) intro_html = intro_template.substitute(title=title, author=author, url=initial_url, synopsis=description) intro_ch.content = intro_html book.add_item(intro_ch) allchapters = [] for item in chapterlist: chaptertitle = item.get_text().strip().replace("/", "-") if chaptertitle.upper() != "A-N": print("Working on: {}".format(chaptertitle)) chapter = get_chapter("{}{}".format(base_url, item['href'])) book.add_item(chapter) allchapters.append(chapter) # Define Table of Contents book.toc = (epub.Link('intro.xhtml', 'Introduction', 'intro'), (epub.Section('Chapters'), allchapters)) # Add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Basic spine myspine = [] if cover: myspine.append('cover') myspine.extend([intro_ch, 'nav']) myspine.extend(allchapters) book.spine = myspine # Write the epub to file epub.write_epub(epubfile, book, {}) else: print("Epub file already exists, not updating")
def wxw(): ap = argparse.ArgumentParser() ap.add_argument('-u', '--url', default='desolate-era-index') ap.add_argument('-b', '--books', nargs='+', default=None) args = ap.parse_args() index_link = BASE_LINK + args.url index_req = Request(index_link, headers={'User-Agent': 'Mozilla/5.0'}) index_soup = BeautifulSoup(urlopen(index_req).read(), 'html5lib') series_title = re.search(r'([^:–()])*\w', index_soup.find('h1', attrs={'class': 'entry-title'}).get_text()).group() raw_chapter_links = a['href'] for a in index_soup.select('div[itemprop=articleBody] a[href]') books = {} chapters = {} book_titles = index_soup.find('div', attrs={'itemprop': 'articleBody'}).find_all('strong') for book in book_titles: book_number = re.search(r'^\w*\s\d+', book.get_text()) if book_number is None: continue book_number = re.search(r'\d+', book_number.group()).group() if args.books is not None and book_number not in args.books: continue books[book_number] = epub.EpubBook() books[book_number].set_title('{} – {}'.format(series_title, book.get_text())) books[book_number].set_identifier(uuid.uuid4().hex) books[book_number].set_language('en') chapters[book_number] = [] for raw_chapter_link in raw_chapter_links: info = re.search(r'\w*-\d+', raw_chapter_link) if info is None: continue book_number = re.search(r'\d+', info.group()).group() if book_number not in books: continue chapter_req = Request(raw_chapter_link, headers={'User-Agent': 'Mozilla/5.0'}) chapter_soup = BeautifulSoup(urlopen(chapter_req).read(), 'html5lib') raw_chapter = chapter_soup.find('div', attrs={'itemprop': 'articleBody'}) parsed_chapter = [] hr = 0 for line in raw_chapter: if line.name == 'hr': hr += 1 elif hr == 1 and line.name == 'p': parsed_chapter.append(line.get_text()) chapter_title = re.search(r'\w([^–:])*$', parsed_chapter[0]).group() chapter = epub.EpubHtml( title=chapter_title, file_name='{}.xhtml'.format(uuid.uuid4().hex), lang='en' ) # Chapter Title parsed_chapter[0] = '<h1>{}</h1>'.format(chapter_title) chapter.content = '<br /><br />'.join(str(line) for line in parsed_chapter) books[book_number].add_item(chapter) books[book_number].toc += (epub.Link(chapter.file_name, chapter.title, uuid.uuid4().hex), ) chapters[book_number].append(chapter) time.sleep(1) print('Finished parsing', raw_chapter_link) for book_number, book in books.items(): book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ['Nav'] + chapters[book_number] # Not sure exactly what this is doing style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) epub.write_epub('{}.epub'.format(''.join(c for c in book.title if c.isalnum())), book, {}) if __name__ == '__main__': wxw()
</head> <body>""" b = """</div></body> </html>""" #making nake for xhtml files title = BeautifulSoup(head, 'html.parser') title = str(title.get_text()) title = re.sub(r'[^A-Za-z0-9 ]+', '', title) content = a + head + para + b #writing in xhtml content = content.encode(encoding='UTF-8', errors='ignore') c = epub.EpubHtml(title=title, file_name=title + '.xhtml', lang='en') c.set_content(content) book.add_item(c) #for TOC table.append(c) #adding to spine important book.spine.append(c) # adding TOC book.toc = (epub.Link('nav.xhtml', titleOfBook, 'nav'), (epub.Section('book'), (table))) #Make Epub book.add_item(epub.EpubNcx())
book.add_author('Impey, Chris') book.spine = ['nav'] # Chapters will be appended later book.set_cover('cover.jpg', open('cover.jpg', 'rb').read()) # Collect all chapters and sections in this list book_toc = [] # Loop through all chapters for chapter_index, chapter in contents.iterrows(): chapter_title = '{}. {}'.format(chapter_index + 1, chapter['humanchapter']) chapter_file_name = 'chap_{0:02d}.xhtml'.format(chapter_index + 1) print('{}'.format(chapter_title)) # Set up en EPUB chapter, ... epub_chapter = epub.EpubHtml(title=chapter_title, file_name=chapter_file_name) # ... add it to the table of contents book_toc.append( epub.Link(href=chapter_file_name, title=chapter['humanchapter'], uid=chapter_file_name.split('.')[0])) # ... and compile the HTML contents for it doc, tag, text = Doc().tagtext() with tag('h1'): text(chapter_title) for section_index, section in chapter['sections'].iterrows(): section_title = '{}.{} {}'.format(chapter_index + 1, section_index + 1,
def worker(book, number, chapters, link): chapter = reqJson(f"{link}{number}") c = epub.EpubHtml(title=chapter['title'], file_name=f'chap_{number}.xhtml', lang='hr') c.content=chapter['content'] chapters[chapter['chapterId']] = c return
def write_epub(self, save_dir: Path = Path(SAVE_DIR), style: str = DEFAULT_CSS, use_cache: bool = USE_CACHE, language: str = LANGUAGE, cache_dir: Path = Path(CACHE_DIR), add_copyright_page=True, end_update=True, ): length = len(self.content) missing_number = len(self.missing_chapters) title = _default_cc.convert(self.book_data.title) # 处理过短书籍 if length < MIN_CHAPTERS: black_list_log.append('"%s", # %s\n' % (self.book_data.url, title)) logger.debug('《%s》过短。' % title if LANGUAGE in _simplified else '《%s》過短。' % title) return # 处理缺章 if (length >= 200 and missing_number >= 10) or missing_number >= 5: black_list_log.append('"%s", # %s\n' % (self.book_data.url, title)) logger.debug('《%s》一书缺失章节过多,达 %d 章。' % (title, len(self.missing_chapters)) if LANGUAGE in _simplified else '《%s》一書缺失章節過多,達 %d 章。' % (title, len(self.missing_chapters))) logger.error('"%s", # %s\n' % (self.book_data.url, title)) return if self.missing_chapters: missing_log.append('《%s》\n' % title) missing_log.extend([' - 第 %d 章《%s》- %s\n' % (i.number, i.title, i.url) for i in self.missing_chapters]) logger.warning('《%s》一书缺 %d 章。' % (title, missing_number) if LANGUAGE in _simplified else '《%s》一書缺 %d 章。' % (title, missing_number)) book = epub.EpubBook() cc = OpenCC('t2s') if language in _simplified else OpenCC('s2t') # 设置图书属性 book_data = self.book_data book.set_identifier(_gen_identifier_from_url(book_data.url)) title = cc.convert(self.book_data.title) book.set_title(title) book.set_language(language) book.add_author(book_data.author) # 添加“关于本书” detail = '\n'.join(['<p>%s</p>' % cc.convert(para) for para in self.book_data.detail]) describe = '\n'.join(['<p>%s</p>' % cc.convert(para) for para in self.book_data.describe]) about = epub.EpubHtml(title=cc.convert('关于本书'), file_name='about.xhtml', lang=language, content='<p><h1>%s</h1></p>%s<p><h3>介绍</h3></p>%s' % (title, detail, describe)) book.add_item(about) # 添加各章节 counter = 1 for chapter in self.content: chapter_html = epub.EpubHtml(title=cc.convert(chapter.title), file_name='%04d' % counter + '.xhtml', lang=language, content=cc.convert(chapter.as_html())) book.add_item(chapter_html) counter += 1 if add_copyright_page: chapter_html = epub.EpubHtml(title=cc.convert('关于著作权'), file_name='copyright.xhtml', lang=language, content=cc.convert(_copyright_page.as_html())) book.add_item(chapter_html) # 添加目录 book.toc = ([i for i in book.items if type(i) == epub.EpubHtml]) # 添加 Ncx 和 Nav book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # 添加 CSS 样式 nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) # 添加 spine book.spine = ['cover', 'nav', *[i for i in book.items if type(i) == epub.EpubHtml]] # 写入 epub if not save_dir.exists(): save_dir.mkdir(parents=True) save_path = save_dir.cwd() / save_dir / ('%s - 至第 %d 章.epub' % (title, len(self.content))) epub.write_epub('writing.epub', book, {}) shutil.move('./writing.epub', str(save_path)) logger.debug('已生成《%s》一书。' % title if LANGUAGE in _simplified else '已生成《%s》一書。' % title) # 更新缓存中图书信息 if end_update: _dump(identifier=_gen_identifier_from_url(book_data.url), some_obj=self.book_data, cache_dir=cache_dir, use_cache=use_cache)
encoding='utf-8') if __name__ == '__main__': book = epub.EpubBook() # add metadata book.set_identifier('sample123456') book.set_title('Sample book') book.set_language('en') book.add_author('Aleksandar Erkalovic') # intro chapter c1 = epub.EpubHtml(title='Introduction', file_name='intro.xhtml', lang='en') c1.content = u'<html><head></head><body><h1>Introduction</h1><p>Introduction paragraph <a class="test">with a link</a> where i explain what is happening.</p></body></html>' # about chapter c2 = epub.EpubHtml(title='About this book', file_name='about.xhtml') c2.content = '<h1>About this book</h1><p>Helou, this is my book! There are many books, but this one is mine.</p>' # add chapters to the book book.add_item(c1) book.add_item(c2) # create table of contents # - add section # - add auto created links to chapters
def create_epub(chapters: Mapping, title: str, basename: str, use_dnd_decorations: bool = False): """Prepare an EPUB file from the list of chapters. Parameters ========== chapters A mapping where the keys are chapter names (spines) and the values are strings of HTML to be rendered as the chapter contents. basename The basename for saving files (PDFs, etc). The resulting epub file will be "{basename}.epub". use_dnd_decorations If true, style sheets will be included to produce D&D stylized stat blocks, etc. """ # Create a new epub book book = epub.EpubBook() book.set_identifier("id123456") book.set_title(title) book.set_language("en") # Add the css files css_template = jinja_env.get_template("dungeonsheets_epub.css") dl_widths = { # Width for dl lists, in 'em' units "character-details": 11, "combat-stats": 15, "proficiencies": 8.5, "faction": 6, "spellcasting": 12.5, "spell-slots": 8, "spell-details": 10, "beast-stats": 9, "feature-details": 5.5, "infusion-details": 8.5, "magic-item-details": 13.5, "monster-details": 15, } style = css_template.render(use_dnd_decorations=use_dnd_decorations, dl_widths=dl_widths) css = epub.EpubItem( uid="style_default", file_name="style/gm_sheet.css", media_type="text/css", content=style, ) book.add_item(css) # Add paper background with open(Path(__file__).parent / "forms/paper-low-res.jpg", mode="rb") as fp: bg_img = fp.read() paper = epub.EpubItem( file_name="images/paper.jpg", media_type="image/jpeg", content=bg_img, ) book.add_item(paper) # Create the separate chapters toc = ["nav"] html_chapters = [] for chap_title, content in chapters.items(): chap_fname = chap_title.replace(" - ", "-").replace(" ", "_").lower() chap_fname = "{}.html".format(chap_fname) chapter = epub.EpubHtml( title=chap_title, file_name=chap_fname, lang="en", media_type="application/xhtml+xml", ) chapter.set_content(content) chapter.add_item(css) book.add_item(chapter) html_chapters.append(chapter) # Add entries for the table of contents toc.append( toc_from_headings(html=content, filename=chap_fname, chapter_title=chap_title)) # Add the table of contents book.toc = toc book.spine = ("nav", *html_chapters) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # Save the file epub_fname = f"{basename}.epub" epub.write_epub(epub_fname, book)
from ebooklib import epub import time # mobi格式简介 https://www.cnblogs.com/buptzym/p/5249662.html s = requests.Session() s.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } epubTOC = [] book = epub.EpubBook() # add copyright description copyright = epub.EpubHtml(title="版权声明", file_name="copyright.html") copyright.content = """<h1>版权声明</h1> <p>本工具目的是将免费网络在线小说转换成方便kindle用户阅读的mobi电子书, 作品版权归原作者或网站所有, 请不要将该工具用于非法用途。</p> <p>GitHub: https://github.com/fondoger/qidian2mobi<p> """ book.add_item(copyright) epubTOC.append(epub.Link("copyright.html", "版权声明", "intro")) def handle_url(url): # handle urls like `//example.com` if url[:2] == '//': return "http:" + url return url
def build(self): '''build issue, downloading articles if needed, and write ebook''' self.fetch_issue() self.info() for s in self.sections: s.build(self.db) book = epub.EpubBook() # add metadata book.set_title(self.title) book.set_identifier(self.id) book.set_language(self.language) book.add_author(self.author) toc = [] spine = [] if self.cover_img: img = fetch(self.cover_img).content book.set_cover("image.jpg", img) spine.append('cover') spine.append('nav') # Sections for section in self.sections: items = [] for article in section.articles: if not article.content: logging.error('%s could not be downloaded. Skipping.', article.url) continue item = epub.EpubHtml(title=article.title, file_name='{}.xhtml'.format( article.title), lang=self.language) item.content = article.content # images were downloaded by the article, and placed # in disk for refenrence. We now add them to the book. for filename in article.images: img = epub.EpubImage() img.file_name = filename with open(filename, 'rb') as f: img.content = f.read() book.add_item(img) items.append(item) for item in items: book.add_item(item) toc.append((epub.Section(section.title, href=items[0].file_name), items)) spine.extend(items) book.toc = toc book.spine = spine # add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # create epub file epub.write_epub('{}.epub'.format(self.id), book, {})
def getDataForEbook(url): """ For now the url must be of the index of an oreilly internet ebook I plan to create a template file that will allow this script to read from just about any blog or website and turn it into an ebook. with the URL the script will look for the webpage and load it into memory to create the book Table of Contents, and after that it will create each chapter separately in its own folder, and to finish it up, it will wrap all into a single epub file. chapters type: array[str] var: It will hold the information of all the chapters of the book May in the future become a problem if the amount of data is too large for it to handle authors type: array[str] var: Keeps the names of the authors links type: array[str] var: holds the links of every chapter for the ebook book type: set{} var: Container for many important metadata for the ebook book_slug type: unicode var: slugify the url book_download_path type: str var: the path of the download folder for the book to be created eBook type: ebooklib var: constructor of the ebook """ #creation of the variables necessary to create the ebook chapters = [''] authors = [] links = [] book = {} # first it will drop "http[s]://" and "index.html", if present: simplified_url = url.split('://')[-1].split('index.html')[0] if VERBOSE: print 'simplified url:', simplified_url #then we will create the book folder... turns out it has to be unicode, so we fix that here book_slug = slugify(unicode(simplified_url, "utf-8")) book_download_path = os.path.join(DOWNLOADS_PATH, book_slug) #in case the book folder is not present, it will create one. if not os.path.isdir(book_download_path): os.mkdir(book_download_path) if VERBOSE: print 'CREATING book_download_path ({})'.format(book_download_path) #Creating eBook creator eBook = epub.EpubBook() #Capturing the url to run BS4 on it resp = get_page(url) soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") #url_root is the root of the book, where you find the table of contents (the link for all the chapters) url_root = url[:url.index("index")] #now we need to find the title of the book, usually it is an h1 with class "title" book["Title"] = soup.find('h1', class_="title").getText() #capture the authors of the book and put all to the authors ina variable to put into the metadata for author in soup.find_all("h3", class_="author"): authors.append(author.getText()) #this is the metadata book["Authors"] = authors #load the whole section "table of contents" (toc) into the container book["TOC"] = str(soup.find('div', class_="toc")) #creates the TOC.html of the book with open(os.path.join(book_download_path, "TOC.html"), "w") as text_file: text_file.write("<!-- " + book["Title"] + " -->\n") text_file.write(book["TOC"]) #to select the chapters it will look inside the TOC for links for chapters #those are prepared to capture only the chapters without the # markups and #only following the ORilley chapter names. for link in soup.find('div', class_="toc").find_all('a', href=True): if "#" not in link['href']: if 'pr' in link['href']: links.append(link['href']) if 'ch' in link['href']: links.append(link['href']) #setup the metadata eBook.set_identifier(book["Title"]) eBook.set_title(book["Title"]) eBook.set_language(LANGUAGE) #adding the authors into ebook metadata for author in book["Authors"]: eBook.add_author(author) #look for the files inside the book downloaded path f_ = os.listdir(book_download_path) #and then run the links looking for each one inside the local path looking for files missing. for link in links: if link in f_: print "Local file found:", link with open(os.path.join(book_download_path, link), "r") as text_file: resp = text_file.read() else: print "Downloading file:", link resp = get_page(url_root + link) soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") try: c = epub.EpubHtml(title=soup.find('h1', class_="title").getText(), file_name=link, lang='en') c.content = createChapter(url_root, link, book_download_path, resp) chapters.append(c) eBook.add_item(c) except AttributeError: c = epub.EpubHtml(title=soup.find('h2', class_="title").getText(), file_name=link, lang='en') c.content = createChapter(url_root, link, book_download_path, resp) chapters.append(c) eBook.add_item(c) eBook.toc = chapters eBook.add_item(epub.EpubNcx()) eBook.add_item(epub.EpubNav()) # define css style style = "" with open(os.path.join(STYLE_PATH, STYLE), "r") as text_file: style = text_file.read() if VERBOSE: print "Applying style", STYLE # add css file nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) eBook.add_item(nav_css) # create spine eBook.spine = chapters time_elapsed = time.time() if VERBOSE: print "Starting book creation..." # create epub file epub.write_epub(os.path.join(DOWNLOADS_PATH, book["Title"] + '.epub'), eBook, {}) print "Done,", os.path.join(DOWNLOADS_PATH, book["Title"] + '.epub'), "created!" print "Time elapsed", time.time() - time_elapsed
def process_message(self, message_json: str) -> bool: self.logger.debug(f"processing message {message_json}") # parse the message bind_ebook_msg = BindEBookMessage.from_json(message_json) # fetch the user record user = self.user_repository.get(bind_ebook_msg.user_id) if user is None: self.logger.error( f"couldn't fetch user with id {bind_ebook_msg.user_id}") return False # fetch the articles for the user articles = self.article_repository.get_all(user.user_id) if len(articles) < 1: # we want to exit early, but this is not an exceptional case, so we'll return True # so that the consumer deletes the message self.logger.info( f"no articles ready for binding for user {user.user_id}") return True # create the ebook model ebook_model = EBook(user.user_id) # create an ebooklib ebook ebook = epub.EpubBook() chapters = [] related_items = [] # for each article: for i, article in enumerate(articles): # fetch the content from S3 article_content = self.file_repository.get(article.content_key) if article_content is None: self.logger.error( f"couldn't fetch the saved content for article {article.article_id}" ) continue # add the article ID to the ebook model ebook_model.article_ids.append(article.article_id) # create an ebooklib chapter chapter = epub.EpubHtml(title=article.title, file_name=f"chapter_{i}.xhtml", lang="en") # add the content to the chapter chapter.set_content(article_content.read()) # for each related content: for j, related_content in enumerate(article.related_content): # fetch the related content from S3 item_content = self.file_repository.get( related_content.content_key) if item_content is None: self.logger.error( f"couldn't fetch the saved related content for article {article.article_id} and related content {related_content.content_key}" ) continue # create the ebooklib item related_item = epub.EpubItem( uid=f"related_item{i}", file_name=related_content.content_key, media_type=related_content.mime_type, content=item_content.read(), ) related_items.append(related_item) chapters.append(chapter) # add the chapters to the ebook for chapter in chapters: ebook.add_item(chapter) # add the images to the ebook as linked content for item in related_items: ebook.add_item(item) # add ebook metadata ebook.set_identifier("") # TODO: Find a value for this ebook.set_title("") # TODO: Find a value for this ebook.add_author("") # TODO: Find a value for this ebook.set_language("en") # create the ebook nav structure ebook.spine = chapters ebook.toc = chapters epub_path = Path.cwd() / f"{ebook_model.ebook_id}.epub" try: # render the ebook and write it to a local file epub.write_epub(str(epub_path), ebook) content_key = f"{ebook_model.user_id}/books/{ebook_model.ebook_id}.epub" # read the local file into a bytestream with epub_path.open(mode="rb") as f: # write the bytestream to S3 and update the content_key on the ebook model if not self.file_repository.put(content_key, f): self.logger.exception("unable to push ebook content to S3") return False except Exception: self.logger.exception("unable to write ebook to local file store") return False finally: # remove the temporary ePub file epub_path.unlink() ebook_model.content_key = content_key # write the ebook model to Dynamo if not self.ebook_repository.put(ebook_model): self.logger.error( f"unable to write ebook record to Dynamo for user {user.user_id}" ) return False if user.prefer_kindle: self.converter_queue_producer.send_message( ConvertEBookMessage(ebook_model.user_id, ebook_model.ebook_id).to_json()) else: self.postmaster_queue_producer.send_message( DeliverEBookMessage(ebook_model.user_id, ebook_model.ebook_id).to_json()) return True
except: # exception is also thrown if directory already exists print('Something happened while creating pic cache folder. Not necessary a problem.') print('** {} **\n\n{} posts\n{} pages'.format(info['name'], info['posts'], info['pages'])) # start creating book book = epub.EpubBook() book.set_title(info['title']) book.add_author(info['title']) book.add_author('Tumblr2book') book.set_language('en') # general info chapter introchapter = epub.EpubHtml(file_name='intro.xhtml') introchapter.content = ''' <h1> {} </h1> <p> <a href="{}"> {} </a> </p> <p> {} </p> <p> {} posts </p> <p> Blog last updated {} </p> <p> Scraped {} </p> {} '''.format(info['title'], info['url'], info['url'], info['description'], info['posts'], info['updated'], time.ctime(), di_warning) book.add_item(introchapter) # introducing templates template_names = [ # parts of posts 'header', 'picture', 'chatphrase',
def posts_epub_link(posts): book = epub.EpubBook() # add metadata book.set_title('Articles de Vincent Jousse') book.set_language('fr') book.add_author('Vincent Jousse') for post in posts: print post.title c1 = epub.EpubHtml(title=post.title, file_name='%s.xhtml' % post.slug, lang='fr') c1.content=u'<html><head></head><body><h1>Introduction</h1><p>Voici une belle introduction.</p></body></html>' book.add_item(c1) # add navigation files book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define css style style = ''' @namespace epub "http://www.idpf.org/2007/ops"; body { font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif; } h2 { text-align: left; text-transform: uppercase; font-weight: 200; } ol { list-style-type: none; } ol > li:first-child { margin-top: 0.3em; } nav[epub|type~='toc'] > ol > li > ol { list-style-type:square; } nav[epub|type~='toc'] > ol > li > ol > li { margin-top: 0.3em; } ''' # add css file nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) # create spine book.spine = ['nav', c1 ] # create epub file epub.write_epub('test.epub', book, {}) return "/test.epub"
def create_epub(work): book = epub.EpubBook() # set metadata book.set_identifier(str(work.id)) book.set_title(work.title) book.set_language('en') book.add_metadata('DC', 'description', work.work_summary) book.add_author(work.user.username) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) title_page = epub.EpubHtml(title=work.title, file_name='title_page.xhtml', lang='en') content_string = '<center><h1>' + work.work_summary + '</h1><br/><h2>' + work.user.username + '</h2>' + '<br/>Word Count: ' + str( work.word_count) + '</center>' title_page.content = content_string.encode('utf8') book.add_item(title_page) book.toc.append(epub.Link('title_page.xhtml', 'Title Page', '')) for chapter in work.chapters: new_chapter = epub.EpubHtml(title=chapter.title, file_name=chapter.title + '.xhtml', lang='en') if (chapter.image_url is not None and chapter.image_url != ""): if 'http' in chapter.image_url: image = requests.get(chapter.image_url).content else: image = open(chapter.image_url, 'rb').read() image_string = "chapter_" + str(chapter.number) + ".jpg" image_item = epub.EpubItem(uid="img_1", file_name=image_string, media_type="image/jpeg", content=image) book.add_item(image_item) if image is not None: new_chapter.add_item(image_item) if chapter.number == 1: book.set_cover(image_string, image) new_chapter.content = "<img src='" + image_string + "'/>" new_chapter.content += "<br/><br/><br/>" new_chapter.content += chapter.text book.add_item(new_chapter) book.toc.append( epub.Link(chapter.title + '.xhtml', chapter.title, chapter.summary)) # define CSS style style = 'BODY {color: white;}' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) # basic spine #book.spine = ['nav', c1] # write to the file epub.write_epub(work.title + '.epub', book, {})
def scrape_books(book_links): """Receives a dictionary of categories,books, and their links""" def create_epub(): eb = epub.EpubBook() eb.set_identifier(f"{savename}") eb.set_title(f"{book}") eb.set_language('en') eb.add_author(f"{author}") style = ''' body, table { margin-left: 100px; margin-right: 100px; font-size: 28px !important; font: 28px Times New Roman !important; font-weight: 400 !important; } ''' return eb for category, values in book_links.items(): try: os.mkdir(category) except: pass print(f"Scraping {category} category. . .") current_books = os.listdir(path=category) for book, link in tqdm(values.items()): browser.get(link) soup = BeautifulSoup(browser.page_source, 'lxml') author = soup.find('center').text savename = re.sub('\W+', ' ', book) altsavename = re.sub('\W+', '', book) eb_content = browser.page_source cork = eb_content.find('Home Page</a>') eb_content = eb_content[:cork] #creating EPUB eb = create_epub() #Writing Contents to Epub chapter = epub.EpubHtml(title=f"{book}", file_name='chap_01.xhtml', lang='hr') chapter.content = eb_content eb.add_item(chapter) spine = ['nav'] spine.append(chapter) nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) eb.add_item(nav_css) eb.spine = spine epub_path = os.path.join(category, f"{savename}.epub") epub.write_epub(epub_path, eb, {}) try: pdf.output(name=f'{savename}.pdf', dest='F') new_pdf_path = os.path.join(category, f'{savename}.pdf') new_epub_path = os.path.join(category, f"{altsavename}.epub") shutil.move(src=f'{savename}.pdf', dst=new_pdf_path) shutil.move(src=f'{savename}.epub', dst=new_epub_path) except: pass
def gen_epub( # pylint: disable=too-many-arguments, too-many-locals, too-many-statements, too-many-branches # noqa: C901 infile: str, outfile: Optional[str] = None, title: Optional[str] = None, start: int = 1, pairs: int = 1000, chapters: int = 10, debug: bool = False, ) -> Optional[str]: # fmt: on """ gen epub. infile: str = r"tests\2.tmx" outfile: Optional[str] = None title: Optional[str] = None start: int = 1 pairs: int = 1000 chapters: int = 10 debug: bool = True """ if debug: logzero.loglevel(10) else: logzero.loglevel(20) if not Path(infile).is_file(): logger.error(" [%s] is not a file or does not exist, exiting...", infile) raise SystemExit(1) if outfile is None: _ = Path(infile).absolute().parent stem = Path(infile).absolute().stem outfile = str(_ / f"{stem}.epub") if title is None: title = Path(infile).name if start < 1: start = 1 start = start - 1 if pairs < 0: pairs = 1000 if chapters < 0: chapters = 1000 # xml try: next(xml_iter(infile)) except Exception as exc: logger.error(" file [%s] maybe not a valid tmx file: %s", infile, exc) raise SystemExit(1) # --- xml_g = xml_iter(infile) # skip if start > 5000: for elm in tqdm(start): next(xml_g) else: for elm in range(start): next(xml_g) chp_cont = [] ch_ = 0 try: conn = "<br/>" conn = " " # for ch_ in trange(chapters): for ch_ in range(chapters): ct_ = [] if pairs > 10000: for _ in trange(pairs): el_ = next(xml_g) # ct_.append('<br/> '.join([pq(elm).html() for elm in pq(el_)("tuv")])) tuv = [pq(elm).html() for elm in pq(el_)("tuv")] # indent the secon tuv by 10px _ = tuv[ 0] + f"""<div style="margin-left: 20px">{tuv[1]}</div>""" ct_.append(_) else: for _ in range(pairs): el_ = next(xml_g) # ct_.append('<br/> '.join([pq(elm).html() for elm in pq(el_)("tuv")])) tuv = [pq(elm).html() for elm in pq(el_)("tuv")] # indent the secon tuv by 10px _ = tuv[ 0] + f"""<div style="margin-left: 20px">{tuv[1]}</div>""" ct_.append(_) chp_cont.append(conn.join(ct_)) except StopIteration: # normal, just collect chapter content chp_cont.append(conn.join(ct_)) except Exception as exc: logger.error("collecting sent pairs exc: %s", exc) finally: final_ch = ch_ + 1 if final_ch < chapters: logger.info(" Only able to collect **%s** chapters", final_ch) digits = math.ceil(math.log(chapters) / math.log(10)) + 1 # refer to https://pypi.org/project/EbookLib/ _ = """ # create chapter c1 = epub.EpubHtml(title='Intro', file_name='chap_01.xhtml', lang='hr') c1.content=u'<h1>Intro heading</h1><p>Zaba je skocila u baru.</p>' # add chapter book.add_item(c1) # define Table Of Contents book.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'), (epub.Section('Simple book'), (c1, )) ) # """ # create chapters ch_epub = [] for elm in range(1, final_ch + 1): _ = epub.EpubHtml(title=f"{elm}", file_name=f"chap_{elm:0{digits}d}.xhtml", lang="en") # celm = _, # globals()[f"c{elm}"] = _ logger.debug("elm: %s", elm) _.content = chp_cont[elm - 1] ch_epub.append(_) book = epub.EpubBook() # set metadata book.set_identifier(f"{title}-20200630") book.set_title(title) book.set_language('en') book.add_author('tmx2epub by mu@qq41947782') # add chapters nad prepare toc # toc = [] for elm in ch_epub: book.add_item(elm) # toc.append(elm) # define CSS style style = 'body { font-family: Times, Times New Roman, serif; }' nav_css = epub.EpubItem( uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style, ) # add CSS file book.add_item(nav_css) _ = """ for elm in range(1, final_ch + 1): _ = epub.Link(f"chap_{elm:0{digits}d}.xhtml", f"{elm}", f"{elm}") toc.append(_) # sect = (epub.Section(f"sect-{elm}"), (chp_cont[elm - 1],)) # toc.append(sect) book.toc = toc # """ book.toc = ((epub.Section(title), ch_epub), ) # basic spine # book.spine = [cover, nav] book.spine = ["nav"] # book.spine.extend(toc) book.spine.extend(ch_epub) # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) epub.write_epub(outfile, book) return outfile
def to_epub(self, filename: str, style: Type[Style] = AutumnStyle, font_size: int = 14) -> str: """ Render the current Goosepaper to an epub file on disk """ stories = [] for prov in self.story_providers: new_stories = prov.get_stories() for a in new_stories: if not a.headline: stories.append(a) continue for b in stories: if a.headline == b.headline: break else: stories.append(a) book = epub.EpubBook() title = f"{self.title} - {self.subtitle}" book.set_title(title) book.set_language("en") style_obj = Style() css = epub.EpubItem( uid="style_default", file_name="style/default.css", media_type="text/css", content=style_obj.get_css(font_size), ) book.add_item(css) chapters = [] links = [] no_headlines = [] for story in stories: if not story.headline: no_headlines.append(story) stories = [x for x in stories if x.headline] for story in stories: file = f"{uuid4().hex}.xhtml" title = story.headline chapter = epub.EpubHtml(title=title, file_name=file, lang="en") links.append(file) chapter.content = story.to_html() book.add_item(chapter) chapters.append(chapter) if no_headlines: file = f"{uuid4().hex}.xhtml" chapter = epub.EpubHtml(title="From Reddit", file_name=file, lang="en") links.append(file) chapter.content = "<br>".join([s.to_html() for s in no_headlines]) book.add_item(chapter) chapters.append(chapter) book.toc = chapters book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.spine = ["nav"] + chapters epub.write_epub(filename, book) return filename