def print_ebook_metadata(path2file): print('\n') print('PFAD: ' + path2file) buch = epub.read_epub(path2file) # get_metadata gibt eine LISTE mit TUPEL zurueck, in denen STRINGS stecken # print path2file + ' hat kein Titelfeld.' titel = buch.get_metadata('DC', 'title')[0][0] print('TITEL: ' + titel) # pruefen ob das Feld 'creator' vorhanden ist try: autor = buch.get_metadata('DC', 'creator')[0][0].split(' ') print(len(autor)) print('AUTOR: ' + autor[-1] + ',' + autor[-2]) print('AUTOR: ' + ' '.join(autor)) # wenn der Dateiname den Autorennamen enthaelt, nichts machen if not path2file.__contains__(autor[-1]): if len(autor) > 1: print(path2file + ' --> ' + autor[-1] + ', ' + ' '.join(autor[:-1]) + ' - ' + titel) else: print(path2file + ' --> ' + autor[-1] + ' - ' + titel) except KeyError: print(path2file + ' --> ' + titel)
from ebooklib import epub from os import walk from bs4 import BeautifulSoup from gensim.utils import tokenize BOOKS_DIR = 'data/books/' files = [d+f for (d, _, files) in walk(BOOKS_DIR) for f in files] print "Found %d files" % (len(files)) paragraphs = [] for f in files: print "Parsing", f book = epub.read_epub(f) content = [BeautifulSoup(x.content, 'html.parser') for x in book.get_items_of_type(9)] for c in content: paragraph_parse_tags = ['p', 'ol', 'ul'] for tag in paragraph_parse_tags: for element in c.find_all(tag): text = element.get_text(' ', strip=True).encode('ascii', "ignore") # TODO: remove hyperlinks text = " ".join(tokenize(text, lowercase=True)) if text != '': paragraphs += [text] print "Done!" OUTPUT_FILE = 'ck_12_paragraphs_all.txt' with open(OUTPUT_FILE, 'w') as f: print "Writing all paragraphs to",
def get_ebook_from(path: Path) -> epub.EpubBook: try: return epub.read_epub(str(path)) except IsADirectoryError: pathhint: Path = Path(str(tempfile.mkdtemp())) / path.name path = zipdir(path, pathhint) return epub.read_epub(str(path)) if is_valid_zip_file(path): return epub.read_epub(str(path)) else: raise Exception( f"Not valid epub format (neither zip file nor directory) at {path}" )
def epub2thtml(epub_path): chapters = [] for item in epub.read_epub(epub_path).get_items(): fn = os.path.splitext(os.path.basename( item.file_name))[0].lower().replace(' ', '_') ext = os.path.splitext(item.file_name)[-1] # footnote can be footnote1 etc # remove the isbn from the fn if ext not in {'.htm', '.html', '.xhtml'}: continue if re.match(r'^\d+_\w+', fn) is not None: fn = '_'.join(fn.split('_')[1:]) if re.match(r'^\d+-\w+', fn) is not None: fn = '-'.join(fn.split('_')[1:]) if fn in ignore_files: continue if any([fn.startswith(pre) for pre in ignore_file_startswith]): continue logger.debug('Including file: %s', fn) if item.get_type() == ebooklib.ITEM_DOCUMENT: chapters.append(item.get_content()) return chapters
def get_isbn(filename): try: book = epub.read_epub(filename) return book.metadata['http://purl.org/dc/elements/1.1/']['identifier'][ 0][0] except Exception as error: print(error, sys.stderr)
def find_epub_files(f): """ Get list of files needed to be converted. *f* epub file no extention """ all_star_html = [] book = epub.read_epub(f + '.epub') for t in book.toc: debug(t.href) debug(f) t_href = t.href file_dot_index = t_href.rfind('.') t_name = t_href[:file_dot_index] t_ext = t_href[file_dot_index:] stn_file = t_name + t_ext extr_file = t_name + '-extracted' + t_ext debug(stn_file) debug(extr_file) addStnFile = find(stn_file, f) addExtrFile = find(extr_file, f) debug(addStnFile) debug(addExtrFile) if addStnFile is not None: debug("Adding file") all_star_html.append(addStnFile) if addExtrFile is not None: debug("Adding -extracted file") all_star_html.append(addExtrFile) return all_star_html
def parse_book(filename, chapter_title_marks, names): # markers for finding chapter title pre, post = chapter_title_marks delta = len(pre) book = epub.read_epub(filename) # for name-numbered chapters name_count = {} for n in names: name_count[n] = 0 chapters = {} for item in book.get_items(): ss = item.content i = ss.find(pre) if i > 0: # a chapter with a regularly formatted chapter title j = ss[i:].find(post) title = ss[i + delta:i + j].lower().replace(b'\xe2\x80\x99', b"'") if title in names: # name + number chapter if name_count[title] == 0: chapters[title] = {} name_count[title] += 1 this_count = name_count[title] else: # other chapter chapters[title] = {} this_count = 0 chapters[title][this_count] = {} chapters[title][this_count]['name'] = item.get_name() content = b'<html>' + item.get_body_content() + b'</html>' chapters[title][this_count]['content'] = content css = book.get_item_with_id('css') return chapters, css
def GetEPubHTML(epub_path, processingFunction=(lambda book_text: book_text)): """ Outputs a list containing all test from the Book split by Chapters """ book = epub.read_epub(epub_path) chapters = [] for index, item in enumerate(book.get_items_of_type( ebooklib.ITEM_DOCUMENT)): soup = BeautifulSoup(item.get_content(), features="lxml") # Iterate all Headers and add full-stop for heading in soup.find_all([f'h{i}' for i in range(1, 7)]): if isinstance(heading.string, str): heading.string = heading.string + "." # Processing for a Specific Book soup = processingFunction(soup) # Strip Empty Lines / Chapters chapter_text = soup.get_text().strip() if chapter_text == "": continue chapter_text = "\n".join( [line for line in chapter_text.split('\n') if line.strip() != '']) # Add Chapter to Book chapters.append(chapter_text) return chapters
def read_book(file): book = epub.read_epub(file) for items in book.get_items(): if items.get_type() == 1: print("Image {}".format(items.get_name())) put_bucket(file, 'image/jpeg', items.get_name(), items.get_content()) if items.get_type() == 2: print("Style {}".format(items.get_name())) put_bucket(file, 'text/css', items.get_name(), items.get_content()) if items.get_type() == 3: print("Type={} {}".format(items.get_type(), items.get_name())) if items.get_type() == 4: print("TOC {}".format(items.get_name())) put_bucket(file, 'text/ncx', items.get_name(), items.get_content()) if items.get_type() == 5: print("Type={} {}".format(items.get_type(), items.get_name())) if items.get_type() == 6: print("Type={} {}".format(items.get_type(), items.get_name())) if items.get_type() == 7: print("Type={} {}".format(items.get_type(), items.get_name())) if items.get_type() == 8: print("Type={} {}".format(items.get_type(), items.get_name())) if items.get_type() == 9: print("HTML {}".format(items.get_name())) put_bucket(file, 'text/html', items.get_name(), items.get_content())
def parse_epub(filename: str, abbr: bool, code: bool): """ Parse an epub file """ book = epub.read_epub(filename) title = book.get_metadata('DC', 'title')[0][0] remove_hashtags = title in TITLES_REMOVE_HASHTAGS # indicate to remove hashtags print('\nParsing book "{0}"'.format(title)) list_plaintexts = [] counter_abbrs = Counter() for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): name = item.get_name() if not re.match(REGEX_CHAPITRE, name): print('...Ignoring {0}'.format(name)) continue print('...Parsing {0}'.format(name)) # parse and clean chapter plaintext, abbrs = clean_epub_item(item, abbr, code, remove_hashtags) list_plaintexts.append(plaintext) counter_abbrs += Counter(abbrs) book_plaintext = '\n\n\n'.join(list_plaintexts) # replace numbers book_plaintext = filter_numbers(book_plaintext) # normalize book_plaintext = maybe_normalize(book_plaintext) if abbr: print('Abbreviation counts:\n{0}'.format(counter_abbrs.items())) return book_plaintext
def process_file(cls, filename): ebook = epub.read_epub(filename) for item in ebook.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: content = item.get_content() soup = BeautifulSoup(content, features="lxml") return cls.process(ebook, soup, item, filename)
def main(argv): # getopt try: opts, args = getopt.getopt(argv, "h") except getopt.GetoptError: usage() sys.exit(2) # handle options for opt, optarg in opts: if opt == '-h': usage() sys.exit() if len(args) == 2: epub_fname = args[0] jpg_fname = args[1] check_file(epub_fname, ".epub") check_file(jpg_fname, ".jpg") else: usage() sys.exit() book = epub.read_epub(epub_fname) f = open(jpg_fname, 'rb') content = f.read() f.close() book.set_cover('cover.jpg', content) epub.write_epub(epub_fname, book, {})
def epub2thtml(epub_path): book = epub.read_epub(epub_path) chapters = [] for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: chapters.append(item.get_content()) return chapters
def get_text(file_path: str): extension = os.path.splitext(file_path)[1] text = "" if extension == ".epub": book = epub.read_epub(file_path) for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): text += doc.get_content().decode("utf-8") elif extension == ".docx": text += docx2txt.process(file_path) elif extension == ".mobi": print("[!] Unsupported file: " + file_path) return None elif extension == ".pdf": pdf_file = open(file_path, "rb") read_pdf = PyPDF2.PdfFileReader(pdf_file) number_of_pages = read_pdf.getNumPages() for x in range(number_of_pages): page = read_pdf.getPage(x) text += page.extractText() else: if not extension == ".txt": print("[!] Unkown file type: " + file_path + ", processing raw text...") text_file = open(file_path, "r", encoding="utf-8") text += text_file.read() text = text.strip().rstrip().replace('\n', '') return text
def import_exist_corpus(request): try: serializer = ImportCorpusSerializer(data=request.data) corpus_id = request.data["corpus_id"] request_data = request.data if serializer.is_valid(): corpus = Corpus.objects.get(pk=corpus_id, user=request.user.id) # Check invalid file type file_name = request.FILES['corpus_file'].name if not file_name.lower().endswith(('.txt', '.epub')): return Response({"detail": "Invalid file type"}, status=status.HTTP_400_BAD_REQUEST) file_obj = request.FILES['corpus_file'] content = '' if file_name.lower().endswith('.epub'): book = epub.read_epub(request_data["corpus_file"]) for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): soup = BeautifulSoup(item.content, 'html5lib') content = content + soup.get_text() content = content.replace('\n', '\r\n') elif file_name.lower().endswith('.txt'): content = (request_data["corpus_file"].read()).decode("utf-8") else: return Response({"detail": "Invalid file type"}, status=status.HTTP_400_BAD_REQUEST) # Find src_lang if corpus.language == settings.VIETNAMESE: p = Preprocessor(Language.vietnamese) elif corpus.language == settings.ENGLISH: p = Preprocessor(Language.english) else: return Response({"detail": "Invalid Language"}, status=status.HTTP_400_BAD_REQUEST) sents = p.segment_to_sentences(content) sents_cnt = len(sents) for idx in range(0, sents_cnt): sentence_refactor = p.preprocess(sents[idx]) sentence_serilizer = CorpusContentSerializer( data={ "phrase": sentence_refactor, "corpus": corpus_id }) if sentence_serilizer.is_valid(): sentence_serilizer.save() return Response(serializer.data, status=status.HTTP_200_OK) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) except Corpus.DoesNotExist: return Response({"detail": "corpus_id not found"}, status=status.HTTP_404_NOT_FOUND) except IntegrityError: return Response(serializer.data, status=status.HTTP_200_OK) except ValueError: return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def text_from_ebook(fin, *, skip_last=False): """ Get text from ebook. Optionally leave out the last document, which often is a note such as Thank you for purchasing this eBook'. :param fin: input file in eBook format (e.g. ePub) :param skip_last: skip the last document in the file :return: the extracted text """ book = epub.read_epub(fin) docs = list(book.get_items_of_type(ITEM_DOCUMENT)) n_docs = len(docs) texts = [] for doc_idx, doc in enumerate(docs): if skip_last and doc_idx == n_docs - 1: break soup = bs(doc.content, 'lxml') lines = list( filter(None, [l.strip() for l in soup.get_text().split('\n')])) if not lines: continue texts.append('\n'.join(lines)) return '\n'.join(texts) + '\n'
def epubRead(fname): book = epub.read_epub(sys.argv[1]) h = html2text.HTML2Text() for item in book.items: if isinstance(item, epub.EpubHtml): print("=" * 80) print(h.handle(item.content.decode("utf-8")))
def readEpub(eBook): #Set up our string of information to return returnData = '' #Load up our book to check for metadata book = epub.read_epub(eBook) #Check title of the ebook and add it to our return data for item in (book.get_metadata('DC', 'title')): returnData = item[0] + " " #Check the publisher of the ebook and add it to our return data for item in (book.get_metadata('DC', 'publisher')): returnData += item[0] + " " #Check the publish date of the ebook and add it to our return data for item in (book.get_metadata('DC', 'date')): returnData += item[0].split("-")[0] + " " #Check the author of the ebook and add it to our return data for item in (book.get_metadata('DC', 'creator')): returnData += item[0] #Print our return data for sanity checking print(returnData) #Return our data return returnData
def epubHandler(self): self.clearData() #copy and rename epub to zip for extraction shutil.copyfile(self.fileName, 'temp.zip') zip_ref = zipfile.ZipFile('temp.zip', 'r') zip_ref.extractall('tempDir') zip_ref.close() bookRead = epub.read_epub(self.fileName) for chapter in bookRead.get_items_of_type( ebooklib.ITEM_DOCUMENT): #Parse document files from epub file #Get usable file location for document file by parsing returned ebooklib object chapter = str(chapter) first, second, third = chapter.split(':') third = third[:-1] if (third.endswith('.htm') or third.endswith('.xml') or third.endswith('.xhtml')): #Convert to html name, extension = third.split('.') src = os.getcwd() + "/" + "tempDir" + "/" + third dest = os.getcwd() + "/" + "tempDir" + "/" + name + ".html" os.rename(src, dest) self.chapterList.append(dest) else: file = self.fileName file = file[:-5] href = file + '/' + third self.chapterList.append(href) self.web_widget.load(QUrl.fromLocalFile( self.chapterList[0])) #Load first page self.combo.addItems(self.chapterList) #Set navigation dropdown options self.currentTextIndex = 0
def __init__(self, input_book_name, font_charset_name): #字体渲染器 pygame.freetype.init() #key:字体名 value渲染器 self.fonts_render = {} for font in EpubFilter.font_list[1:]: self.fonts_render[font]=pygame.freetype.Font(font, 50) if 'Medium' not in font: #对于没有自带加粗的字体进行适当的加粗 #更加适合kindle的屏幕 self.fonts_render[font].strong=True self.fonts_render[font].strength=1/36#default is 1/36 and the "bold" is 1/12 #所有kindle目前支持的字符集合 #'kindle内置中文字体/STHeitiMedium' self.font_charset_map=pickle.load(open(font_charset_name, 'rb')) #kindle支持的字符集 self.kindle_charset=self.font_charset_map[EpubFilter.font_list[0]] #生僻字和它对应的图片名 self.char_image_map={} #当前处理的epub self.book=epub.read_epub(input_book_name) self.font_image_dir='font_image' self.new_css_filename='uncommon_word.css' self.temp_dirctory = os.path.join(os.path.dirname(input_book_name), os.path.splitext(os.path.basename(input_book_name))[0]+"_temp") if not os.path.exists(self.temp_dirctory): os.makedirs(self.temp_dirctory)
async def read_book(self, book_file): p = re.compile(r'\.\s+') p2 = re.compile(r"\\'") self._book = epub.read_epub(book_file) for item in self._book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: name = str(item.get_name()) self._parsed_book[name] = list() logger.debug('==================================') logger.debug('NAME : ' + name) logger.debug('----------------------------------') content = str(item.get_content()) logger.debug(content) parser = MyHTMLParser() parser.feed(content) result = parser.get_result() for string in result: string = p.sub('.\n', string) string = p2.sub("'", string) lines = string.split("\n") for line in lines: new_line = str(line.lstrip("\\n")).rstrip() self._parsed_book[name].append(new_line) logger.debug(new_line) #translated_string = await self.translate_text(new_line, 'en') #content = content.replace(new_line, translated_string) #logger.debug(":" + str(translated_string) + ":") logger.debug('==================================') logger.debug("Book:") logger.debug(str(self._parsed_book)) logger.debug('==================================')
def extract_docs(working_dir): input_path = working_dir + '/dictionaries/A Dictionary of Computer Science.epub' book = epub.read_epub(input_path) toc_content = None for doc in book.get_items(): if 'Text/part0001.xhtml' == doc.get_name(): toc_content = doc.get_content() break toc_tree = html.fromstring(toc_content) terms = [] toc2s = toc_tree.find_class('toc2') current = 0 total = len(toc2s) for el in toc2s: print_progress_bar(' Extracing Oxford CS dictionary...', current, total) current += 1 terms += el.text_content().strip() docs_terms_filename = working_dir + '/dictionaries/docs_terms.txt' with open(docs_terms_filename, 'w') as f: f.write('\n'.join(terms)) print_progress_bar(' Extracing Oxford CS dictionary...', current, total) print()
def is_epub(file): try: book = epub.read_epub(file) except EpubException: return False else: return True
def generate_html_chunks(self): assert self._type in [CONTENT_HTML, CONTENT_EPUB] if self._type == CONTENT_HTML: log.debug('Reading raw HTML from %s', self._url) yield self._istream.read() elif self._type == CONTENT_EPUB: log.debug('Reading ePub from %s', self._url) ios = lazygen.BufferedRandomReader(self._istream) book = epub.read_epub(ios) self._title, self._author = book.title, '' authors = book.get_metadata('DC', 'creator') if authors: self._author = authors[0][0] for doc_item in book.get_items_of_type(ITEM_DOCUMENT): yield doc_item.content elif self._type == CONTENT_PDF: yield ''
def epub2html(epub_path): book = epub.read_epub(epub_path) chapters = '' for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: chapters = chapters + item.get_content().decode("utf-8") return chapters
def __init__(self, book_path, graph_path, distance): self.book_path = book_path self.book = epub.read_epub(self.book_path) text = epub_utils.get_text(self.book) super().__init__(book_path, graph_path, text) self.distance = distance self.distance_partition = self.__get_names_by_portion__()
def readEPUB(filename): """Function to read the attachment and return the contents""" book = epub.read_epub(filename) content = '' for a in book.get_items_of_type(epub.EpubHtml): content += a return content
def read_meta_epub(epub_name): doc = epub.read_epub(epub_name) # print('-------', doc) meta = {} metadata = doc.metadata # for vlaues, row in metadata.items(): # print(vlaues) # print(row) calibre_meta = 'calibre' if 'calibre' in metadata else CALIBRE_META if calibre_meta in metadata: calibre_metadata = metadata[calibre_meta] for key, item in calibre_metadata.items(): meta[key] = item[0][1]['content'] elements_meta = metadata[ELEMENTS_META] for key, val in elements_meta.items(): if 'identifier' == key: identifier = {} for iden in val: iden_key = DOC_KEY if DOC_KEY in iden[1] else 'id' identifier[iden[1][iden_key].lower()] = iden[0] meta[key] = identifier else: if len(val) == 1 and key not in ('subject', 'identifier'): meta[key] = val[0][0] else: meta[key] = [value[0] for value in val if len(value) > 0] meta['meta_type'] = 'opf' return meta
def updateSingleBookHelper(file): bookid = file[8:-5] unzip(file, './static/' + bookid) i = 0 for file2 in getFiles('./static/' + bookid, '.xhtml'): os.rename(file2, './static/' + bookid + '/' + str(i) + '.xhtml') i = i + 1 book = epub.read_epub(file) booktodb = { "bkname": book.get_metadata('DC', 'title')[0][0], "bkauthor": book.get_metadata('DC', 'creator')[0][0], "bkclass": '类型', "bkstate": '完结', "bkstar": 5, "bkinfo": getbkinfo("./static/" + bookid + "/" + str(i - 1) + ".xhtml"), "bkimg": '/static/' + bookid + '/cover.jpg', "bkid": int(bookid), "url": '/static/' + bookid + '/0.xhtml', 'bkviewnum': 0, 'bksize': i - 2 } bkdb.insert_one(booktodb) getChapterForBkid(int(bookid)) #为这本书配套其章节名&章节URL print("thread mession completed.")
def extract_metadata(self, epub_filename): ''' Extraction of metadata ''' self.filepath = epub_filename epub_file = epub.read_epub(self.filepath) metadata_fields = ['creator', 'title', 'subject', 'source', 'rights', 'relation', 'publisher', 'identifier', 'description', 'coverage', 'contributor', 'date'] for metadata_field in metadata_fields: try: setattr(self, metadata_field, epub_file.get_metadata('DC', metadata_field)[0][0]) except (IndexError, AttributeError): pass metadata_to_attribute = [['original_language', 'language'], ['epub_type', 'type'], ['epub_format', 'format']] for attribute, metadata_field in metadata_to_attribute: try: setattr(self, attribute, epub_file.get_metadata('DC', metadata_field)[0][0]) except (IndexError, AttributeError): pass
def eupub_to_chapters(): """Parse a .epub file and return an array of strings corresponding to the chapters of the epub. Returns: list(str): A list of string objects, each of which corresponds to a chapter of the .epub file, i.e. ["chapter 1 text", "chapter 2 text", ...] """ path = os.path.abspath(EPUB_FILEPATH) print(path) book = epub.read_epub(path) chapter_texts = [] for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): html_content = text.get_content().decode("utf-8") chapter_texts.append(html_content) full_text = "".join(chapter_texts) full_text_chapters = convert_utils.get_all_chapters(full_text) print("{} Chapters found in ebook".format(len(full_text_chapters))) # NOTE: For debugging only - Write the text contents to file if OUTPUT_EPUB_TEXT_TO_TEST_FILE: f = open("chapter_test.txt", "w") for i, chapter in enumerate(full_text_chapters): # For each chapter, prepend `!!Chapter{Chapter #}{newline}{chapter text}` # for readability f.write("!!Chapter {}:\n{}".format(i+1,chapter)) f.close() return full_text_chapters
def __init__(self, bk): b = bk.split("/")[-1] self.title = b[:b.find(".")] if bk.endswith("epub"): print("EPUB file detected") self.author = '' self.pages = epub.read_epub(bk).pages out = epub2text(bk) final = [] for text in out: gan = text.split("\n") for g in gan: final.append(g) self.text = final if bk.endswith("pdf"): print("PDF file detected") book = open(bk, 'rb') pdfReader = PyPDF2.PdfFileReader(book) self.pages = pdfReader.numPages final = [] for num in range(0, self.pages): page = pdfReader.getPage(num) text = page.extractText() final.append(text) self.text = final if bk.endswith("docx") or bk.endswith("doc"): out = docx2txt.process(bk) self.text = [out]
def extract(self, filename, **kwargs): book = epub.read_epub(filename) result = "" for item in book.get_items(): type = item.get_type() if type == ITEM_DOCUMENT: soup = BeautifulSoup(item.content, 'lxml') result = result + soup.text return result
def load_epub(filename: str) -> ebooklib.epub.EpubBook: try: return epub.read_epub(filename) except(FileNotFoundError): sys.exit('File not found: ' + filename) except(IsADirectoryError): sys.exit('File is directory:: ' + filename) except(ebooklib.epub.EpubException): sys.exit('File is not valid epub: ' + filename)
def _initialise(self): self.metadata = Metadata(self.url) main_page_request = self.session.get(self.url) if main_page_request.status_code != codes.ok: exit(1) self.main_page = BeautifulSoup(main_page_request.content, "html5lib") try: self.book = epub.read_epub(self.filename) if not self.force else None except AttributeError: pass
def extract(self, filename, **kwargs): book = epub.read_epub(filename) result = '' for id, _ in book.spine: item = book.get_item_with_id(id) soup = BeautifulSoup(item.content, 'lxml') for child in soup.find_all( ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4'] ): result = result + child.text + '\n' return result
def epub2txt(src, dst): content = '' book = epub.read_epub(src) for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): soup = BeautifulSoup(item.content, 'html5lib') content = content +soup.get_text() content = content.replace('\n', '\r\n') with io.open(dst, 'w', encoding='utf-8') as f: f.write(content)
def extract(filename, **kwargs): """Extract text from epub using python epub library """ book = epub.read_epub(filename) result = "" for item in book.get_items(): type = item.get_type() if type == ITEM_DOCUMENT: soup = BeautifulSoup(item.content) result = result + soup.text return result
def get_default_title(self, temp_file, ext): book_title = _('Imported Book %(date)s') % dict(date=datetime.date.today()) if ext == 'epub': epub_book = epub.read_epub(temp_file) try: dc_key = epub.NAMESPACES['DC'] book_title = epub_book.metadata[dc_key]['title'][0][0] except Exception: pass return book_title
def custom_epub_create(self, custom_epub, user): self.check_status() #準備epub文件 from ebooklib import epub from utils.epub import txt2epub, html2epub, add_bookinfo info = { 'ISBN': self.book_info.ISBN, 'bookname': self.book_info.bookname, 'author': self.book_info.author, 'date': str(self.book_info.date), 'house': self.book_info.house, 'language': 'zh', } if self.status == 5: final_epub = self.path +'/OCR/{0}.epub'.format(self.ISBN) try: book = epub.read_epub(final_epub) book = add_bookinfo( book, **info ) book.set_identifier(user.username) epub.write_epub(custom_epub, book, {}) except BaseException as e: raise SystemError('epub create fail:' +unicode(e)) else: final_epub = self.path +'/temp/{0}.temp'.format(self.ISBN) final_dir = os.path.dirname(final_epub) if not os.path.exists(final_dir): os.mkdir(final_dir) try: part_list = [ file.get_clean_file() for file in self.ebook_set.all().order_by('part') ] html2epub(part_list, final_epub, **info) book = epub.read_epub(final_epub) book.set_identifier(user.username) epub.write_epub(custom_epub, book, {}) except BaseException as e: raise SystemError('epub create fail (not final):' +unicode(e)) return custom_epub
def handle_path(self, path): text, book = StringIO(), epub.read_epub(path) for id, _ in book.spine: item = book.get_item_with_id(id) soup = BeautifulSoup(item.content, 'lxml') for child in soup.find_all( ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4'] ): text.write(child.text) text.write(u'\n') return text.getvalue()
def get_url_from_file(file: Union[str, click.Path]) -> Union[str, None]: book = epub.read_epub(file) title_page = book.get_item_with_id("title") if not title_page: # if we're checking old-format ebook title_page = book.get_item_with_id("nav") try: parsed_text = BeautifulSoup(title_page.content, "html5lib") url = parsed_text.find(id="story-url") if not url: url = parsed_text return url("a")[0]["href"] except AttributeError: error = f"File {file} doesn't contain requested information." with open("pyffdl.log", "a") as fp: click.echo(error, file=fp) click.echo(error, err=True) return None
def read(filepath): ebook = epub.read_epub(filepath) # We're interested in document items in the ebook all_paragraphs = [] for item in ebook.get_items_of_type(ebooklib.ITEM_DOCUMENT): text = item.get_content() text = preprocess_html(text) soup = BeautifulSoup(text, 'html.parser') paragraphs = soup.find_all('p') all_paragraphs.extend(paragraphs) for i in xrange(len(all_paragraphs)): p = all_paragraphs[i] p = p.text p = p.encode('utf-8') p = process_paragraph(p) all_paragraphs[i] = p content = '\n'.join(str(p) for p in all_paragraphs) return content
def __init__(self, book_location): book = epub.read_epub(book_location) # Filter out pictures and the like self.chapters = [item for item in book.items if 'is_chapter' in dir(item)] self.title = book.title
def _import_old_epub(self, lyrics_path): new_verse_pattern = re.compile(r"^\s*(\d)\.\s+(.+)$") no_and_title_pattern = re.compile(r"^\s*(\d+)\s+(.+)$") if not self.epubs_page.old_epub: return book = epub.read_epub(self.epubs_page.old_epub) for item in list(filter(lambda i: isinstance(i, epub.EpubHtml), book.items)): tree = parse_html_string(item.content).getroottree() titles = tree.xpath("//title/text()") if titles: title = titles[0] m = no_and_title_pattern.match(title) if m is None: continue no, title = m.groups() markers = [] marker = None for line_element in tree.xpath("//div[@class='pGroup']/*"): if line_element.tag == 'p': while line_element.getchildren(): line_element.getchildren()[0].drop_tag() line_text = line_element.text m = new_verse_pattern.match(line_text) if m is not None: verse_no, line_text = m.groups() if marker is not None: markers.append(marker) marker = { 'name': str(verse_no), 'text': line_text, } else: marker['text'] += "\n{}".format(line_text) elif "chorus" in line_element.attrib['class']: if marker is not None: markers.append(marker) marker = { 'name': line_element.getchildren()[0].text.strip(). replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } for chorus_line_element in line_element.getchildren()[1:]: marker['text'] += "{}\n".format(chorus_line_element.text) marker['text'] = marker['text'][:-1] markers.append(marker) with open(os.path.join(lyrics_path, "{}.json".format(no)), "w") as f: json.dump({ 'title': title, 'markers': markers, }, f, indent=2)
def _import_new_epub(self, lyrics_path): if not self.epubs_page.new_epub: return book = epub.read_epub(self.epubs_page.new_epub) for item in filter(lambda i: isinstance(i, epub.EpubHtml), book.items): tree = parse_html_string(item.content).getroottree() title = tree.xpath("//h1/strong/text()") if title: title = title[0] try: song_no = int(tree.xpath("//head/title/text()")[0].split(" ", 1)[0]) markers = [] marker = None for verse_no, verse_element in enumerate(tree.xpath("//div[@class='pGroup']/ol/li"), 1): marker = { 'name': str(verse_no), 'text': '', } for line_element in verse_element.getchildren(): if line_element.tag == 'p' and not 'se' in line_element.attrib.get('class', ''): while line_element.getchildren(): line_element.getchildren()[0].drop_tag() line_text = line_element.text.strip() marker['text'] += "{}\n".format(line_text) elif "chorus" in line_element.attrib['class']: if marker is not None: marker['text'] = marker['text'][:-1] markers.append(marker) marker = { 'name': line_element.getchildren()[0].text.strip(). replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } for chorus_line_element in line_element.getchildren()[1:]: marker['text'] += "{}\n".format(chorus_line_element.text) else: if marker is not None: marker['text'] = marker['text'][:-1] markers.append(marker) marker = { 'name': line_element.text.strip().replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } marker['text'] = marker['text'][:-1] markers.append(marker) if markers: with open(os.path.join(lyrics_path, "{}.json".format(song_no)), "w") as f: json.dump({ 'title': title, 'markers': markers, }, f, indent=2) except ValueError: pass
from ebooklib import epub buch = epub.read_epub('/media/nas/ebooks/A/Anderson, Peter - Survivor 1.06 - Der Baum des Lebens.epub')
def __init__(self, file_path): self.file_path = file_path self.book = epub.read_epub(self.file_path) self.pages = self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT) self.aggregatd_body = BeautifulSoup('<body></body>', 'html.parser') self.is_aggregated = False
def import_book_from_file(epub_file, user, **kwargs): import uuid from django.utils.timezone import utc from lxml import etree from ebooklib.utils import parse_html_string from .book import create_book opts = {'plugins': [TidyPlugin(), ImportPlugin()]} epub_book = epub.read_epub(epub_file, opts) chapters = {} toc = [] def _parse_toc(elements, parent=None): for _elem in elements: # used later to get parent of an elem unique_id = uuid.uuid4().hex if isinstance(_elem, tuple): toc.append((1, _elem[0].title, unique_id, parent)) _parse_toc(_elem[1], unique_id) elif isinstance(_elem, epub.Section): pass elif isinstance(_elem, epub.Link): _u = urlparse.urlparse(_elem.href) _name = urllib.unquote(os.path.basename(_u.path)) if not _name: _name = _elem.title if _name not in chapters: chapters[_name] = _elem.title toc.append((0, _name, unique_id, parent)) _parse_toc(epub_book.toc) epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0] title = kwargs.get('book_title', epub_book_name) book_url = kwargs.get('book_url', None) # must check if title already exists book = create_book(user, title, book_url=book_url) now = datetime.datetime.utcnow().replace(tzinfo=utc) stat = models.BookStatus.objects.filter(book=book, name="new")[0] for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE): att = models.Attachment( book=book, version=book.version, status=stat ) s = attach.get_content() f = StringIO.StringIO(s) f2 = File(f) f2.size = len(s) att.attachment.save(attach.file_name, f2, save=False) att.save() f.close() _imported = {} # TODO: ask about importing empty sections for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): # Nav and Cover are not imported if not chap.is_chapter(): continue # check if this chapter name already exists name = urllib.unquote(os.path.basename(chap.file_name)) content = chap.get_body_content() # maybe this part has to go to the plugin # but you can not get title from <title> if name in chapters: name = chapters[name] else: name = _convert_file_name(name) if name.rfind('.') != -1: name = name[:name.rfind('.')] name = name.replace('.', '') chapter = models.Chapter( book=book, version=book.version, url_title=booktype_slugify(unicode(name)), title=name, status=stat, content=content, created=now, modified=now ) chapter.save() _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter # fix links for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): if not chap.is_chapter(): continue content = chap.get_content() try: tree = parse_html_string(content) except: pass root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') to_save = False for _item in body.iter(): if _item.tag == 'a': _href = _item.get('href') if _href: _u = urlparse.urlparse(_href) pth = urllib.unquote(os.path.basename(_u.path)) if pth in _imported: _name = _imported[pth].url_title _u2 = urlparse.urljoin(_href, '../' + _name + '/') _item.set('href', _u2) to_save = True if to_save: chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) _imported[urllib.unquote(os.path.basename(chap.file_name))].content = chap.content _imported[urllib.unquote(os.path.basename(chap.file_name))].save() n = len(toc) + 1 parents = {} for _elem in toc: if _elem[0] == 1: # section toc_item = models.BookToc( book=book, version=book.version, name=_elem[1], chapter=None, weight=n, typeof=2 ) else: if not _elem[1] in _imported: continue chap = _imported[_elem[1]] toc_item = models.BookToc( book=book, version=book.version, name=chap.title, chapter=chap, weight=n, typeof=1 ) # check if elem has parent if _elem[3]: toc_item.parent = parents.get(_elem[3], None) toc_item.save() # decrease weight n -= 1 # save temporarily the toc_item in parent parents[_elem[2]] = toc_item return book
def upload(self, request, pk=None): res = {} if request.method == 'POST': #book info 設定 try: newBookInfo = BookInfo.objects.get(ISBN=request.POST['ISBN']) except: serializer = BookInfoSerializer(data=request.data) if not serializer.is_valid(): res['detail'] = u'序列化驗證失敗' + unicode(serializer.errors) return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE) newBookInfo = serializer.save() #判斷是否上傳 source_priority = { 'self': 0, 'txt': 1, 'epub': 2, } try: book = Book.objects.get(ISBN=request.POST['ISBN']) if source_priority[request.POST['category']] <= source_priority[book.source]: res['detail'] = u'文件已存在' return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE) except: pass #上傳文件設定 uploadPath = BASE_DIR + u'/file/ebookSystem/document/{0}'.format(request.POST['ISBN']) uploadFilePath = os.path.join(uploadPath, request.POST['ISBN'] +'.' +request.POST['category']) self.post_resource(uploadFilePath, request.FILES['fileObject']) #根據選擇上傳格式作業 final_file = os.path.join(uploadPath, 'OCR') + '/{0}.epub'.format(request.POST['ISBN'], ) #txt if request.POST['category'] == 'txt': from ebooklib import epub from utils.epub import txt2epub try: os.makedirs(os.path.dirname(final_file)) info = { 'ISBN': newBookInfo.ISBN, 'bookname': newBookInfo.bookname, 'author': newBookInfo.author, 'date': str(newBookInfo.date), 'house': newBookInfo.house, 'language': 'zh', } txt2epub(uploadFilePath, final_file, **info) except BaseException as e: shutil.rmtree(uploadPath) res['detail'] = u'建立文件失敗' +str(e) return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE) #epub if request.POST['category'] == 'epub': from ebooklib import epub from utils.epub import through, add_bookinfo try: os.makedirs(os.path.dirname(final_file)) through(uploadFilePath, final_file) book = epub.read_epub(final_file) book = add_bookinfo( book, ISBN = newBookInfo.ISBN, bookname = newBookInfo.bookname, author = newBookInfo.author, date = str(newBookInfo.date), house = newBookInfo.house, language = 'zh', ) epub.write_epub(final_file, book, {}) except BaseException as e: shutil.rmtree(uploadPath) raise(e) res['detail'] = u'建立文件失敗' +str(e) return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE) #建立book object和ebook object try: newBook = Book(book_info=newBookInfo, ISBN=request.POST['ISBN']) except: newBook = Book.objects.get(ISBN=request.POST['ISBN']) newBook.scaner = request.user newBook.owner = request.user newBook.source = request.POST['category'] newBook.finish_date = timezone.now() newBook.save() ebook = EBook.objects.create(book=newBook, part=1, ISBN_part=request.POST['ISBN'] + '-1', begin_page=-1, end_page=-1) ebook.change_status(5, 'final') res['detail'] = u'成功建立並上傳文件' return Response(data=res, status=status.HTTP_202_ACCEPTED)
## epub2gif import sys, os, glob, shutil import ebooklib from ebooklib import epub from PIL import Image, ImageFont, ImageDraw from images2gif import writeGif # args if len(sys.argv) == 2: bookname = os.path.splitext(sys.argv[1])[0] book = epub.read_epub(sys.argv[1]) else: print '\nNo EPUB file provided.\nUsage: python epub2gif.py filename.epub\n' quit() # parameters speed = 0.1 W = 500 H = 400 bgColor = (255,255,255) # save content of pic to file def savetofile(img): # get name and extension img_name = os.path.basename(img.file_name) img_type = img.media_type ext = img_type.split('/')[1] # write to file img_file = open(img_name, 'w')
def through(src, dst): book = epub.read_epub(src) epub.write_epub(dst, book, {}) return book
from ebooklib import epub, ITEM_DOCUMENT book = epub.read_epub('../_external_ressources/epub1/perroquet.epub') for bookitems in book.get_items_of_type(ITEM_DOCUMENT): print(bookitems.file_name)
children = lst.find_all(['li']) for child in children: summary_points.append(normalize_text(child.get_text().strip())) # some books use <p><strong>Key Concepts</strong></p> strongs = soup.find_all(['strong']) for element in strongs: if "Key Concepts" == element.get_text().strip(): lst = element.parent.find_next_sibling("ul") children = lst.find_all(['li']) for child in children: summary_points.append(normalize_text(child.get_text().strip())) return summary_points def get_lesson_summary(epub_item): soup = BeautifulSoup(item.content, 'html.parser') facts = find_fact_list(soup) for fact in facts: print(fact) for file in os.listdir("./books"): if file.endswith(".epub"): book = epub.read_epub("./books/" + file) for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): get_lesson_summary(item)
def load_book(self, book_path): return epub.read_epub(book_path)
import sys from ebooklib import epub if len(sys.argv) != 4: print 'Usage: main.py [filter file] [input epub file] [output epub file]' sys.exit() filter_filename = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] with open(filter_filename) as f: filter_words = f.read().splitlines() book = epub.read_epub(input_filename) for item in book.items: if isinstance(item, epub.EpubHtml): for word in filter_words: stars = '*' * len(word) item.content = item.content.replace(word, stars) epub.write_epub(output_filename, book, {})
else: global latest_id ingredients = {} matches = {} ingredient_ids = {} ingredients_fieldnames = ['tmpId', 'name', 'season', 'taste', 'weight', 'volume', 'vegetarian', 'dairy', 'kosher', 'nuts'] matches_fieldnames = ['firstIngredient', 'secondIngredient', 'level', 'upvotes', 'downvotes', 'affinity', 'quote'] removeExistingFiles(['flavorbible.db', 'Ingredient_tmp.json', 'Match_tmp.json']) conn = sqlite3.connect('flavorbible.db') c = conn.cursor() createTables(c) book = epub.read_epub('flavorbible.epub') result = '' for item in book.get_items(): type = item.get_type() if type == ebooklib.ITEM_DOCUMENT: soup = BeautifulSoup(item.content, 'lxml') # Find ingredient listings. for ingredient in soup.find_all('p', {'class' : ['lh', 'lh1']}): print('HEADING: ', ingredient) i = fixName(ingredient.text) if containsBlacklistedString(i): continue if i in ingredient_ids:
def __init__(self, filename): book = epub.read_epub(filename) for image in book.get_items_of_type(ebooklib.ITEM_IMAGE): print(image)