def print_ebook_metadata(path2file):
    print('\n')
    print('PFAD:   ' + path2file)
    buch = epub.read_epub(path2file)

    # get_metadata gibt eine LISTE mit TUPEL zurueck, in denen STRINGS stecken
    # print path2file + ' hat kein Titelfeld.'

    titel = buch.get_metadata('DC', 'title')[0][0]
    print('TITEL:   ' + titel)

    # pruefen ob das Feld 'creator' vorhanden ist
    try:
        autor = buch.get_metadata('DC', 'creator')[0][0].split(' ')
        print(len(autor))
        print('AUTOR:   ' + autor[-1] + ',' + autor[-2])
        print('AUTOR:   ' + ' '.join(autor))

        # wenn der Dateiname den Autorennamen enthaelt, nichts machen
        if not path2file.__contains__(autor[-1]):
            if len(autor) > 1:
                print(path2file + ' --> ' + autor[-1] + ', ' + ' '.join(autor[:-1]) + ' - ' + titel)
            else:
                print(path2file + ' --> ' + autor[-1] + ' - ' + titel)

    except KeyError:
        print(path2file + ' --> ' + titel)
示例#2
1
from ebooklib import epub
from os import walk
from bs4 import BeautifulSoup
from gensim.utils import tokenize


BOOKS_DIR = 'data/books/'


files = [d+f for (d, _, files) in walk(BOOKS_DIR) for f in files]
print "Found %d files" % (len(files))

paragraphs = []
for f in files:
    print "Parsing", f
    book = epub.read_epub(f)
    content = [BeautifulSoup(x.content, 'html.parser') for x in book.get_items_of_type(9)]
    for c in content:
        paragraph_parse_tags = ['p', 'ol', 'ul']
        for tag in paragraph_parse_tags:
            for element in c.find_all(tag):
                text = element.get_text(' ', strip=True).encode('ascii', "ignore")
                # TODO: remove hyperlinks
                text = " ".join(tokenize(text, lowercase=True))
                if text != '':
                    paragraphs += [text]
    print "Done!"

OUTPUT_FILE = 'ck_12_paragraphs_all.txt'
with open(OUTPUT_FILE, 'w') as f:
    print "Writing all paragraphs to",
示例#3
0
文件: epub1.py 项目: excursus/nlpypes
def get_ebook_from(path: Path) -> epub.EpubBook:
    try:
        return epub.read_epub(str(path))
    except IsADirectoryError:
        pathhint: Path = Path(str(tempfile.mkdtemp())) / path.name
        path = zipdir(path, pathhint)
        return epub.read_epub(str(path))
    if is_valid_zip_file(path):
        return epub.read_epub(str(path))
    else:
        raise Exception(
            f"Not valid epub format (neither zip file nor directory) at {path}"
        )
示例#4
0
def epub2thtml(epub_path):
    chapters = []
    for item in epub.read_epub(epub_path).get_items():
        fn = os.path.splitext(os.path.basename(
            item.file_name))[0].lower().replace(' ', '_')
        ext = os.path.splitext(item.file_name)[-1]

        # footnote can be footnote1 etc
        # remove the isbn from the fn

        if ext not in {'.htm', '.html', '.xhtml'}:
            continue

        if re.match(r'^\d+_\w+', fn) is not None:
            fn = '_'.join(fn.split('_')[1:])
        if re.match(r'^\d+-\w+', fn) is not None:
            fn = '-'.join(fn.split('_')[1:])

        if fn in ignore_files:
            continue

        if any([fn.startswith(pre) for pre in ignore_file_startswith]):
            continue

        logger.debug('Including file: %s', fn)

        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters
示例#5
0
def get_isbn(filename):
    try:
        book = epub.read_epub(filename)
        return book.metadata['http://purl.org/dc/elements/1.1/']['identifier'][
            0][0]
    except Exception as error:
        print(error, sys.stderr)
示例#6
0
def find_epub_files(f):
    """
    Get list of files needed to be converted.

    *f* epub file no extention
    """
    all_star_html = []
    book = epub.read_epub(f + '.epub')
    for t in book.toc:
        debug(t.href)
        debug(f)
        t_href = t.href
        file_dot_index = t_href.rfind('.')
        t_name = t_href[:file_dot_index]
        t_ext = t_href[file_dot_index:]

        stn_file = t_name + t_ext
        extr_file = t_name + '-extracted' + t_ext
        debug(stn_file)
        debug(extr_file)

        addStnFile = find(stn_file, f)
        addExtrFile = find(extr_file, f)
        debug(addStnFile)
        debug(addExtrFile)
        if addStnFile is not None:
            debug("Adding file")
            all_star_html.append(addStnFile)
        if addExtrFile is not None:
            debug("Adding -extracted file")
            all_star_html.append(addExtrFile)
    return all_star_html
示例#7
0
def parse_book(filename, chapter_title_marks, names):
    # markers for finding chapter title
    pre, post = chapter_title_marks
    delta = len(pre)

    book = epub.read_epub(filename)

    # for name-numbered chapters
    name_count = {}
    for n in names:
        name_count[n] = 0

    chapters = {}
    for item in book.get_items():
        ss = item.content
        i = ss.find(pre)
        if i > 0:  # a chapter with a regularly formatted chapter title
            j = ss[i:].find(post)
            title = ss[i + delta:i + j].lower().replace(b'\xe2\x80\x99', b"'")
            if title in names:  # name + number chapter
                if name_count[title] == 0:
                    chapters[title] = {}
                name_count[title] += 1
                this_count = name_count[title]
            else:  # other chapter
                chapters[title] = {}
                this_count = 0
            chapters[title][this_count] = {}
            chapters[title][this_count]['name'] = item.get_name()
            content = b'<html>' + item.get_body_content() + b'</html>'
            chapters[title][this_count]['content'] = content

    css = book.get_item_with_id('css')

    return chapters, css
def GetEPubHTML(epub_path, processingFunction=(lambda book_text: book_text)):
    """ Outputs a list containing all test from the Book split by Chapters """
    book = epub.read_epub(epub_path)
    chapters = []

    for index, item in enumerate(book.get_items_of_type(
            ebooklib.ITEM_DOCUMENT)):
        soup = BeautifulSoup(item.get_content(), features="lxml")

        # Iterate all Headers and add full-stop
        for heading in soup.find_all([f'h{i}' for i in range(1, 7)]):
            if isinstance(heading.string, str):
                heading.string = heading.string + "."

        # Processing for a Specific Book
        soup = processingFunction(soup)

        # Strip Empty Lines / Chapters
        chapter_text = soup.get_text().strip()

        if chapter_text == "":
            continue

        chapter_text = "\n".join(
            [line for line in chapter_text.split('\n') if line.strip() != ''])

        # Add Chapter to Book
        chapters.append(chapter_text)

    return chapters
示例#9
0
def read_book(file):
    book = epub.read_epub(file)
    for items in book.get_items():
        if items.get_type() == 1:
            print("Image {}".format(items.get_name()))
            put_bucket(file, 'image/jpeg', items.get_name(),
                       items.get_content())
        if items.get_type() == 2:
            print("Style {}".format(items.get_name()))
            put_bucket(file, 'text/css', items.get_name(), items.get_content())
        if items.get_type() == 3:
            print("Type={} {}".format(items.get_type(), items.get_name()))
        if items.get_type() == 4:
            print("TOC {}".format(items.get_name()))
            put_bucket(file, 'text/ncx', items.get_name(), items.get_content())
        if items.get_type() == 5:
            print("Type={} {}".format(items.get_type(), items.get_name()))
        if items.get_type() == 6:
            print("Type={} {}".format(items.get_type(), items.get_name()))
        if items.get_type() == 7:
            print("Type={} {}".format(items.get_type(), items.get_name()))
        if items.get_type() == 8:
            print("Type={} {}".format(items.get_type(), items.get_name()))
        if items.get_type() == 9:
            print("HTML {}".format(items.get_name()))
            put_bucket(file, 'text/html', items.get_name(),
                       items.get_content())
示例#10
0
def parse_epub(filename: str, abbr: bool, code: bool):
    """
    Parse an epub file
    """
    book = epub.read_epub(filename)
    title = book.get_metadata('DC', 'title')[0][0]
    remove_hashtags = title in TITLES_REMOVE_HASHTAGS  # indicate to remove hashtags
    print('\nParsing book "{0}"'.format(title))
    list_plaintexts = []
    counter_abbrs = Counter()
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        name = item.get_name()
        if not re.match(REGEX_CHAPITRE, name):
            print('...Ignoring {0}'.format(name))
            continue
        print('...Parsing {0}'.format(name))
        # parse and clean chapter
        plaintext, abbrs = clean_epub_item(item, abbr, code, remove_hashtags)
        list_plaintexts.append(plaintext)
        counter_abbrs += Counter(abbrs)
    book_plaintext = '\n\n\n'.join(list_plaintexts)
    # replace numbers
    book_plaintext = filter_numbers(book_plaintext)
    # normalize
    book_plaintext = maybe_normalize(book_plaintext)
    if abbr:
        print('Abbreviation counts:\n{0}'.format(counter_abbrs.items()))
    return book_plaintext
示例#11
0
 def process_file(cls, filename):
     ebook = epub.read_epub(filename)
     for item in ebook.get_items():
         if item.get_type() == ebooklib.ITEM_DOCUMENT:
             content = item.get_content()
             soup = BeautifulSoup(content, features="lxml")
             return cls.process(ebook, soup, item, filename)
示例#12
0
def main(argv):
    # getopt
    try:                                
        opts, args = getopt.getopt(argv, "h")
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    # handle options
    for opt, optarg in opts:
        if opt == '-h':
            usage()                     
            sys.exit()
    if len(args) == 2:
        epub_fname = args[0]
        jpg_fname = args[1]
        check_file(epub_fname, ".epub")
        check_file(jpg_fname, ".jpg")
    else:
        usage()
        sys.exit()
    book = epub.read_epub(epub_fname)
    f = open(jpg_fname, 'rb')
    content = f.read()
    f.close()
    book.set_cover('cover.jpg', content)
    epub.write_epub(epub_fname, book, {})
def epub2thtml(epub_path):
    book = epub.read_epub(epub_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters
示例#14
0
def get_text(file_path: str):
    extension = os.path.splitext(file_path)[1]
    text = ""

    if extension == ".epub":
        book = epub.read_epub(file_path)
        for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            text += doc.get_content().decode("utf-8")

    elif extension == ".docx":
        text += docx2txt.process(file_path)

    elif extension == ".mobi":
        print("[!] Unsupported file: " + file_path)
        return None

    elif extension == ".pdf":
        pdf_file = open(file_path, "rb")
        read_pdf = PyPDF2.PdfFileReader(pdf_file)
        number_of_pages = read_pdf.getNumPages()
        for x in range(number_of_pages):
            page = read_pdf.getPage(x)
            text += page.extractText()
    else:
        if not extension == ".txt":
            print("[!] Unkown file type: " + file_path +
                  ", processing raw text...")

        text_file = open(file_path, "r", encoding="utf-8")
        text += text_file.read()

    text = text.strip().rstrip().replace('\n', '')
    return text
示例#15
0
def import_exist_corpus(request):
    try:
        serializer = ImportCorpusSerializer(data=request.data)
        corpus_id = request.data["corpus_id"]
        request_data = request.data
        if serializer.is_valid():
            corpus = Corpus.objects.get(pk=corpus_id, user=request.user.id)
            # Check invalid file type
            file_name = request.FILES['corpus_file'].name
            if not file_name.lower().endswith(('.txt', '.epub')):
                return Response({"detail": "Invalid file type"},
                                status=status.HTTP_400_BAD_REQUEST)

            file_obj = request.FILES['corpus_file']

            content = ''
            if file_name.lower().endswith('.epub'):
                book = epub.read_epub(request_data["corpus_file"])
                for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                    soup = BeautifulSoup(item.content, 'html5lib')
                    content = content + soup.get_text()

                content = content.replace('\n', '\r\n')
            elif file_name.lower().endswith('.txt'):
                content = (request_data["corpus_file"].read()).decode("utf-8")
            else:
                return Response({"detail": "Invalid file type"},
                                status=status.HTTP_400_BAD_REQUEST)

            # Find src_lang
            if corpus.language == settings.VIETNAMESE:
                p = Preprocessor(Language.vietnamese)
            elif corpus.language == settings.ENGLISH:
                p = Preprocessor(Language.english)
            else:
                return Response({"detail": "Invalid Language"},
                                status=status.HTTP_400_BAD_REQUEST)

            sents = p.segment_to_sentences(content)
            sents_cnt = len(sents)

            for idx in range(0, sents_cnt):
                sentence_refactor = p.preprocess(sents[idx])
                sentence_serilizer = CorpusContentSerializer(
                    data={
                        "phrase": sentence_refactor,
                        "corpus": corpus_id
                    })
                if sentence_serilizer.is_valid():
                    sentence_serilizer.save()

            return Response(serializer.data, status=status.HTTP_200_OK)
        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except Corpus.DoesNotExist:
        return Response({"detail": "corpus_id not found"},
                        status=status.HTTP_404_NOT_FOUND)
    except IntegrityError:
        return Response(serializer.data, status=status.HTTP_200_OK)
    except ValueError:
        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def text_from_ebook(fin, *, skip_last=False):
    """ Get text from ebook. Optionally leave out the last document,
        which often is a note such as Thank you for purchasing this eBook'.

    :param fin: input file in eBook format (e.g. ePub)
    :param skip_last: skip the last document in the file
    :return: the extracted text
    """
    book = epub.read_epub(fin)
    docs = list(book.get_items_of_type(ITEM_DOCUMENT))
    n_docs = len(docs)
    texts = []
    for doc_idx, doc in enumerate(docs):
        if skip_last and doc_idx == n_docs - 1:
            break
        soup = bs(doc.content, 'lxml')
        lines = list(
            filter(None, [l.strip() for l in soup.get_text().split('\n')]))

        if not lines:
            continue

        texts.append('\n'.join(lines))

    return '\n'.join(texts) + '\n'
示例#17
0
文件: epub.py 项目: xeenypl/ebookCat
def epubRead(fname):
    book = epub.read_epub(sys.argv[1])
    h = html2text.HTML2Text()
    for item in book.items:
        if isinstance(item, epub.EpubHtml):
            print("=" * 80)
            print(h.handle(item.content.decode("utf-8")))
示例#18
0
def readEpub(eBook):

    #Set up our string of information to return
    returnData = ''

    #Load up our book to check for metadata
    book = epub.read_epub(eBook)
    
    #Check title of the ebook and add it to our return data 
    for item in (book.get_metadata('DC', 'title')):
        returnData = item[0] + " "

    #Check the publisher of the ebook and add it to our return data 
    for item in (book.get_metadata('DC', 'publisher')):
        returnData += item[0] + " "

    #Check the publish date of the ebook and add it to our return data 
    for item in (book.get_metadata('DC', 'date')):
        returnData += item[0].split("-")[0] + " "

    #Check the author of the ebook and add it to our return data 
    for item in (book.get_metadata('DC', 'creator')):
        returnData += item[0]

    #Print our return data for sanity checking
    print(returnData)

    #Return our data
    return returnData
示例#19
0
    def epubHandler(self):
        self.clearData()

        #copy and rename epub to zip for extraction
        shutil.copyfile(self.fileName, 'temp.zip')
        zip_ref = zipfile.ZipFile('temp.zip', 'r')
        zip_ref.extractall('tempDir')
        zip_ref.close()

        bookRead = epub.read_epub(self.fileName)
        for chapter in bookRead.get_items_of_type(
                ebooklib.ITEM_DOCUMENT):  #Parse document files from epub file
            #Get usable file location for document file by parsing returned ebooklib object
            chapter = str(chapter)
            first, second, third = chapter.split(':')
            third = third[:-1]
            if (third.endswith('.htm') or third.endswith('.xml')
                    or third.endswith('.xhtml')):  #Convert to html
                name, extension = third.split('.')
                src = os.getcwd() + "/" + "tempDir" + "/" + third
                dest = os.getcwd() + "/" + "tempDir" + "/" + name + ".html"
                os.rename(src, dest)
                self.chapterList.append(dest)
            else:
                file = self.fileName
                file = file[:-5]
                href = file + '/' + third
                self.chapterList.append(href)

        self.web_widget.load(QUrl.fromLocalFile(
            self.chapterList[0]))  #Load first page
        self.combo.addItems(self.chapterList)  #Set navigation dropdown options
        self.currentTextIndex = 0
示例#20
0
    def __init__(self, input_book_name, font_charset_name):
        #字体渲染器
        pygame.freetype.init()
        #key:字体名 value渲染器
        self.fonts_render = {}
        for font in EpubFilter.font_list[1:]:
            self.fonts_render[font]=pygame.freetype.Font(font, 50)
            if 'Medium' not in font:
                #对于没有自带加粗的字体进行适当的加粗
                #更加适合kindle的屏幕
                self.fonts_render[font].strong=True
                self.fonts_render[font].strength=1/36#default is 1/36 and the "bold" is 1/12
        #所有kindle目前支持的字符集合
        #'kindle内置中文字体/STHeitiMedium'
        self.font_charset_map=pickle.load(open(font_charset_name, 'rb'))
        #kindle支持的字符集
        self.kindle_charset=self.font_charset_map[EpubFilter.font_list[0]]

        #生僻字和它对应的图片名
        self.char_image_map={}
        #当前处理的epub
        self.book=epub.read_epub(input_book_name)
        self.font_image_dir='font_image'
        self.new_css_filename='uncommon_word.css'
        self.temp_dirctory = os.path.join(os.path.dirname(input_book_name),
        os.path.splitext(os.path.basename(input_book_name))[0]+"_temp")
        if not os.path.exists(self.temp_dirctory):
            os.makedirs(self.temp_dirctory)
示例#21
0
 async def read_book(self, book_file):
     p = re.compile(r'\.\s+')
     p2 = re.compile(r"\\'")
     self._book = epub.read_epub(book_file)
     for item in self._book.get_items():
         if item.get_type() == ebooklib.ITEM_DOCUMENT:
             name = str(item.get_name())
             self._parsed_book[name] = list()
             logger.debug('==================================')
             logger.debug('NAME : ' + name)
             logger.debug('----------------------------------')
             content = str(item.get_content())
             logger.debug(content)
             parser = MyHTMLParser()
             parser.feed(content)
             result = parser.get_result()
             for string in result:
                 string = p.sub('.\n', string)
                 string = p2.sub("'", string)
                 lines = string.split("\n")
                 for line in lines:
                     new_line = str(line.lstrip("\\n")).rstrip()
                     self._parsed_book[name].append(new_line)
                     logger.debug(new_line)
                     #translated_string = await self.translate_text(new_line, 'en')
                     #content = content.replace(new_line, translated_string)
                     #logger.debug(":" + str(translated_string) + ":")
             logger.debug('==================================')
     logger.debug("Book:")
     logger.debug(str(self._parsed_book))
     logger.debug('==================================')
示例#22
0
def extract_docs(working_dir):
    input_path = working_dir + '/dictionaries/A Dictionary of Computer Science.epub'
    book = epub.read_epub(input_path)
    toc_content = None

    for doc in book.get_items():
        if 'Text/part0001.xhtml' == doc.get_name():
            toc_content = doc.get_content()
            break

    toc_tree = html.fromstring(toc_content)

    terms = []
    toc2s = toc_tree.find_class('toc2')

    current = 0
    total = len(toc2s)

    for el in toc2s:
        print_progress_bar('  Extracing Oxford CS dictionary...', current,
                           total)
        current += 1
        terms += el.text_content().strip()

    docs_terms_filename = working_dir + '/dictionaries/docs_terms.txt'
    with open(docs_terms_filename, 'w') as f:
        f.write('\n'.join(terms))

    print_progress_bar('  Extracing Oxford CS dictionary...', current, total)
    print()
示例#23
0
def is_epub(file):
    try:
        book = epub.read_epub(file)
    except EpubException:
        return False
    else:
        return True
示例#24
0
    def generate_html_chunks(self):
        assert self._type in [CONTENT_HTML, CONTENT_EPUB]

        if self._type == CONTENT_HTML:
            log.debug('Reading raw HTML from %s', self._url)

            yield self._istream.read()

        elif self._type == CONTENT_EPUB:
            log.debug('Reading ePub from %s', self._url)

            ios = lazygen.BufferedRandomReader(self._istream)
            book = epub.read_epub(ios)

            self._title, self._author = book.title, ''

            authors = book.get_metadata('DC', 'creator')
            if authors:
                self._author = authors[0][0]

            for doc_item in book.get_items_of_type(ITEM_DOCUMENT):
                yield doc_item.content

        elif self._type == CONTENT_PDF:
            yield ''
示例#25
0
def epub2html(epub_path):
    book = epub.read_epub(epub_path)
    chapters = ''
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters = chapters + item.get_content().decode("utf-8")
    return chapters
示例#26
0
 def __init__(self, book_path, graph_path, distance):
     self.book_path = book_path
     self.book = epub.read_epub(self.book_path)
     text = epub_utils.get_text(self.book)
     super().__init__(book_path, graph_path, text)
     self.distance = distance
     self.distance_partition = self.__get_names_by_portion__()
示例#27
0
def readEPUB(filename):
    """Function to read the attachment and return the contents"""
    book = epub.read_epub(filename)
    content = ''
    for a in book.get_items_of_type(epub.EpubHtml):
        content += a
    return content
示例#28
0
def read_meta_epub(epub_name):
    doc = epub.read_epub(epub_name)
    # print('-------', doc)
    meta = {}
    metadata = doc.metadata
    # for vlaues, row in metadata.items():
    #     print(vlaues)
    #     print(row)
    calibre_meta = 'calibre' if 'calibre' in metadata else CALIBRE_META
    if calibre_meta in metadata:
        calibre_metadata = metadata[calibre_meta]
        for key, item in calibre_metadata.items():
            meta[key] = item[0][1]['content']
    elements_meta = metadata[ELEMENTS_META]
    for key, val in elements_meta.items():
        if 'identifier' == key:
            identifier = {}
            for iden in val:
                iden_key = DOC_KEY if DOC_KEY in iden[1] else 'id'
                identifier[iden[1][iden_key].lower()] = iden[0]
            meta[key] = identifier
        else:
            if len(val) == 1 and key not in ('subject', 'identifier'):
                meta[key] = val[0][0]
            else:
                meta[key] = [value[0] for value in val if len(value) > 0]
    meta['meta_type'] = 'opf'
    return meta
示例#29
0
def updateSingleBookHelper(file):
    bookid = file[8:-5]
    unzip(file, './static/' + bookid)
    i = 0
    for file2 in getFiles('./static/' + bookid, '.xhtml'):
        os.rename(file2, './static/' + bookid + '/' + str(i) + '.xhtml')
        i = i + 1

    book = epub.read_epub(file)
    booktodb = {
        "bkname": book.get_metadata('DC', 'title')[0][0],
        "bkauthor": book.get_metadata('DC', 'creator')[0][0],
        "bkclass": '类型',
        "bkstate": '完结',
        "bkstar": 5,
        "bkinfo":
        getbkinfo("./static/" + bookid + "/" + str(i - 1) + ".xhtml"),
        "bkimg": '/static/' + bookid + '/cover.jpg',
        "bkid": int(bookid),
        "url": '/static/' + bookid + '/0.xhtml',
        'bkviewnum': 0,
        'bksize': i - 2
    }
    bkdb.insert_one(booktodb)

    getChapterForBkid(int(bookid))  #为这本书配套其章节名&章节URL

    print("thread mession completed.")
 def extract_metadata(self, epub_filename):
     '''
     Extraction of metadata
     '''
     self.filepath = epub_filename
     epub_file = epub.read_epub(self.filepath)
     metadata_fields = ['creator',
                        'title',
                        'subject',
                        'source',
                        'rights',
                        'relation',
                        'publisher',
                        'identifier',
                        'description',
                        'coverage',
                        'contributor',
                        'date']
     for metadata_field in metadata_fields:
         try:
             setattr(self,
                     metadata_field,
                     epub_file.get_metadata('DC', metadata_field)[0][0])
         except (IndexError, AttributeError):
             pass
     metadata_to_attribute = [['original_language', 'language'],
                              ['epub_type', 'type'],
                              ['epub_format', 'format']]
     for attribute, metadata_field in metadata_to_attribute:
         try:
             setattr(self,
                     attribute,
                     epub_file.get_metadata('DC', metadata_field)[0][0])
         except (IndexError, AttributeError):
             pass
示例#31
0
def eupub_to_chapters():
    """Parse a .epub file and return an array of strings
    corresponding to the chapters of the epub.

    Returns:
        list(str): A list of string objects, each of which corresponds to 
        a chapter of the .epub file, i.e. ["chapter 1 text", "chapter 2 text", ...]
    """    
    path = os.path.abspath(EPUB_FILEPATH)
    print(path)
    book = epub.read_epub(path)

    chapter_texts = []
    for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        html_content = text.get_content().decode("utf-8")
        chapter_texts.append(html_content)
    
    full_text = "".join(chapter_texts)
    full_text_chapters = convert_utils.get_all_chapters(full_text) 
    print("{} Chapters found in ebook".format(len(full_text_chapters)))


    # NOTE: For debugging only - Write the text contents to file
    if OUTPUT_EPUB_TEXT_TO_TEST_FILE:
        f = open("chapter_test.txt", "w")
        for i, chapter in enumerate(full_text_chapters):
            # For each chapter, prepend `!!Chapter{Chapter #}{newline}{chapter text}` 
            # for readability
            f.write("!!Chapter {}:\n{}".format(i+1,chapter))
        f.close()

    return full_text_chapters
示例#32
0
    def __init__(self, bk):
        b = bk.split("/")[-1]
        self.title = b[:b.find(".")]

        if bk.endswith("epub"):
            print("EPUB file detected")
            self.author = ''
            self.pages = epub.read_epub(bk).pages
            out = epub2text(bk)
            final = []
            for text in out:
                gan = text.split("\n")
                for g in gan:
                    final.append(g)
            self.text = final

        if bk.endswith("pdf"):
            print("PDF file detected")
            book = open(bk, 'rb')
            pdfReader = PyPDF2.PdfFileReader(book)
            self.pages = pdfReader.numPages
            final = []
            for num in range(0, self.pages):
                page = pdfReader.getPage(num)
                text = page.extractText()
                final.append(text)
            self.text = final

        if bk.endswith("docx") or bk.endswith("doc"):
            out = docx2txt.process(bk)
            self.text = [out]
示例#33
0
文件: main.py 项目: TTWNO/epub-pinyin
def find_epub_files(f):
    """
    Get list of files needed to be converted.

    *f* epub file no extention
    """
    all_star_html = []
    book = epub.read_epub(f + '.epub')
    for t in book.toc:
        debug(t.href)
        debug(f)
        t_href = t.href
        file_dot_index = t_href.rfind('.')
        t_name = t_href[:file_dot_index]
        t_ext = t_href[file_dot_index:]

        stn_file = t_name + t_ext
        extr_file = t_name + '-extracted' + t_ext
        debug(stn_file)
        debug(extr_file)

        addStnFile = find(stn_file, f)
        addExtrFile = find(extr_file, f)
        debug(addStnFile)
        debug(addExtrFile)
        if addStnFile is not None:
            debug("Adding file")
            all_star_html.append(addStnFile)
        if addExtrFile is not None:
            debug("Adding -extracted file")
            all_star_html.append(addExtrFile)
    return all_star_html
示例#34
0
 def extract(self, filename, **kwargs):
     book = epub.read_epub(filename)
     result = ""
     for item in book.get_items():
         type = item.get_type()
         if type == ITEM_DOCUMENT:
             soup = BeautifulSoup(item.content, 'lxml')
             result = result + soup.text
     return result
示例#35
0
def load_epub(filename: str) -> ebooklib.epub.EpubBook:
	try:
		return epub.read_epub(filename)
	except(FileNotFoundError):
		sys.exit('File not found: ' + filename)
	except(IsADirectoryError):
		sys.exit('File is directory:: ' + filename)
	except(ebooklib.epub.EpubException):
		sys.exit('File is not valid epub: ' + filename)
示例#36
0
 def _initialise(self):
     self.metadata = Metadata(self.url)
     main_page_request = self.session.get(self.url)
     if main_page_request.status_code != codes.ok:
         exit(1)
     self.main_page = BeautifulSoup(main_page_request.content, "html5lib")
     try:
         self.book = epub.read_epub(self.filename) if not self.force else None
     except AttributeError:
         pass
示例#37
0
 def extract(self, filename, **kwargs):
     book = epub.read_epub(filename)
     result = ''
     for id, _ in book.spine:
         item = book.get_item_with_id(id)
         soup = BeautifulSoup(item.content, 'lxml')
         for child in soup.find_all(
             ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
         ):
             result = result + child.text + '\n'
     return result
示例#38
0
def epub2txt(src, dst):
	content = ''
	book = epub.read_epub(src)

	for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
		soup = BeautifulSoup(item.content, 'html5lib')
		content = content +soup.get_text()

	content = content.replace('\n', '\r\n')
	with io.open(dst, 'w', encoding='utf-8') as f:
		f.write(content)
示例#39
0
def extract(filename, **kwargs):
    """Extract text from epub using python epub library
    """
    book = epub.read_epub(filename)
    result = ""
    for item in book.get_items():
        type = item.get_type()
        if type == ITEM_DOCUMENT:
            soup = BeautifulSoup(item.content)
            result = result + soup.text
    return result
示例#40
0
    def get_default_title(self, temp_file, ext):
        book_title = _('Imported Book %(date)s') % dict(date=datetime.date.today())

        if ext == 'epub':
            epub_book = epub.read_epub(temp_file)
            try:
                dc_key = epub.NAMESPACES['DC']
                book_title = epub_book.metadata[dc_key]['title'][0][0]
            except Exception:
                pass

        return book_title
示例#41
0
	def custom_epub_create(self, custom_epub, user):
		self.check_status()
		#準備epub文件
		from ebooklib import epub
		from utils.epub import txt2epub, html2epub, add_bookinfo
		info = {
			'ISBN': self.book_info.ISBN,
			'bookname': self.book_info.bookname,
			'author': self.book_info.author,
			'date': str(self.book_info.date),
			'house': self.book_info.house,
			'language': 'zh',
		}
		if self.status == 5:
			final_epub = self.path +'/OCR/{0}.epub'.format(self.ISBN)
			try:
				book = epub.read_epub(final_epub)
				book = add_bookinfo(
					book,
					**info
				)
				book.set_identifier(user.username)
				epub.write_epub(custom_epub, book, {})
			except BaseException as e:
				raise SystemError('epub create fail:' +unicode(e))
		else:
			final_epub = self.path +'/temp/{0}.temp'.format(self.ISBN)
			final_dir = os.path.dirname(final_epub)
			if not os.path.exists(final_dir):
				os.mkdir(final_dir)
			try:
				part_list = [ file.get_clean_file() for file in self.ebook_set.all().order_by('part') ]
				html2epub(part_list, final_epub, **info)
				book = epub.read_epub(final_epub)
				book.set_identifier(user.username)
				epub.write_epub(custom_epub, book, {})
			except BaseException as e:
				raise SystemError('epub create fail (not final):' +unicode(e))

		return custom_epub
示例#42
0
文件: __epub.py 项目: btimby/fulltext
    def handle_path(self, path):
        text, book = StringIO(), epub.read_epub(path)

        for id, _ in book.spine:
            item = book.get_item_with_id(id)
            soup = BeautifulSoup(item.content, 'lxml')
            for child in soup.find_all(
                ['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4']
            ):
                text.write(child.text)
                text.write(u'\n')

        return text.getvalue()
示例#43
0
def get_url_from_file(file: Union[str, click.Path]) -> Union[str, None]:
    book = epub.read_epub(file)
    title_page = book.get_item_with_id("title")
    if not title_page:  # if we're checking old-format ebook
        title_page = book.get_item_with_id("nav")
    try:
        parsed_text = BeautifulSoup(title_page.content, "html5lib")
        url = parsed_text.find(id="story-url")
        if not url:
            url = parsed_text
        return url("a")[0]["href"]
    except AttributeError:
        error = f"File {file} doesn't contain requested information."
        with open("pyffdl.log", "a") as fp:
            click.echo(error, file=fp)
        click.echo(error, err=True)
        return None
示例#44
0
def read(filepath):
    ebook = epub.read_epub(filepath)

    # We're interested in document items in the ebook
    all_paragraphs = []
    for item in ebook.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        text = item.get_content()
        text = preprocess_html(text)
        soup = BeautifulSoup(text, 'html.parser')
        paragraphs = soup.find_all('p')
        all_paragraphs.extend(paragraphs)

    for i in xrange(len(all_paragraphs)):
        p = all_paragraphs[i]
        p = p.text
        p = p.encode('utf-8')
        p = process_paragraph(p)
        all_paragraphs[i] = p

    content = '\n'.join(str(p) for p in all_paragraphs)
    return content
 def __init__(self, book_location):
     book = epub.read_epub(book_location)
     # Filter out pictures and the like
     self.chapters = [item for item in book.items if 'is_chapter' in dir(item)]
     self.title = book.title
示例#46
0
    def _import_old_epub(self, lyrics_path):
        new_verse_pattern = re.compile(r"^\s*(\d)\.\s+(.+)$")
        no_and_title_pattern = re.compile(r"^\s*(\d+)\s+(.+)$")

        if not self.epubs_page.old_epub:
            return

        book = epub.read_epub(self.epubs_page.old_epub)

        for item in list(filter(lambda i: isinstance(i, epub.EpubHtml), book.items)):
            tree = parse_html_string(item.content).getroottree()

            titles = tree.xpath("//title/text()")
            if titles:
                title = titles[0]

                m = no_and_title_pattern.match(title)
                if m is None:
                    continue
                no, title = m.groups()

                markers = []
                marker = None

                for line_element in tree.xpath("//div[@class='pGroup']/*"):
                    if line_element.tag == 'p':
                        while line_element.getchildren():
                            line_element.getchildren()[0].drop_tag()

                        line_text = line_element.text

                        m = new_verse_pattern.match(line_text)
                        if m is not None:
                            verse_no, line_text = m.groups()
                            if marker is not None:
                                markers.append(marker)

                            marker = {
                                'name': str(verse_no),
                                'text': line_text,
                            }
                        else:
                            marker['text'] += "\n{}".format(line_text)

                    elif "chorus" in line_element.attrib['class']:
                        if marker is not None:
                            markers.append(marker)

                        marker = {
                            'name': line_element.getchildren()[0].text.strip().
                                    replace('(', '').replace(')', '').lower().capitalize(),
                            'text': "",
                        }

                        for chorus_line_element in line_element.getchildren()[1:]:
                            marker['text'] += "{}\n".format(chorus_line_element.text)

                        marker['text'] = marker['text'][:-1]

                markers.append(marker)

                with open(os.path.join(lyrics_path, "{}.json".format(no)), "w") as f:
                    json.dump({
                        'title': title,
                        'markers': markers,
                    }, f, indent=2)
示例#47
0
    def _import_new_epub(self, lyrics_path):
        if not self.epubs_page.new_epub:
            return

        book = epub.read_epub(self.epubs_page.new_epub)

        for item in filter(lambda i: isinstance(i, epub.EpubHtml), book.items):
            tree = parse_html_string(item.content).getroottree()

            title = tree.xpath("//h1/strong/text()")

            if title:
                title = title[0]

                try:
                    song_no = int(tree.xpath("//head/title/text()")[0].split(" ", 1)[0])

                    markers = []
                    marker = None

                    for verse_no, verse_element in enumerate(tree.xpath("//div[@class='pGroup']/ol/li"), 1):
                        marker = {
                            'name': str(verse_no),
                            'text': '',
                        }
                        for line_element in verse_element.getchildren():
                            if line_element.tag == 'p' and not 'se' in line_element.attrib.get('class', ''):
                                while line_element.getchildren():
                                    line_element.getchildren()[0].drop_tag()

                                line_text = line_element.text.strip()

                                marker['text'] += "{}\n".format(line_text)

                            elif "chorus" in line_element.attrib['class']:
                                if marker is not None:
                                    marker['text'] = marker['text'][:-1]
                                    markers.append(marker)

                                marker = {
                                    'name': line_element.getchildren()[0].text.strip().
                                        replace('(', '').replace(')', '').lower().capitalize(),
                                    'text': "",
                                }

                                for chorus_line_element in line_element.getchildren()[1:]:
                                    marker['text'] += "{}\n".format(chorus_line_element.text)

                            else:
                                if marker is not None:
                                    marker['text'] = marker['text'][:-1]
                                    markers.append(marker)

                                marker = {
                                    'name': line_element.text.strip().replace('(', '').replace(')',
                                                                                               '').lower().capitalize(),
                                    'text': "",
                                }

                        marker['text'] = marker['text'][:-1]
                        markers.append(marker)

                    if markers:
                        with open(os.path.join(lyrics_path, "{}.json".format(song_no)), "w") as f:
                            json.dump({
                                'title': title,
                                'markers': markers,
                            }, f, indent=2)

                except ValueError:
                    pass
from ebooklib import epub

buch = epub.read_epub('/media/nas/ebooks/A/Anderson, Peter - Survivor 1.06 - Der Baum des Lebens.epub')








示例#49
0
文件: EpubMiner.py 项目: alishir/MPJ
 def __init__(self, file_path):
     self.file_path = file_path
     self.book = epub.read_epub(self.file_path)
     self.pages = self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
     self.aggregatd_body = BeautifulSoup('<body></body>', 'html.parser')
     self.is_aggregated = False
示例#50
0
def import_book_from_file(epub_file, user, **kwargs):
    import uuid

    from django.utils.timezone import utc
    from lxml import etree
    from ebooklib.utils import parse_html_string
    from .book import create_book

    opts = {'plugins': [TidyPlugin(), ImportPlugin()]}
    epub_book = epub.read_epub(epub_file, opts)

    chapters = {}
    toc = []

    def _parse_toc(elements, parent=None):
        for _elem in elements:
            # used later to get parent of an elem
            unique_id = uuid.uuid4().hex

            if isinstance(_elem, tuple):
                toc.append((1, _elem[0].title, unique_id, parent))
                _parse_toc(_elem[1], unique_id)
            elif isinstance(_elem, epub.Section):
                pass
            elif isinstance(_elem, epub.Link):
                _u = urlparse.urlparse(_elem.href)
                _name = urllib.unquote(os.path.basename(_u.path))
                if not _name:
                    _name = _elem.title

                if _name not in chapters:
                    chapters[_name] = _elem.title
                    toc.append((0, _name, unique_id, parent))

    _parse_toc(epub_book.toc)

    epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0]
    title = kwargs.get('book_title', epub_book_name)
    book_url = kwargs.get('book_url', None)

    # must check if title already exists
    book = create_book(user, title, book_url=book_url)
    now = datetime.datetime.utcnow().replace(tzinfo=utc)
    stat = models.BookStatus.objects.filter(book=book, name="new")[0]

    for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE):
        att = models.Attachment(
            book=book,
            version=book.version,
            status=stat
        )

        s = attach.get_content()
        f = StringIO.StringIO(s)
        f2 = File(f)
        f2.size = len(s)
        att.attachment.save(attach.file_name, f2, save=False)
        att.save()
        f.close()

    _imported = {}
    # TODO: ask about importing empty sections

    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        # Nav and Cover are not imported
        if not chap.is_chapter():
            continue

        # check if this chapter name already exists
        name = urllib.unquote(os.path.basename(chap.file_name))
        content = chap.get_body_content()

        # maybe this part has to go to the plugin
        # but you can not get title from <title>
        if name in chapters:
            name = chapters[name]
        else:
            name = _convert_file_name(name)
            if name.rfind('.') != -1:
                name = name[:name.rfind('.')]
            name = name.replace('.', '')

        chapter = models.Chapter(
            book=book,
            version=book.version,
            url_title=booktype_slugify(unicode(name)),
            title=name,
            status=stat,
            content=content,
            created=now,
            modified=now
        )
        chapter.save()
        _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter

    # fix links
    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        if not chap.is_chapter():
            continue

        content = chap.get_content()
        try:
            tree = parse_html_string(content)
        except:
            pass

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            to_save = False

            for _item in body.iter():
                if _item.tag == 'a':
                    _href = _item.get('href')

                    if _href:
                        _u = urlparse.urlparse(_href)
                        pth = urllib.unquote(os.path.basename(_u.path))

                        if pth in _imported:
                            _name = _imported[pth].url_title

                            _u2 = urlparse.urljoin(_href, '../' + _name + '/')
                            _item.set('href', _u2)
                            to_save = True

            if to_save:
                chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
                _imported[urllib.unquote(os.path.basename(chap.file_name))].content = chap.content
                _imported[urllib.unquote(os.path.basename(chap.file_name))].save()

    n = len(toc) + 1
    parents = {}

    for _elem in toc:
        if _elem[0] == 1:  # section
            toc_item = models.BookToc(
                book=book,
                version=book.version,
                name=_elem[1],
                chapter=None,
                weight=n,
                typeof=2
            )
        else:
            if not _elem[1] in _imported:
                continue

            chap = _imported[_elem[1]]
            toc_item = models.BookToc(
                book=book,
                version=book.version,
                name=chap.title,
                chapter=chap,
                weight=n,
                typeof=1
            )

        # check if elem has parent
        if _elem[3]:
            toc_item.parent = parents.get(_elem[3], None)
        toc_item.save()

        # decrease weight
        n -= 1

        # save temporarily the toc_item in parent
        parents[_elem[2]] = toc_item

    return book
示例#51
0
	def upload(self, request, pk=None):
		res = {}

		if request.method == 'POST':
			#book info 設定
			try:
				newBookInfo = BookInfo.objects.get(ISBN=request.POST['ISBN'])
			except:
				serializer = BookInfoSerializer(data=request.data)
				if not serializer.is_valid():
					res['detail'] = u'序列化驗證失敗' + unicode(serializer.errors)
					return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE)
				newBookInfo = serializer.save()

			#判斷是否上傳
			source_priority = {
				'self': 0,
				'txt': 1,
				'epub': 2,
			}
			try:
				book = Book.objects.get(ISBN=request.POST['ISBN'])
				if source_priority[request.POST['category']] <= source_priority[book.source]:
					res['detail'] = u'文件已存在'
					return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE)
			except:
				pass

			#上傳文件設定
			uploadPath = BASE_DIR + u'/file/ebookSystem/document/{0}'.format(request.POST['ISBN'])
			uploadFilePath = os.path.join(uploadPath, request.POST['ISBN'] +'.' +request.POST['category'])
			self.post_resource(uploadFilePath, request.FILES['fileObject'])

			#根據選擇上傳格式作業
			final_file = os.path.join(uploadPath, 'OCR') + '/{0}.epub'.format(request.POST['ISBN'], )
			#txt
			if request.POST['category'] == 'txt':
				from ebooklib import epub
				from utils.epub import txt2epub
				try:
					os.makedirs(os.path.dirname(final_file))
					info = {
						'ISBN': newBookInfo.ISBN,
						'bookname': newBookInfo.bookname,
						'author': newBookInfo.author,
						'date': str(newBookInfo.date),
						'house': newBookInfo.house,
						'language': 'zh',
					}
					txt2epub(uploadFilePath, final_file, **info)
				except BaseException as e:
					shutil.rmtree(uploadPath)
					res['detail'] = u'建立文件失敗' +str(e)
					return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE)

			#epub
			if request.POST['category'] == 'epub':
				from ebooklib import epub
				from utils.epub import through, add_bookinfo
				try:
					os.makedirs(os.path.dirname(final_file))
					through(uploadFilePath, final_file)
					book = epub.read_epub(final_file)
					book = add_bookinfo(
						book,
						ISBN = newBookInfo.ISBN,
						bookname = newBookInfo.bookname,
						author = newBookInfo.author,
						date = str(newBookInfo.date),
						house = newBookInfo.house,
						language = 'zh',
					)
					epub.write_epub(final_file, book, {})
				except BaseException as e:
					shutil.rmtree(uploadPath)
					raise(e)
					res['detail'] = u'建立文件失敗' +str(e)
					return Response(data=res, status=status.HTTP_406_NOT_ACCEPTABLE)

			#建立book object和ebook object
			try:
				newBook = Book(book_info=newBookInfo, ISBN=request.POST['ISBN'])
			except:
				newBook = Book.objects.get(ISBN=request.POST['ISBN'])

			newBook.scaner = request.user
			newBook.owner = request.user
			newBook.source = request.POST['category']
			newBook.finish_date = timezone.now()
			newBook.save()

			ebook = EBook.objects.create(book=newBook, part=1, ISBN_part=request.POST['ISBN'] + '-1', begin_page=-1, end_page=-1)
			ebook.change_status(5, 'final')

			res['detail'] = u'成功建立並上傳文件'
			return Response(data=res, status=status.HTTP_202_ACCEPTED)
示例#52
0
## epub2gif

import sys, os, glob, shutil

import ebooklib
from ebooklib import epub
from PIL import Image, ImageFont, ImageDraw
from images2gif import writeGif

# args
if len(sys.argv) == 2:
    bookname = os.path.splitext(sys.argv[1])[0]
    book = epub.read_epub(sys.argv[1])
else:
    print '\nNo EPUB file provided.\nUsage: python epub2gif.py filename.epub\n'
    quit()

# parameters
speed = 0.1
W = 500
H = 400
bgColor = (255,255,255)

# save content of pic to file
def savetofile(img):
    # get name and extension
    img_name = os.path.basename(img.file_name)
    img_type = img.media_type
    ext = img_type.split('/')[1]
    # write to file
    img_file = open(img_name, 'w')
示例#53
0
def through(src, dst):

	book = epub.read_epub(src)
	epub.write_epub(dst, book, {})
	return book
示例#54
0
from ebooklib import epub, ITEM_DOCUMENT

book = epub.read_epub('../_external_ressources/epub1/perroquet.epub')

for bookitems in book.get_items_of_type(ITEM_DOCUMENT):
    print(bookitems.file_name)

            children = lst.find_all(['li'])
            for child in children:
                summary_points.append(normalize_text(child.get_text().strip()))

    # some books use <p><strong>Key Concepts</strong></p>
    strongs = soup.find_all(['strong'])
    for element in strongs:
        if "Key Concepts" == element.get_text().strip():
            lst = element.parent.find_next_sibling("ul")
            children = lst.find_all(['li'])
            for child in children:
                summary_points.append(normalize_text(child.get_text().strip()))


    return summary_points


def get_lesson_summary(epub_item):
    soup = BeautifulSoup(item.content, 'html.parser')

    facts = find_fact_list(soup)
    for fact in facts:
        print(fact)

for file in os.listdir("./books"):
    if file.endswith(".epub"):
        book = epub.read_epub("./books/" + file)

        for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            get_lesson_summary(item)
示例#56
0
文件: base.py 项目: MarsWan/Booktype
 def load_book(self, book_path):
     return epub.read_epub(book_path)
示例#57
0
import sys
from ebooklib import epub

if len(sys.argv) != 4:
    print 'Usage: main.py [filter file] [input epub file] [output epub file]'
    sys.exit()

filter_filename = sys.argv[1]
input_filename = sys.argv[2]
output_filename = sys.argv[3]

with open(filter_filename) as f:
    filter_words = f.read().splitlines()

book = epub.read_epub(input_filename)

for item in book.items:
    if isinstance(item, epub.EpubHtml):
        for word in filter_words:
            stars = '*' * len(word)
            item.content = item.content.replace(word, stars)

epub.write_epub(output_filename, book, {})
示例#58
0
    else:
        global latest_id

        ingredients = {}
        matches = {}
        ingredient_ids = {}
        
        ingredients_fieldnames = ['tmpId', 'name', 'season', 'taste', 'weight', 'volume', 'vegetarian', 'dairy', 'kosher', 'nuts']
        matches_fieldnames = ['firstIngredient', 'secondIngredient', 'level', 'upvotes', 'downvotes', 'affinity', 'quote']

        removeExistingFiles(['flavorbible.db', 'Ingredient_tmp.json', 'Match_tmp.json'])
        conn = sqlite3.connect('flavorbible.db')
        c = conn.cursor()
        createTables(c)
        
        book = epub.read_epub('flavorbible.epub')
        result = ''
        for item in book.get_items():
            type = item.get_type()
            if type == ebooklib.ITEM_DOCUMENT:
                soup = BeautifulSoup(item.content, 'lxml')
        
                # Find ingredient listings.
                for ingredient in soup.find_all('p', {'class' : ['lh', 'lh1']}):
                    print('HEADING: ', ingredient)

                    i = fixName(ingredient.text)
                    if containsBlacklistedString(i):
                        continue
                    
                    if i in ingredient_ids:
 def __init__(self, filename):
     book = epub.read_epub(filename)
     for image in book.get_items_of_type(ebooklib.ITEM_IMAGE):
         print(image)