def get_pages_pagebreak_tag(self, mobi_file_path): ''' Determine pages based on the presence of <mbp:pagebreak>. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() html = as_bytes(mr.mobi_html.lower()) for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html): pages.append(m.end()) return pages
def get_pages_pagebreak_tag(self, mobi_file_path): ''' Determine pages based on the presense of <mbp:pagebreak>. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() html = mr.mobi_html.lower() for m in re.finditer('<[^>]*pagebreak[^>]*>', html): pages.append(m.end()) return pages
def parse_mobi(pathtoebook, book_fmt): mobiReader = MobiReader(pathtoebook, default_log) html = b'' offset = 1 # use code from calibre.ebooks.mobi.reader.mobi8:Mobi8Reader.__call__ if book_fmt == 'AZW3' and mobiReader.kf8_type == 'joint': offset = mobiReader.kf8_boundary + 2 mobiReader.extract_text(offset=offset) html = mobiReader.mobi_html if book_fmt == 'AZW3': m8r = Mobi8Reader(mobiReader, default_log) m8r.kf8_sections = mobiReader.sections[offset-1:] m8r.read_indices() m8r.build_parts() html = b''.join(m8r.parts) # match text between HTML tags for match_text in re.finditer(b'>[^<>]+<', html): yield (match_text.start() + 1, match_text.group(0)[1:-1])
def parse_mobi(book_path): # use code from calibre.ebooks.mobi.reader.mobi8:Mobi8Reader.__call__ # and calibre.ebook.conversion.plugins.mobi_input:MOBIInput.convert # https://github.com/kevinhendricks/KindleUnpack/blob/master/lib/mobi_k8proc.py#L216 try: mr = MobiReader(book_path, default_log) except Exception: mr = MobiReader(book_path, default_log, try_extra_data_fix=True) if mr.kf8_type == 'joint': raise Exception('JointMOBI') mr.check_for_drm() mr.extract_text() html = mr.mobi_html if mr.kf8_type == 'standalone': m8r = Mobi8Reader(mr, default_log) m8r.kf8_sections = mr.sections m8r.read_indices() m8r.build_parts() html = b''.join(m8r.parts) # match text between HTML tags for match_text in re.finditer(b'>[^<>]+<', html): yield (match_text.group(0)[1:-1].decode('utf-8'), match_text.start() + 1)
def get_pages_accurate(self, mobi_file_path): ''' A more accurate but much more resource intensive and slower method to calculate the page length. Parses the uncompressed text. In an average paper back book There are 32 lines per page and a maximum of 70 characters per line. Each paragraph starts a new line and every 70 characters (minus markup) in a paragraph starts a new line. The position after every 30 lines will be marked as a new page. This can be make more accurate by accounting for <div class="mbp_pagebreak" /> as a new page marker. And <br> elements as an empty line. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() # States in_tag = False in_p = False check_p = False closing = False p_char_count = 0 # Get positions of every line # A line is either a paragraph starting # or every 70 characters in a paragraph. lines = [] pos = -1 # We want this to be as fast as possible so we # are going to do one pass across the text. re # and string functions will parse the text each # time they are called. # # We can can use .lower() here because we are # not modifying the text. In this case the case # doesn't matter just the absolute character and # the position within the stream. data = bytearray(as_bytes(mr.mobi_html.lower())) slash, p, lt, gt = map(ord, '/p<>') for c in data: pos += 1 # Check if we are starting or stopping a p tag. if check_p: if c == slash: closing = True continue elif c == p: if closing: in_p = False else: in_p = True lines.append(pos - 2) check_p = False closing = False continue if c == lt: in_tag = True check_p = True continue elif c == gt: in_tag = False check_p = False continue if in_p and not in_tag: p_char_count += 1 if p_char_count == 70: lines.append(pos) p_char_count = 0 # Every 30 lines is a new page for i in range(0, len(lines), 32): pages.append(lines[i]) return pages
def get_pages_accurate(self, mobi_file_path): ''' A more accurate but much more resource intensive and slower method to calculate the page length. Parses the uncompressed text. In an average paper back book There are 32 lines per page and a maximum of 70 characters per line. Each paragraph starts a new line and every 70 characters (minus markup) in a paragraph starts a new line. The position after every 30 lines will be marked as a new page. This can be make more accurate by accounting for <div class="mbp_pagebreak" /> as a new page marker. And <br> elements as an empty line. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() # States in_tag = False in_p = False check_p = False closing = False p_char_count = 0 # Get positions of every line # A line is either a paragraph starting # or every 70 characters in a paragraph. lines = [] pos = -1 # We want this to be as fast as possible so we # are going to do one pass across the text. re # and string functions will parse the text each # time they are called. # # We can can use .lower() here because we are # not modifying the text. In this case the case # doesn't matter just the absolute character and # the position within the stream. for c in mr.mobi_html.lower(): pos += 1 # Check if we are starting or stopping a p tag. if check_p: if c == '/': closing = True continue elif c == 'p': if closing: in_p = False else: in_p = True lines.append(pos - 2) check_p = False closing = False continue if c == '<': in_tag = True check_p = True continue elif c == '>': in_tag = False check_p = False continue if in_p and not in_tag: p_char_count += 1 if p_char_count == 70: lines.append(pos) p_char_count = 0 # Every 30 lines is a new page for i in xrange(0, len(lines), 32): pages.append(lines[i]) return pages