示例#1
0
    def extract_text(self, offset=1):
        self.log.debug('Extracting text...')
        text_sections = [
            self.text_section(i) for i in range(
                offset,
                min(self.book_header.records + offset, len(self.sections)))
        ]
        processed_records = list(
            range(offset - 1, self.book_header.records + offset))

        self.mobi_html = b''

        if self.book_header.compression_type == 'DH':
            huffs = [
                self.sections[i][0] for i in range(
                    self.book_header.huff_offset,
                    self.book_header.huff_offset +
                    self.book_header.huff_number)
            ]
            processed_records += list(
                range(
                    self.book_header.huff_offset,
                    self.book_header.huff_offset +
                    self.book_header.huff_number))
            huff = HuffReader(huffs)
            unpack = huff.unpack

        elif self.book_header.compression_type == '\x00\x02':
            unpack = decompress_doc

        elif self.book_header.compression_type == '\x00\x01':
            unpack = lambda x: x
        else:
            raise MobiError('Unknown compression algorithm: %s' %
                            repr(self.book_header.compression_type))
        self.mobi_html = b''.join(map(unpack, text_sections))
        if self.mobi_html.endswith(b'#'):
            self.mobi_html = self.mobi_html[:-1]

        if self.book_header.ancient and '<html' not in self.mobi_html[:
                                                                      300].lower(
                                                                      ):
            self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
        self.mobi_html = self.mobi_html.replace('\0', '')
        if self.book_header.codec == 'cp1252':
            self.mobi_html = self.mobi_html.replace('\x1e',
                                                    '')  # record separator
            self.mobi_html = self.mobi_html.replace('\x02',
                                                    '')  # start of text
        return processed_records
示例#2
0
 def huffit(off, cnt):
     huffman_record_nums = list(xrange(off, off + cnt))
     huffrecs = [self.records[r].raw for r in huffman_record_nums]
     huffs = HuffReader(huffrecs)
     return huffman_record_nums, huffs.unpack