def getIDTagByPosFid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file row = fromBase32(posfid) off = fromBase32(offset) [insertpos, idtext, filenum, seqnm, startpos, length] = self.divtbl[row] pos = insertpos + off fname, pn, skelpos, skelend = self.getFileInfo(pos) # an existing "id=" must exist in original xhtml otherwise it would not have worked for linking. # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent # some position information encoded into Base32 name. # so find the closest "id=" before position the file by actually searching in that file idtext = self.getIDTag(pos) return fname, idtext
def getIDTagByPosFid(self, posfid, offset): # first convert kindle:pos:fid and offset info to position in file row = fromBase32(posfid) off = fromBase32(offset) [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row] pos = insertpos + off fname, pn, skelpos, skelend = self.getFileInfo(pos) if fname is None: # pos does not exist # default to skeleton pos instead print "Link To Position", pos, "does not exist, retargeting to top of target" pos = self.skeltbl[filenum][3] fname, pn, skelpos, skelend = self.getFileInfo(pos) # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking. # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent # some position information encoded into Base32 name. # so find the closest "id=" before position the file by actually searching in that file idtext = self.getIDTag(pos) return fname, idtext
def __init__(self, data, debug=False): self._debug = debug self.resc = None self.opos = 0 self.extrameta = [] self.cover_name = None self.spine_idrefs = {} self.spine_order = [] self.spine_pageattributes = {} self.spine_ppd = None # need3 indicate the book has fields which require epub3. # but the estimation of the source epub version from the fields is difficult. self.need3 = False self.package_ver = None self.extra_metadata = [] self.refines_metadata = [] self.extra_attributes = [] # get header start_pos = data.find(b'<') self.resc_header = data[:start_pos] # get resc data length start = self.resc_header.find(b'=') + 1 end = self.resc_header.find(b'&', start) resc_size = 0 if end > 0: resc_size = fromBase32(self.resc_header[start:end]) resc_rawbytes = len(data) - start_pos if resc_rawbytes == resc_size: self.resc_length = resc_size else: # Most RESC has a nul string at its tail but some do not. end_pos = data.find(b'\x00', start_pos) if end_pos < 0: self.resc_length = resc_rawbytes else: self.resc_length = end_pos - start_pos if self.resc_length != resc_size: print( "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes)." .format(self.resc_length, resc_size)) # now parse RESC after converting it to unicode from utf-8 self.resc = unicode_str(data[start_pos:start_pos + self.resc_length]) self.parseData()
def __init__(self, data, debug=False): self._debug = debug self.resc = None self.opos = 0 self.extrameta = [] self.cover_name = None self.spine_idrefs = {} self.spine_order = [] self.spine_pageattributes = {} self.spine_ppd = None # need3 indicate the book has fields which require epub3. # but the estimation of the source epub version from the fields is difficult. self.need3 = False self.package_ver = None self.extra_metadata = [] self.refines_metadata = [] self.extra_attributes = [] # get header start_pos = data.find('<') self.resc_header = data[:start_pos] # get resc data length start = self.resc_header.find('=') + 1 end = self.resc_header.find('&', start) resc_size = 0 if end > 0: resc_size = fromBase32(self.resc_header[start:end]) resc_rawbytes = len(data) - start_pos if resc_rawbytes == resc_size: self.resc_length = resc_size else: # Most RESC has a nul string at its tail but some do not. end_pos = data.find('\x00', start_pos) if end_pos < 0: self.resc_length = resc_rawbytes else: self.resc_length = end_pos - start_pos if self.resc_length != resc_size: print "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size) # now handle RESC self.resc = data[start_pos:start_pos+self.resc_length] self.parseData()
def buildXHTML(self): # first need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** # cutting and pasting any pieces into the xhtml text files # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) # XXXX is the offset in records into divtbl # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position # pos:fid pattern posfid_pattern = re.compile(r'''(<a.*?href=.*?>)''', re.IGNORECASE) posfid_index_pattern = re.compile(r'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') parts = [] print "Building proper xhtml for each file" for i in xrange(self.k8proc.getNumberOfParts()): part = self.k8proc.getPart(i) [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) # internal links srcpieces = posfid_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<'): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) if idtag == '': replacement= '"' + filename + '"' else: replacement = '"' + filename + '#' + idtag + '"' tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts.append(part) # we are free to cut and paste as we see fit # we can safely remove all of the Kindlegen generated aid tags find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''') for i in xrange(len(parts)): part = parts[i] srcpieces = find_tag_with_aid_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): for m in within_tag_aid_position_pattern.finditer(tag): replacement = '' tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags # with page-break-after style patterns find_tag_with_AmznPageBreak_pattern = re.compile(r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) within_tag_AmznPageBreak_position_pattern = re.compile(r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') for i in xrange(len(parts)): part = parts[i] srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( lambda m:' style="page-break-after:%s"'%m.group(1), tag) part = "".join(srcpieces) parts[i] = part # we have to handle substitutions for the flows pieces first as they may # be inlined into the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) # kindle:embed:XXXX (used for fonts) flows = [] flows.append(None) flowinfo = [] flowinfo.append([None, None, None, None]) # regular expression search patterns img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) tag_pattern = re.compile(r'''(<[^>]*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE) url_img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) font_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) for i in xrange(1, self.k8proc.getNumberOfFlows()): [type, format, dir, filename] = self.k8proc.getFlowInfo(i) flowpart = self.k8proc.getFlow(i) # links to raster image files from image tags # image_pattern srcpieces = img_pattern.split(flowpart) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber-1] if imageName is not None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) srcpieces[j] = tag flowpart = "".join(srcpieces) # replacements inside css url(): srcpieces = url_pattern.split(flowpart) for j in range(1, len(srcpieces),2): tag = srcpieces[j] # process links to raster image files for m in url_img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber-1] osep = m.group()[0] csep = m.group()[-1] if imageName is not None: replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep) self.used[imageName] = 'used' tag = url_img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) # process links to fonts for m in font_index_pattern.finditer(tag): fontNumber = fromBase32(m.group(1)) fontName = self.rscnames[fontNumber-1] osep = m.group()[0] csep = m.group()[-1] if fontName is None: print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag) else: replacement = '%s%s%s'%(osep, '../Fonts/' + fontName, csep) tag = font_index_pattern.sub(replacement, tag, 1) self.used[fontName] = 'used' # process links to other css pieces for m in url_css_index_pattern.finditer(tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) replacement = '"../' + pdir + '/' + fnm + '"' tag = url_css_index_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' srcpieces[j] = tag flowpart = "".join(srcpieces) # store away in our own copy flows.append(flowpart) # I do no thtink this case exists and even if it does exist, it needs to be done in a separate # pass to prevent inlining a flow piece into another flow piece before the inserted one or the # target one has been fully processed # but keep it around if it ends up we do need it # flow pattern not inside url() # srcpieces = tag_pattern.split(flowpart) # for j in range(1, len(srcpieces),2): # tag = srcpieces[j] # if tag.startswith('<'): # for m in flow_pattern.finditer(tag): # num = fromBase32(m.group(1)) # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) # flowtext = self.k8proc.getFlow(num) # if fmt == 'inline': # tag = flowtext # else: # replacement = '"../' + pdir + '/' + fnm + '"' # tag = flow_pattern.sub(replacement, tag, 1) # self.used[fnm] = 'used' # srcpieces[j] = tag # flowpart = "".join(srcpieces) # now handle the main text xhtml parts # Handle the flow items in the XHTML text pieces # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) tag_pattern = re.compile(r'''(<[^>]*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # flow pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<'): for m in flow_pattern.finditer(tag): num = fromBase32(m.group(1)) if num > 0 and num < len(self.k8proc.flowinfo): [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowpart = flows[num] if fmt == 'inline': tag = flowpart else: replacement = '"../' + pdir + '/' + fnm + '"' tag = flow_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' else: print "warning: ignoring non-existent flow link", tag, " value 0x%x" % num srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in style= attributes urls style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # replace urls in style attributes srcpieces = style_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if 'kindle:embed' in tag: for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber-1] osep = m.group()[0] csep = m.group()[-1] if imageName is not None: replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep) self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # links to raster image files # image_pattern srcpieces = img_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.rscnames[imageNumber-1] if imageName is not None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # finally perform any general cleanups needed to make valid XHTML # these include: # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" # in svg tags replace "viewbox" attributes with "viewBox" # in <li> remove value="XX" attributes since these are illegal tag_pattern = re.compile(r'''(<[^>]*>)''') li_value_pattern = re.compile(r'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # tag pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<svg') or tag.startswith('<SVG'): tag = tag.replace('preserveaspectratio','preserveAspectRatio') tag = tag.replace('viewbox','viewBox') elif tag.startswith('<li ') or tag.startswith('<LI '): tagpieces = li_value_pattern.split(tag) tag = "".join(tagpieces) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part self.k8proc.setFlows(flows) self.k8proc.setParts(parts) return self.used
def buildParts(self, rawML): # now split the rawML into its flow pieces self.flows = [] for j in xrange(0, len(self.fdsttbl)-1): start = self.fdsttbl[j] end = self.fdsttbl[j+1] if end == 0xffffffff: end = len(rawML) if self.DEBUG: print "splitting rawml starting at %d and ending at %d into flow piece %d" % (start, end, j) self.flows.append(rawML[start:end]) # the first piece represents the xhtml text text = self.flows[0] self.flows[0] = '' # walk the <skeleton> and <div> tables to build original source xhtml files # *without* destroying any file position information needed for later href processing # and create final list of file separation start: stop points and etc in partinfo if self.DEBUG: print "\nRebuilding flow piece 0: the main body of the ebook" self.parts = [] self.partinfo = [] divptr = 0 baseptr = 0 for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos: baseptr] for i in range(divcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[divptr] if self.DEBUG: print " moving div/frag %d starting at %d of length %d" % (divptr, startpos, length) print " inside of skeleton number %d at postion %d" % (skelnum, insertpos) if i == 0: aidtext = idtext[12:-2] filename = 'part%04d.xhtml' % filenum slice = text[baseptr: baseptr + length] insertpos = insertpos - skelpos skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] baseptr = baseptr + length divptr += 1 self.parts.append(skeleton) self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the # original xhtml but have been stripped out and placed here. # This can include local CDATA snippets and and svg sections. # The problem is that for most browsers and ereaders, you can not # use <img src="imageXXXX.svg" /> to import any svg image that itself # properly uses an <image/> tag to import some raster image - it # should work according to the spec but does not for almost all browsers # and ereaders and causes epub validation issues because those raster # images are in manifest but not in xhtml text - since they only # referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an <image /> # and if so inline them into the xhtml text pieces. # there may be other sorts of pieces stored here but until we see one # in the wild to reverse engineer we won't be able to tell self.flowinfo.append([None, None, None, None]) svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE) for j in xrange(1,len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j m = re.search(svg_tag_pattern, flowpart) if m != None: # svg type = 'svg' start = m.start() m2 = re.search(image_tag_pattern, flowpart) if m2 != None: format = 'inline' dir = None fname = None # strip off anything before <svg if inlining flowpart = flowpart[start:] else: format = 'file' dir = "Images" fname = 'svgimg' + nstr + '.svg' else: # search for CDATA and if exists inline it if flowpart.find('[CDATA[') >= 0: type = 'css' flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n' format = 'inline' dir = None fname = None else: # css - assume as standalone css file type = 'css' format = 'file' dir = "Styles" fname = 'style' + nstr + '.css' self.flows[j] = flowpart self.flowinfo.append([type, format, dir, fname]) if self.DEBUG: print "\nFlow Map: %d entries" % len(self.flowinfo) for fi in self.flowinfo: print fi print "\n" print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo) for pi in self.partinfo: print pi if False: # self.DEBUG: # dump all of the locations of the aid tags used in TEXT # find id links only inside of tags # inside any < > pair find all "aid=' and return whatever is inside the quotes # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace print "\npositions of all aid= pieces" id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) for m in re.finditer(id_pattern, rawML): print "%0x %s %0x" % (m.start(), m.group(1), fromBase32(m.group(1))) [filename, partnum, start, end] = self.getFileInfo(m.start()) print " in %d %0x %0x" % (partnum, start, end) return
def buildParts(self, rawML): # now split the rawML into its flow pieces self.flows = [] for j in xrange(0, len(self.fdsttbl) - 1): start = self.fdsttbl[j] end = self.fdsttbl[j + 1] if end == 0xffffffff: end = len(rawML) if self.DEBUG: print "splitting rawml starting at %d and ending at %d into flow piece %d" % ( start, end, j) self.flows.append(rawML[start:end]) # the first piece represents the xhtml text text = self.flows[0] self.flows[0] = '' # walk the <skeleton> and <div> tables to build original source xhtml files # *without* destroying any file position information needed for later href processing # and create final list of file separation start: stop points and etc in partinfo if self.DEBUG: print "\nRebuilding flow piece 0: the main body of the ebook" self.parts = [] self.partinfo = [] divptr = 0 baseptr = 0 for [skelnum, skelname, divcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos:baseptr] for i in range(divcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.divtbl[divptr] if self.DEBUG: print " moving div/frag %d starting at %d of length %d" % ( divptr, startpos, length) print " inside of skeleton number %d at postion %d" % ( skelnum, insertpos) if i == 0: aidtext = idtext[12:-2] filename = 'part%04d.xhtml' % filenum slice = text[baseptr:baseptr + length] insertpos = insertpos - skelpos skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] baseptr = baseptr + length divptr += 1 self.parts.append(skeleton) self.partinfo.append( [skelnum, 'Text', filename, skelpos, baseptr, aidtext]) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the # original xhtml but have been stripped out and placed here. # This can include local CDATA snippets and and svg sections. # The problem is that for most browsers and ereaders, you can not # use <img src="imageXXXX.svg" /> to import any svg image that itself # properly uses an <image/> tag to import some raster image - it # should work according to the spec but does not for almost all browsers # and ereaders and causes epub validation issues because those raster # images are in manifest but not in xhtml text - since they only # referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an <image /> # and if so inline them into the xhtml text pieces. # there may be other sorts of pieces stored here but until we see one # in the wild to reverse engineer we won't be able to tell self.flowinfo.append([None, None, None, None]) svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE) for j in xrange(1, len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j m = re.search(svg_tag_pattern, flowpart) if m != None: # svg type = 'svg' start = m.start() m2 = re.search(image_tag_pattern, flowpart) if m2 != None: format = 'inline' dir = None fname = None # strip off anything before <svg if inlining flowpart = flowpart[start:] else: format = 'file' dir = "Images" fname = 'svgimg' + nstr + '.svg' else: # search for CDATA and if exists inline it if flowpart.find('[CDATA[') >= 0: type = 'css' flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n' format = 'inline' dir = None fname = None else: # css - assume as standalone css file type = 'css' format = 'file' dir = "Styles" fname = 'style' + nstr + '.css' self.flows[j] = flowpart self.flowinfo.append([type, format, dir, fname]) if self.DEBUG: print "\nFlow Map: %d entries" % len(self.flowinfo) for fi in self.flowinfo: print fi print "\n" print "\nXHTML File Part Position Information: %d entries" % len( self.partinfo) for pi in self.partinfo: print pi if False: # self.DEBUG: # dump all of the locations of the aid tags used in TEXT # find id links only inside of tags # inside any < > pair find all "aid=' and return whatever is inside the quotes # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace print "\npositions of all aid= pieces" id_pattern = re.compile( r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE) for m in re.finditer(id_pattern, rawML): print "%0x %s %0x" % (m.start(), m.group(1), fromBase32(m.group(1))) [filename, partnum, start, end] = self.getFileInfo(m.start()) print " in %d %0x %0x" % (partnum, start, end) return
def buildParts(self, rawML): # now split the rawML into its flow pieces self.flows = [] for j in xrange(0, len(self.fdsttbl) - 1): start = self.fdsttbl[j] end = self.fdsttbl[j + 1] self.flows.append(rawML[start:end]) # the first piece represents the xhtml text text = self.flows[0] self.flows[0] = '' # walk the <skeleton> and fragment tables to build original source xhtml files # *without* destroying any file position information needed for later href processing # and create final list of file separation start: stop points and etc in partinfo if self.DEBUG: print "\nRebuilding flow piece 0: the main body of the ebook" self.parts = [] self.partinfo = [] fragptr = 0 baseptr = 0 cnt = 0 for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos:baseptr] for i in range(fragcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] aidtext = idtext[12:-2] if i == 0: filename = 'part%04d.xhtml' % filenum slice = text[baseptr:baseptr + length] insertpos = insertpos - skelpos head = skeleton[:insertpos] tail = skeleton[insertpos:] actual_inspos = insertpos if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): # There is an incomplete tag in either the head or tail. # This can happen for some badly formed KF8 files print 'The fragment table for %s has incorrect insert position. Calculating manually.' % skelname bp, ep = locate_beg_end_of_tag(skeleton, aidtext) if bp != ep: actual_inspos = ep + 1 + startpos if insertpos != actual_inspos: print "fixed corrupt fragment table insert position", insertpos + skelpos, actual_inspos + skelpos insertpos = actual_inspos self.fragtbl[fragptr][0] = actual_inspos + skelpos skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] baseptr = baseptr + length fragptr += 1 cnt += 1 self.parts.append(skeleton) self.partinfo.append( [skelnum, 'Text', filename, skelpos, baseptr, aidtext]) assembled_text = "".join(self.parts) if self.DEBUG: outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') open(pathof(outassembled), 'wb').write(assembled_text) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the # original xhtml but have been stripped out and placed here. # This can include local CDATA snippets and and svg sections. # The problem is that for most browsers and ereaders, you can not # use <img src="imageXXXX.svg" /> to import any svg image that itself # properly uses an <image/> tag to import some raster image - it # should work according to the spec but does not for almost all browsers # and ereaders and causes epub validation issues because those raster # images are in manifest but not in xhtml text - since they only # referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an <image /> # and if so inline them into the xhtml text pieces. # there may be other sorts of pieces stored here but until we see one # in the wild to reverse engineer we won't be able to tell self.flowinfo.append([None, None, None, None]) svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE) for j in xrange(1, len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j m = re.search(svg_tag_pattern, flowpart) if m is not None: # svg type = 'svg' start = m.start() m2 = re.search(image_tag_pattern, flowpart) if m2 is not None: format = 'inline' dir = None fname = None # strip off anything before <svg if inlining flowpart = flowpart[start:] else: format = 'file' dir = "Images" fname = 'svgimg' + nstr + '.svg' else: # search for CDATA and if exists inline it if flowpart.find('[CDATA[') >= 0: type = 'css' flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n' format = 'inline' dir = None fname = None else: # css - assume as standalone css file type = 'css' format = 'file' dir = "Styles" fname = 'style' + nstr + '.css' self.flows[j] = flowpart self.flowinfo.append([type, format, dir, fname]) if self.DEBUG: print "\nFlow Map: %d entries" % len(self.flowinfo) for fi in self.flowinfo: print fi print "\n" print "\nXHTML File Part Position Information: %d entries" % len( self.partinfo) for pi in self.partinfo: print pi if False: # self.Debug: # dump all of the locations of the aid tags used in TEXT # find id links only inside of tags # inside any < > pair find all "aid=' and return whatever is inside the quotes # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace print "\npositions of all aid= pieces" id_pattern = re.compile( r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''', re.IGNORECASE) for m in re.finditer(id_pattern, rawML): [filename, partnum, start, end] = self.getFileInfo(m.start()) [seqnum, idtext] = self.getFragTblInfo(m.start()) value = fromBase32(m.group(1)) print " aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % ( m.group(1), value, m.start(), partnum, start, end) print " %s fragtbl entry %d" % (idtext, seqnum) return
def buildXHTML(self): # first need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** # cutting and pasting any pieces into the xhtml text files # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) # XXXX is the offset in records into divtbl # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position # pos:fid pattern posfid_pattern = re.compile(r'''(<a.*?href=.*?>)''', re.IGNORECASE) posfid_index_pattern = re.compile(r'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') parts = [] print "Building proper xhtml for each file" for i in xrange(self.k8proc.getNumberOfParts()): part = self.k8proc.getPart(i) [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) # internal links srcpieces = posfid_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<'): for m in posfid_index_pattern.finditer(tag): posfid = m.group(1) offset = m.group(2) filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) if idtag == '': replacement= '"' + filename + '"' else: replacement = '"' + filename + '#' + idtag + '"' tag = posfid_index_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts.append(part) # we are free to cut and paste as we see fit # we can safely remove all of the Kindlegen generated aid tags find_tag_with_aid_pattern = re.compile(r'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) within_tag_aid_position_pattern = re.compile(r'''\said\s*=['"][^'"]*['"]''') for i in xrange(len(parts)): part = parts[i] srcpieces = find_tag_with_aid_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): for m in within_tag_aid_position_pattern.finditer(tag): replacement = '' tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags # with page-break-after style patterns find_tag_with_AmznPageBreak_pattern = re.compile(r'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) within_tag_AmznPageBreak_position_pattern = re.compile(r'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') for i in xrange(len(parts)): part = parts[i] srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith('<'): srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( lambda m:' style="page-break-after:%s"'%m.group(1), tag) part = "".join(srcpieces) parts[i] = part # we have to handle substitutions for the flows pieces first as they may # be inlined into the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) # kindle:embed:XXXX (used for fonts) flows = [] flows.append(None) flowinfo = [] flowinfo.append([None, None, None, None]) # regular expression search patterns img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) tag_pattern = re.compile(r'''(<[^>]*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) url_pattern = re.compile(r'''(url\(.*?\))''', re.IGNORECASE) url_img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) font_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) url_css_index_pattern = re.compile(r'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) for i in xrange(1, self.k8proc.getNumberOfFlows()): [type, format, dir, filename] = self.k8proc.getFlowInfo(i) flowpart = self.k8proc.getFlow(i) # links to raster image files from image tags # image_pattern srcpieces = img_pattern.split(flowpart) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber-1] if imageName is not None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) srcpieces[j] = tag flowpart = "".join(srcpieces) # replacements inside css url(): srcpieces = url_pattern.split(flowpart) for j in range(1, len(srcpieces),2): tag = srcpieces[j] # process links to raster image files for m in url_img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber-1] osep = m.group()[0] csep = m.group()[-1] if imageName is not None: replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep) self.used[imageName] = 'used' tag = url_img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) # process links to fonts for m in font_index_pattern.finditer(tag): fontNumber = fromBase32(m.group(1)) fontName = self.imgnames[fontNumber-1] osep = m.group()[0] csep = m.group()[-1] if fontName is None: print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag) else: replacement = '%s%s%s'%(osep, '../Fonts/' + fontName, csep) tag = font_index_pattern.sub(replacement, tag, 1) self.used[fontName] = 'used' # process links to other css pieces for m in url_css_index_pattern.finditer(tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) replacement = '"../' + pdir + '/' + fnm + '"' tag = url_css_index_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' srcpieces[j] = tag flowpart = "".join(srcpieces) # store away in our own copy flows.append(flowpart) # I do no thtink this case exists and even if it does exist, it needs to be done in a separate # pass to prevent inlining a flow piece into another flow piece before the inserted one or the # target one has been fully processed # but keep it around if it ends up we do need it # # flow pattern not inside url() # srcpieces = tag_pattern.split(flowpart) # for j in range(1, len(srcpieces),2): # tag = srcpieces[j] # if tag.startswith('<'): # for m in flow_pattern.finditer(tag): # num = fromBase32(m.group(1)) # [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) # flowtext = self.k8proc.getFlow(num) # if fmt == 'inline': # tag = flowtext # else: # replacement = '"../' + pdir + '/' + fnm + '"' # tag = flow_pattern.sub(replacement, tag, 1) # self.used[fnm] = 'used' # srcpieces[j] = tag # flowpart = "".join(srcpieces) # now handle the main text xhtml parts # Handle the flow items in the XHTML text pieces # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) tag_pattern = re.compile(r'''(<[^>]*>)''') flow_pattern = re.compile(r'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # flow pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<'): for m in flow_pattern.finditer(tag): num = fromBase32(m.group(1)) if num > 0 and num < len(self.k8proc.flowinfo): [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowpart = flows[num] if fmt == 'inline': tag = flowpart else: replacement = '"../' + pdir + '/' + fnm + '"' tag = flow_pattern.sub(replacement, tag, 1) self.used[fnm] = 'used' else: print "warning: ignoring non-existent flow link", tag, " value 0x%x" % num srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in style= attributes urls style_pattern = re.compile(r'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # replace urls in style attributes srcpieces = style_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if 'kindle:embed' in tag: for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber-1] osep = m.group()[0] csep = m.group()[-1] if imageName is not None: replacement = '%s%s%s'%(osep, '../Images/' + imageName, csep) self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) img_pattern = re.compile(r'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) img_index_pattern = re.compile(r'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # links to raster image files # image_pattern srcpieces = img_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<im'): for m in img_index_pattern.finditer(tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber-1] if imageName is not None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = 'used' tag = img_index_pattern.sub(replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # finally perform any general cleanups needed to make valid XHTML # these include: # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" # in svg tags replace "viewbox" attributes with "viewBox" # in <li> remove value="XX" attributes since these are illegal tag_pattern = re.compile(r'''(<[^>]*>)''') li_value_pattern = re.compile(r'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # tag pattern srcpieces = tag_pattern.split(part) for j in range(1, len(srcpieces),2): tag = srcpieces[j] if tag.startswith('<svg') or tag.startswith('<SVG'): tag = tag.replace('preserveaspectratio','preserveAspectRatio') tag = tag.replace('viewbox','viewBox') elif tag.startswith('<li ') or tag.startswith('<LI '): tagpieces = li_value_pattern.split(tag) tag = "".join(tagpieces) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part self.k8proc.setFlows(flows) self.k8proc.setParts(parts) return self.used
def buildXHTML(self): # first need to update all links that are internal which # are based on positions within the xhtml files **BEFORE** # cutting and pasting any pieces into the xhtml text files # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) # XXXX is the offset in records into divtbl # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position # pos:fid pattern posfid_pattern = re.compile(r"""(<a.*?href=.*?>)""", re.IGNORECASE) posfid_index_pattern = re.compile(r"""['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']""") parts = [] print "Building proper xhtml for each file" for i in xrange(self.k8proc.getNumberOfParts()): part = self.k8proc.getPart(i) [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) # internal links srcpieces = re.split(posfid_pattern, part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<"): for m in re.finditer(posfid_index_pattern, tag): posfid = m.group(1) offset = m.group(2) filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) if idtag == "": replacement = '"' + filename + '"' else: replacement = '"' + filename + "#" + idtag + '"' tag = re.sub(posfid_index_pattern, replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts.append(part) # we are free to cut and paste as we see fit # we can safely remove all of the Kindlegen generated aid tags find_tag_with_aid_pattern = re.compile(r"""(<[^>]*\said\s*=[^>]*>)""", re.IGNORECASE) within_tag_aid_position_pattern = re.compile(r"""\said\s*=['"][^'"]*['"]""") for i in xrange(len(parts)): part = parts[i] srcpieces = re.split(find_tag_with_aid_pattern, part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith("<"): for m in re.finditer(within_tag_aid_position_pattern, tag): replacement = "" tag = re.sub(within_tag_aid_position_pattern, replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part # we can safely remove all of the Kindlegen generated data-AmznPageBreak tags find_tag_with_AmznPageBreak_pattern = re.compile(r"""(<[^>]*\sdata-AmznPageBreak=[^>]*>)""", re.IGNORECASE) within_tag_AmznPageBreak_position_pattern = re.compile(r"""\sdata-AmznPageBreak=['"][^'"]*['"]""") for i in xrange(len(parts)): part = parts[i] srcpieces = re.split(find_tag_with_AmznPageBreak_pattern, part) for j in range(len(srcpieces)): tag = srcpieces[j] if tag.startswith("<"): for m in re.finditer(within_tag_AmznPageBreak_position_pattern, tag): replacement = "" tag = re.sub(within_tag_AmznPageBreak_position_pattern, replacement, tag, 1) srcpieces[j] = tag part = "".join(srcpieces) parts[i] = part # we have to handle substitutions for the flows pieces first as they may # be inlined into the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) # kindle:embed:XXXX (used for fonts) flows = [] flows.append(None) flowinfo = [] flowinfo.append([None, None, None, None]) # regular expression search patterns img_pattern = re.compile(r"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE) img_index_pattern = re.compile(r"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""", re.IGNORECASE) tag_pattern = re.compile(r"""(<[^>]*>)""") flow_pattern = re.compile(r"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE) url_pattern = re.compile(r"""(url\(.*?\))""", re.IGNORECASE) url_img_index_pattern = re.compile(r"""kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*""", re.IGNORECASE) font_index_pattern = re.compile(r"""kindle:embed:([0-9|A-V]+)""", re.IGNORECASE) url_css_index_pattern = re.compile(r"""kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*""", re.IGNORECASE) for i in xrange(1, self.k8proc.getNumberOfFlows()): [type, format, dir, filename] = self.k8proc.getFlowInfo(i) flowpart = self.k8proc.getFlow(i) # links to raster image files from image tags # image_pattern srcpieces = re.split(img_pattern, flowpart) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<im"): for m in re.finditer(img_index_pattern, tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber - 1] if imageName != None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = "used" tag = re.sub(img_index_pattern, replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % ( imageNumber, tag, ) srcpieces[j] = tag flowpart = "".join(srcpieces) # replacements inside css url(): srcpieces = re.split(url_pattern, flowpart) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] # process links to raster image files for m in re.finditer(url_img_index_pattern, tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber - 1] if imageName != None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = "used" tag = re.sub(url_img_index_pattern, replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % ( imageNumber, tag, ) # process links to fonts for m in re.finditer(font_index_pattern, tag): fontNumber = fromBase32(m.group(1)) fontName = self.imgnames[fontNumber - 1] if fontName is None: print "Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag) else: replacement = '"../Fonts/' + fontName + '"' tag = re.sub(font_index_pattern, replacement, tag, 1) self.used[fontName] = "used" # process links to other css pieces for m in re.finditer(url_css_index_pattern, tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowtext = self.k8proc.getFlow(num) replacement = '"../' + pdir + "/" + fnm + '"' tag = re.sub(url_css_index_pattern, replacement, tag, 1) self.used[fnm] = "used" srcpieces[j] = tag flowpart = "".join(srcpieces) # flow pattern not inside url() srcpieces = re.split(tag_pattern, flowpart) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<"): for m in re.finditer(flow_pattern, tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowtext = self.k8proc.getFlow(num) if fmt == "inline": tag = flowtext else: replacement = '"../' + pdir + "/" + fnm + '"' tag = re.sub(flow_pattern, replacement, tag, 1) self.used[fnm] = "used" srcpieces[j] = tag flowpart = "".join(srcpieces) # store away in our own copy flows.append(flowpart) # now handle the main text xhtml parts # Handle the flow items in the XHTML text pieces # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) tag_pattern = re.compile(r"""(<[^>]*>)""") flow_pattern = re.compile(r"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # flow pattern srcpieces = re.split(tag_pattern, part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<"): for m in re.finditer(flow_pattern, tag): num = fromBase32(m.group(1)) [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) flowpart = self.k8proc.getFlow(num) if fmt == "inline": tag = flowpart else: replacement = '"../' + pdir + "/" + fnm + '"' tag = re.sub(flow_pattern, replacement, tag, 1) self.used[fnm] = "used" srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # Handle any embedded raster images links in the xhtml text # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) img_pattern = re.compile(r"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE) img_index_pattern = re.compile(r"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""") for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # links to raster image files # image_pattern srcpieces = re.split(img_pattern, part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<im"): for m in re.finditer(img_index_pattern, tag): imageNumber = fromBase32(m.group(1)) imageName = self.imgnames[imageNumber - 1] if imageName != None: replacement = '"../Images/' + imageName + '"' self.used[imageName] = "used" tag = re.sub(img_index_pattern, replacement, tag, 1) else: print "Error: Referenced image %s was not recognized as a valid image in %s" % ( imageNumber, tag, ) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part # finally perform any general cleanups needed to make valid XHTML # these include: # in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio" # in svg tags replace "viewbox" attributes with "viewBox" # in <li> remove value="XX" attributes since these are illegal tag_pattern = re.compile(r"""(<[^>]*>)""") li_value_pattern = re.compile(r"""\svalue\s*=\s*['"][^'"]*['"]""", re.IGNORECASE) for i in xrange(len(parts)): part = parts[i] [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] # tag pattern srcpieces = re.split(tag_pattern, part) for j in range(1, len(srcpieces), 2): tag = srcpieces[j] if tag.startswith("<svg") or tag.startswith("<SVG"): tag = tag.replace("preserveaspectratio", "preserveAspectRatio") tag = tag.replace("viewbox", "viewBox") elif tag.startswith("<li ") or tag.startswith("<LI "): tagpieces = re.split(li_value_pattern, tag) tag = "".join(tagpieces) srcpieces[j] = tag part = "".join(srcpieces) # store away modified version parts[i] = part self.k8proc.setFlows(flows) self.k8proc.setParts(parts) return self.used
def buildParts(self, rawML): # now split the rawML into its flow pieces self.flows = [] for j in xrange(0, len(self.fdsttbl)-1): start = self.fdsttbl[j] end = self.fdsttbl[j+1] self.flows.append(rawML[start:end]) # the first piece represents the xhtml text text = self.flows[0] self.flows[0] = '' # walk the <skeleton> and fragment tables to build original source xhtml files # *without* destroying any file position information needed for later href processing # and create final list of file separation start: stop points and etc in partinfo if self.DEBUG: print "\nRebuilding flow piece 0: the main body of the ebook" self.parts = [] self.partinfo = [] fragptr = 0 baseptr = 0 cnt = 0 for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl: baseptr = skelpos + skellen skeleton = text[skelpos: baseptr] for i in range(fragcnt): [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[fragptr] aidtext = idtext[12:-2] if i == 0: filename = 'part%04d.xhtml' % filenum slice = text[baseptr: baseptr + length] insertpos = insertpos - skelpos head = skeleton[:insertpos] tail = skeleton[insertpos:] actual_inspos = insertpos if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < head.rfind(b'<')): # There is an incomplete tag in either the head or tail. # This can happen for some badly formed KF8 files print 'The fragment table for %s has incorrect insert position. Calculating manually.' % skelname bp, ep = locate_beg_end_of_tag(skeleton, aidtext) if bp != ep: actual_inspos = ep + 1 + startpos if insertpos != actual_inspos: print "fixed corrupt fragment table insert position", insertpos+skelpos, actual_inspos+skelpos insertpos = actual_inspos self.fragtbl[fragptr][0] = actual_inspos + skelpos skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:] baseptr = baseptr + length fragptr += 1 cnt += 1 self.parts.append(skeleton) self.partinfo.append([skelnum, 'Text', filename, skelpos, baseptr, aidtext]) assembled_text = "".join(self.parts) if self.DEBUG: outassembled = os.path.join(self.files.k8dir, 'assembled_text.dat') open(pathof(outassembled),'wb').write(assembled_text) # The primary css style sheet is typically stored next followed by any # snippets of code that were previously inlined in the # original xhtml but have been stripped out and placed here. # This can include local CDATA snippets and and svg sections. # The problem is that for most browsers and ereaders, you can not # use <img src="imageXXXX.svg" /> to import any svg image that itself # properly uses an <image/> tag to import some raster image - it # should work according to the spec but does not for almost all browsers # and ereaders and causes epub validation issues because those raster # images are in manifest but not in xhtml text - since they only # referenced from an svg image # So we need to check the remaining flow pieces to see if they are css # or svg images. if svg images, we must check if they have an <image /> # and if so inline them into the xhtml text pieces. # there may be other sorts of pieces stored here but until we see one # in the wild to reverse engineer we won't be able to tell self.flowinfo.append([None, None, None, None]) svg_tag_pattern = re.compile(r'''(<svg[^>]*>)''', re.IGNORECASE) image_tag_pattern = re.compile(r'''(<image[^>]*>)''', re.IGNORECASE) for j in xrange(1,len(self.flows)): flowpart = self.flows[j] nstr = '%04d' % j m = re.search(svg_tag_pattern, flowpart) if m is not None: # svg type = 'svg' start = m.start() m2 = re.search(image_tag_pattern, flowpart) if m2 is not None: format = 'inline' dir = None fname = None # strip off anything before <svg if inlining flowpart = flowpart[start:] else: format = 'file' dir = "Images" fname = 'svgimg' + nstr + '.svg' else: # search for CDATA and if exists inline it if flowpart.find('[CDATA[') >= 0: type = 'css' flowpart = '<style type="text/css">\n' + flowpart + '\n</style>\n' format = 'inline' dir = None fname = None else: # css - assume as standalone css file type = 'css' format = 'file' dir = "Styles" fname = 'style' + nstr + '.css' self.flows[j] = flowpart self.flowinfo.append([type, format, dir, fname]) if self.DEBUG: print "\nFlow Map: %d entries" % len(self.flowinfo) for fi in self.flowinfo: print fi print "\n" print "\nXHTML File Part Position Information: %d entries" % len(self.partinfo) for pi in self.partinfo: print pi if False: #self.Debug: # dump all of the locations of the aid tags used in TEXT # find id links only inside of tags # inside any < > pair find all "aid=' and return whatever is inside the quotes # [^>]* means match any amount of chars except for '>' char # [^'"] match any amount of chars except for the quote character # \s* means match any amount of whitespace print "\npositions of all aid= pieces" id_pattern = re.compile(r'''<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>''',re.IGNORECASE) for m in re.finditer(id_pattern, rawML): [filename, partnum, start, end] = self.getFileInfo(m.start()) [seqnum, idtext] = self.getFragTblInfo(m.start()) value = fromBase32(m.group(1)) print " aid: %s value: %d at: %d -> part: %d, start: %d, end: %d" % (m.group(1), value, m.start(), partnum, start, end) print " %s fragtbl entry %d" % (idtext, seqnum) return