def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): ''' Apply inflection rule. @param mainEntry: The word to inflect. @param inflectionRuleData: The inflection rules. @param start: The start position of the inflection rule to use. @param end: The end position of the inflection rule to use. @return: The string with the inflected word or None if an error occurs. ''' mode = -1 byteArray = array.array("c", mainEntry) position = len(byteArray) for charOffset in range(start, end): char = inflectionRuleData[charOffset] byte = ord(char) if byte >= 0x0a and byte <= 0x13: # Move cursor backwards offset = byte - 0x0a if mode not in [0x02, 0x03]: mode = 0x02 position = len(byteArray) position -= offset elif byte > 0x13: if mode == -1: print "Error: Unexpected first byte %i of inflection rule" % byte return None elif position == -1: print "Error: Unexpected first byte %i of inflection rule" % byte return None else: if mode == 0x01: # Insert at word start byteArray.insert(position, char) position += 1 elif mode == 0x02: # Insert at word end byteArray.insert(position, char) elif mode == 0x03: # Delete at word end position -= 1 deleted = byteArray.pop(position) if deleted != char: if DEBUG_DICT: print "0x03: %s %s %s %s" % ( mainEntry, toHex(inflectionRuleData[start:end]), char, deleted) print "Error: Delete operation of inflection rule failed" return None elif mode == 0x04: # Delete at word start deleted = byteArray.pop(position) if deleted != char: if DEBUG_DICT: print "0x03: %s %s %s %s" % ( mainEntry, toHex(inflectionRuleData[start:end]), char, deleted) print "Error: Delete operation of inflection rule failed" return None else: print "Error: Inflection rule mode %x is not implemented" % mode return None elif byte == 0x01: # Insert at word start if mode not in [0x01, 0x04]: position = 0 mode = byte elif byte == 0x02: # Insert at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = byte elif byte == 0x03: # Delete at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = byte elif byte == 0x04: # Delete at word start if mode not in [0x01, 0x04]: position = 0 # Delete at word start mode = byte else: print "Error: Inflection rule mode %x is not implemented" % byte return None return byteArray.tostring()
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False): imgnames = [] for mh in mhlst: if mh.isK8(): print "\n\nProcessing K8 format Ebook ..." elif mh.isPrintReplica(): print "\nProcessing PrintReplica (.azw4) format Ebook ..." else: print "\nProcessing Mobi format Ebook ..." if DEBUG: # write out raw mobi header data mhname = os.path.join(files.outdir, "header.dat") if mh.isK8(): mhname = os.path.join(files.outdir, "header_K8.dat") file(mhname, 'wb').write(mh.header) # process each mobi header if mh.isEncrypted(): raise unpackException('file is encrypted') # build up the metadata metadata = mh.getMetaData() metadata['Language'] = mh.Language() metadata['Title'] = [unicode(mh.title, mh.codec).encode("utf-8")] metadata['Codec'] = [mh.codec] metadata['UniqueID'] = [str(mh.unique_id)] if DEBUG: print "MetaData from EXTH" print metadata # save the raw markup language rawML = mh.getRawML() if DEBUG or WRITE_RAW_DATA: ext = '.rawml' if mh.isK8(): outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext) else: if mh.isPrintReplica(): ext = '.rawpr' outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext) else: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext) file(outraw, 'wb').write(rawML) # process additional sections that represent images, resources, fonts, and etc # build up a list of image names to use to postprocess the rawml print "Unpacking images, resources, fonts, etc" firstaddl = mh.getfirstAddl() if DEBUG: print "firstaddl is ", firstaddl print "num_sections is ", sect.num_sections print "K8Boundary is ", K8Boundary beg = firstaddl end = sect.num_sections if firstaddl < K8Boundary: end = K8Boundary obfuscate_data = [] for i in xrange(beg, end): if DEBUG: print "Section is ", i data = sect.loadSection(i) type = data[0:4] if type in ["FLIS", "FCIS", "FDST", "DATP"]: if DEBUG: print 'First 4 bytes: %s' % toHex(data[0:4]) fname = "%05d" % (1 + i - beg) fname = type + fname if mh.isK8(): fname += "_K8" fname += '.dat' outname = os.path.join(files.outdir, fname) file(outname, 'wb').write(data) print "Skipping ", type, " section" imgnames.append(None) continue elif type == "SRCS": # The mobi file was created by kindlegen and contains a zip archive with all source files. # Extract the archive and save it. print " Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) file(srcname, 'wb').write(data[16:]) imgnames.append(None) continue elif type == "FONT": # fonts only exist in K8 ebooks # Format: # bytes 0 - 3: 'FONT' # bytes 4 - 7: uncompressed size # bytes 8 - 11: flags # bit 0x0001 - zlib compression # bit 0x0002 - obfuscated with xor string # bytes 12 - 15: offset to start of compressed font data # bytes 16 - 19: length of xor string stored before the start of the comnpress font data # bytes 19 - 23: start of xor string usize, fflags, dstart, xor_len, xor_start = struct.unpack_from( '>LLLLL', data, 4) font_data = data[dstart:] extent = len(font_data) extent = min(extent, 1040) if fflags & 0x0002: # obfuscated so need to de-obfuscate the first 1040 bytes key = bytearray(data[xor_start:xor_start + xor_len]) buf = bytearray(font_data) for n in xrange(extent): buf[n] ^= key[n % xor_len] font_data = bytes(buf) if fflags & 0x0001: # ZLIB compressed data wbits, err = read_zlib_header(font_data[:2]) if err is None: adler32, = struct.unpack_from('>I', font_data, len(font_data) - 4) font_data = zlib.decompress(font_data[2:-4], -wbits, usize) if len(font_data) != usize: print 'Font Decompression Error: Uncompressed font size mismatch' if False: # For some reason these almost never match, probably Amazon has a # buggy Adler32 implementation sig = (zlib.adler32(font_data) & 0xffffffff) if sig != adler32: print 'Font Decompression Error' print 'Adler checksum did not match. Stored: %d Calculated: %d' % ( adler32, sig) else: print "Error Decoding Font", str(err) hdr = font_data[0:4] if hdr == '\0\1\0\0' or hdr == 'true' or hdr == 'ttcf': ext = '.ttf' elif hdr == 'OTTO': ext = '.otf' else: print "Warning: unknown font header %s" % hdr.encode('hex') ext = '.dat' fontname = "font%05d" % (1 + i - beg) fontname += ext if (ext == '.ttf' or ext == '.otf') and (fflags & 0x0002): obfuscate_data.append(fontname) print " extracting font: ", fontname outfnt = os.path.join(files.imgdir, fontname) file(outfnt, 'wb').write(font_data) imgnames.append(fontname) continue elif type == "RESC": # resources only exist in K8 ebooks # not sure what they are, looks like # a piece of the top of the original content.opf # file, so only write them out # if DEBUG is True if DEBUG: data = data[4:] rescname = "resc%05d.dat" % (1 + i - beg) print " extracting resource: ", rescname outrsc = os.path.join(files.imgdir, rescname) file(outrsc, 'wb').write(data) imgnames.append(None) continue if data == EOF_RECORD: if DEBUG: print "Skip section %i as it contains the EOF record." % i imgnames.append(None) continue # if reach here should be an image but double check to make sure # Get the proper file extension imgtype = imghdr.what(None, data) if imgtype is None: print "Warning: Section %s contains no image or an unknown image format" % i imgnames.append(None) if DEBUG: print 'First 4 bytes: %s' % toHex(data[0:4]) fname = "unknown%05d.dat" % (1 + i - beg) outname = os.path.join(files.outdir, fname) file(outname, 'wb').write(data) else: imgname = "image%05d.%s" % (1 + i - beg, imgtype) print " extracting image: ", imgname outimg = os.path.join(files.imgdir, imgname) file(outimg, 'wb').write(data) imgnames.append(imgname) # FIXME all of this PrintReplica code is untested! # Process print replica book. if mh.isPrintReplica() and not k8only: filenames = [] print "Print Replica ebook detected" try: mh.processPrintReplica(files) except Exception, e: print 'Error processing Print Replica: ' + str(e) filenames.append(['', files.getInputFileBasename() + '.pdf']) usedmap = {} for name in imgnames: if name != None: usedmap[name] = 'used' opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap) opf.writeOPF() continue if mh.isK8(): # K8 mobi # require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, DEBUG) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # add in any guide info from metadata, such as StartOffset if 'StartOffset' in metadata.keys(): starts = metadata['StartOffset'] last_start = starts.pop() if int(last_start) == 0xffffffff: last_start = '0' filename, partnum, beg, end = k8proc.getFileInfo( int(last_start)) idtext = k8proc.getIDTag(int(last_start)) linktgt = filename if idtext != '': linktgt += '#' + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with # info about filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap['pos_fid'].split(':') filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap['filename'] = filename ncxmap['idtag'] = idtag ncx_data[i] = ncxmap # write out the toc.ncx ncx.writeK8NCX(ncx_data, metadata) # convert the rawML to a set of xhtml files htmlproc = XHTMLK8Processor(imgnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the files filenames = [] n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, 'wb').write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == 'file': filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, 'wb').write(flowpart) opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) if obfuscate_data: uuid = opf.writeOPF(True) else: uuid = opf.writeOPF() # make an epub of it all files.makeEPUB(usedmap, obfuscate_data, uuid) elif not k8only: # An original Mobi # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # If Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata['DictInLanguage'] = mh.DictInLanguage() if mh.DictOutLanguage(): metadata['DictOutLanguage'] = mh.DictOutLanguage() positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, imgnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() filenames = [] # write the proper mobi html fname = files.getInputFileBasename() + '.html' filenames.append(['', fname]) outhtml = os.path.join(files.mobi7dir, fname) file(outhtml, 'wb').write(srctext) # create an OPF # extract guidetext from srctext guidetext = '' guidematch = re.search(r'''<guide>(.*)</guide>''', srctext, re.IGNORECASE + re.DOTALL) if guidematch: replacetext = r'''href="''' + filenames[0][ 1] + r'''#filepos\1"''' guidetext = re.sub(r'''filepos=['"]{0,1}0*(\d+)['"]{0,1}''', replacetext, guidematch.group(1)) guidetext += '\n' guidetext = unicode(guidetext, mh.codec).encode("utf-8") opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) opf.writeOPF()
def process_all_mobi_headers(files, sect, mhlst, K8Boundary, k8only=False): imgnames = [] for mh in mhlst: if mh.isK8(): print "\n\nProcessing K8 format Ebook ..." elif mh.isPrintReplica(): print "\nProcessing PrintReplica (.azw4) format Ebook ..." else: print "\nProcessing Mobi format Ebook ..." if DEBUG: # write out raw mobi header data mhname = os.path.join(files.outdir, "header.dat") if mh.isK8(): mhname = os.path.join(files.outdir, "header_K8.dat") file(mhname, "wb").write(mh.header) # process each mobi header if mh.isEncrypted(): raise unpackException("file is encrypted") # build up the metadata metadata = mh.getMetaData() metadata["Language"] = mh.Language() metadata["Title"] = [unicode(mh.title, mh.codec).encode("utf-8")] metadata["Codec"] = [mh.codec] metadata["UniqueID"] = [str(mh.unique_id)] if DEBUG: print "MetaData from EXTH" print metadata # save the raw markup language rawML = mh.getRawML() if DEBUG or WRITE_RAW_DATA: ext = ".rawml" if mh.isK8(): outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ext) else: if mh.isPrintReplica(): ext = ".rawpr" outraw = os.path.join(files.outdir, files.getInputFileBasename() + ext) else: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ext) file(outraw, "wb").write(rawML) # process additional sections that represent images, resources, fonts, and etc # build up a list of image names to use to postprocess the rawml print "Unpacking images, resources, fonts, etc" firstaddl = mh.getfirstAddl() if DEBUG: print "firstaddl is ", firstaddl print "num_sections is ", sect.num_sections print "K8Boundary is ", K8Boundary beg = firstaddl end = sect.num_sections if firstaddl < K8Boundary: end = K8Boundary obfuscate_data = [] for i in xrange(beg, end): if DEBUG: print "Section is ", i data = sect.loadSection(i) type = data[0:4] if type in ["FLIS", "FCIS", "FDST", "DATP"]: if DEBUG: print "First 4 bytes: %s" % toHex(data[0:4]) fname = "%05d" % (1 + i - beg) fname = type + fname if mh.isK8(): fname += "_K8" fname += ".dat" outname = os.path.join(files.outdir, fname) file(outname, "wb").write(data) print "Skipping ", type, " section" imgnames.append(None) continue elif type == "SRCS": # The mobi file was created by kindlegen and contains a zip archive with all source files. # Extract the archive and save it. print " Info: File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) file(srcname, "wb").write(data[16:]) imgnames.append(None) continue elif type == "FONT": # fonts only exist in K8 ebooks # Format: # bytes 0 - 3: 'FONT' # bytes 4 - 7: uncompressed size # bytes 8 - 11: flags # bit 0x0001 - zlib compression # bit 0x0002 - obfuscated with xor string # bytes 12 - 15: offset to start of compressed font data # bytes 16 - 19: length of xor string stored before the start of the comnpress font data # bytes 19 - 23: start of xor string usize, fflags, dstart, xor_len, xor_start = struct.unpack_from(">LLLLL", data, 4) font_data = data[dstart:] extent = len(font_data) extent = min(extent, 1040) if fflags & 0x0002: # obfuscated so need to de-obfuscate the first 1040 bytes key = bytearray(data[xor_start : xor_start + xor_len]) buf = bytearray(font_data) for n in xrange(extent): buf[n] ^= key[n % xor_len] font_data = bytes(buf) if fflags & 0x0001: # ZLIB compressed data wbits, err = read_zlib_header(font_data[:2]) if err is None: adler32, = struct.unpack_from(">I", font_data, len(font_data) - 4) font_data = zlib.decompress(font_data[2:-4], -wbits, usize) if len(font_data) != usize: print "Font Decompression Error: Uncompressed font size mismatch" if False: # For some reason these almost never match, probably Amazon has a # buggy Adler32 implementation sig = zlib.adler32(font_data) & 0xFFFFFFFF if sig != adler32: print "Font Decompression Error" print "Adler checksum did not match. Stored: %d Calculated: %d" % (adler32, sig) else: print "Error Decoding Font", str(err) hdr = font_data[0:4] if hdr == "\0\1\0\0" or hdr == "true" or hdr == "ttcf": ext = ".ttf" elif hdr == "OTTO": ext = ".otf" else: print "Warning: unknown font header %s" % hdr.encode("hex") ext = ".dat" fontname = "font%05d" % (1 + i - beg) fontname += ext if (ext == ".ttf" or ext == ".otf") and (fflags & 0x0002): obfuscate_data.append(fontname) print " extracting font: ", fontname outfnt = os.path.join(files.imgdir, fontname) file(outfnt, "wb").write(font_data) imgnames.append(fontname) continue elif type == "RESC": # resources only exist in K8 ebooks # not sure what they are, looks like # a piece of the top of the original content.opf # file, so only write them out # if DEBUG is True if DEBUG: data = data[4:] rescname = "resc%05d.dat" % (1 + i - beg) print " extracting resource: ", rescname outrsc = os.path.join(files.imgdir, rescname) file(outrsc, "wb").write(data) imgnames.append(None) continue if data == EOF_RECORD: if DEBUG: print "Skip section %i as it contains the EOF record." % i imgnames.append(None) continue # if reach here should be an image but double check to make sure # Get the proper file extension imgtype = imghdr.what(None, data) if imgtype is None: print "Warning: Section %s contains no image or an unknown image format" % i imgnames.append(None) if DEBUG: print "First 4 bytes: %s" % toHex(data[0:4]) fname = "unknown%05d.dat" % (1 + i - beg) outname = os.path.join(files.outdir, fname) file(outname, "wb").write(data) else: imgname = "image%05d.%s" % (1 + i - beg, imgtype) print " extracting image: ", imgname outimg = os.path.join(files.imgdir, imgname) file(outimg, "wb").write(data) imgnames.append(imgname) # FIXME all of this PrintReplica code is untested! # Process print replica book. if mh.isPrintReplica() and not k8only: filenames = [] print "Print Replica ebook detected" try: mh.processPrintReplica(files) except Exception, e: print "Error processing Print Replica: " + str(e) filenames.append(["", files.getInputFileBasename() + ".pdf"]) usedmap = {} for name in imgnames: if name != None: usedmap[name] = "used" opf = OPFProcessor(files, metadata, filenames, imgnames, False, mh, usedmap) opf.writeOPF() continue if mh.isK8(): # K8 mobi # require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, DEBUG) k8proc.buildParts(rawML) # collect information for the guide first guidetext = k8proc.getGuideText() # add in any guide info from metadata, such as StartOffset if "StartOffset" in metadata.keys(): starts = metadata["StartOffset"] last_start = starts.pop() if int(last_start) == 0xFFFFFFFF: last_start = "0" filename, partnum, beg, end = k8proc.getFileInfo(int(last_start)) idtext = k8proc.getIDTag(int(last_start)) linktgt = filename if idtext != "": linktgt += "#" + idtext guidetext += '<reference type="text" href="Text/%s" />\n' % linktgt # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() # extend the ncx data with # info about filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap["pos_fid"].split(":") filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap["filename"] = filename ncxmap["idtag"] = idtag ncx_data[i] = ncxmap # write out the toc.ncx ncx.writeK8NCX(ncx_data, metadata) # convert the rawML to a set of xhtml files htmlproc = XHTMLK8Processor(imgnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the files filenames = [] n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, "wb").write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [type, format, dir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if format == "file": filenames.append([dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) file(fname, "wb").write(flowpart) opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) if obfuscate_data: uuid = opf.writeOPF(True) else: uuid = opf.writeOPF() # make an epub of it all files.makeEPUB(usedmap, obfuscate_data, uuid) elif not k8only: # An original Mobi # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() ncx.writeNCX(metadata) positionMap = {} # If Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata["DictInLanguage"] = mh.DictInLanguage() if mh.DictOutLanguage(): metadata["DictOutLanguage"] = mh.DictOutLanguage() positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, imgnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() filenames = [] # write the proper mobi html fname = files.getInputFileBasename() + ".html" filenames.append(["", fname]) outhtml = os.path.join(files.mobi7dir, fname) file(outhtml, "wb").write(srctext) # create an OPF # extract guidetext from srctext guidetext = "" guidematch = re.search(r"""<guide>(.*)</guide>""", srctext, re.IGNORECASE + re.DOTALL) if guidematch: replacetext = r'''href="''' + filenames[0][1] + r'''#filepos\1"''' guidetext = re.sub(r"""filepos=['"]{0,1}0*(\d+)['"]{0,1}""", replacetext, guidematch.group(1)) guidetext += "\n" guidetext = unicode(guidetext, mh.codec).encode("utf-8") opf = OPFProcessor(files, metadata, filenames, imgnames, ncx.isNCX, mh, usedmap, guidetext) opf.writeOPF()
def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): ''' Create a map of tags and values from the given byte section. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param entryData: The data to process. @param startPos: The starting position in entryData. @param endPos: The end position in entryData or None if it is unknown. @return: Hashmap of tag and list of values. ''' tags = [] tagHashMap = {} controlByteIndex = 0 dataStart = startPos + controlByteCount for tag, valuesPerEntry, mask, endFlag in tagTable: if endFlag == 0x01: controlByteIndex += 1 continue cbyte = ord(entryData[startPos + controlByteIndex]) if 0: print "Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte) value = ord(entryData[startPos + controlByteIndex]) & mask if value != 0: if value == mask: if countSetBits(mask) > 1: # If all bits of masked value are set and the mask has more than one bit, a variable width value # will follow after the control bytes which defines the length of bytes (NOT the value count!) # which will contain the corresponding variable width values. consumed, value = getVariableWidthValue(entryData, dataStart) dataStart += consumed tags.append((tag, None, value, valuesPerEntry)) else: tags.append((tag, 1, None, valuesPerEntry)) else: # Shift bits to get the masked value. while mask & 0x01 == 0: mask = mask >> 1 value = value >> 1 tags.append((tag, value, None, valuesPerEntry)) for tag, valueCount, valueBytes, valuesPerEntry in tags: values = [] if valueCount != None: # Read valueCount * valuesPerEntry variable width values. for _ in range(valueCount): for _ in range(valuesPerEntry): consumed, data = getVariableWidthValue(entryData, dataStart) dataStart += consumed values.append(data) else: # Convert valueBytes to variable width values. totalConsumed = 0 while totalConsumed < valueBytes: # Does this work for valuesPerEntry != 1? consumed, data = getVariableWidthValue(entryData, dataStart) dataStart += consumed totalConsumed += consumed values.append(data) if totalConsumed != valueBytes: print "Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed) tagHashMap[tag] = values # Test that all bytes have been processed if endPos is given. if endPos is not None and dataStart != endPos: # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. for char in entryData[dataStart:endPos]: if char != chr(0x00): print "Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]) if 0: print "controlByteCount: %s" % controlByteCount print "tagTable: %s" % tagTable print "data: %s" % toHex(entryData[startPos:endPos]) print "tagHashMap: %s" % tagHashMap break return tagHashMap
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): ''' Apply inflection rule. @param mainEntry: The word to inflect. @param inflectionRuleData: The inflection rules. @param start: The start position of the inflection rule to use. @param end: The end position of the inflection rule to use. @return: The string with the inflected word or None if an error occurs. ''' mode = -1 byteArray = array.array("c", mainEntry) position = len(byteArray) for charOffset in range(start, end): char = inflectionRuleData[charOffset] byte = ord(char) if byte >= 0x0a and byte <= 0x13: # Move cursor backwards offset = byte - 0x0a if mode not in [0x02, 0x03]: mode = 0x02 position = len(byteArray) position -= offset elif byte > 0x13: if mode == -1: print "Error: Unexpected first byte %i of inflection rule" % byte return None elif position == -1: print "Error: Unexpected first byte %i of inflection rule" % byte return None else: if mode == 0x01: # Insert at word start byteArray.insert(position, char) position += 1 elif mode == 0x02: # Insert at word end byteArray.insert(position, char) elif mode == 0x03: # Delete at word end position -= 1 deleted = byteArray.pop(position) if deleted != char: if DEBUG_DICT: print "0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, deleted) print "Error: Delete operation of inflection rule failed" return None elif mode == 0x04: # Delete at word start deleted = byteArray.pop(position) if deleted != char: if DEBUG_DICT: print "0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, deleted) print "Error: Delete operation of inflection rule failed" return None else: print "Error: Inflection rule mode %x is not implemented" % mode return None elif byte == 0x01: # Insert at word start if mode not in [0x01, 0x04]: position = 0 mode = byte elif byte == 0x02: # Insert at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = byte elif byte == 0x03: # Delete at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = byte elif byte == 0x04: # Delete at word start if mode not in [0x01, 0x04]: position = 0 # Delete at word start mode = byte else: print "Error: Inflection rule mode %x is not implemented" % byte return None return byteArray.tostring()
def getTagMap(self, controlByteCount, tagTable, entryData, startPos, endPos): ''' Create a map of tags and values from the given byte section. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param entryData: The data to process. @param startPos: The starting position in entryData. @param endPos: The end position in entryData or None if it is unknown. @return: Hashmap of tag and list of values. ''' tags = [] tagHashMap = {} controlByteIndex = 0 dataStart = startPos + controlByteCount for tag, valuesPerEntry, mask, endFlag in tagTable: if endFlag == 0x01: controlByteIndex += 1 continue value = ord(entryData[startPos + controlByteIndex]) & mask if value != 0: if value == mask: if self.countSetBits(mask) > 1: # If all bits of masked value are set and the mask has more than one bit, a variable width value # will follow after the control bytes which defines the length of bytes (NOT the value count!) # which will contain the corresponding variable width values. consumed, value = getVariableWidthValue( entryData, dataStart) dataStart += consumed tags.append((tag, None, value, valuesPerEntry)) else: tags.append((tag, 1, None, valuesPerEntry)) else: # Shift bits to get the masked value. while mask & 0x01 == 0: mask = mask >> 1 value = value >> 1 tags.append((tag, value, None, valuesPerEntry)) for tag, valueCount, valueBytes, valuesPerEntry in tags: values = [] if valueCount != None: # Read valueCount * valuesPerEntry variable width values. for _ in range(valueCount): for _ in range(valuesPerEntry): consumed, data = getVariableWidthValue( entryData, dataStart) dataStart += consumed values.append(data) else: # Convert valueBytes to variable width values. totalConsumed = 0 while totalConsumed < valueBytes: # Does this work for valuesPerEntry != 1? consumed, data = getVariableWidthValue( entryData, dataStart) dataStart += consumed totalConsumed += consumed values.append(data) if totalConsumed != valueBytes: print "Error: Should consume %s bytes, but consumed %s" % ( valueBytes, totalConsumed) tagHashMap[tag] = values # Test that all bytes have been processed if endPos is given. if endPos is not None and dataStart != endPos: # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. for char in entryData[dataStart:endPos]: if char != chr(0x00): print "Warning: There are unprocessed index bytes left: %s" % toHex( entryData[dataStart:endPos]) if DEBUG_DICT: print "controlByteCount: %s" % controlByteCount print "tagTable: %s" % tagTable print "data: %s" % toHex(entryData[startPos:endPos]) print "tagHashMap: %s" % tagHashMap break return tagHashMap