def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The Inflection data object to properly select the right inflection data section to use @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" for value in groupList: offset, nextOffset, data = dinfl.offsets(value) # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue( inflectionNames, value) inflectionName = inflectionNames[value + consumed:value + consumed + textLength] # Get and apply inflection rule across possibly multiple inflection data sections value = tagMap[0x1a][i] rvalue, start, count, data = dinfl.lookup(value) offset, = struct.unpack_from('>H', data, start + 4 + (2 * rvalue)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset + 1, offset + 1 + textLength) if inflection is not None: result += ' <idx:iform name="%s" value="%s"/>' % ( inflectionName, inflection) result += "</idx:infl>" return result
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The inflection index data. @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) for value in groupList: offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) if value + 1 < entryCount: nextOffset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * (value + 1))) else: nextOffset = None # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue(inflectionNames, value) inflectionName = inflectionNames[value+consumed:value+consumed+textLength] # Get and apply inflection rule. value = tagMap[0x1a][i] offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) if inflection != None: result += ' <idx:iform name="%s" value="%s"/>' % (inflectionName, inflection) result += "</idx:infl>" return result
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The Inflection data object to properly select the right inflection data section to use @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" for value in groupList: offset, nextOffset, data = dinfl.offsets(value) # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue(inflectionNames, value) inflectionName = inflectionNames[value+consumed:value+consumed+textLength] # Get and apply inflection rule across possibly multiple inflection data sections value = tagMap[0x1a][i] rvalue, start, count, data = dinfl.lookup(value) offset, = struct.unpack_from('>H', data, start + 4 + (2 * rvalue)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) if inflection is not None: result += ' <idx:iform name="%s" value="%s"/>' % (inflectionName, inflection) result += "</idx:infl>" return result
def getPositionMap(self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) print "\nParsing metaInflIndexData" midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr['count'] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr['len'] inflectionControlByteCount, inflectionTagTable = readTagSection( tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) print "\nParsing metaOrthIndex" idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr['len'] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr['count'] print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable if hordt2 is not None: print "orth entry uses ordt2 lookup table of type ", idxhdr[ 'otype'] hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo['start'] entryCount = hdrinfo['count'] idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j + 1] textLength = ord(data[startPos]) text = data[startPos + 1:startPos + 1 + textLength] if hordt2 is not None: utext = u"" if idxhdr['otype'] == 0: pattern = '>H' inc = 2 else: pattern = '>B' inc = 1 pos = 0 while pos < textLength: off, = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode('utf-8') tagMap = getTagMap(controlByteCount, tagTable, data, startPos + 1 + textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups( text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[ 0x02][0] if entryEndPosition in positionMap: positionMap[ entryEndPosition] = "</idx:entry>" + positionMap[ entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getPositionMap (self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) print "\nParsing metaInflIndexData" midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr['count'] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr['len'] inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) print "\nParsing metaOrthIndex" idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr['len'] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr['count'] print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable if hordt2 is not None: print "orth entry uses ordt2 lookup table of type ", idxhdr['otype'] hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo['start'] entryCount = hdrinfo['count'] idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j+1] textLength = ord(data[startPos]) text = data[startPos+1:startPos+1+textLength] if hordt2 is not None: utext = u"" if idxhdr['otype'] == 0: pattern = '>H' inc = 2 else: pattern = '>B' inc = 1 pos = 0 while pos < textLength: off, = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode('utf-8') tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getPositionMap(self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18) if metaIndexCount != 1: print "Error: Dictionary contains multiple inflection index sections, which is not yet supported" decodeInflection = False inflIndexData = sect.loadSection(metaInflIndex + 1) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04) inflectionControlByteCount, inflectionTagTable = readTagSection( tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) tagSectionStart, = struct.unpack_from('>L', data, 0x04) controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount, = struct.unpack_from('>L', data, 0x18) print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j + 1] textLength = ord(data[startPos]) text = data[startPos + 1:startPos + 1 + textLength] tagMap = getTagMap(controlByteCount, tagTable, data, startPos + 1 + textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups( text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[ 0x02][0] if entryEndPosition in positionMap: positionMap[ entryEndPosition] = "</idx:entry>" + positionMap[ entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % ( text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[ entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap
def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, data, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The inflection index data. @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = "" idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) for value in groupList: offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) if value + 1 < entryCount: nextOffset, = struct.unpack_from( '>H', data, idxtPos + 4 + (2 * (value + 1))) else: nextOffset = None # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print "Error: Required tag 0x05 not found in tagMap" return "" if 0x1a not in tagMap: print "Error: Required tag 0x1a not found in tagMap" return "" result += "<idx:infl>" for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue( inflectionNames, value) inflectionName = inflectionNames[value + consumed:value + consumed + textLength] # Get and apply inflection rule. value = tagMap[0x1a][i] offset, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * value)) textLength = ord(data[offset]) inflection = self.applyInflectionRule(mainEntry, data, offset + 1, offset + 1 + textLength) if inflection != None: result += ' <idx:iform name="%s" value="%s"/>' % ( inflectionName, inflection) result += "</idx:infl>" return result
def getPositionMap (self): header = self.header sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print "Info: Document contains orthographic index, handle as dictionary" if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18) if metaIndexCount != 1: print "Error: Dictionary contains multiple inflection index sections, which is not yet supported" decodeInflection = False inflIndexData = sect.loadSection(metaInflIndex + 1) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04) inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) if DEBUG_DICT: print "inflectionTagTable: %s" % inflectionTagTable if self.hasTag(inflectionTagTable, 0x07): print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported" decodeInflection = False data = sect.loadSection(metaOrthIndex) tagSectionStart, = struct.unpack_from('>L', data, 0x04) controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount, = struct.unpack_from('>L', data, 0x18) print "orthIndexCount is", orthIndexCount if DEBUG_DICT: print "orthTagTable: %s" % tagTable hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print "Info: Index doesn't contain entry length tags" print "Read dictionary index data" for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) idxtPos, = struct.unpack_from('>L', data, 0x14) entryCount, = struct.unpack_from('>L', data, 0x18) idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j+1] textLength = ord(data[startPos]) text = data[startPos+1:startPos+1+textLength] tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a]) else: inflectionGroups = "" assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition] else: positionMap[entryEndPosition] = "</idx:entry>" else: indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups) if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap