Пример #1
0
    def getPositionMap(self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)

                print "\nParsing metaInflIndexData"
                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)

                metaIndexCount = midxhdr['count']
                idatas = []
                for j in range(metaIndexCount):
                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
                dinfl = InflectionData(idatas)

                inflNameData = sect.loadSection(metaInflIndex + 1 +
                                                metaIndexCount)
                tagSectionStart = midxhdr['len']
                inflectionControlByteCount, inflectionTagTable = readTagSection(
                    tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)

            print "\nParsing metaOrthIndex"
            idxhdr, hordt1, hordt2 = self.parseHeader(data)

            tagSectionStart = idxhdr['len']
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount = idxhdr['count']
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            if hordt2 is not None:
                print "orth entry uses ordt2 lookup table of type ", idxhdr[
                    'otype']
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1,
                           metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
                idxtPos = hdrinfo['start']
                entryCount = hdrinfo['count']
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data,
                                              idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j + 1]
                    textLength = ord(data[startPos])
                    text = data[startPos + 1:startPos + 1 + textLength]
                    if hordt2 is not None:
                        utext = u""
                        if idxhdr['otype'] == 0:
                            pattern = '>H'
                            inc = 2
                        else:
                            pattern = '>B'
                            inc = 1
                        pos = 0
                        while pos < textLength:
                            off, = struct.unpack_from(pattern, text, pos)
                            if off < len(hordt2):
                                utext += unichr(hordt2[off])
                            else:
                                utext += unichr(off)
                            pos += inc
                        text = utext.encode('utf-8')

                    tagMap = getTagMap(controlByteCount, tagTable, data,
                                       startPos + 1 + textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(
                                text, inflectionControlByteCount,
                                inflectionTagTable, dinfl, inflNameData,
                                tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[
                                0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[
                                    entryEndPosition] = "</idx:entry>" + positionMap[
                                        entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap
Пример #2
0
    def getPositionMap(self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)
                metaIndexCount, = struct.unpack_from('>L', metaInflIndexData,
                                                     0x18)
                if metaIndexCount != 1:
                    print "Error: Dictionary contains multiple inflection index sections, which is not yet supported"
                    decodeInflection = False
                inflIndexData = sect.loadSection(metaInflIndex + 1)
                inflNameData = sect.loadSection(metaInflIndex + 1 +
                                                metaIndexCount)
                tagSectionStart, = struct.unpack_from('>L', metaInflIndexData,
                                                      0x04)
                inflectionControlByteCount, inflectionTagTable = readTagSection(
                    tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)
            tagSectionStart, = struct.unpack_from('>L', data, 0x04)
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount, = struct.unpack_from('>L', data, 0x18)
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1,
                           metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                idxtPos, = struct.unpack_from('>L', data, 0x14)
                entryCount, = struct.unpack_from('>L', data, 0x18)
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data,
                                              idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)

                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j + 1]
                    textLength = ord(data[startPos])
                    text = data[startPos + 1:startPos + 1 + textLength]
                    tagMap = getTagMap(controlByteCount, tagTable, data,
                                       startPos + 1 + textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(
                                text, inflectionControlByteCount,
                                inflectionTagTable, inflIndexData,
                                inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[
                                0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[
                                    entryEndPosition] = "</idx:entry>" + positionMap[
                                        entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (
                                text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[
                                    entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap
Пример #3
0
    def getPositionMap (self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)

                print "\nParsing metaInflIndexData"
                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)

                metaIndexCount = midxhdr['count']
                idatas = []
                for j in range(metaIndexCount):
                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
                dinfl = InflectionData(idatas)
                
                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
                tagSectionStart = midxhdr['len']
                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)

            print "\nParsing metaOrthIndex"
            idxhdr, hordt1, hordt2 = self.parseHeader(data)

            tagSectionStart = idxhdr['len']
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount = idxhdr['count']
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            if hordt2 is not None:
                print "orth entry uses ordt2 lookup table of type ", idxhdr['otype']
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
                idxtPos = hdrinfo['start']
                entryCount = hdrinfo['count']
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos])
                    text = data[startPos+1:startPos+1+textLength]
                    if hordt2 is not None:
                        utext = u""
                        if idxhdr['otype'] == 0:
                            pattern = '>H'
                            inc = 2
                        else:
                            pattern = '>B'
                            inc = 1
                        pos = 0
                        while pos < textLength:
                            off, = struct.unpack_from(pattern, text, pos)
                            if off < len(hordt2):
                                utext += unichr(hordt2[off])
                            else:
                                utext += unichr(off)
                            pos += inc
                        text = utext.encode('utf-8')

                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, 
                                                                        dinfl, inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap
Пример #4
0
    def getPositionMap (self):
        header = self.header
        sect = self.sect

        positionMap = {}

        metaOrthIndex = self.metaOrthIndex
        metaInflIndex = self.metaInflIndex

        decodeInflection = True
        if metaOrthIndex != 0xFFFFFFFF:
            print "Info: Document contains orthographic index, handle as dictionary"
            if metaInflIndex == 0xFFFFFFFF:
                decodeInflection = False
            else:
                metaInflIndexData = sect.loadSection(metaInflIndex)
                metaIndexCount, = struct.unpack_from('>L', metaInflIndexData, 0x18)
                if metaIndexCount != 1:
                    print "Error: Dictionary contains multiple inflection index sections, which is not yet supported"
                    decodeInflection = False
                inflIndexData = sect.loadSection(metaInflIndex + 1)
                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
                tagSectionStart, = struct.unpack_from('>L', metaInflIndexData, 0x04)
                inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
                if DEBUG_DICT:
                    print "inflectionTagTable: %s" % inflectionTagTable
                if self.hasTag(inflectionTagTable, 0x07):
                    print "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
                    decodeInflection = False

            data = sect.loadSection(metaOrthIndex)
            tagSectionStart, = struct.unpack_from('>L', data, 0x04)
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            orthIndexCount, = struct.unpack_from('>L', data, 0x18)
            print "orthIndexCount is", orthIndexCount
            if DEBUG_DICT:
                print "orthTagTable: %s" % tagTable
            hasEntryLength = self.hasTag(tagTable, 0x02)
            if not hasEntryLength:
                print "Info: Index doesn't contain entry length tags"

            print "Read dictionary index data"
            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
                data = sect.loadSection(i)
                idxtPos, = struct.unpack_from('>L', data, 0x14)
                entryCount, = struct.unpack_from('>L', data, 0x18)
                idxPositions = []
                for j in range(entryCount):
                    pos, = struct.unpack_from('>H', data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)

                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j+1]
                    textLength = ord(data[startPos])
                    text = data[startPos+1:startPos+1+textLength]
                    tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
                    if 0x01 in tagMap:
                        if decodeInflection and 0x2a in tagMap:
                            inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, inflIndexData, inflNameData, tagMap[0x2a])
                        else:
                            inflectionGroups = ""
                        assert len(tagMap[0x01]) == 1
                        entryStartPosition = tagMap[0x01][0]
                        if hasEntryLength:
                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
                            ml = '<idx:entry scriptable="yes"><idx:orth value="%s">%s</idx:orth>' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
                            else:
                                positionMap[entryStartPosition] = ml
                            assert len(tagMap[0x02]) == 1
                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
                            if entryEndPosition in positionMap:
                                positionMap[entryEndPosition] = "</idx:entry>" + positionMap[entryEndPosition]
                            else:
                                positionMap[entryEndPosition] = "</idx:entry>"

                        else:
                            indexTags = '<idx:entry>\n<idx:orth value="%s">\n%s</idx:entry>\n' % (text, inflectionGroups)
                            if entryStartPosition in positionMap:
                                positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
                            else:
                                positionMap[entryStartPosition] = indexTags
        return positionMap