예제 #1
0
def generate_match_table3(ranges):
	"Current match table format."

	# Yet another attempt, similar to generate_match_table2 except
	# in packing format.
	#
	# Total match size now (at time of writing): 1194 bytes.
	#
	# This is the current encoding format used in duk_lexer.c.

	be = dukutil.BitEncoder()

	freq = [0] * (0x10ffff + 1)  # informative

	def enc(x):
		freq[x] += 1

		if x <= 0x0e:
			# 4-bit encoding
			be.bits(x, 4)
			return
		x -= 0x0e + 1
		if x <= 0xfd:
			# 12-bit encoding
			be.bits(0x0f, 4)
			be.bits(x, 8)
			return
		x -= 0xfd + 1
		if x <= 0xfff:
			# 24-bit encoding
			be.bits(0x0f, 4)
			be.bits(0xfe, 8)
			be.bits(x, 12)
			return
		x -= 0xfff + 1
		if True:
			# 36-bit encoding
			be.bits(0x0f, 4)
			be.bits(0xff, 8)
			be.bits(x, 24)
			return

		raise Exception('cannot encode')

	prev_re = 0

	for rs, re in ranges:
		r1 = rs - prev_re	# 1 or above (no unjoined ranges)
		r2 = re - rs		# 0 or above
		enc(r1)
		enc(r2)
		prev_re = re

	enc(0)	# end marker

	data, nbits = be.getBytes(), be.getNumBits()
	return data, freq
예제 #2
0
def gen_strings_data_bitpacked(strlist):
    be = dukutil.BitEncoder()

    # Strings are encoded as follows: a string begins in lowercase
    # mode and recognizes the following 5-bit symbols:
    #
    #    0-25    'a' ... 'z'
    #    26	     '_'
    #    27      0x00 (actually decoded to 0xff, internal marker)
    #    28	     reserved
    #    29      switch to uppercase for one character
    #            (next 5-bit symbol must be in range 0-25)
    #    30      switch to uppercase
    #    31      read a 7-bit character verbatim
    #
    # Uppercase mode is the same except codes 29 and 30 switch to
    # lowercase.

    UNDERSCORE = 26
    ZERO = 27
    SWITCH1 = 29
    SWITCH = 30
    SEVENBIT = 31

    maxlen = 0
    n_optimal = 0
    n_switch1 = 0
    n_switch = 0
    n_sevenbit = 0

    for s, d in strlist:
        be.bits(len(s), 5)

        if len(s) > maxlen:
            maxlen = len(s)

        # 5-bit character, mode specific
        mode = 'lowercase'

        for idx, c in enumerate(s):
            # This encoder is not that optimal, but good enough for now.

            islower = (ord(c) >= ord('a') and ord(c) <= ord('z'))
            isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z'))
            islast = (idx == len(s) - 1)
            isnextlower = False
            isnextupper = False
            if not islast:
                c2 = s[idx + 1]
                isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z'))
                isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z'))

            if c == '_':
                be.bits(UNDERSCORE, 5)
                n_optimal += 1
            elif c == '\x00':
                be.bits(ZERO, 5)
                n_optimal += 1
            elif islower and mode == 'lowercase':
                be.bits(ord(c) - ord('a'), 5)
                n_optimal += 1
            elif isupper and mode == 'uppercase':
                be.bits(ord(c) - ord('A'), 5)
                n_optimal += 1
            elif islower and mode == 'uppercase':
                if isnextlower:
                    be.bits(SWITCH, 5)
                    be.bits(ord(c) - ord('a'), 5)
                    mode = 'lowercase'
                    n_switch += 1
                else:
                    be.bits(SWITCH1, 5)
                    be.bits(ord(c) - ord('a'), 5)
                    n_switch1 += 1
            elif isupper and mode == 'lowercase':
                if isnextupper:
                    be.bits(SWITCH, 5)
                    be.bits(ord(c) - ord('A'), 5)
                    mode = 'uppercase'
                    n_switch += 1
                else:
                    be.bits(SWITCH1, 5)
                    be.bits(ord(c) - ord('A'), 5)
                    n_switch1 += 1
            else:
                assert (ord(c) >= 0 and ord(c) <= 127)
                be.bits(SEVENBIT, 5)
                be.bits(ord(c), 7)
                n_sevenbit += 1
                #print 'sevenbit for: %r' % c

    # end marker not necessary, C code knows length from define

    res = be.getByteString()

    print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \
           'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \
     (len(strlist), len(res), maxlen, \
             n_optimal, n_switch1, n_switch, n_sevenbit)

    return res, maxlen
예제 #3
0
def generate_tables(convmap):
    "Generate bit-packed case conversion table for a given conversion map."

    # The bitstream encoding is based on manual inspection for whatever
    # regularity the Unicode case conversion rules have.
    #
    # Start with a full description of case conversions which does not
    # cover all codepoints; unmapped codepoints convert to themselves.
    # Scan for range-to-range mappings with a range of skips starting from 1.
    # Whenever a valid range is found, remove it from the map.  Finally,
    # output the remaining case conversions (1:1 and 1:n) on a per codepoint
    # basis.
    #
    # This is very slow because we always scan from scratch, but its the
    # most reliable and simple way to scan

    ranges = [
    ]  # range mappings (2 or more consecutive mappings with a certain skip)
    singles = []  # 1:1 character mappings
    multis = []  # 1:n character mappings

    # Ranges with skips

    for skip in xrange(1, 6 + 1):  # skips 1...6 are useful
        while True:
            start_i, start_o, count = find_first_range_with_skip(convmap, skip)
            if start_i is None:
                break
            print 'skip %d: %d %d %d' % (skip, start_i, start_o, count)
            ranges.append([start_i, start_o, count, skip])

    # 1:1 conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        if len(convmap[i]) > 1:
            continue
        singles.append([i, ord(convmap[i])])  # codepoint, codepoint
        del convmap[i]

    # There are many mappings to 2-char sequences with latter char being U+0399.
    # These could be handled as a special case, but we don't do that right now.
    #
    # [8064L, u'\u1f08\u0399']
    # [8065L, u'\u1f09\u0399']
    # [8066L, u'\u1f0a\u0399']
    # [8067L, u'\u1f0b\u0399']
    # [8068L, u'\u1f0c\u0399']
    # [8069L, u'\u1f0d\u0399']
    # [8070L, u'\u1f0e\u0399']
    # [8071L, u'\u1f0f\u0399']
    # ...
    #
    # tmp = {}
    # k = convmap.keys()
    # k.sort()
    # for i in k:
    #    if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
    #        tmp[i] = convmap[i][0]
    #        del convmap[i]
    # print repr(tmp)
    #
    # skip = 1
    # while True:
    #    start_i, start_o, count = find_first_range_with_skip(tmp, skip)
    #    if start_i is None:
    #        break
    #    print 'special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)
    # print len(tmp.keys())
    # print repr(tmp)
    # XXX: need to put 12 remaining mappings back to convmap

    # 1:n conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        multis.append([i, convmap[i]])  # codepoint, string
        del convmap[i]

    for t in singles:
        print repr(t)

    for t in multis:
        print repr(t)

    print 'range mappings: %d' % len(ranges)
    print 'single character mappings: %d' % len(singles)
    print 'complex mappings (1:n): %d' % len(multis)
    print 'remaining (should be zero): %d' % len(convmap.keys())

    # XXX: opportunities for diff encoding skip=3 ranges?
    prev = None
    for t in ranges:
        # range: [start_i, start_o, count, skip]
        if t[3] != 3:
            continue
        if prev is not None:
            print '%d %d' % (t[0] - prev[0], t[1] - prev[1])
        else:
            print 'start: %d %d' % (t[0], t[1])
        prev = t

    # bit packed encoding

    be = dukutil.BitEncoder()

    for curr_skip in xrange(1, 7):  # 1...6
        count = 0
        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            count += 1
        be.bits(count, 6)
        print 'encode: skip=%d, count=%d' % (curr_skip, count)

        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            be.bits(start_i, 16)
            be.bits(start_o, 16)
            be.bits(r_count, 7)
    be.bits(0x3f, 6)  # maximum count value = end of skips

    count = len(singles)
    be.bits(count, 7)
    for t in singles:
        cp_i, cp_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(cp_o, 16)

    count = len(multis)
    be.bits(count, 7)
    for t in multis:
        cp_i, str_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(len(str_o), 2)
        for i in xrange(len(str_o)):
            be.bits(ord(str_o[i]), 16)

    return be.getBytes(), be.getNumBits()