Exemplo n.º 1
0
    def cparse(self, row):
        # Parsing cp932.txt, in two-column format:
        cp = unhex(row[0])
        if cp > 32000:
            return tojis0208(cp), unhex(row[1])

        raise ValueError  # i.e., ignore this line
Exemplo n.º 2
0
    def cparse(self, row):
        # Parsing cp932.txt, in two-column format:
        cp = unhex(row[0])
        if cp > 32000:
            return tojis0208(cp), unhex(row[1])

        raise ValueError # i.e., ignore this line
Exemplo n.º 3
0
    def digest(self, src, fwd, recomp=re.compile):
        """Load the four-byte code offsets from the GB 18030 table.
        """
        one = recomp('u="(....)" b="(..) (..) (..) (..)"')
        many = recomp(
            '<range\s+uFirst="(....)"\s+uLast="(....)"\s+bFirst="(..) (..) (..) (..)"'
        )

        for line in src:
            bits = one.search(line)
            if bits is None:
                # Range of codepoints
                bits = many.search(line)
                if bits is None: continue
                data = map(lambda i, g=bits.group: unhex(g(i)), range(1, 7))
                lo, hi, data = data[0], data[1], data[2:]

            else:
                # Single codepoint: treat as a range of length 1
                data = map(lambda i, g=bits.group: unhex(g(i)), range(1, 6))
                lo = hi = data[0]
                data = data[1:]

            # Calculate the "seqpoint" from the GB four-byte encoding:
            point = (((data[0] - 0x81) * 10 + data[1] - 0x30) * 126 + data[2] -
                     0x81) * 10 + data[3] - 0x30

            while lo <= hi:
                fwd[lo] = point
                lo, point = lo + 1, point + 1
Exemplo n.º 4
0
    def jparse(self, row):
        # Parsing jis0208.txt, in three-column format:
        cp = unhex(row[0])  # or row[1] ?
        if cp > 32000:  # do we want this test ?  or what test in its place ?
            assert tojis0208(cp) == unhex(row[1])
            return tojis0208(cp), unhex(row[2])

        raise ValueError  # i.e., ignore this line
Exemplo n.º 5
0
    def jparse(self, row):
        # Parsing jis0208.txt, in three-column format:
        cp = unhex(row[0]) # or row[1] ?
        if cp > 32000: # do we want this test ?  or what test in its place ?
            assert tojis0208(cp) == unhex(row[1])
            return tojis0208(cp), unhex(row[2])

        raise ValueError # i.e., ignore this line
Exemplo n.º 6
0
 def parse(self, row):
     if len(row) != 3: raise ValueError  # not a data line
     byte, uni = unhex(row[2]), unhex(row[1])
     if byte == 0 or 0xe000 <= uni < 0xf900:
         raise ValueError  # skip this line
     if 0x10000 <= uni < 0x20000 or 0x30000 <= uni:
         print 'Unrecognised codepoint:', row[3], '(for %s)' % row[0]
         raise ValueError  # skip these, too
     return uni, byte  # we're actually building a reversed table !
Exemplo n.º 7
0
    def digest(self, src, fwd, recomp=re.compile):
        filter = recomp(
            '<a u="([0-9A-F]{4})" b="([0-9A-F]{2}) ([0-9A-F]{2})"/>')
        for line in src:
            bits = filter.search(line)
            if bits is None: continue

            uni = unhex(bits.group(1))  # 0x7250
            if 0xe000 <= uni < 0xf900: continue
            assert not (0x9fa6 <= uni <=
                        0xa000), 'added something to reverse table'
            gbk = unhex(bits.group(2) + bits.group(3))  # a0 a3
            fwd[gbk] = uni
Exemplo n.º 8
0
 def digest(self, src, row):
     for line in src:
         try:
             num, lo, hi, name = tuple(map(string.strip, string.split(line, ';')))
             if num: num = int(num)
             else: num = 128
             lo, hi = unhex(lo), unhex(hi)
             # Unicode values go up to 0x10ffff, we can encode up to 0xffffff
             assert lo < 0xffffff > hi
         except (ValueError, IndexError, AssertionError):
             print 'skipping bogus line:', line
             pass
         else:
             row.append((num, lo, hi))
Exemplo n.º 9
0
 def digest(self, src, row):
     for line in src:
         try:
             num, lo, hi, name = tuple(
                 map(string.strip, string.split(line, ';')))
             if num: num = int(num)
             else: num = 128
             lo, hi = unhex(lo), unhex(hi)
             # Unicode values go up to 0x10ffff, we can encode up to 0xffffff
             assert lo < 0xffffff > hi
         except (ValueError, IndexError, AssertionError):
             print 'skipping bogus line:', line
             pass
         else:
             row.append((num, lo, hi))
Exemplo n.º 10
0
        def get_kddi(src, fwd):
            """Parser for sources/kddi-emojis.html. """

            issjis = createChecker('Ff', '3467',   hexdigits, hexdigits)
            ispua  = createChecker('Ee', '45AaBb', hexdigits, hexdigits)

            for row in parseHTMLtable(src):
                # Consider only rows with 4 cells where cell 3 is a valid sjis
                if len(row) != 4 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[3]): pua = row[3][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 11
0
        def get_kddi(src, fwd):
            """Parser for sources/kddi-emojis.html. """

            issjis = createChecker('Ff', '3467', hexdigits, hexdigits)
            ispua = createChecker('Ee', '45AaBb', hexdigits, hexdigits)

            for row in parseHTMLtable(src):
                # Consider only rows with 4 cells where cell 3 is a valid sjis
                if len(row) != 4 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[3]): pua = row[3][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 12
0
        def get_kddi_spec_chars(src, fwd):
            """Parser for sources/kddi-spec_chars.html. """

            issjis = createChecker('58Ff',  '17Cc', hexdigits, hexdigits)
            ispua  = createChecker('2AaBb', hexdigits)

            for row in parseHTMLtable(src):
                # Strip leading '0x' from all cells
                row = map(lambda cell: cell.lstrip('0x'), row)
                # Consider only rows with at least 4 cells where cell 3 is sjis
                if len(row) < 4 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[1]): pua = row[1][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 13
0
        def get_kddi_spec_chars(src, fwd):
            """Parser for sources/kddi-spec_chars.html. """

            issjis = createChecker('58Ff', '17Cc', hexdigits, hexdigits)
            ispua = createChecker('2AaBb', hexdigits)

            for row in parseHTMLtable(src):
                # Strip leading '0x' from all cells
                row = map(lambda cell: cell.lstrip('0x'), row)
                # Consider only rows with at least 4 cells where cell 3 is sjis
                if len(row) < 4 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[1]): pua = row[1][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 14
0
        def get_imode(src, fwd):
            """Parser for sources/imode-emoji.html

            Derived from old sources/imode-emoji-makelist.pl,
            itself derived from sources by NTT DoCoMo."""

            issjis = createChecker('Ff', '89', hexdigits, hexdigits)
            ispua  = createChecker('Ee', '67', hexdigits, hexdigits)

            for row in parseHTMLtable(src):
                # Consider only rows with 7 cells where cell 3 is a valid sjis
                if len(row) != 7 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[3]): pua = row[3][:4]
                elif ispua(row[4]): pua = row[4][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 15
0
        def get_imode(src, fwd):
            """Parser for sources/imode-emoji.html

            Derived from old sources/imode-emoji-makelist.pl,
            itself derived from sources by NTT DoCoMo."""

            issjis = createChecker('Ff', '89', hexdigits, hexdigits)
            ispua = createChecker('Ee', '67', hexdigits, hexdigits)

            for row in parseHTMLtable(src):
                # Consider only rows with 7 cells where cell 3 is a valid sjis
                if len(row) != 7 or not issjis(row[2]):
                    continue
                assert issjis(row[2]), 'No Shift-JIS data'
                sjis = row[2][:4]

                if ispua(row[3]): pua = row[3][:4]
                elif ispua(row[4]): pua = row[4][:4]
                else: assert None, 'Missing Unicode PUA data for %s' % (sjis)

                cp = unhex(sjis)
                if cp > 32000:
                    print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua))
                    fwd[tojis0208(cp)] = unhex(pua)
Exemplo n.º 16
0
 def parse(self, row):
     k, v = unhex(row[0]), unhex(row[1])
     if k < 0xa1: raise ValueError
     if k == 0x2237: v = 0xFF5E # CORE-45158
     return k, v
Exemplo n.º 17
0
 def parse(self, row):
     return int(row[0]), unhex(row[2])
Exemplo n.º 18
0
 def parse(self, row):
     return unhex(row[0]), unhex(row[1])
Exemplo n.º 19
0
 def parse(self, row):
     return unhex(row[0]), unhex(row[1])
Exemplo n.º 20
0
 def parse(self, row):
     return int(row[0]), unhex(row[2])
Exemplo n.º 21
0
 def parse(self, row):
     big, uni = unhex(row[0]), unhex(row[1])
     if uni == NON_UNICODE or big < 0xa1: raise ValueError # ignore this line
     return big, uni
Exemplo n.º 22
0
    def parse(self, row):
        if len(row) == 2 and len(row[0]) == 4 == len(row[1]):
            byte, uni = unhex(row[0]), unhex(row[1])
            if 0 <= byte < 0x10000: return byte, uni

        raise ValueError  # ignore this line
Exemplo n.º 23
0
 def parse(self, row):
     big, uni = unhex(row[0]), unhex(row[1])
     if uni == NON_UNICODE or big < 0xa1:
         raise ValueError  # ignore this line
     return big, uni
Exemplo n.º 24
0
 def parse(self, row):
     k, v = unhex(row[0]), unhex(row[1])
     if k < 0xa1: raise ValueError
     if k == 0x2237: v = 0xFF5E  # CORE-45158
     return k, v