def cparse(self, row): # Parsing cp932.txt, in two-column format: cp = unhex(row[0]) if cp > 32000: return tojis0208(cp), unhex(row[1]) raise ValueError # i.e., ignore this line
def digest(self, src, fwd, recomp=re.compile): """Load the four-byte code offsets from the GB 18030 table. """ one = recomp('u="(....)" b="(..) (..) (..) (..)"') many = recomp( '<range\s+uFirst="(....)"\s+uLast="(....)"\s+bFirst="(..) (..) (..) (..)"' ) for line in src: bits = one.search(line) if bits is None: # Range of codepoints bits = many.search(line) if bits is None: continue data = map(lambda i, g=bits.group: unhex(g(i)), range(1, 7)) lo, hi, data = data[0], data[1], data[2:] else: # Single codepoint: treat as a range of length 1 data = map(lambda i, g=bits.group: unhex(g(i)), range(1, 6)) lo = hi = data[0] data = data[1:] # Calculate the "seqpoint" from the GB four-byte encoding: point = (((data[0] - 0x81) * 10 + data[1] - 0x30) * 126 + data[2] - 0x81) * 10 + data[3] - 0x30 while lo <= hi: fwd[lo] = point lo, point = lo + 1, point + 1
def jparse(self, row): # Parsing jis0208.txt, in three-column format: cp = unhex(row[0]) # or row[1] ? if cp > 32000: # do we want this test ? or what test in its place ? assert tojis0208(cp) == unhex(row[1]) return tojis0208(cp), unhex(row[2]) raise ValueError # i.e., ignore this line
def parse(self, row): if len(row) != 3: raise ValueError # not a data line byte, uni = unhex(row[2]), unhex(row[1]) if byte == 0 or 0xe000 <= uni < 0xf900: raise ValueError # skip this line if 0x10000 <= uni < 0x20000 or 0x30000 <= uni: print 'Unrecognised codepoint:', row[3], '(for %s)' % row[0] raise ValueError # skip these, too return uni, byte # we're actually building a reversed table !
def digest(self, src, fwd, recomp=re.compile): filter = recomp( '<a u="([0-9A-F]{4})" b="([0-9A-F]{2}) ([0-9A-F]{2})"/>') for line in src: bits = filter.search(line) if bits is None: continue uni = unhex(bits.group(1)) # 0x7250 if 0xe000 <= uni < 0xf900: continue assert not (0x9fa6 <= uni <= 0xa000), 'added something to reverse table' gbk = unhex(bits.group(2) + bits.group(3)) # a0 a3 fwd[gbk] = uni
def digest(self, src, row): for line in src: try: num, lo, hi, name = tuple(map(string.strip, string.split(line, ';'))) if num: num = int(num) else: num = 128 lo, hi = unhex(lo), unhex(hi) # Unicode values go up to 0x10ffff, we can encode up to 0xffffff assert lo < 0xffffff > hi except (ValueError, IndexError, AssertionError): print 'skipping bogus line:', line pass else: row.append((num, lo, hi))
def digest(self, src, row): for line in src: try: num, lo, hi, name = tuple( map(string.strip, string.split(line, ';'))) if num: num = int(num) else: num = 128 lo, hi = unhex(lo), unhex(hi) # Unicode values go up to 0x10ffff, we can encode up to 0xffffff assert lo < 0xffffff > hi except (ValueError, IndexError, AssertionError): print 'skipping bogus line:', line pass else: row.append((num, lo, hi))
def get_kddi(src, fwd): """Parser for sources/kddi-emojis.html. """ issjis = createChecker('Ff', '3467', hexdigits, hexdigits) ispua = createChecker('Ee', '45AaBb', hexdigits, hexdigits) for row in parseHTMLtable(src): # Consider only rows with 4 cells where cell 3 is a valid sjis if len(row) != 4 or not issjis(row[2]): continue assert issjis(row[2]), 'No Shift-JIS data' sjis = row[2][:4] if ispua(row[3]): pua = row[3][:4] else: assert None, 'Missing Unicode PUA data for %s' % (sjis) cp = unhex(sjis) if cp > 32000: print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua)) fwd[tojis0208(cp)] = unhex(pua)
def get_kddi_spec_chars(src, fwd): """Parser for sources/kddi-spec_chars.html. """ issjis = createChecker('58Ff', '17Cc', hexdigits, hexdigits) ispua = createChecker('2AaBb', hexdigits) for row in parseHTMLtable(src): # Strip leading '0x' from all cells row = map(lambda cell: cell.lstrip('0x'), row) # Consider only rows with at least 4 cells where cell 3 is sjis if len(row) < 4 or not issjis(row[2]): continue assert issjis(row[2]), 'No Shift-JIS data' sjis = row[2][:4] if ispua(row[1]): pua = row[1][:4] else: assert None, 'Missing Unicode PUA data for %s' % (sjis) cp = unhex(sjis) if cp > 32000: print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua)) fwd[tojis0208(cp)] = unhex(pua)
def get_imode(src, fwd): """Parser for sources/imode-emoji.html Derived from old sources/imode-emoji-makelist.pl, itself derived from sources by NTT DoCoMo.""" issjis = createChecker('Ff', '89', hexdigits, hexdigits) ispua = createChecker('Ee', '67', hexdigits, hexdigits) for row in parseHTMLtable(src): # Consider only rows with 7 cells where cell 3 is a valid sjis if len(row) != 7 or not issjis(row[2]): continue assert issjis(row[2]), 'No Shift-JIS data' sjis = row[2][:4] if ispua(row[3]): pua = row[3][:4] elif ispua(row[4]): pua = row[4][:4] else: assert None, 'Missing Unicode PUA data for %s' % (sjis) cp = unhex(sjis) if cp > 32000: print "fwd[%s] = %s" % (tojis0208(cp), unhex(pua)) fwd[tojis0208(cp)] = unhex(pua)
def parse(self, row): k, v = unhex(row[0]), unhex(row[1]) if k < 0xa1: raise ValueError if k == 0x2237: v = 0xFF5E # CORE-45158 return k, v
def parse(self, row): return int(row[0]), unhex(row[2])
def parse(self, row): return unhex(row[0]), unhex(row[1])
def parse(self, row): big, uni = unhex(row[0]), unhex(row[1]) if uni == NON_UNICODE or big < 0xa1: raise ValueError # ignore this line return big, uni
def parse(self, row): if len(row) == 2 and len(row[0]) == 4 == len(row[1]): byte, uni = unhex(row[0]), unhex(row[1]) if 0 <= byte < 0x10000: return byte, uni raise ValueError # ignore this line