def main(): parser = optparse.OptionParser() parser.add_option('--unicode-data', dest='unicode_data') parser.add_option('--special-casing', dest='special_casing') parser.add_option('--out-source', dest='out_source') parser.add_option('--out-header', dest='out_header') parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc') parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc') (opts, args) = parser.parse_args() unicode_data = UnicodeData(opts.unicode_data) special_casing = SpecialCasing(opts.special_casing) uc, lc, tc = get_base_conversion_maps(unicode_data) update_special_casings(uc, lc, tc, special_casing) # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded # ascii is handled with 'fast path' so not needed here t = clonedict(uc) remove_ascii_part(t) uc_bytes, uc_nbits = generate_tables(t) t = clonedict(lc) remove_ascii_part(t) lc_bytes, lc_nbits = generate_tables(t) # Generate C source and header files genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True) genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True) f = open(opts.out_source, 'wb') f.write(genc.getString()) f.close() genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes))) genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes))) f = open(opts.out_header, 'wb') f.write(genc.getString()) f.close()
def main(): parser = optparse.OptionParser() parser.add_option('--unicode-data', dest='unicode_data') # UnicodeData.txt parser.add_option('--special-casing', dest='special_casing') # SpecialCasing.txt parser.add_option('--include-categories', dest='include_categories') parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE') parser.add_option('--out-source', dest='out_source') parser.add_option('--out-header', dest='out_header') parser.add_option('--out-png', dest='out_png') parser.add_option('--table-name', dest='table_name', default='match_table') (opts, args) = parser.parse_args() unidata = opts.unicode_data catsinc = [] if opts.include_categories != '': catsinc = opts.include_categories.split(',') catsexc = [] if opts.exclude_categories != 'NONE': catsexc = opts.exclude_categories.split(',') print 'CATSEXC: %s' % repr(catsexc) print 'CATSINC: %s' % repr(catsinc) # pseudocategories filter_ascii = ('ASCII' in catsexc) filter_nonbmp = ('NONBMP' in catsexc) # Read raw result def filter1(x): if filter_ascii and x <= 0x7f: # exclude ascii return False if filter_nonbmp and x >= 0x10000: # exclude non-bmp return False return True res = read_unicode_data(unidata, catsinc, catsexc, filter1) # Raw output print('RAW OUTPUT:') print('===========') print('\n'.join(res)) # Scan ranges print('') print('RANGES:') print('=======') ranges = scan_ranges(res) for i in ranges: if i[0] == i[1]: print('0x%04x' % i[0]) else: print('0x%04x ... 0x%04x' % (i[0], i[1])) print('') print('%d ranges total' % len(ranges)) # Generate match table print('') print('MATCH TABLE:') print('============') #matchtable1 = generate_match_table1(ranges) #matchtable2 = generate_match_table2(ranges) matchtable3, freq = generate_match_table3(ranges) print 'match table: %s' % repr(matchtable3) print 'match table length: %d bytes' % len(matchtable3) print 'encoding freq:' for i in xrange(len(freq)): if freq[i] == 0: continue print ' %6d: %d' % (i, freq[i]) print('') print('MATCH C TABLE -> file %s' % repr(opts.out_header)) # Create C source and header files genc = dukutil.GenerateC() genc.emitHeader('extract_chars.py') genc.emitArray(matchtable3, opts.table_name, bytesize=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True) if opts.out_source is not None: f = open(opts.out_source, 'wb') f.write(genc.getString()) f.close() genc = dukutil.GenerateC() genc.emitHeader('extract_chars.py') genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3))) if opts.out_header is not None: f = open(opts.out_header, 'wb') f.write(genc.getString()) f.close() # Image (for illustrative purposes only) if opts.out_png is not None: generate_png(res, opts.out_png)
def main(): parser = optparse.OptionParser() parser.add_option('--command', dest='command', default='caseconv_bitpacked') parser.add_option('--unicode-data', dest='unicode_data') parser.add_option('--special-casing', dest='special_casing') parser.add_option('--out-source', dest='out_source') parser.add_option('--out-header', dest='out_header') parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc') parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc') parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup') (opts, args) = parser.parse_args() unicode_data = UnicodeData(opts.unicode_data) special_casing = SpecialCasing(opts.special_casing) uc, lc, tc = get_base_conversion_maps(unicode_data) update_special_casings(uc, lc, tc, special_casing) if opts.command == 'caseconv_bitpacked': # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded # ascii is handled with 'fast path' so not needed here t = clonedict(uc) remove_ascii_part(t) uc_bytes, uc_nbits = generate_tables(t) t = clonedict(lc) remove_ascii_part(t) lc_bytes, lc_nbits = generate_tables(t) # Generate C source and header files genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True) genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True) f = open(opts.out_source, 'wb') f.write(genc.getString()) f.close() genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes))) genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes))) f = open(opts.out_header, 'wb') f.write(genc.getString()) f.close() elif opts.command == 're_canon_lookup': # direct canonicalization lookup for case insensitive regexps, includes ascii part t = clonedict(uc) re_canon_lookup = generate_regexp_canonicalize_lookup(t) genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True) f = open(opts.out_source, 'wb') f.write(genc.getString()) f.close() genc = dukutil.GenerateC() genc.emitHeader('extract_caseconv.py') genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup))) f = open(opts.out_header, 'wb') f.write(genc.getString()) f.close() else: raise Exception('invalid command: %r' % opts.command)