def compressed_cp_lines(cps): values_per_line = 12 bytes_ = [] for cp in cps: lzw.add_cp(bytes_, int(cp, 16)) compressed_bytes = lzw.compress(bytes_) print 'rewrote {} * 32 = {} bits as {} * 8 = {} bits'.format(len(cps), len(cps)*32, len(bytes_), len(bytes_)*8) print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def compressed_prop_lines(cp_prop_pairs): values_per_line = 12 bytes_ = uncompressed_prop_bytes(cp_prop_pairs) compressed_bytes = lzw.compress(bytes_) #print 'rewrote {} * 64 = {} bits as {} * 8 = {} bits'.format(len(cp_prop_pairs), len(cp_prop_pairs)*64, len(bytes_), len(bytes_)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def compressed_case_mapping_lines(mappings): values_per_line = 12 bytes_ = [] for t in mappings: lzw.add_cp(bytes_, int(t[0], 16)) lzw.add_short(bytes_, t[1][0]) lzw.add_short(bytes_, t[1][1]) compressed_bytes = lzw.compress(bytes_) print 'rewrote {} * 64 = {} bits as {} * 8 = {} bits'.format(len(mappings), len(mappings)*64, len(bytes_), len(bytes_)*8) print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def compressed_case_mapping_to_lines(mappings): values_per_line = 12 bytes_ = [] for t in mappings: lzw.add_short(bytes_, t[0][0]) lzw.add_short(bytes_, t[0][1]) try: x = case_conditions[t[1]] # TODO: Totally wrong! Just here for size eval. except: x = 0 lzw.add_short(bytes_, x) compressed_bytes = lzw.compress(bytes_) print 'rewrote {} * 48 = {} bits as {} * 8 = {} bits'.format(len(mappings), len(mappings)*48, len(bytes_), len(bytes_)*8) print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) return lzw.compressed_bytes_to_lines(compressed_bytes, values_per_line)
def cus_lines(cus): as_ints = map(lambda x: ord(x), cus) values_per_line = 12 return lzw.compressed_bytes_to_lines(as_ints, values_per_line)[0]
def uncompressed_prop_lines(cp_prop_pairs): values_per_line = 18 bytes_ = uncompressed_prop_bytes(cp_prop_pairs) return lzw.compressed_bytes_to_lines(bytes_, values_per_line)
if cp in quick_check_maps['NFKC']: nfkc_quick_check = quick_check_maps['NFKC'][cp] lzw.add_cp(prop_bytes_, cp) lzw.add_short(prop_bytes_, canonical_decomp[0]) lzw.add_short(prop_bytes_, canonical_decomp[1]) lzw.add_short(prop_bytes_, compatible_decomp[0]) lzw.add_short(prop_bytes_, compatible_decomp[1]) lzw.add_byte(prop_bytes_, int(ccc)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfd_quick_check, nfkd_quick_check)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfc_quick_check, nfkc_quick_check)) value_per_line = 12 compressed_bytes = lzw.compress(prop_bytes_) props_lines, num_shorts = lzw.compressed_bytes_to_lines( compressed_bytes, value_per_line) #print 'rewrote {} * 144 = {} bits as {} * 8 = {} bits'.format(len(all_cps), len(all_cps)*144, len(prop_bytes_), len(prop_bytes_)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) cpp_file = open('normalization_data_cp_props.cpp', 'w') cpp_file.write( cp_props_file_form.format(canon_all_cps_string, len(canon_all_cps), compat_all_cps_string, len(compat_all_cps), props_lines, num_shorts, len(all_cps))) def cps_string(cps): cps = map(lambda x: hex(x)[2:], cps) return ''.join(map(lambda x: r'\U' + '0' * (8 - len(x)) + x, cps))