nfd_quick_check = quick_check_maps['NFD'][cp] nfkd_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFKD']: nfkd_quick_check = quick_check_maps['NFKD'][cp] nfc_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFC']: nfc_quick_check = quick_check_maps['NFC'][cp] nfkc_quick_check = 'quick_check::yes' if cp in quick_check_maps['NFKC']: nfkc_quick_check = quick_check_maps['NFKC'][cp] lzw.add_cp(prop_bytes_, cp) lzw.add_short(prop_bytes_, canonical_decomp[0]) lzw.add_short(prop_bytes_, canonical_decomp[1]) lzw.add_short(prop_bytes_, compatible_decomp[0]) lzw.add_short(prop_bytes_, compatible_decomp[1]) lzw.add_byte(prop_bytes_, int(ccc)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfd_quick_check, nfkd_quick_check)) lzw.add_byte(prop_bytes_, \ quick_checks_to_byte(nfc_quick_check, nfkc_quick_check)) value_per_line = 12 compressed_bytes = lzw.compress(prop_bytes_) props_lines, num_shorts = lzw.compressed_bytes_to_lines( compressed_bytes, value_per_line) #print 'rewrote {} * 144 = {} bits as {} * 8 = {} bits'.format(len(all_cps), len(all_cps)*144, len(prop_bytes_), len(prop_bytes_)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_bytes), len(compressed_bytes) * 16) cpp_file = open('normalization_data_cp_props.cpp', 'w') cpp_file.write( cp_props_file_form.format(canon_all_cps_string,
def uncompressed_prop_bytes(cp_prop_pairs): bytes_ = [] for pair in cp_prop_pairs: lzw.add_cp(bytes_, pair[0]) lzw.add_byte(bytes_, pair[1]) return bytes_
'uint16_t', 2500) cpp_file = open('collation_data_0.cpp', 'w') cpp_file.write( collation_data_0_file_form.format(implicit_weights_segments_str, len(implicit_weights_segments), reorder_group_str, len(reorder_group_strings), ce_lines, len(compressed_ces), len(collation_elements))) key_bytes = [] #value_bytes = [] value_strings = [] for k, v in sorted(fcc_cet.items(), key=lambda x: original_order[x[0]]): lzw.add_byte(key_bytes, len(k)) for x in k: lzw.add_cp(key_bytes, x) value_strings.append('{{{}, {}}}'.format(v[0], v[1])) #lzw.add_short(value_bytes, v[0]) #lzw.add_short(value_bytes, v[1]) compressed_keys = lzw.compress(key_bytes) # The other data sets are optimizaed by LZW compression. This one is # heavily pessimized. # compressed_values = lzw.compress(value_bytes) #print 'rewrote {} * 128 = {} bits as {} * 8 = {} bits'.format(len(fcc_cet), len(fcc_cet)*128, len(key_bytes), len(key_bytes)*8) #print 'compressed to {} * 16 = {} bits'.format(len(compressed_keys), len(compressed_keys) * 16) key_lines = values_to_lines(map(lambda x: hex(x), compressed_keys), 'uint16_t', 2500)