def generate(self, propertyName, indent=4): hex_specifier = "%%#0%ix" % (int(quad_bits / 4) + 2) runtype = {-1: "Full", 0: "Empty", 1: "Mixed"} str = "\n" + (" " * indent) + "namespace {\n" + \ (" " * indent) + "const static UnicodeSet::run_t __%s_runs[] = {\n" % propertyName + \ (" " * indent) + cformat.multiline_fill(['{%s, %i}' % (runtype[r[0]], r[1]) for r in self.runs], ',', indent) + \ "};\n" if len(self.quads) == 0: str += ( " " * indent ) + "const static UnicodeSet::bitquad_t * const __%s_quads = nullptr;\n" % propertyName else: str += (" " * indent) + "const static UnicodeSet::bitquad_t __%s_quads[] = {\n" % propertyName + \ (" " * indent) + cformat.multiline_fill([hex_specifier % q for q in self.quads], ',', indent) + \ "};\n" # Despite being const_cast below, neither runs nor quads will be modified by the UnicodeSet. If any # modifications are made, they first test the run/quad capacity and will observe that they 0 length # and allocate heap memory to make any changes str += (" " * indent) + "}\n\n" + \ (" " * indent) + \ "const static UnicodeSet %s{const_cast<UnicodeSet::run_t *>(__%s_runs), %i, 0, " \ "const_cast<UnicodeSet::bitquad_t *>(__%s_quads), %i, 0};\n\n" \ % (propertyName, propertyName, len(self.runs), propertyName, len(self.quads)) return str
def genSingletonEquivalenceMap(reverse_map, mapping_name): rslt = " EquivalenceMap %s_equivalents = {\n" % mapping_name processed = {} entries = [] for s in sorted(reverse_map.keys()): # only singleton entries are relevent if len(s) == 1: c = ord(s[0]) if c in processed.keys(): continue equiv = reverse_map[s] processed[c] = True eq_class = [c, equiv] equiv_s = chr(equiv) while equiv_s in reverse_map.keys() and not equiv in processed: processed[equiv] = True equiv = reverse_map[equiv_s] eq_class.append(equiv) equiv_s = chr(equiv) equiv_class_size = len(eq_class) if equiv_class_size > 1: class_entries = [] for i in range(equiv_class_size): class_entries.append( "{0x%X, 0x%X}" % (eq_class[i], eq_class[(i + 1) % equiv_class_size])) entries.append(cformat.multiline_fill(class_entries, ',', 8)) rslt += ",\n".join(entries) + "};\n" return rslt
def GB_range_table(): idx = WHATWG_parser.parse_WHATWG_index_file('gb18030-ranges') tbl = "std::vector<std::pair<unsigned, unsigned>> GB_RangeTable = {\n " tbl += cformat.multiline_fill( ['{%i, 0x%04x}' % (k, idx[k]) for k in sorted(idx.keys())], ',', 4) tbl += "};\n" return tbl
def GB_double_byte_table(): idx = WHATWG_parser.parse_WHATWG_index_file('gb18030') tbl = "std::vector<unsigned> GB_DoubleByteTable = {\n " tbl += cformat.multiline_fill( ['0x%04x' % idx[k] for k in sorted(idx.keys())], ',', 4) tbl += "};\n" return tbl
def make_extended_ASCII_encoder(enc_name): idx = parse_WHATWG_index_file(enc_name) if not validate_full_extended_ASCII(idx): print(enc_name + " is not a full extended ASCII single-byte-encoding") return cps = [idx[k] for k in range(128)] cp_list = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8) return Template(Alphabet_Template).substitute( alphabet_name=enc_name.replace('-', '_'), codepoint_list=cp_list)
def GB_double_byte_table(): idx = WHATWG_parser.parse_WHATWG_index_file('gb18030') tbl = "std::vector<std::vector<codepoint_t>> GB_DoubleByteTable = {\n" for byte1 in range(0x81, 0xFF): pointer_base = (byte1 - 0x81) * 190 cps = [idx[p] for p in range(pointer_base, pointer_base + 190)] cp_list = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 5) tbl += " {" + cp_list + "}" if byte1 != 0xFE: tbl += ",\n" else: tbl += "};\n" return tbl