Exemplo n.º 1
0
    def generate(self, propertyName, indent=4):
        hex_specifier = "%%#0%ix" % (int(quad_bits / 4) + 2)
        runtype = {-1: "Full", 0: "Empty", 1: "Mixed"}

        str = "\n" + (" " * indent) + "namespace {\n" + \
              (" " * indent) + "const static UnicodeSet::run_t __%s_runs[] = {\n" % propertyName + \
              (" " * indent) + cformat.multiline_fill(['{%s, %i}' % (runtype[r[0]], r[1]) for r in self.runs], ',',
                                                      indent) + \
              "};\n"

        if len(self.quads) == 0:
            str += (
                " " * indent
            ) + "const static UnicodeSet::bitquad_t * const __%s_quads = nullptr;\n" % propertyName
        else:
            str += (" " * indent) + "const static UnicodeSet::bitquad_t  __%s_quads[] = {\n" % propertyName + \
                   (" " * indent) + cformat.multiline_fill([hex_specifier % q for q in self.quads], ',', indent) + \
                   "};\n"

        # Despite being const_cast below, neither runs nor quads will be modified by the UnicodeSet. If any
        # modifications are made, they first test the run/quad capacity and will observe that they 0 length
        # and allocate heap memory to make any changes

        str += (" " * indent) + "}\n\n" + \
               (" " * indent) + \
               "const static UnicodeSet %s{const_cast<UnicodeSet::run_t *>(__%s_runs), %i, 0, " \
               "const_cast<UnicodeSet::bitquad_t *>(__%s_quads), %i, 0};\n\n" \
               % (propertyName, propertyName, len(self.runs), propertyName, len(self.quads))

        return str
def genSingletonEquivalenceMap(reverse_map, mapping_name):
    rslt = "    EquivalenceMap %s_equivalents = {\n" % mapping_name
    processed = {}
    entries = []
    for s in sorted(reverse_map.keys()):
        # only singleton entries are relevent
        if len(s) == 1:
            c = ord(s[0])
            if c in processed.keys(): continue
            equiv = reverse_map[s]
            processed[c] = True
            eq_class = [c, equiv]
            equiv_s = chr(equiv)
            while equiv_s in reverse_map.keys() and not equiv in processed:
                processed[equiv] = True
                equiv = reverse_map[equiv_s]
                eq_class.append(equiv)
                equiv_s = chr(equiv)
            equiv_class_size = len(eq_class)
            if equiv_class_size > 1:
                class_entries = []
                for i in range(equiv_class_size):
                    class_entries.append(
                        "{0x%X, 0x%X}" %
                        (eq_class[i], eq_class[(i + 1) % equiv_class_size]))
                entries.append(cformat.multiline_fill(class_entries, ',', 8))

    rslt += ",\n".join(entries) + "};\n"
    return rslt
def GB_range_table():
    idx = WHATWG_parser.parse_WHATWG_index_file('gb18030-ranges')
    tbl = "std::vector<std::pair<unsigned, unsigned>> GB_RangeTable = {\n    "
    tbl += cformat.multiline_fill(
        ['{%i, 0x%04x}' % (k, idx[k]) for k in sorted(idx.keys())], ',', 4)
    tbl += "};\n"
    return tbl
def GB_double_byte_table():
    idx = WHATWG_parser.parse_WHATWG_index_file('gb18030')
    tbl = "std::vector<unsigned> GB_DoubleByteTable = {\n    "
    tbl += cformat.multiline_fill(
        ['0x%04x' % idx[k] for k in sorted(idx.keys())], ',', 4)
    tbl += "};\n"
    return tbl
def make_extended_ASCII_encoder(enc_name):
    idx = parse_WHATWG_index_file(enc_name)
    if not validate_full_extended_ASCII(idx):
        print(enc_name + " is not a full extended ASCII single-byte-encoding")
        return
    cps = [idx[k] for k in range(128)]
    cp_list = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
    return Template(Alphabet_Template).substitute(
        alphabet_name=enc_name.replace('-', '_'), codepoint_list=cp_list)
def GB_double_byte_table():
    idx = WHATWG_parser.parse_WHATWG_index_file('gb18030')
    tbl = "std::vector<std::vector<codepoint_t>> GB_DoubleByteTable = {\n"
    for byte1 in range(0x81, 0xFF):
        pointer_base = (byte1 - 0x81) * 190
        cps = [idx[p] for p in range(pointer_base, pointer_base + 190)]
        cp_list = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 5)
        tbl += "    {" + cp_list + "}"
        if byte1 != 0xFE:
            tbl += ",\n"
        else:
            tbl += "};\n"
    return tbl