예제 #1
0
def generate(target):
    """Generates a C++ source file that contains a diacritic removal mapping 
       function.

    The delimiter checking function contains a switch statement with cases for
    every character in Unicode that has a removable combining diacritical mark.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongol/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    # Map diacritics from 0 to the maximum Unicode codepoint
    add_diacritic_range(0x0000, 0x10FFFF)

    out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) {
    switch (codepoint) {\n""")

    mappings_list = []

    for mapping in diacritic_mappings:
        mappings_list.append((mapping, diacritic_mappings[mapping]))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        out.write("    case " + str(hex(mapping[0])) + ": return " + \
            str(hex(mapping[1])) +";\n")

    out.write("    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())
예제 #2
0
def generate(target):
    """Generates a C++ source file that contains a diacritic removal mapping 
       function.

    The delimiter checking function contains a switch statement with cases for
    every character in Unicode that has a removable combining diacritical mark.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongol/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    # Map diacritics from 0 to the maximum Unicode codepoint
    add_diacritic_range(0x0000, 0x10FFFF)

    out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) {
    switch (codepoint) {\n""")

    mappings_list = []

    for mapping in diacritic_mappings:
        mappings_list.append((mapping, diacritic_mappings[mapping]))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        out.write("    case " + str(hex(mapping[0])) + ": return " + \
            str(hex(mapping[1])) +";\n")

    out.write("    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())
예제 #3
0
def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a diacritic checking function.

    The diacritic checking function contains a switch statement with cases for
    every diacritic in the Unicode Character Database.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("monger/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    diacritics = set()

    proplist_file = open(unicode_proplist_file, 'r')

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in "Diacritic":
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in diacritics:
                        diacritics.add(i)
            else:
                if int(values[0], 16) not in diacritics:
                    diacritics.add(int(values[0], 16))

    out.write("""bool codepointIsDiacritic(char32_t codepoint) {
    switch (codepoint) {\n""")

    for diacritic in sorted(diacritics):
        out.write("\
    case " + str(hex(diacritic)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())
예제 #4
0
def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a diacritic checking function.

    The diacritic checking function contains a switch statement with cases for
    every diacritic in the Unicode Character Database.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    diacritics = set()

    proplist_file = open(unicode_proplist_file, 'r')

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in "Diacritic":
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in diacritics:
                        diacritics.add(i)
            else:
                if int(values[0], 16) not in diacritics:
                    diacritics.add(int(values[0], 16))

    out.write("""bool codepointIsDiacritic(char32_t codepoint) {
    switch (codepoint) {\n""")

    for diacritic in sorted(diacritics):
        out.write("\
    case " + str(hex(diacritic)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())
예제 #5
0
def generate(unicode_casefold_file, target):
    """Generates a C++ source file that contains a Unicode case folding 
       function.

    The case folding function contains a switch statement with cases for every
    Unicode codepoint that has a case folding mapping. 
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    case_mappings = {}

    cf_file = open(unicode_casefold_file, "rU")

    for line in cf_file:
        # Filter out blank lines and lines that start with #
        data = line[: line.find("#")]
        if data == "":
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert len(values) == 4

        status = values[1]
        if status == "C" or status == "S":
            # We only include the "Common" and "Simple" mappings. "Full" case
            # folding mappings expand certain letters to multiple codepoints,
            # which we currently do not support.
            original_codepoint = int(values[0], 16)
            codepoint_mapping = int(values[2], 16)
            case_mappings[original_codepoint] = codepoint_mapping

    out.write(
        """char32_t codepointToLower(char32_t codepoint, CaseFoldMode \
mode) { 
    if (mode == CaseFoldMode::kTurkish) {
        if (codepoint == 0x049) {  // I -> ı
            return 0x131;
        } else if (codepoint == 0x130) {  // İ -> i
            return 0x069;
        }
    }

    switch (codepoint) {\n"""
    )

    mappings_list = []

    for mapping in case_mappings:
        mappings_list.append((mapping, case_mappings[mapping]))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        out.write(
            "\
    case "
            + str(hex(mapping[0]))
            + ": return "
            + str(hex(mapping[1]))
            + ";\n"
        )

    out.write(
        "\
    default: return codepoint;\n    }\n}"
    )

    out.write(closeNamespaces())
예제 #6
0
def generate(unicode_casefold_file, target):
    """Generates a C++ source file that contains a Unicode case folding
       function.

    The case folding function contains a switch statement with cases for every
    Unicode codepoint that has a case folding mapping.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    case_mappings = {}

    cf_file = open(unicode_casefold_file, 'rU')

    for line in cf_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 4)

        status = values[1]
        if status == 'C' or status == 'S':
            # We only include the "Common" and "Simple" mappings. "Full" case
            # folding mappings expand certain letters to multiple codepoints,
            # which we currently do not support.
            original_codepoint = int(values[0], 16)
            codepoint_mapping = int(values[2], 16)
            case_mappings[original_codepoint] = codepoint_mapping

    turkishMapping = {
        0x49: 0x131,  # I -> ı
        0x130: 0x069,  # İ -> i
    }

    out.write(
        """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
               if (codepoint <= 0x7f) {
                    if (codepoint >= 'A' && codepoint <= 'Z') {
                       return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
                              ? 0x131
                              : (codepoint | 0x20); // Set the ascii lowercase bit on the character.
                   }
                   return codepoint;
               }

               switch (codepoint) {\n""")

    mappings_list = []

    for mapping in case_mappings:
        mappings_list.append((mapping, case_mappings[mapping]))

    # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
    # them even if we'd skip the letter in non-turkish mode.
    for mapping in turkishMapping:
        if mapping not in case_mappings:
            mappings_list.append((mapping, mapping))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        if mapping[0] <= 0x7f:
            continue  # ascii is special cased above.

        if mapping[0] in turkishMapping:
            out.write(
                "case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
                % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
        else:
            out.write("case 0x%x: return 0x%x;\n" % mapping)

    out.write("\
    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())
예제 #7
0
def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a delimiter checking function.

    The delimiter checking function contains a switch statement with cases for 
    every delimiter in the Unicode Character Database with the properties 
    specified in delim_properties.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    delim_codepoints = set()

    proplist_file = open(unicode_proplist_file, 'r')

    delim_properties = [
        "White_Space", "Dash", "Hyphen", "Quotation_Mark",
        "Terminal_Punctuation", "Pattern_Syntax", "STerm"
    ]

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in delim_properties:
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in delim_codepoints:
                        delim_codepoints.add(i)
            else:
                if int(values[0], 16) not in delim_codepoints:
                    delim_codepoints.add(int(values[0], 16))

    # As of Unicode 8.0.0, all of the delimiters we used for text index
    # version 2 are also in the list.

    out.write("""bool codepointIsDelimiter(char32_t codepoint, \
DelimiterListLanguage lang) {
    if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') {
        return false;
    }

    switch (codepoint) {\n""")

    for delim in sorted(delim_codepoints):
        out.write("\
    case " + str(hex(delim)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())
def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a delimiter checking function.

    The delimiter checking function contains a switch statement with cases for 
    every delimiter in the Unicode Character Database with the properties 
    specified in delim_properties.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    delim_codepoints = set()

    proplist_file = open(unicode_proplist_file, 'r')

    delim_properties = ["White_Space", 
                        "Dash", 
                        "Hyphen", 
                        "Quotation_Mark", 
                        "Terminal_Punctuation", 
                        "Pattern_Syntax", 
                        "STerm"]

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if(data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert(len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in delim_properties:
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end   = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in delim_codepoints: 
                        delim_codepoints.add(i)
            else:
                if int(values[0], 16) not in delim_codepoints:
                    delim_codepoints.add(int(values[0], 16))

    # As of Unicode 8.0.0, all of the delimiters we used for text index 
    # version 2 are also in the list.

    out.write("""bool codepointIsDelimiter(char32_t codepoint, \
DelimiterListLanguage lang) {
    if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') {
        return false;
    }

    // Most characters are latin letters, so filter those out first.
    if (codepoint >= 'A' && codepoint <= 'Z') {
        return false;
    } else if (codepoint >= 'a' && codepoint <= 'z') {
        return false;
    }

    switch (codepoint) {\n""")

    for delim in sorted(delim_codepoints):
        out.write("\
    case " + str(hex(delim)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())
예제 #9
0
def generate(unicode_casefold_file, target):
    """Generates a C++ source file that contains a Unicode case folding 
       function.

    The case folding function contains a switch statement with cases for every
    Unicode codepoint that has a case folding mapping. 
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    case_mappings = {}

    cf_file = open(unicode_casefold_file, 'rU')

    for line in cf_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if (data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert (len(values) == 4)

        status = values[1]
        if status == 'C' or status == 'S':
            # We only include the "Common" and "Simple" mappings. "Full" case
            # folding mappings expand certain letters to multiple codepoints,
            # which we currently do not support.
            original_codepoint = int(values[0], 16)
            codepoint_mapping = int(values[2], 16)
            case_mappings[original_codepoint] = codepoint_mapping

    out.write("""char32_t codepointToLower(char32_t codepoint, CaseFoldMode \
mode) { 
    if (mode == CaseFoldMode::kTurkish) {
        if (codepoint == 0x049) {  // I -> ı
            return 0x131;
        } else if (codepoint == 0x130) {  // İ -> i
            return 0x069;
        }
    }

    switch (codepoint) {\n""")

    mappings_list = []

    for mapping in case_mappings:
        mappings_list.append((mapping, case_mappings[mapping]))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        out.write("\
    case "           + str(hex(mapping[0])) + ": return " + \
        str(hex(mapping[1])) +";\n")

    out.write("\
    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())
예제 #10
0
def generate(unicode_casefold_file, target):
    """Generates a C++ source file that contains a Unicode case folding
       function.

    The case folding function contains a switch statement with cases for every
    Unicode codepoint that has a case folding mapping.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    case_mappings = {}

    cf_file = open(unicode_casefold_file, 'rU')

    for line in cf_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if(data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert(len(values) == 4)

        status = values[1]
        if status == 'C' or status == 'S':
            # We only include the "Common" and "Simple" mappings. "Full" case 
            # folding mappings expand certain letters to multiple codepoints, 
            # which we currently do not support.
            original_codepoint = int(values[0], 16)
            codepoint_mapping  = int(values[2], 16)
            case_mappings[original_codepoint] = codepoint_mapping

    turkishMapping = {
        0x49: 0x131,  # I -> ı
        0x130: 0x069,   # İ -> i
    }

    out.write(
        """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
               if (codepoint <= 0x7f) {
                    if (codepoint >= 'A' && codepoint <= 'Z') {
                       return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
                              ? 0x131
                              : (codepoint | 0x20); // Set the ascii lowercase bit on the character.
                   }
                   return codepoint;
               }

               switch (codepoint) {\n""")

    mappings_list = []

    for mapping in case_mappings:
        mappings_list.append((mapping, case_mappings[mapping]))

    # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
    # them even if we'd skip the letter in non-turkish mode.
    for mapping in turkishMapping:
        if mapping not in case_mappings:
            mappings_list.append((mapping, mapping))

    sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])

    for mapping in sorted_mappings:
        if mapping[0] <= 0x7f:
            continue # ascii is special cased above.

        if mapping[0] in turkishMapping:
            out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
                      % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
        else:
            out.write("case 0x%x: return 0x%x;\n"%mapping)

    out.write("\
    default: return codepoint;\n    }\n}")

    out.write(closeNamespaces())
예제 #11
0
def generate(unicode_proplist_file, target):
    """Generates a C++ source file that contains a delimiter checking function.

    The delimiter checking function contains a switch statement with cases for 
    every delimiter in the Unicode Character Database with the properties 
    specified in delim_properties.
    """
    out = open(target, "w")

    out.write(getCopyrightNotice())
    out.write(include("mongo/db/fts/unicode/codepoints.h"))
    out.write("\n")
    out.write(openNamespaces())

    delim_codepoints = set()

    proplist_file = open(unicode_proplist_file, 'rU')

    delim_properties = ["White_Space", 
                        "Dash", 
                        "Hyphen", 
                        "Quotation_Mark", 
                        "Terminal_Punctuation", 
                        "Pattern_Syntax", 
                        "STerm"]

    for line in proplist_file:
        # Filter out blank lines and lines that start with #
        data = line[:line.find('#')]
        if(data == ""):
            continue

        # Parse the data on the line
        values = data.split("; ")
        assert(len(values) == 2)

        uproperty = values[1].strip()
        if uproperty in delim_properties:
            if len(values[0].split('..')) == 2:
                codepoint_range = values[0].split('..')

                start = int(codepoint_range[0], 16)
                end   = int(codepoint_range[1], 16) + 1

                for i in range(start, end):
                    if i not in delim_codepoints: 
                        delim_codepoints.add(i)
            else:
                if int(values[0], 16) not in delim_codepoints:
                    delim_codepoints.add(int(values[0], 16))

    # As of Unicode 8.0.0, all of the delimiters we used for text index 
    # version 2 are also in the list.
    out.write("static const bool englishAsciiDelimiters[128] = {\n")
    for cp in range(0x80):
        if cp == ord("'"):
            out.write("    0, // ' special case\n")
        else:
            out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
    out.write("};\n")

    out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n")
    for cp in range(0x80):
        out.write("    %d, // 0x%x\n" % (cp in delim_codepoints, cp))
    out.write("};\n")

    out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) {
    if (codepoint <= 0x7f) {
        if (lang == DelimiterListLanguage::kEnglish) {
            return englishAsciiDelimiters[codepoint];
        }
        return nonEnglishAsciiDelimiters[codepoint];
    }

    switch (codepoint) {\n""")

    for delim in sorted(delim_codepoints):
        if delim <= 0x7f: # ascii codepoints handled in lists above.
            continue
        out.write("\
    case " + str(hex(delim)) + ": return true;\n")

    out.write("\
    default: return false;\n    }\n}")

    out.write(closeNamespaces())