def generate(target): """Generates a C++ source file that contains a diacritic removal mapping function. The delimiter checking function contains a switch statement with cases for every character in Unicode that has a removable combining diacritical mark. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongol/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) # Map diacritics from 0 to the maximum Unicode codepoint add_diacritic_range(0x0000, 0x10FFFF) out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) { switch (codepoint) {\n""") mappings_list = [] for mapping in diacritic_mappings: mappings_list.append((mapping, diacritic_mappings[mapping])) sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: out.write(" case " + str(hex(mapping[0])) + ": return " + \ str(hex(mapping[1])) +";\n") out.write(" default: return codepoint;\n }\n}") out.write(closeNamespaces())
def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a diacritic checking function. The diacritic checking function contains a switch statement with cases for every diacritic in the Unicode Character Database. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("monger/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) diacritics = set() proplist_file = open(unicode_proplist_file, 'r') for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if (data == ""): continue # Parse the data on the line values = data.split("; ") assert (len(values) == 2) uproperty = values[1].strip() if uproperty in "Diacritic": if len(values[0].split('..')) == 2: codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in diacritics: diacritics.add(i) else: if int(values[0], 16) not in diacritics: diacritics.add(int(values[0], 16)) out.write("""bool codepointIsDiacritic(char32_t codepoint) { switch (codepoint) {\n""") for diacritic in sorted(diacritics): out.write("\ case " + str(hex(diacritic)) + ": return true;\n") out.write("\ default: return false;\n }\n}") out.write(closeNamespaces())
def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a diacritic checking function. The diacritic checking function contains a switch statement with cases for every diacritic in the Unicode Character Database. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) diacritics = set() proplist_file = open(unicode_proplist_file, 'r') for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if (data == ""): continue # Parse the data on the line values = data.split("; ") assert (len(values) == 2) uproperty = values[1].strip() if uproperty in "Diacritic": if len(values[0].split('..')) == 2: codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in diacritics: diacritics.add(i) else: if int(values[0], 16) not in diacritics: diacritics.add(int(values[0], 16)) out.write("""bool codepointIsDiacritic(char32_t codepoint) { switch (codepoint) {\n""") for diacritic in sorted(diacritics): out.write("\ case " + str(hex(diacritic)) + ": return true;\n") out.write("\ default: return false;\n }\n}") out.write(closeNamespaces())
def generate(unicode_casefold_file, target): """Generates a C++ source file that contains a Unicode case folding function. The case folding function contains a switch statement with cases for every Unicode codepoint that has a case folding mapping. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) case_mappings = {} cf_file = open(unicode_casefold_file, "rU") for line in cf_file: # Filter out blank lines and lines that start with # data = line[: line.find("#")] if data == "": continue # Parse the data on the line values = data.split("; ") assert len(values) == 4 status = values[1] if status == "C" or status == "S": # We only include the "Common" and "Simple" mappings. "Full" case # folding mappings expand certain letters to multiple codepoints, # which we currently do not support. original_codepoint = int(values[0], 16) codepoint_mapping = int(values[2], 16) case_mappings[original_codepoint] = codepoint_mapping out.write( """char32_t codepointToLower(char32_t codepoint, CaseFoldMode \ mode) { if (mode == CaseFoldMode::kTurkish) { if (codepoint == 0x049) { // I -> ı return 0x131; } else if (codepoint == 0x130) { // İ -> i return 0x069; } } switch (codepoint) {\n""" ) mappings_list = [] for mapping in case_mappings: mappings_list.append((mapping, case_mappings[mapping])) sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: out.write( "\ case " + str(hex(mapping[0])) + ": return " + str(hex(mapping[1])) + ";\n" ) out.write( "\ default: return codepoint;\n }\n}" ) out.write(closeNamespaces())
def generate(unicode_casefold_file, target): """Generates a C++ source file that contains a Unicode case folding function. The case folding function contains a switch statement with cases for every Unicode codepoint that has a case folding mapping. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) case_mappings = {} cf_file = open(unicode_casefold_file, 'rU') for line in cf_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if (data == ""): continue # Parse the data on the line values = data.split("; ") assert (len(values) == 4) status = values[1] if status == 'C' or status == 'S': # We only include the "Common" and "Simple" mappings. "Full" case # folding mappings expand certain letters to multiple codepoints, # which we currently do not support. original_codepoint = int(values[0], 16) codepoint_mapping = int(values[2], 16) case_mappings[original_codepoint] = codepoint_mapping turkishMapping = { 0x49: 0x131, # I -> ı 0x130: 0x069, # İ -> i } out.write( """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) { if (codepoint <= 0x7f) { if (codepoint >= 'A' && codepoint <= 'Z') { return (mode == CaseFoldMode::kTurkish && codepoint == 'I') ? 0x131 : (codepoint | 0x20); // Set the ascii lowercase bit on the character. } return codepoint; } switch (codepoint) {\n""") mappings_list = [] for mapping in case_mappings: mappings_list.append((mapping, case_mappings[mapping])) # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle # them even if we'd skip the letter in non-turkish mode. for mapping in turkishMapping: if mapping not in case_mappings: mappings_list.append((mapping, mapping)) sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: if mapping[0] <= 0x7f: continue # ascii is special cased above. if mapping[0] in turkishMapping: out.write( "case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" % (mapping[0], turkishMapping[mapping[0]], mapping[1])) else: out.write("case 0x%x: return 0x%x;\n" % mapping) out.write("\ default: return codepoint;\n }\n}") out.write(closeNamespaces())
def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a delimiter checking function. The delimiter checking function contains a switch statement with cases for every delimiter in the Unicode Character Database with the properties specified in delim_properties. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) delim_codepoints = set() proplist_file = open(unicode_proplist_file, 'r') delim_properties = [ "White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax", "STerm" ] for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if (data == ""): continue # Parse the data on the line values = data.split("; ") assert (len(values) == 2) uproperty = values[1].strip() if uproperty in delim_properties: if len(values[0].split('..')) == 2: codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in delim_codepoints: delim_codepoints.add(i) else: if int(values[0], 16) not in delim_codepoints: delim_codepoints.add(int(values[0], 16)) # As of Unicode 8.0.0, all of the delimiters we used for text index # version 2 are also in the list. out.write("""bool codepointIsDelimiter(char32_t codepoint, \ DelimiterListLanguage lang) { if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') { return false; } switch (codepoint) {\n""") for delim in sorted(delim_codepoints): out.write("\ case " + str(hex(delim)) + ": return true;\n") out.write("\ default: return false;\n }\n}") out.write(closeNamespaces())
def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a delimiter checking function. The delimiter checking function contains a switch statement with cases for every delimiter in the Unicode Character Database with the properties specified in delim_properties. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) delim_codepoints = set() proplist_file = open(unicode_proplist_file, 'r') delim_properties = ["White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax", "STerm"] for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if(data == ""): continue # Parse the data on the line values = data.split("; ") assert(len(values) == 2) uproperty = values[1].strip() if uproperty in delim_properties: if len(values[0].split('..')) == 2: codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in delim_codepoints: delim_codepoints.add(i) else: if int(values[0], 16) not in delim_codepoints: delim_codepoints.add(int(values[0], 16)) # As of Unicode 8.0.0, all of the delimiters we used for text index # version 2 are also in the list. out.write("""bool codepointIsDelimiter(char32_t codepoint, \ DelimiterListLanguage lang) { if (lang == DelimiterListLanguage::kEnglish && codepoint == '\\'') { return false; } // Most characters are latin letters, so filter those out first. if (codepoint >= 'A' && codepoint <= 'Z') { return false; } else if (codepoint >= 'a' && codepoint <= 'z') { return false; } switch (codepoint) {\n""") for delim in sorted(delim_codepoints): out.write("\ case " + str(hex(delim)) + ": return true;\n") out.write("\ default: return false;\n }\n}") out.write(closeNamespaces())
def generate(unicode_casefold_file, target): """Generates a C++ source file that contains a Unicode case folding function. The case folding function contains a switch statement with cases for every Unicode codepoint that has a case folding mapping. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) case_mappings = {} cf_file = open(unicode_casefold_file, 'rU') for line in cf_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if (data == ""): continue # Parse the data on the line values = data.split("; ") assert (len(values) == 4) status = values[1] if status == 'C' or status == 'S': # We only include the "Common" and "Simple" mappings. "Full" case # folding mappings expand certain letters to multiple codepoints, # which we currently do not support. original_codepoint = int(values[0], 16) codepoint_mapping = int(values[2], 16) case_mappings[original_codepoint] = codepoint_mapping out.write("""char32_t codepointToLower(char32_t codepoint, CaseFoldMode \ mode) { if (mode == CaseFoldMode::kTurkish) { if (codepoint == 0x049) { // I -> ı return 0x131; } else if (codepoint == 0x130) { // İ -> i return 0x069; } } switch (codepoint) {\n""") mappings_list = [] for mapping in case_mappings: mappings_list.append((mapping, case_mappings[mapping])) sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: out.write("\ case " + str(hex(mapping[0])) + ": return " + \ str(hex(mapping[1])) +";\n") out.write("\ default: return codepoint;\n }\n}") out.write(closeNamespaces())
def generate(unicode_casefold_file, target): """Generates a C++ source file that contains a Unicode case folding function. The case folding function contains a switch statement with cases for every Unicode codepoint that has a case folding mapping. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) case_mappings = {} cf_file = open(unicode_casefold_file, 'rU') for line in cf_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if(data == ""): continue # Parse the data on the line values = data.split("; ") assert(len(values) == 4) status = values[1] if status == 'C' or status == 'S': # We only include the "Common" and "Simple" mappings. "Full" case # folding mappings expand certain letters to multiple codepoints, # which we currently do not support. original_codepoint = int(values[0], 16) codepoint_mapping = int(values[2], 16) case_mappings[original_codepoint] = codepoint_mapping turkishMapping = { 0x49: 0x131, # I -> ı 0x130: 0x069, # İ -> i } out.write( """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) { if (codepoint <= 0x7f) { if (codepoint >= 'A' && codepoint <= 'Z') { return (mode == CaseFoldMode::kTurkish && codepoint == 'I') ? 0x131 : (codepoint | 0x20); // Set the ascii lowercase bit on the character. } return codepoint; } switch (codepoint) {\n""") mappings_list = [] for mapping in case_mappings: mappings_list.append((mapping, case_mappings[mapping])) # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle # them even if we'd skip the letter in non-turkish mode. for mapping in turkishMapping: if mapping not in case_mappings: mappings_list.append((mapping, mapping)) sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: if mapping[0] <= 0x7f: continue # ascii is special cased above. if mapping[0] in turkishMapping: out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" % (mapping[0], turkishMapping[mapping[0]], mapping[1])) else: out.write("case 0x%x: return 0x%x;\n"%mapping) out.write("\ default: return codepoint;\n }\n}") out.write(closeNamespaces())
def generate(unicode_proplist_file, target): """Generates a C++ source file that contains a delimiter checking function. The delimiter checking function contains a switch statement with cases for every delimiter in the Unicode Character Database with the properties specified in delim_properties. """ out = open(target, "w") out.write(getCopyrightNotice()) out.write(include("mongo/db/fts/unicode/codepoints.h")) out.write("\n") out.write(openNamespaces()) delim_codepoints = set() proplist_file = open(unicode_proplist_file, 'rU') delim_properties = ["White_Space", "Dash", "Hyphen", "Quotation_Mark", "Terminal_Punctuation", "Pattern_Syntax", "STerm"] for line in proplist_file: # Filter out blank lines and lines that start with # data = line[:line.find('#')] if(data == ""): continue # Parse the data on the line values = data.split("; ") assert(len(values) == 2) uproperty = values[1].strip() if uproperty in delim_properties: if len(values[0].split('..')) == 2: codepoint_range = values[0].split('..') start = int(codepoint_range[0], 16) end = int(codepoint_range[1], 16) + 1 for i in range(start, end): if i not in delim_codepoints: delim_codepoints.add(i) else: if int(values[0], 16) not in delim_codepoints: delim_codepoints.add(int(values[0], 16)) # As of Unicode 8.0.0, all of the delimiters we used for text index # version 2 are also in the list. out.write("static const bool englishAsciiDelimiters[128] = {\n") for cp in range(0x80): if cp == ord("'"): out.write(" 0, // ' special case\n") else: out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp)) out.write("};\n") out.write("static const bool nonEnglishAsciiDelimiters[128] = {\n") for cp in range(0x80): out.write(" %d, // 0x%x\n" % (cp in delim_codepoints, cp)) out.write("};\n") out.write("""bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang) { if (codepoint <= 0x7f) { if (lang == DelimiterListLanguage::kEnglish) { return englishAsciiDelimiters[codepoint]; } return nonEnglishAsciiDelimiters[codepoint]; } switch (codepoint) {\n""") for delim in sorted(delim_codepoints): if delim <= 0x7f: # ascii codepoints handled in lists above. continue out.write("\ case " + str(hex(delim)) + ": return true;\n") out.write("\ default: return false;\n }\n}") out.write(closeNamespaces())