def ldml_write(self, exemplars, sort=True): """Write exemplars to a string that can be written to a LDML formatted file.""" if sort: # Exemplars mentioned in UTS #35 need to be sorted. list_exemplars = list() for exemplar in sorted(exemplars): list_exemplars.append(exemplar) else: # Graphemes should be sorted by frequency, # and since they already are, # do nothing further here with the order. list_exemplars = exemplars list_nfc_exemplars = map(self.ucd.normalize_nfc, list_exemplars) # Ignore exemplars not of most common script found in the data. list_nfc_exemplars_main_script = list() for exemplar in list_nfc_exemplars: char = exemplar[ 0] # only look at the first character in an exemplar. script = Script.getScript(char) script_name = Script.getShortName(script) if script_name == self._get_script( ) or not self.ucd.is_specific_script(char): list_nfc_exemplars_main_script.append(exemplar) if self.unittest: return ' '.join(list_nfc_exemplars_main_script) else: return palaso.sldr.UnicodeSets.list2us( list_nfc_exemplars_main_script, self.ucd)
def ldml_write(self, exemplars, sort=True): """Write exemplars to a string that can be written to a LDML formatted file.""" if sort: # Exemplars mentioned in UTS #35 need to be sorted. list_exemplars = list() for exemplar in sorted(exemplars): list_exemplars.append(exemplar) else: # Graphemes should be sorted by frequency, # and since they already are, # do nothing further here with the order. list_exemplars = exemplars list_nfc_exemplars = map(self.ucd.normalize_nfc, list_exemplars) # Ignore exemplars not of most common script found in the data. list_nfc_exemplars_main_script = list() for exemplar in list_nfc_exemplars: char = exemplar[0] # only look at the first character in an exemplar. script = Script.getScript(char) script_name = Script.getShortName(script) if script_name == self._get_script() or not self.ucd.is_specific_script(char): list_nfc_exemplars_main_script.append(exemplar) if self.unittest: return ' '.join(list_nfc_exemplars_main_script) else: return palaso.sldr.UnicodeSets.list2us(list_nfc_exemplars_main_script, self.ucd)
def is_specific_script(char): """True if the character has a specific Script property, that is, not the values Common or Inherited. """ script = Script.getScript(char) script_code = Script.getScriptCode(script) if script_code == UScriptCode.COMMON or script_code == UScriptCode.INHERITED: return False return True
def make_json(puz_uid, hint_uid, size=(30, 30), limit=1000, sample=100, numtrans=3): script = get_script(puz_uid) rtl = Script(Script.getCode(script)[0]).isRightToLeft() c = translate_clues(gen_puzzle2(puz_uid, size, limit, script), puz_uid, hint_uid, numtrans) grid = [] for row in c.best_grid: new_row = [cell if cell != " " else "" for cell in row] if rtl: new_row = new_row[::-1] grid.append(new_row) wl = [] for clue in sorted(c.best_wordlist, key=lambda x: (x[4], x[2], x[3])): new_clue = clue[:] if rtl: new_clue[3] = len(grid[0]) - 1 - clue[3] wl.append(new_clue) allcaps = "".join(["".join(clue[0]) for clue in wl]).isupper() json.dump({"grid": grid, "clues": wl, "rtl": rtl, "allcaps": allcaps}, open("puzzle.json", "w")) return wl
def _get_script(self): """Return most frequently occurring script.""" script_code_and_count_list = self.scripts.most_common(1) if len(script_code_and_count_list) == 0: return '' else: script_code_and_count = script_code_and_count_list[0] script_code = script_code_and_count[0] script = self.codes_for_scripts[script_code] script_name = Script.getShortName(script) return script_name
def normalize_string(in_str, allowed_scripts): """ Normalizes in_str by replacing letters and digits in other scripts with exemplar values. Args: in_str: String to process allowed_scripts: List of script short names (like "Mymr") to preserve """ # TODO: Consider checking ScriptExtensions here as well output = "" for ch in in_str: ch_script = Script.getScript(ch) ch_type = Char.charType(ch) ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type] ch_digit = Char.digit(ch) if ch_script.getShortName() in allowed_scripts: # ch is in an allowed script: # copy directly to the output output += ch elif ch_bucket == 1: # ch is a letter in a disallowed script: # normalize to the sample char for that script output += Script.getSampleString(ch_script) elif ch_bucket == 3 and ch_digit != -1: # ch is a decimal digit in a disallowed script: # normalize to the zero digit in that numbering system output += chr(ord(ch) - ch_digit) elif ch_type == UCharCategory.CURRENCY_SYMBOL: # ch is a currency symbol in a disallowed script: # normalize to $ output += "$" else: # all other characters: # copy directly to the output output += ch return output
def process(self, text): """Analyze a string.""" i = 0 text = self.ucd.normalize('NFD', text) # Record script of each character. for char in text: script = Script.getScript(char) script_code = Script.getScriptCode(script) self.scripts[script_code] += 1 self.codes_for_scripts[script_code] = script # Record clusters while i < len(text): # Look for multigraphs (from length of max_multigraph_length down to 1) character(s) # of multigraphs already specified in a LDML file. # Longest possible matches are looked at first. for multigraph_length in range(self.max_multigraph_length, 0, -1): multigraph = text[i:i + multigraph_length] if (multigraph in self._main or multigraph in self._auxiliary or multigraph in self._index or multigraph in self._punctuation): exemplar = Exemplar(multigraph) self.clusters[exemplar] += 1 i += multigraph_length break # No multigraphs were found at this position, # so continue processing a single character # if we have not gone beyond the end of the text. if not i < len(text): break char = text[i] # Test for punctuation. if self.ucd.ispunct(char): exemplar = Exemplar(char) self.clusters[exemplar] += 1 i += 1 continue # Find grapheme clusters. # Ensure exemplar base has needed properties. if not self.allowable(char): i += 1 continue # The current character is a base character. base = char # Then find the end of the cluster # (which may consist of only base characters). length = base_length = 1 while i + length < len(text): trailer = text[i + length] if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT): # A Default_Ignorable_Code_Point was found, so the cluster continues. length += 1 continue if self.ucd.ismark(trailer): # A Mark was found, so the cluster continues. length += 1 # Marks such as nuktas are considered part of the base. if self.ucd.is_always_combine(trailer): # A Mark such as a nukta was found, so the base continues, # as well as the cluster. base_length += 1 base = text[i:i + base_length] continue else: # No more marks, so the end of the cluster has been reached. break # Extract cluster # If no nuktas have been found, # then the base will be the single character already called base (or char). # If no non-nukta marks have been found, # then the trailers variable will be an empty string. trailers = text[i + base_length:i + length] exemplar = Exemplar(base, trailers) self.clusters[exemplar] += 1 i += length
def process(self, text): """Analyze a string.""" i = 0 text = self.ucd.normalize('NFD', text) # Record script of each character. for char in text: script = Script.getScript(char) script_code = Script.getScriptCode(script) self.scripts[script_code] += 1 self.codes_for_scripts[script_code] = script # Record clusters while i < len(text): # Look for multigraphs (from length of max_multigraph_length down to 1) character(s) # of multigraphs already specified in a LDML file. # Longest possible matches are looked at first. for multigraph_length in range(self.max_multigraph_length, 0, -1): multigraph = text[i:i + multigraph_length] if (multigraph in self._main or multigraph in self._auxiliary or multigraph in self._index or multigraph in self._punctuation): exemplar = Exemplar(multigraph) self.clusters[exemplar] += 1 i += multigraph_length break # No multigraphs were found at this position, # so continue processing a single character # if we have not gone beyond the end of the text. if not i < len(text): break char = text[i] # Test for punctuation. if self.ucd.ispunct(char): exemplar = Exemplar(char) self.clusters[exemplar] += 1 i += 1 continue # Find grapheme clusters. # Ensure exemplar base has needed properties. if not self.allowable(char): i += 1 continue # The current character is a base character. base = char # Then find the end of the cluster # (which may consist of only base characters). length = base_length = 1 while i + length < len(text): trailer = text[i + length] if self.ucd.is_zwj(trailer): # ZWJ found, so the cluster continues. length += 1 continue if self.ucd.is_zwnj(trailer): # ZWNJ found, so the end of the cluster has been reached, # but put include ZWNJ in the cluster length += 1 break if self.ucd.ismark(trailer): # A Mark was found, so the cluster continues. length += 1 # Marks such as nuktas are considered part of the base. if self.ucd.is_always_combine(trailer): # A Mark such as a nukta was found, so the base continues, # as well as the cluster. base_length += 1 base = text[i:i + base_length] continue else: # No more marks, so the end of the cluster has been reached. break # Extract cluster # If no nuktas have been found, # then the base will be the single character already called base (or char). # If no non-nukta marks have been found, # then the trailers variable will be an empty string. trailers = text[i + base_length:i + length] exemplar = Exemplar(base, trailers) self.clusters[exemplar] += 1 i += length