def check_is_valid_glyph_string(glyphs, iso=None): """ a string of glyphs like "a b c d e f" should be single-space separated single unicode characters """ if type(glyphs) is not str or len(glyphs) < 1: log.error("Do not use empty glyph sequences") return False if re.findall(r"\n", glyphs): log.error("Glyph sequences should not contain line breaks") return False if re.findall(r" {2,}", glyphs): log.error("More than single space in '%s'" % glyphs) print([g for g in re.findall(r" {2,}", glyphs)]) return False pruned, removed = prune_superflous_marks(glyphs) if len(removed) > 0: log.error("Superflous marks that are implicitly extracted via " "decomposition: '%s'" % "','".join(removed)) return False for c in glyphs: if unicodedata2.category(c) == "Sk": log.warning("'%s' contains modifier symbol '%s' in characters. It " "is very likely this should be a combining mark " "instead." % (iso, c)) return True
def is_separator(character: str) -> bool: if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]: return True character_category = unicodedata.category(character) # type: str return "Z" in character_category
def prune_superflous_marks(string): """ From a given string return a set of unique characters with all those standalone Mark charaters removed that are already implicitly present in a decomposable character @param string str @return set pruned, set removed """ unique_strings = character_list_from_string(string) removed = [] for c in unique_strings: # No need to bother about glyph clusters with more than one character, # since that inherently will not be a mistakenly listed mark if len(c) > 1: continue if unicodedata2.category(c).startswith("M"): for s in unique_strings: if s != c and c in parse_chars(s): removed.append(c) if removed == []: return unique_strings, () pruned = list_unique([c for c in unique_strings if c not in removed]) removed = list_unique(removed) return pruned, removed
def is_separator(character: str) -> bool: if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: return True character_category: str = unicodedata.category(character) return "Z" in character_category
def detectScript(txt): charScript = [script(c) for c in txt] for i, ch in enumerate(txt): scr = charScript[i] if scr in UNKNOWN_SCRIPT: if i: scr = charScript[i - 1] else: scr = None cat = category(ch) if ch in MIRRORED and cat == "Pe": scr = None charScript[i] = scr # Any unknowns should be mapped to the _next_ script prev = None for i in range(len(txt) - 1, -1, -1): if charScript[i] is None: charScript[i] = prev else: prev = charScript[i] # There may be unknowns at the end of the string, fall back to # preceding script prev = "Zxxx" # last resort for i in range(len(txt)): if charScript[i] is None: charScript[i] = prev else: prev = charScript[i] assert None not in charScript return charScript
def prune_chars(self, retainDecomposed=False): """ A helper to parse all orthographies' charsets in all languages. This decomposes glyphs and prunes any glyphs that are redundant. Also transforms the dict attributes from strings to lists. """ for lang in self.values(): if "orthographies" in lang: for o in lang["orthographies"]: for type in [ "base", "auxiliary", "numerals", "punctuation", "marks" ]: if type in o: o[type] = parse_chars(o[type], True, retainDecomposed) if type == "base": o[type] = [ c for c in o[type] if not unicodedata2.category(c).startswith( "M") ] # noqa # Remove any components in auxiliary after decomposition # that are already in base if "base" in o and "auxiliary" in o: o["auxiliary"] = [ a for a in o["auxiliary"] if a not in o["base"] ]
def getCategoryStr(self): cat = unicodedata.category(self.text) if cat == 'Cn': try: cat = unicodeInfo.unicodeData['en'][self.num][1] except KeyError: pass catNames = [self.getCategoryValue(cat, l) for l in unicodeInfo.langs] return cat + ' - ' + ' / '.join(catNames)
def is_punctuation(character: str) -> bool: character_category = unicodedata.category(character) # type: str if "P" in character_category: return True character_range = unicode_range(character) # type: Optional[str] if character_range is None: return False return "Punctuation" in character_range
def is_symbol(character: str) -> bool: character_category = unicodedata.category(character) # type: str if "S" in character_category or "N" in character_category: return True character_range = unicode_range(character) # type: Optional[str] if character_range is None: return False return "Forms" in character_range
def check_types(Langs): for iso, lang in Langs.items(): if "includes" in lang: if not check_is_valid_list(lang["includes"]): logging.error("'%s' has invalid list 'includes'" % iso) if "source" in lang: if not check_is_valid_list(lang["source"]): logging.error("'%s' has invalid list 'source'" % iso) if "orthographies" in lang: if not check_is_valid_list(lang["orthographies"]): logging.error("'%s' has invalid list 'orthographies'" % iso) for o in lang["orthographies"]: if "base" in o: if iso == "arg": for i, c in enumerate(list(o["base"].replace(" ", ""))): if unicodedata2.category(c).startswith("Z"): logging.error("'%s' has invalid whitespace " "characters '%s' at %d" % (iso, unicodedata2.name(c), i)) if not check_is_valid_glyph_string(o["base"]): logging.error("'%s' has invalid 'base' glyph list" % iso) if "combinations" in o: if not check_is_valid_combation_string(o["combinations"]): logging.error("'%s' has invalid 'combination' string" % iso) if "name" not in lang and "preferred_name" not in lang: logging.error("'%s' has neither 'name' nor 'preferred_name'" % iso) if "name" in lang and "preferred_name" in lang and \ lang["name"] == lang["preferred_name"]: logging.error("'%s' has 'name' and 'preferred_name', but they are " "identical" % iso) # if "todo_status" in lang and lang["todo_status"] not in VALID_TODOS: # logging.error("'%s' has an invalid 'todo_status'" % iso) if "status" in lang and lang["status"] not in VALID_STATUS: logging.error("'%s' has an invalid 'status'" % iso)
def sort_key_character_category(c): """ Sorting comparator to sort unicode characters by their unicode type, first Letters (Uppercase, then lowercase, if applicable), then Marks, then anything else, secondary sort by unicode ASC """ order = ["Lu", "Lt", "Ll", "LC", "L", "Lo", "Mn", "Me", "M", "Mc"] # Get the first letter of the category cat = unicodedata2.category(c)[:2] # Get the index of that letter in the order, or higher if not found order = order.index(cat) if cat in order else len(order) # Concat the primary order with the unicode int, so as secondary sort we # get unicode ASC order = "%s-%s" % (str(order).zfill(2), str(ord(c)).zfill(8)) return order
def parse_marks(input): """ From a space separated string """ chars = parse_chars(input) return [c for c in chars if unicodedata2.category(c).startswith("M")]
def is_private_use_only(character: str) -> bool: character_category = unicodedata.category(character) # type: str return "Co" == character_category
# return file names: if len(args) < 1: print "Please specify an inputfont." sys.exit(2) elif len(args) < 2: inPath = args[0] outPath = os.path.splitext(inPath)[0] + DEFAULT_OUTPATH_ADDITION.lower().strip() + os.path.splitext(inPath)[1] else: inPath = args[0] outPath = args[1] return inPath, outPath ######################################################################################################### MARK_GLYPH_CODEPOINT_RANGE = [ int(m) for m in range(65000) if unicodedata.category(unichr(m)) == "Mn" ]; m=None # also allow for "M"? MARK_GLYPH_CODEPOINT_RANGE.remove(int("034F", 16)) PPF2_SUPPORTED = 0 # not tested yet! and deactivated ... ######################################################################################################### def saveFile(data,file): modus = "wb" file = os.path.abspath(file) if os.path.exists(file): os.remove(file) directory = os.path.dirname(file) if not os.path.exists(directory): os.makedirs(directory) theFile = open(file,modus) theFile.write(data) theFile.close()
def is_private_use_only(character: str) -> bool: character_category: str = unicodedata.category(character) return character_category == "Co"
def check_types(Langs): for iso, lang in Langs.items(): if "includes" in lang: if not check_is_valid_list(lang["includes"]): log.error("'%s' has invalid list 'includes'" % iso) if "source" in lang: if not check_is_valid_list(lang["source"]): log.error("'%s' has invalid list 'source'" % iso) if "orthographies" in lang: if not check_is_valid_list(lang["orthographies"]): log.error("'%s' has invalid list 'orthographies'" % iso) for o in lang["orthographies"]: if "base" in o: if iso == "arg": chars = list(o["base"].replace(" ", "")) for i, c in enumerate(chars): if unicodedata2.category(c).startswith("Z"): log.error("'%s' has invalid whitespace " "characters '%s' at %d" % (iso, unicodedata2.name(c), i)) if not check_is_valid_glyph_string(o["base"], iso): log.error("'%s' has invalid 'base' glyph list" % iso) if "auxiliary" in o: if not check_is_valid_glyph_string(o["auxiliary"], iso): log.error("'%s' has invalid 'auxiliary' glyph list" % iso) allowed = [ "autonym", "inherit", "script", "base", "marks", "auxiliary", "numerals", "status", "note", "punctuation", # tolerated for now, but unused "preferred_as_group", "design_note" ] invalid = [k for k in o.keys() if k not in allowed] if len(invalid): log.warn("'%s' has invalid orthography keys: '%s'" % (iso, "', '".join(invalid))) if "status" not in o: log.error("'%s' has an orthography (script '%s') that is " "missing 'status'" % (iso, o["script"])) else: if o["status"] not in ORTHOGRAPHY_STATUSES: log.error("'%s' has an orthography status '%s' which " "is invalid, should be one of %s" % (iso, o["status"], ", ".join(ORTHOGRAPHY_STATUSES))) primary_orthography = [ o for o in lang["orthographies"] if "status" in o and o["status"] == "primary" ] if len(primary_orthography) == 0: log.error("'%s' has no primary orthography" % iso) if "name" not in lang and "preferred_name" not in lang: log.error("'%s' has neither 'name' nor 'preferred_name'" % iso) if "name" in lang and "preferred_name" in lang and \ lang["name"] == lang["preferred_name"]: log.error("'%s' has 'name' and 'preferred_name', but they are " "identical" % iso) if "status" in lang and lang["status"] not in STATUSES: log.error("'%s' has an invalid 'status'" % iso) if "validity" not in lang: log.warn("'%s' is missing 'validity'" % iso) if "validity" in lang and lang["validity"] not in VALIDITYLEVELS: log.error("'%s' has invalid 'validity'" % iso) if "speakers" in lang: if (re.search(r"[^\d]", str(lang["speakers"]))): log.error("'%s' has invalid 'speakers' '%s' - only numbers " "are allowed" % (iso, lang["speakers"]))
def save_sorted(Langs=None): """ Helper script to re-save the hyperglot.yaml sorted alphabetically, alternatively from the passed in Langs object (which can have been modified) """ log.setLevel(logging.WARNING) if Langs is None: Langs = Languages(inherit=False, prune=False) print("Running pre-save validation, please fix any issues flagged.") # validate() # Save with removed superflous marks for iso, lang in Langs.items(): if "orthographies" in lang: for i, o in enumerate(lang["orthographies"]): for type in ["base", "auxiliary", "numerals"]: if type in o: chars = o[type] pruned, removed = prune_superflous_marks(" ".join( o[type])) if len(removed) > 0: log.info("Saving '%s' with '%s' pruned of " "superfluous marks (implicitly " "included in combining glyphs): " "%s" % (iso, type, "','".join(removed))) chars = pruned # Do not include anything (after decomposition) # that is already listed in base if "base" in o and type != "base": chars = [c for c in chars if c not in o["base"]] joined = " ".join(chars) Langs[iso]["orthographies"][i][type] = joined # Automate extracting and writing marks (in addition to any # that might have been defined manually). Note that we only # extract marks from 'base' since 'marks' are part of the # base level checking. Marks in 'auxiliary' will simply be # saved (if necessary) in 'auxiliary'. marks = [] if "marks" in o: marks = parse_chars(o["marks"], decompose=True, retainDecomposed=False) if "base" in o: marks = set(marks + parse_marks(o["base"])) if len(marks) > 0: # Note: Let's store marks with two spaces between to # make them more legible; when parsing the attribute # back in all whitespaces are removed o["marks"] = " ".join(sorted(marks)) if "base" in o: base, removed = prune_superflous_marks(" ".join( o["base"])) # Save base without marks _base = [ c for c in base if not uni.category(c).startswith("M") ] o["base"] = " ".join(_base) # Sort by keys alphabetic = dict(OrderedDict(sorted(Langs.items()))) file = open(DB, "w") yaml.dump(alphabetic, file, **DUMP_ARGS) print("Saved lib/hyperglot/hyperglot.yaml")
def get_stats_from_chars(text_chars, db=None): report = {} uppercase = [] numerals = [] punctuation = [] controlchars = [] spaces = [] other = [] # Include decomposed forms for c in text_chars: decomposed = ud.normalize("NFKD", c) if len(decomposed) > 1: text_chars = text_chars + [d for d in decomposed] text_chars = set(text_chars) for c in text_chars: # print(c, ud.category(c)) cat = ud.category(c) if cat == "Lu": uppercase.append(c) elif cat.startswith("N"): numerals.append(c) elif cat.startswith("P"): punctuation.append(c) elif cat.startswith("C") and len(c) > 1: controlchars.append(c) elif cat.startswith("Z"): spaces.append(c) else: other.append(c) # Remove all but "other" from chars, we don't care about them for diffing for remove in [ uppercase, numerals, punctuation, controlchars, spaces, ["\n", "\t"] ]: text_chars = text_chars.difference(set(remove)) report["iso_in_db"] = db is not None report["found_in_text"] = { "uppercase": sorted(uppercase), "numerals": sorted(numerals), "punctuation": sorted(punctuation), "chars": sorted(text_chars) } # Compare to orthographies if db is not None: db_chars = [] if "orthographies" in db: for o in db["orthographies"]: if "base" in o: db_chars = db_chars + o["base"] if "auxiliary" in o: db_chars = db_chars + o["auxiliary"] db_chars = set(sorted(db_chars)) not_in_db = text_chars.difference(db_chars) missing_from_text = db_chars.difference(text_chars) decomposed = set(parse_chars("".join(text_chars), decompose=True)) # print("Listed in DB but not in text", missing_from_text) # print("Appears in text but not listed in DB", not_in_db) # print("Text can be written with DB characters", # decomposed.issubset(db_chars)) missing_from_db = "" for c in not_in_db: missing = ud.normalize("NFKD", c) missing_parts = "" for part in missing: if part not in db_chars: missing_parts = missing_parts + part if missing_parts != []: missing_from_db = missing_from_db + missing_parts # print("missing from db", sorted(list(missing_from_db))) missing_from_db = sorted(list(set(missing_from_db))) report["not_in_text"] = sorted(missing_from_text) report["not_in_db"] = sorted(not_in_db) if missing_from_db: report["missing_from_db"] = missing_from_db report["db_chars_valid"] = decomposed.issubset(db_chars) return report
def infoline(cp): i = get_info(cp) char = i.char.encode('unicode_escape').decode() if category(i.char).startswith('C') else i.char return [oct(i.cp), i.cp, hex(i.cp), i.html, char, i.block.name, i.name]
def remove_accents(s): return ''.join((c for c in unicodedata2.normalize('NFD', s) if unicodedata2.category(c) != 'Mn'))
def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )