def read_fontconfig_orth(path): """filepath to fontconfig *.orth file --> (icu.UnicodeSet, [references])""" result = icu.UnicodeSet() references = [ 'https://cgit.freedesktop.org/fontconfig/tree/fc-lang/' + os.path.basename(path) ] with codecs.open(path, 'r', 'utf-8') as f: for line in f: references.extend(extract_urls(line)) line = line.split('#')[0].strip().split('\t')[0].strip() if not line: continue elif line.startswith('include '): incfile = os.path.join(os.path.dirname(path), line.split()[1]) result.addAll(read_fontconfig_orth(incfile)[0]) else: r = [int(x, 16) for x in line.split('-') if x.strip()] if len(r) == 1: result.add(normalize_fontconfig_char(unichr(r[0]))) elif len(r) == 2: for c in range(r[0], r[1] + 1): result.add(normalize_fontconfig_char(unichr(c))) else: raise ValueError(path) result = result.compact() return (result, references)
def ScriptSymbols(script, include_script_code=False): """Yields short symbol names for all characters in the given script.""" script_chars = icu.UnicodeSet(r'[\p{%s}\u200C\u200D]' % script.getName()) script_name = script.getName().replace('_', ' ') utf8.stderr.write('Found %d characters specific to %s (%s)\n' % (len(script_chars), script_name, script.getShortName())) prefix = script_name.upper() for c in script_chars: label = CharToCodepoint(c) if label in EXCEPTIONS: symbol_name = EXCEPTIONS[label] else: name = CharName(c) if not name: utf8.stderr.write('Warning: No Unicode name for %04X\n' % label) continue name = RemovePrefix(name, prefix) name = RemovePrefix(name, 'ZERO WIDTH') assert name for old, new in DIGITS.items(): name = name.replace(old, new) components = [t for t in name.split() if t not in STOPWORDS] symbol_name = '_'.join(components).lower() assert symbol_name, ('Empty symbol name for %04X (%s)' % (label, name)) if symbol_name.startswith('-'): symbol_name = "'%s" % symbol_name[1:] if any(substr in name for substr in DEPENDENT): symbol_name = '-%s' % symbol_name if include_script_code: symbol_name = '%s:%s' % (script.getShortName(), symbol_name) yield symbol_name, label return
def make_phoneme_set(s): pat = [u'\\u0020', "ˈ", '.'] for phoneme in s.split(): if len(phoneme) == 1: pat.append(phoneme) else: pat.append('{%s}' % phoneme) result = icu.UnicodeSet() result.applyPattern('[%s]' % ' '.join(pat)) return result
def makePhonemeSet(s): pat = [] for phoneme in s.split(): if len(phoneme) == 1: pat.append(phoneme) else: pat.append('{%s}' % phoneme) #print ' '.join(pat).encode('utf-8') result = icu.UnicodeSet() result.applyPattern('[%s]' % ' '.join(pat)) return result
def get_cldr_exemplars(lang, exemplars): main, src = get_cldr_exemplars_by_type(lang, 'main', cldr_exemplars) if not main: return None, set() result = icu.UnicodeSet(main) sources = {src} aux, src = get_cldr_exemplars_by_type(lang, 'auxiliary', cldr_exemplars) if aux: result.addAll(aux) sources.add(src) index, src = get_cldr_exemplars_by_type(lang, 'index', cldr_exemplars) if index: result.addAll(index) sources.add(src) return result, sources
def format_unicodeset(uset): ranges = [] for i in range(uset.getRangeCount()): if len(uset.getRangeStart(i)) != 1 or len(uset.getRangeEnd(i)) != 1: return uset.toPattern() start = ord(uset.getRangeStart(i)) end = ord(uset.getRangeEnd(i)) if end - start < 3: ranges.extend([ escape_for_unicodeset(unichr(c)) for c in range(start, end + 1) ]) else: ranges.append('%s-%s' % (escape_for_unicodeset( unichr(start)), escape_for_unicodeset(unichr(end)))) result = '[%s]' % ' '.join(ranges) # Make sure we don't change semantics with our pretty-pretting. if icu.UnicodeSet(result).toPattern() != uset.toPattern(): return uset.toPattern() return result
def read_cldr_file(filepath): assert filepath.endswith('.xml'), filepath exemplars = {} ldml = etree.parse(filepath).getroot() lang = ldml.find('./identity/language').attrib['type'] script = ldml.find('./identity/script') if script is not None: lang = lang + '_' + script.attrib['type'] territory = ldml.find('./identity/territory') if territory is not None: lang = lang + '_' + territory.attrib['type'] variants = sorted( [t.attrib['type'] for t in ldml.iterfind('./identity/variant')]) if variants is not None: lang = '_'.join([lang] + variants) tags = set(t.tag for t in ldml.iterfind('./identity/*')) if not tags.issubset( {'version', 'language', 'script', 'territory', 'variant'}): raise ValueError('unexpected identity elements in %s' % filepath) for ex in ldml.iterfind('./characters/exemplarCharacters'): extype = ex.attrib.get('type', 'main') exemplars[extype] = icu.UnicodeSet(''.join(ex.itertext())) return lang, exemplars
def makePhonemeSet(s): pat = [] for phoneme in s.split(): if len(phoneme) == 1: pat.append(phoneme) else: pat.append('{%s}' % phoneme) #print ' '.join(pat).encode('utf-8') result = icu.UnicodeSet() result.applyPattern('[%s]' % ' '.join(pat)) return result ARMENIAN_GRAPHEMES = icu.UnicodeSet() ARMENIAN_GRAPHEMES.applyPattern('[:Armn:]') ARMENIAN_PHONEMES = makePhonemeSet(""" m n p pʰ t tʰ k kʰ b d g t͡s t͡sʰ t͡ʃ t͡ʃʰ d͡z d͡ʒ f v s z ʃ ʒ x ɣ h l j r ɾ i u ɛ ə o a """)
# -*- coding: utf-8 -*- from __future__ import unicode_literals import codecs import icu from cldr_util import makePhonemeSet, match, check, regtest GRAPHEMES = icu.UnicodeSet() GRAPHEMES.applyPattern('[[:Sinh:] [:Cf:]]') PHONEMES = makePhonemeSet(""" m n ɲ ŋ p b ᵐb ⁿd ʈ ɖ ⁿɖ k g ᵑg s ʃ t͡ʃ d͡ʒ f h r ʋ l j w i iː u uː e eː ə o oː æː æ a aː . """) check('si-si_FONIPA.txt', GRAPHEMES, PHONEMES) regtest('si-si_FONIPA', GRAPHEMES, PHONEMES)
def write_deltas(deltas, out): for lang, (chars, refs, cldr_sources) in sorted(deltas.items()): locale = icu.Locale(lang) out.write('\n\n### %s: %s\n\n' % (lang, locale.getDisplayName())) out.write('```\n%s\n```\n\n' % (format_unicodeset(chars))) if cldr_sources: markdown = '[%s](http://www.unicode.org/repos/cldr/trunk/%s)' s = [markdown % (src, src) for src in sorted(cldr_sources)] out.write('* CLDR: %s\n' % ' '.join(s)) for ref in refs: out.write('* %s\n' % ref) if __name__ == '__main__': empty_uset = icu.UnicodeSet() fully_missing, fully_missing_manual_cleanup_needed = {}, {} chars_missing, ok, bogus = {}, {}, {} likely_subtags = read_likely_subtags() language_aliases = read_language_aliases() cldr_exemplars = read_cldr_exemplars() fontconfig_exemplars = read_fontconfig_exemplars() for fclang, (fcset, fcrefs) in sorted(fontconfig_exemplars.items()): lang = language_aliases.get(fclang, fclang) if lang == 'ps_PK': lang = 'ps' if lang == 'pap_AN': lang = 'pap_Latn' if lang in {'pap_AW'}: continue likely = likely_subtags.get(lang, 'und') if lang not in cldr_exemplars and not lang.startswith('zh_'): pattern = fcset.toPattern() lang = '_'.join((lang.split('_')[0], guess_script(pattern)))