def _check_scripts(scripts): """Return True if all scripts are known (pseudo) codes.""" have_unknown = False if scripts: all_scripts = unicode_data.all_scripts() all_scripts = all_scripts | set(['LGC', 'CJK', 'MONO', 'SYM2', 'MUSIC']) for s in scripts: if s not in all_scripts: print >> sys.stderr, 'unknown script:', s have_unknown = True return not have_unknown
def _check_scripts(scripts): """Return True if all scripts are known (pseudo) codes.""" have_unknown = False if scripts: all_scripts = unicode_data.all_scripts() all_scripts = all_scripts | {'CJK', 'EXCL', 'LGC', 'MONO', 'MUSIC', 'SYM2', 'Zsye'} for s in scripts: if s not in all_scripts: sys.stderr.write('unknown script:\n', s) have_unknown = True return not have_unknown
def _check_scripts(scripts): """Return True if all scripts are known (pseudo) codes.""" have_unknown = False if scripts: all_scripts = unicode_data.all_scripts() all_scripts = all_scripts | set( ['CJK', 'EXCL', 'LGC', 'MONO', 'MUSIC', 'SYM2', 'Zsye']) for s in scripts: if s not in all_scripts: print >> sys.stderr, 'unknown script:', s have_unknown = True return not have_unknown
def _check_scripts(scripts): """Return True if all scripts are known (pseudo) codes.""" have_unknown = False if scripts: all_scripts = unicode_data.all_scripts() all_scripts = all_scripts | { "CJK", "EXCL", "LGC", "MONO", "MUSIC", "SYM2", "Zsye", } for s in scripts: if s not in all_scripts: sys.stderr.write("unknown script:\n", s) have_unknown = True return not have_unknown
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': _log('used lang is und for script %s in region %s' % (script, region)) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_scripts(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if script not in all_lang_scripts[lang]: _log('adding likely lang %s for script %s' % (lang, script)) all_lang_scripts[lang].add(script) elif script not in known_scripts: _log('adding script with unknown language %s' % script) all_lang_scripts[lang].add(script) else: _log('script %s with unknown language already seen' % script) # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') # Patch: see noto-fonts#133 comment on June 8th. all_lang_scripts['tlh'] |= {'Latn', 'Piqd'} all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data
def test_all_scripts(self): """Tests the all_scripts() method.""" self.assertIn('Latn', unicode_data.all_scripts()) self.assertNotIn('Japn', unicode_data.all_scripts())
def _create_lang_data(): """Generates language data from CLDR plus extensions. Returns a mapping from lang to a tuple of: - a set of scripts used in some region - a set of scripts not used in any region.""" all_lang_scripts = collections.defaultdict(set) used_lang_scripts = collections.defaultdict(set) known_scripts = set() all_langs = set() for region in cldr_data.known_regions(): lang_scripts = cldr_data.region_to_lang_scripts(region) for lang_script in lang_scripts: lang, script = lang_script.split('-') known_scripts.add(script) if lang == 'und': if _DEBUG: print 'used lang is und for script %s in region %s' % (script, region) continue used_lang_scripts[lang].add(script) all_lang_scripts[lang].add(script) all_langs.add(lang) for lang in cldr_data.known_langs(): lang_scripts = cldr_data.lang_to_scripts(lang) all_lang_scripts[lang] |= lang_scripts known_scripts |= lang_scripts all_langs.add(lang) for lang in all_langs: script = cldr_data.get_likely_script(lang) if not is_excluded_script(script): all_lang_scripts[lang].add(script) for script in unicode_data.all_scripts(): if is_excluded_script(script): continue lang = cldr_data.get_likely_subtags('und-' + script)[0] if lang != 'und': if _DEBUG and script not in all_lang_scripts[lang]: print '# adding likely lang %s for script %s' % (lang, script) all_lang_scripts[lang].add(script) elif script not in known_scripts: if _DEBUG: print '# adding script with unknown language %s' % script all_lang_scripts[lang].add(script) elif _DEBUG: print '### script %s with unknown language already seen' % script # Patch: ensure ryu-Jpan exists # - Okinawan can be written in either Kana or a combination of Hira # and Kanji. Rather than take a strong position on this, add a # mapping to Jpan. all_lang_scripts['ryu'].add('Jpan') # Patch: see noto-fonts#133 comment on June 8th. all_lang_scripts['tlh'] |= {'Latn', 'Piqd'} all_langs = used_lang_scripts.keys() + all_lang_scripts.keys() lang_data = {} for lang in all_langs: if lang in used_lang_scripts: if lang in all_lang_scripts: unused_set = all_lang_scripts[lang] - used_lang_scripts[lang] lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set()) else: lang_data[lang] = (used_lang_scripts[lang].copy(), set()) else: lang_data[lang] = (set(), all_lang_scripts[lang].copy()) return lang_data