예제 #1
0
def _check_scripts(scripts):
  """Return True if all scripts are known (pseudo) codes."""
  have_unknown = False
  if scripts:
    all_scripts = unicode_data.all_scripts()
    all_scripts = all_scripts | set(['LGC', 'CJK', 'MONO', 'SYM2', 'MUSIC'])
    for s in scripts:
      if s not in all_scripts:
        print >> sys.stderr, 'unknown script:', s
        have_unknown = True
  return not have_unknown
예제 #2
0
def _check_scripts(scripts):
  """Return True if all scripts are known (pseudo) codes."""
  have_unknown = False
  if scripts:
    all_scripts = unicode_data.all_scripts()
    all_scripts = all_scripts | {'CJK', 'EXCL', 'LGC', 'MONO', 'MUSIC', 'SYM2', 'Zsye'}
    for s in scripts:
      if s not in all_scripts:
        sys.stderr.write('unknown script:\n', s)
        have_unknown = True
  return not have_unknown
예제 #3
0
def _check_scripts(scripts):
    """Return True if all scripts are known (pseudo) codes."""
    have_unknown = False
    if scripts:
        all_scripts = unicode_data.all_scripts()
        all_scripts = all_scripts | set(
            ['CJK', 'EXCL', 'LGC', 'MONO', 'MUSIC', 'SYM2', 'Zsye'])
        for s in scripts:
            if s not in all_scripts:
                print >> sys.stderr, 'unknown script:', s
                have_unknown = True
    return not have_unknown
예제 #4
0
def _check_scripts(scripts):
    """Return True if all scripts are known (pseudo) codes."""
    have_unknown = False
    if scripts:
        all_scripts = unicode_data.all_scripts()
        all_scripts = all_scripts | {
            "CJK",
            "EXCL",
            "LGC",
            "MONO",
            "MUSIC",
            "SYM2",
            "Zsye",
        }
        for s in scripts:
            if s not in all_scripts:
                sys.stderr.write("unknown script:\n", s)
                have_unknown = True
    return not have_unknown
예제 #5
0
def _create_lang_data():
    """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

    all_lang_scripts = collections.defaultdict(set)
    used_lang_scripts = collections.defaultdict(set)
    known_scripts = set()
    all_langs = set()
    for region in cldr_data.known_regions():
        lang_scripts = cldr_data.region_to_lang_scripts(region)
        for lang_script in lang_scripts:
            lang, script = lang_script.split('-')
            known_scripts.add(script)
            if lang == 'und':
                _log('used lang is und for script %s in region %s' %
                     (script, region))
                continue
            used_lang_scripts[lang].add(script)
            all_lang_scripts[lang].add(script)
            all_langs.add(lang)

    for lang in cldr_data.known_langs():
        lang_scripts = cldr_data.lang_to_scripts(lang)
        all_lang_scripts[lang] |= lang_scripts
        known_scripts |= lang_scripts
        all_langs.add(lang)

    for lang in all_langs:
        script = cldr_data.get_likely_script(lang)
        if not is_excluded_script(script):
            all_lang_scripts[lang].add(script)

    for script in unicode_data.all_scripts():
        if is_excluded_script(script):
            continue
        lang = cldr_data.get_likely_subtags('und-' + script)[0]
        if lang != 'und':
            if script not in all_lang_scripts[lang]:
                _log('adding likely lang %s for script %s' % (lang, script))
            all_lang_scripts[lang].add(script)
        elif script not in known_scripts:
            _log('adding script with unknown language %s' % script)
            all_lang_scripts[lang].add(script)
        else:
            _log('script %s with unknown language already seen' % script)

    # Patch: ensure ryu-Jpan exists
    # - Okinawan can be written in either Kana or a combination of Hira
    #   and Kanji. Rather than take a strong position on this, add a
    #   mapping to Jpan.
    all_lang_scripts['ryu'].add('Jpan')

    # Patch: see noto-fonts#133 comment on June 8th.
    all_lang_scripts['tlh'] |= {'Latn', 'Piqd'}

    all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
    lang_data = {}
    for lang in all_langs:
        if lang in used_lang_scripts:
            if lang in all_lang_scripts:
                unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
                lang_data[lang] = (used_lang_scripts[lang].copy(),
                                   unused_set if unused_set else set())
            else:
                lang_data[lang] = (used_lang_scripts[lang].copy(), set())
        else:
            lang_data[lang] = (set(), all_lang_scripts[lang].copy())

    return lang_data
예제 #6
0
 def test_all_scripts(self):
     """Tests the all_scripts() method."""
     self.assertIn('Latn', unicode_data.all_scripts())
     self.assertNotIn('Japn', unicode_data.all_scripts())
예제 #7
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        if _DEBUG:
          print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_scripts():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      if _DEBUG and script not in all_lang_scripts[lang]:
        print '# adding likely lang %s for script %s' % (lang, script)
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      if _DEBUG:
        print '# adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    elif _DEBUG:
      print '### script %s with unknown language already seen' % script

  # Patch: ensure ryu-Jpan exists
  # - Okinawan can be written in either Kana or a combination of Hira
  #   and Kanji. Rather than take a strong position on this, add a
  #   mapping to Jpan.
  all_lang_scripts['ryu'].add('Jpan')

  # Patch: see noto-fonts#133 comment on June 8th.
  all_lang_scripts['tlh'] |= {'Latn', 'Piqd'}

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(),
                           unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
예제 #8
0
 def test_all_scripts(self):
     """Tests the all_scripts() method."""
     self.assertIn('Latn', unicode_data.all_scripts())
     self.assertNotIn('Japn', unicode_data.all_scripts())