def get_family_id_to_regions(family_id_to_lang_scr_to_sample_key):
  lang_scr_to_regions = collections.defaultdict(set)
  for region in sorted(cldr_data.known_regions()):
    if region == 'ZZ':
      continue
    if len(region) > 2: # e.g. world
      print 'skipping region %s' % region
      continue
    lang_scrs = cldr_data.region_to_lang_scripts(region)
    for lang_scr in lang_scrs:
      lang_scr_to_regions[lang_scr].add(region)

  family_id_to_regions = collections.defaultdict(set)
  warnings = set()
  for family_id, lang_scr_to_sample_key in family_id_to_lang_scr_to_sample_key.iteritems():
    for lang_scr in lang_scr_to_sample_key:
      if lang_scr in lang_scr_to_regions:
        for region in lang_scr_to_regions[lang_scr]:
          family_id_to_regions[family_id].add(region)
      else:
        # don't warn about undefined languages
        if not lang_scr.startswith('und'):
          warnings.add(lang_scr)

  for lang_scr in sorted(warnings):
    print 'no mapping from %s to any region' % lang_scr

  return family_id_to_regions
Пример #2
0
def get_family_id_to_regions(family_id_to_lang_scr_to_sample_key):
  lang_scr_to_regions = collections.defaultdict(set)
  for region in sorted(cldr_data.known_regions()):
    if region == 'ZZ':
      continue
    if len(region) > 2: # e.g. world
      print 'skipping region %s' % region
      continue
    lang_scrs = cldr_data.region_to_lang_scripts(region)
    for lang_scr in lang_scrs:
      lang_scr_to_regions[lang_scr].add(region)

  family_id_to_regions = collections.defaultdict(set)
  warnings = set()
  for tup in family_id_to_lang_scr_to_sample_key.iteritems():
    family_id, lang_scr_to_sample_key = tup
    for lang_scr in lang_scr_to_sample_key:
      if lang_scr in lang_scr_to_regions:
        for region in lang_scr_to_regions[lang_scr]:
          family_id_to_regions[family_id].add(region)
      else:
        # don't warn about undefined languages
        if not lang_scr.startswith('und'):
          warnings.add(lang_scr)

  for lang_scr in sorted(warnings):
    print 'no mapping from %s to any region' % lang_scr

  return family_id_to_regions
Пример #3
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      print 'adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    else:
      print '### script %s with unknown language already seen' % script

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(), unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
Пример #4
0
def get_region_to_family_ids(script_to_family_ids):
  region_to_family_ids = collections.defaultdict(set)
  for region in cldr_data.known_regions():
    if region == 'ZZ':
      continue
    if len(region) > 2:
      print 'skipping region %s' % region
      continue
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      if script == 'Kana':
        print 'remap %s to use Jpan script' % lang_script
        script = 'Jpan'
      if not script in script_to_family_ids:
        print 'unsupported script %s for lang %s in region %s' % (script, lang, region)
      else:
        families = script_to_family_ids[script]
        region_to_family_ids[region].update(families)
  return region_to_family_ids
Пример #5
0
def get_region_to_family_ids(script_to_family_ids):
    region_to_family_ids = collections.defaultdict(set)
    for region in cldr_data.known_regions():
        if region == 'ZZ':
            continue
        if len(region) > 2:
            print 'skipping region %s' % region
            continue
        lang_scripts = cldr_data.region_to_lang_scripts(region)
        for lang_script in lang_scripts:
            lang, script = lang_script.split('-')
            if script == 'Kana':
                print 'remap %s to use Jpan script' % lang_script
                script = 'Jpan'
            if not script in script_to_family_ids:
                print 'unsupported script %s for lang %s in region %s' % (
                    script, lang, region)
            else:
                families = script_to_family_ids[script]
                region_to_family_ids[region].update(families)
    return region_to_family_ids
Пример #6
0
def _create_lang_data():
    """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

    all_lang_scripts = collections.defaultdict(set)
    used_lang_scripts = collections.defaultdict(set)
    known_scripts = set()
    all_langs = set()
    for region in cldr_data.known_regions():
        lang_scripts = cldr_data.region_to_lang_scripts(region)
        for lang_script in lang_scripts:
            lang, script = lang_script.split('-')
            known_scripts.add(script)
            if lang == 'und':
                _log('used lang is und for script %s in region %s' %
                     (script, region))
                continue
            used_lang_scripts[lang].add(script)
            all_lang_scripts[lang].add(script)
            all_langs.add(lang)

    for lang in cldr_data.known_langs():
        lang_scripts = cldr_data.lang_to_scripts(lang)
        all_lang_scripts[lang] |= lang_scripts
        known_scripts |= lang_scripts
        all_langs.add(lang)

    for lang in all_langs:
        script = cldr_data.get_likely_script(lang)
        if not is_excluded_script(script):
            all_lang_scripts[lang].add(script)

    for script in unicode_data.all_scripts():
        if is_excluded_script(script):
            continue
        lang = cldr_data.get_likely_subtags('und-' + script)[0]
        if lang != 'und':
            if script not in all_lang_scripts[lang]:
                _log('adding likely lang %s for script %s' % (lang, script))
            all_lang_scripts[lang].add(script)
        elif script not in known_scripts:
            _log('adding script with unknown language %s' % script)
            all_lang_scripts[lang].add(script)
        else:
            _log('script %s with unknown language already seen' % script)

    # Patch: ensure ryu-Jpan exists
    # - Okinawan can be written in either Kana or a combination of Hira
    #   and Kanji. Rather than take a strong position on this, add a
    #   mapping to Jpan.
    all_lang_scripts['ryu'].add('Jpan')

    # Patch: see noto-fonts#133 comment on June 8th.
    all_lang_scripts['tlh'] |= {'Latn', 'Piqd'}

    all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
    lang_data = {}
    for lang in all_langs:
        if lang in used_lang_scripts:
            if lang in all_lang_scripts:
                unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
                lang_data[lang] = (used_lang_scripts[lang].copy(),
                                   unused_set if unused_set else set())
            else:
                lang_data[lang] = (used_lang_scripts[lang].copy(), set())
        else:
            lang_data[lang] = (set(), all_lang_scripts[lang].copy())

    return lang_data
Пример #7
0
def _create_lang_data():
  """Generates language data from CLDR plus extensions.
  Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region."""

  all_lang_scripts = collections.defaultdict(set)
  used_lang_scripts = collections.defaultdict(set)
  known_scripts = set()
  all_langs = set()
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      known_scripts.add(script)
      if lang == 'und':
        if _DEBUG:
          print 'used lang is und for script %s in region %s' % (script, region)
        continue
      used_lang_scripts[lang].add(script)
      all_lang_scripts[lang].add(script)
      all_langs.add(lang)

  for lang in cldr_data.known_langs():
    lang_scripts = cldr_data.lang_to_scripts(lang)
    all_lang_scripts[lang] |= lang_scripts
    known_scripts |= lang_scripts
    all_langs.add(lang)

  for lang in all_langs:
    script = cldr_data.get_likely_script(lang)
    if not is_excluded_script(script):
      all_lang_scripts[lang].add(script)

  for script in unicode_data.all_script_codes():
    if is_excluded_script(script):
      continue
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      if _DEBUG and script not in all_lang_scripts[lang]:
        print '# adding likely lang %s for script %s' % (lang, script)
      all_lang_scripts[lang].add(script)
    elif script not in known_scripts:
      if _DEBUG:
        print '# adding script with unknown language %s' % script
      all_lang_scripts[lang].add(script)
    elif _DEBUG:
      print '### script %s with unknown language already seen' % script

  # Patch: ensure ryu-Jpan exists
  # - Okinawan can be written in either Kana or a combination of Hira
  #   and Kanji. Rather than take a strong position on this, add a
  #   mapping to Jpan.
  all_lang_scripts['ryu'].add('Jpan')

  all_langs = used_lang_scripts.keys() + all_lang_scripts.keys()
  lang_data = {}
  for lang in all_langs:
    if lang in used_lang_scripts:
      if lang in all_lang_scripts:
        unused_set = all_lang_scripts[lang] - used_lang_scripts[lang]
        lang_data[lang] = (used_lang_scripts[lang].copy(),
                           unused_set if unused_set else set())
      else:
        lang_data[lang] = (used_lang_scripts[lang].copy(), set())
    else:
      lang_data[lang] = (set(), all_lang_scripts[lang].copy())

  return lang_data
Пример #8
0
def get_used_lang_data(supported_scripts):
  """Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region"""

  # Get additional scripts for a lang by using get_likely_subtags from script to
  # lang.  This might not be the same as the likely script for a lang, but it does
  # indicate the language can be written in the script, or so we assume.
  lang_to_additional_script = {}
  for script in supported_scripts:
    lang = cldr_data.get_likely_subtags('und-' + script)[0]
    if lang != 'und':
      lang_to_additional_script[lang] = script

  unsupported_scripts = set()
  lang_data = {}
  used_lang_scripts = collections.defaultdict(set)
  for region in cldr_data.known_regions():
    lang_scripts = cldr_data.region_to_lang_scripts(region)
    for lang_script in lang_scripts:
      lang, script = lang_script.split('-')
      if script == 'Kana':
        print 'remap %s to use Jpan' % lang_script
        script = 'Jpan'
      if script not in supported_scripts:
        unsupported_scripts.add(script)
      used_lang_scripts[lang].add(script)

  if unsupported_scripts:
    print 'used scripts that are not supported: %s' % ', '.join(sorted(unsupported_scripts))

  known_langs = set(cldr_data.known_langs())
  for lang in lang_to_additional_script:
    if not lang in known_langs:
      print 'lang %s not in known langs' % lang
      known_langs.add(lang)

  for lang in known_langs:
    if lang in ['ryu', 'ain']:
      all_scripts = set(['Jpan'])
    else:
      all_scripts = set(cldr_data.lang_to_scripts(lang))

    # add additional scripts for lang
    if lang in lang_to_additional_script:
      script = lang_to_additional_script[lang]
      if script not in all_scripts:
        print 'cldr data does not have script %s for lang %s' % (script, lang)
        all_scripts.add(script)

    if not all_scripts & supported_scripts:
      print 'no supported scripts among %s for lang %s' % (all_scripts, lang)
      continue

    used_scripts = used_lang_scripts[lang]
    if not used_scripts:
      script = cldr_data.get_likely_script(lang)
      if script != 'Zzzz':
        used_scripts = set([script])

    unused_scripts = all_scripts - used_scripts
    lang_data[lang] = (used_scripts, unused_scripts)

  # Patch out langs whose sample data Noto doesn't support
  # A bunch of these resolve to the same sample.  Would be easier to check if I just had
  # sample names independent of language names, but then harder to remove the languages.
  for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']:
    if not lang in lang_data:
      print 'patched out lang %s not present' % lang
    else:
      print 'patch out lang %s' % lang
      del lang_data[lang]

  return lang_data
Пример #9
0
def get_used_lang_data(supported_scripts):
    """Returns a mapping from lang to a tuple of:
  - a set of scripts used in some region
  - a set of scripts not used in any region"""

    # Get additional scripts for a lang by using get_likely_subtags from script to
    # lang.  This might not be the same as the likely script for a lang, but it does
    # indicate the language can be written in the script, or so we assume.
    lang_to_additional_script = {}
    for script in supported_scripts:
        lang = cldr_data.get_likely_subtags('und-' + script)[0]
        if lang != 'und':
            lang_to_additional_script[lang] = script

    unsupported_scripts = set()
    lang_data = {}
    used_lang_scripts = collections.defaultdict(set)
    for region in cldr_data.known_regions():
        lang_scripts = cldr_data.region_to_lang_scripts(region)
        for lang_script in lang_scripts:
            lang, script = lang_script.split('-')
            if script == 'Kana':
                print 'remap %s to use Jpan' % lang_script
                script = 'Jpan'
            if script not in supported_scripts:
                unsupported_scripts.add(script)
            used_lang_scripts[lang].add(script)

    if unsupported_scripts:
        print 'used scripts that are not supported: %s' % ', '.join(
            sorted(unsupported_scripts))

    known_langs = set(cldr_data.known_langs())
    for lang in lang_to_additional_script:
        if not lang in known_langs:
            print 'lang %s not in known langs, adding' % lang
            known_langs.add(lang)

    for lang in known_langs:
        if lang in ['ryu', 'ain']:
            all_scripts = set(['Jpan'])
        else:
            all_scripts = set(cldr_data.lang_to_scripts(lang))

        # add additional scripts for lang
        if lang in lang_to_additional_script:
            script = lang_to_additional_script[lang]
            if script not in all_scripts:
                print 'cldr data does not have script %s for lang %s' % (
                    script, lang)
                all_scripts.add(script)

        if not all_scripts & supported_scripts:
            print 'no supported scripts among %s for lang %s' % (all_scripts,
                                                                 lang)
            continue

        used_scripts = used_lang_scripts[lang]
        if not used_scripts:
            script = cldr_data.get_likely_script(lang)
            if script != 'Zzzz':
                used_scripts = set([script])

        unused_scripts = all_scripts - used_scripts
        lang_data[lang] = (used_scripts, unused_scripts)

    # Patch out langs whose sample data Noto doesn't support
    # A bunch of these resolve to the same sample.  Would be easier to check if I just had
    # sample names independent of language names, but then harder to remove the languages.
    for lang in ['abq', 'ady', 'aii-Cyrl', 'av', 'bua', 'chm']:
        if not lang in lang_data:
            print 'patched out lang %s not present' % lang
        else:
            print 'patch out lang %s' % lang
            del lang_data[lang]

    return lang_data