def read_cldr_name_file(path, langcode, category): data = read_cldr_names(path, langcode, category) name_quads = [] for subtag, name in sorted(data.items()): if (langcode, subtag) in OVERRIDES: name = OVERRIDES[langcode, subtag] if name is None: continue if subtag == name: # Default entries that map a language code to itself, which # an inattentive annotator just left there continue # CLDR assigns multiple names to one code by adding -alt-* to # the end of the code. For example, the English name of 'az' is # Azerbaijani, but the English name of 'az-alt-short' is Azeri. if normalize_name(name) == normalize_name(subtag): # Giving the name "zh (Hans)" to "zh-Hans" is still lazy continue priority = 3 if subtag.endswith('-alt-menu') and name == 'mandarin': # The -alt-menu entries are supposed to do things like alphabetize # "Mandarin Chinese" under "Chinese, Mandarin". A few languages # just put the string "mandarin" there, which seems wrong and # messes up our name lookups. continue if '-alt-' in subtag: subtag, _ = subtag.split('-alt-', 1) priority = 1 name_quads.append((langcode, subtag, name, priority)) return name_quads
def read_cldr_name_file(path, langcode, category): data = read_cldr_names(path, langcode, category) name_quads = [] for subtag, name in sorted(data.items()): if (langcode, subtag) in OVERRIDES: name = OVERRIDES[langcode, subtag] if name is None: continue if subtag == name: # Default entries that map a language code to itself, which # a lazy annotator just left there continue # CLDR assigns multiple names to one code by adding -alt-* to # the end of the code. For example, the English name of 'az' is # Azerbaijani, but the English name of 'az-alt-short' is Azeri. if normalize_name(name) == normalize_name(subtag): # Giving the name "zh (Hans)" to "zh-Hans" is still lazy continue priority = 3 if '-alt-' in subtag: subtag, _ = subtag.split('-alt-', 1) priority = 1 name_quads.append((langcode, subtag, name, priority)) return name_quads
def update_names(names_fwd, names_rev, name_quads): for name_language, referent, name, priority in name_quads: # Get just the language from name_language, not the territory or script. short_language = langcodes.get(name_language).language rev_all = names_rev.setdefault('und', {}) rev_language = names_rev.setdefault(short_language, {}) for rev_dict in (rev_all, rev_language): rev_dict.setdefault(normalize_name(name), []).append( (name_language, referent, priority)) fwd_key = '{}@{}'.format(referent.lower(), name_language) if fwd_key not in names_fwd: names_fwd[fwd_key] = name