def _read_primary_languages(): # Hand-edited linguistic data: path = os.path.join(paths.datadir, 'languages') cp = configparser.ConfigParser(interpolation=None, default_section='') cp.read(path, encoding='UTF-8') primary_languages = {name: sect for name, sect in cp.items() if sect.name} name_to_code = {} misc.check_sorted(cp) for language, section in cp.items(): if not language: continue for key in section.keys(): if key in { 'names', 'characters', 'macrolanguage', 'plural-forms', 'principal-territory' }: continue if key.startswith('characters@'): continue raise misc.DataIntegrityError('unknown key: {}'.format(key)) for name in section['names'].splitlines(): name = _munch_language_name(name) if name: if name in name_to_code: raise misc.DataIntegrityError name_to_code[name] = language return primary_languages, name_to_code
def _read_header_fields(): path = os.path.join(paths.datadir, 'header-fields') with open(path, 'rt', encoding='ASCII') as file: fields = [ s.rstrip() for s in file if s.rstrip() and not s.startswith('#') ] misc.check_sorted(fields) return frozenset(fields)
def _read_string_formats(): path = os.path.join(paths.datadir, 'string-formats') cp = configparser.ConfigParser(interpolation=None, default_section='') cp.read(path, encoding='ASCII') section = cp['formats'] misc.check_sorted(section) return { name: frozenset(examples.split()) for name, examples in section.items() }
def _read_tags(): path = os.path.join(paths.datadir, 'tags') cp = configparser.ConfigParser(interpolation=None, default_section='') cp.read(path, encoding='UTF-8') misc.check_sorted(cp) tags = {} for tagname, section in cp.items(): if not tagname: continue kwargs = dict(section.items()) kwargs['name'] = tagname tags[tagname] = Tag(**kwargs) return tags
def _read_control_characters(): path = os.path.join(paths.datadir, 'control-characters') cp = configparser.ConfigParser(interpolation=None, default_section='') cp.read(path, encoding='UTF-8') for section in cp.values(): if not section.name: continue misc.check_sorted(section) for code, name in section.items(): if len(code) != 2: raise misc.DataIntegrityError code = chr(int(code, 16)) if unicodedata.category(code) != 'Cc': raise misc.DataIntegrityError if name.upper() != name: raise misc.DataIntegrityError yield (code, name)
def _read_iso_codes(): # ISO language/territory codes: path = os.path.join(paths.datadir, 'iso-codes') cp = configparser.ConfigParser(interpolation=None, default_section='') cp.read(path, encoding='UTF-8') cfg_iso_639 = cp['language-codes'] misc.check_sorted(cfg_iso_639) iso_639 = {} for lll, ll in cfg_iso_639.items(): if ll: iso_639[ll] = ll iso_639[lll] = ll else: iso_639[lll] = lll cfg_iso_3166 = cp['territory-codes'] misc.check_sorted(cfg_iso_3166) iso_3166 = frozenset(cc.upper() for cc in cfg_iso_3166.keys()) return (iso_639, iso_3166)
def test_unsorted(self): with assert_raises(M.DataIntegrityError) as cm: M.check_sorted([23, 37, 17]) assert_equal(str(cm.exception), '37 > 17')
def test_sorted(self): M.check_sorted([17, 23, 37])