def _LoadLanguages(languages_dir):
  languages = {}
  for textproto_file in glob.iglob(os.path.join(languages_dir, '*.textproto')):
    with open(textproto_file, 'r', encoding='utf-8') as f:
      language = text_format.Parse(f.read(), fonts_public_pb2.LanguageProto())
      languages[language.id] = language
  return languages
示例#2
0
def _GetLanguageForUdhr(languages, udhr):
    for l in languages.values():
        if (l.language == udhr.iso639_3 and l.script == udhr.iso15924) or \
            l.id == udhr.bcp47:
            return l

    language = fonts_public_pb2.LanguageProto()
    language.id = udhr.bcp47
    language.language = udhr.bcp47.split('_')[0]
    language.script = udhr.iso15924
    language.name = udhr.name.replace(' (', ', ').replace(')', '')
    return language
示例#3
0
def _WriteLanguageMetadata(cldr, out_dir):
    hyperglot_languages = languages.Languages()
    with UdhrTranslations() as udhrs:
        for lang_code in cldr.langs:
            path = os.path.join(out_dir, lang_code + '.textproto')
            if os.path.exists(path):
                continue

            lang = cldr.langs[lang_code]
            hg_lang = _GetHyperglotLanguage(lang, hyperglot_languages)

            language = fonts_public_pb2.LanguageProto()
            language.id = lang.id
            language.language = lang.lang_code
            language.script = lang.script_code
            language.population = lang.population

            name = lang.name
            if name is not None:
                language.name = name

            preferred_name = _GetPreferredName(lang, hg_lang)
            if preferred_name is not None:
                language.preferred_name = preferred_name

            autonym = _GetAutonym(cldr, lang, hg_lang)
            if autonym is not None:
                language.autonym = autonym

            if lang.regions is not None:
                language.region.extend(sorted(lang.regions))

            exemplar_chars = _GetExemplarCharacters(cldr, lang, hg_lang)
            if exemplar_chars is not None:
                language.exemplar_chars.MergeFrom(exemplar_chars)

            sample_text = _GetSampleText(lang_code, cldr, udhrs)
            if sample_text is not None:
                language.sample_text.MergeFrom(sample_text)

            _WriteProto(language, path)
示例#4
0
def main(argv):
    languages = _LoadLanguages(os.path.join(FLAGS.lang, 'languages'))
    regions = _LoadRegions(os.path.join(FLAGS.lang, 'regions'))

    if FLAGS.samples:
        assert len(argv) > 1, 'No METADATA.pb files specified'
        line_to_lang_name = {}
        for l in languages:
            line = 'languages: "{code}"'.format(code=languages[l].id)
            line_to_lang_name[line] = languages[l].name
        samples = {}
        for sample_filename in os.listdir(FLAGS.samples):
            key = os.path.splitext(os.path.basename(sample_filename))[0]
            samples[key] = os.path.join(FLAGS.samples, sample_filename)
        for path in argv[1:]:
            family = _ReadProto(fonts_public_pb2.FamilyProto(), path)
            if True:  #len(family.languages) == 0 or family.name == 'Noto Sans Tamil Supplement':
                key = family.name.replace(' ', '')
                if key not in samples:
                    print('Family not found in samples: ' + family.name)
                    continue
                with open(samples[key], 'r') as f:
                    sample_data = yaml.safe_load(f)
                    sample_text = fonts_public_pb2.SampleTextProto()
                    sample_text.masthead_full = sample_data['masthead_full']
                    sample_text.masthead_partial = sample_data[
                        'masthead_partial']
                    sample_text.styles = sample_data['styles']
                    sample_text.tester = sample_data['tester']
                    sample_text.poster_sm = sample_data['poster_sm']
                    sample_text.poster_md = sample_data['poster_md']
                    sample_text.poster_lg = sample_data['poster_lg']
                    family.sample_text.MergeFrom(sample_text)
                    _WriteProto(family, path, comments=line_to_lang_name)

    if not FLAGS.udhrs:
        return

    if FLAGS.udhrs.endswith('.yaml'):
        with open(FLAGS.udhrs, 'r') as f:
            data = yaml.safe_load(f)
            for translation, meta in data.items():
                if 'lang_full' not in meta or meta['lang_full'] not in [
                        'ccp-Beng-IN', 'lad-Hebr-IL'
                ]:
                    continue
                language = meta['lang']
                if language.startswith('und-'):
                    continue
                script = re.search(r'.*-(.*)-.*', meta['lang_full']).group(
                    1) if 'script' not in meta else meta['script']
                key = language + '_' + script
                iso639_3 = meta['lang_639_3']
                iso15924 = script
                name = meta['name_lang'] if 'name_udhr' not in meta else meta[
                    'name_udhr']
                udhr = Udhr(key=key,
                            iso639_3=iso639_3,
                            iso15924=iso15924,
                            bcp47=key,
                            direction=None,
                            ohchr=None,
                            stage=4,
                            loc=None,
                            name=name)
                udhr.LoadArticleOne(translation)

                language = _GetLanguageForUdhr(languages, udhr)
                if not language.HasField('sample_text'):
                    language.sample_text.MergeFrom(udhr.GetSampleTexts())
                if 'name_autonym' in meta and not language.HasField('autonym'):
                    language.autonym = meta['name_autonym'].strip()
                _WriteProto(
                    language,
                    os.path.join(FLAGS.lang, 'languages',
                                 language.id + '.textproto'))

    elif FLAGS.udhrs.endswith('.csv'):
        with open(FLAGS.udhrs, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            head = next(reader)
            index_id = head.index('id')
            index_name = head.index('language')
            index_historical = head.index('historical')
            index_sample = head.index('SAMPLE')
            for row in reader:
                id = row[index_id]
                if id in languages:
                    language = languages[row[index_id]]
                else:
                    language = fonts_public_pb2.LanguageProto()
                    language.id = id
                    language.language, language.script = id.split('_')
                    language.name = row[index_name]
                historical = row[index_historical] == 'X'
                if language.historical != historical:
                    if historical:
                        language.historical = True
                    else:
                        language.ClearField('historical')
                sample = row[index_sample]
                if sample and not sample.startswith('http'):
                    udhr = Udhr(key=id,
                                iso639_3=language.language,
                                iso15924=language.script,
                                bcp47=id,
                                direction=None,
                                ohchr=None,
                                stage=4,
                                loc=None,
                                name=None)
                    udhr.LoadArticleOne(sample)
                    if not language.HasField('sample_text'):
                        language.sample_text.MergeFrom(udhr.GetSampleTexts())
                _WriteProto(
                    language,
                    os.path.join(FLAGS.lang, 'languages',
                                 language.id + '.textproto'))

    elif os.path.isdir(FLAGS.udhrs):
        for udhr_path in glob.glob(os.path.join(FLAGS.udhrs, '*')):
            if udhr_path.endswith('index.xml') or os.path.basename(
                    udhr_path).startswith('status'):
                continue
            udhr_data = etree.parse(udhr_path)
            head = udhr_data.getroot()
            for name, value in head.attrib.items():
                if re.search(r'\{.*\}lang', name):
                    bcp47 = value.replace('-', '_')
            udhr = Udhr(key=head.get('key'),
                        iso639_3=head.get('iso639-3'),
                        iso15924=head.get('iso15924'),
                        bcp47=bcp47,
                        direction=head.get('dir'),
                        ohchr=None,
                        stage=4,
                        loc=None,
                        name=head.get('n'))
            udhr.Parse(udhr_data)

            language = _GetLanguageForUdhr(languages, udhr)
            if language.id in languages or language.HasField('sample_text'):
                continue
            language.sample_text.MergeFrom(udhr.GetSampleTexts())
            _WriteProto(
                language,
                os.path.join(FLAGS.lang, 'languages',
                             language.id + '.textproto'))

    else:
        raise Exception('Unsupported input type for --udhrs: ' + FLAGS.udhrs)