def generate_table(filename): with codecs.open(filename, 'w', 'utf-8') as f: script_to_samples = _get_script_to_samples() print >> f, _HTML_HEADER print >> f, '<table>' print >> f, '<tr><th>Script<br/>BCP<th>name<th>type<th>text' for script, samples in sorted(script_to_samples.iteritems()): script_en = cldr_data.get_english_script_name(script) print >> f, '<tr><th colspan=4>%s' % script_en for bcp, sample_type, sample_text in samples: try: lsrv = cldr_data.loc_tag_to_lsrv(bcp) lsrv = (lsrv[0], None, lsrv[2], lsrv[3]) bcp_no_script = cldr_data.lsrv_to_loc_tag(lsrv) bcp_en = cldr_data.get_english_language_name(bcp_no_script) if not bcp_en: bcp_en = 'No name' if bcp_en == 'Unknown Language' and sample_type == 'chars': bcp_en = '(characters)' except: print 'could not get english name for %s' % bcp bcp_en = bcp cols = ['<tr>'] cols.append(bcp_no_script) cols.append(bcp_en) cols.append(sample_type) cols.append(sample_text) print >> f, '<td>'.join(cols) print >> f, '<tr><td colspan=4> ' print >> f, '</table>' print >> f, _HTML_FOOTER
def generate_table(filename): with codecs.open(filename, "w", "utf-8") as f: script_to_samples = _get_script_to_samples() f.write(_HTML_HEADER) f.write("<table>\n") f.write("<tr><th>Script<br/>BCP<th>name<th>type<th>text\n") for script, samples in sorted(script_to_samples.items()): script_en = cldr_data.get_english_script_name(script) f.write("<tr><th colspan=4>%s\n" % script_en) for bcp, sample_type, sample_text in samples: try: lsrv = cldr_data.loc_tag_to_lsrv(bcp) lsrv = (lsrv[0], None, lsrv[2], lsrv[3]) bcp_no_script = cldr_data.lsrv_to_loc_tag(lsrv) bcp_en = cldr_data.get_english_language_name(bcp_no_script) if not bcp_en: bcp_en = "No name" if bcp_en == "Unknown Language" and sample_type == "chars": bcp_en = "(characters)" except: print("could not get english name for %s" % bcp) bcp_en = bcp cols = ["<tr>"] cols.append(bcp_no_script) cols.append(bcp_en) cols.append(sample_type) cols.append(sample_text) f.write("<td>".join(cols) + "\n") f.write("<tr><td colspan=4> \n") f.write("</table>\n") f.write(_HTML_FOOTER + "\n")
def _init_lang_for_script_map(): locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()] for t in locs_by_lit_pop: lsrv = cldr_data.loc_tag_to_lsrv(t) script = lsrv[1] if script not in _lang_for_script_map: lang = lsrv[0] # print '%s lang => %s' % (script, lang) _lang_for_script_map[script] = lang
def _init_lang_for_script_map(): locs_by_lit_pop = [loc for _, loc in cldr_data.get_lang_scrs_by_decreasing_global_lit_pop()] for t in locs_by_lit_pop: lsrv = cldr_data.loc_tag_to_lsrv(t) script = lsrv[1] if script not in _lang_for_script_map: lang = lsrv[0] # print('%s lang => %s' % (script, lang)) _lang_for_script_map[script] = lang
def _get_script_to_samples(): script_to_samples = collections.defaultdict(list) sample_dir = tool_utils.resolve_path('[tools]/sample_texts') for f in sorted(os.listdir(sample_dir)): base, ext = path.splitext(f) if ext != '.txt' or '_' not in base: print 'skipping', f continue bcp, sample_type = base.split('_') try: lang, script, region, variant = cldr_data.loc_tag_to_lsrv(bcp) except: print 'bcp %s did not parse as lsrv' % bcp continue if script == 'Latn': continue script_to_samples[script].append((bcp, sample_type)) for script, samples in sorted(script_to_samples.iteritems()): pref = {} for bcp, sample_type in samples: if bcp not in pref or sample_type == 'udhr': pref[bcp] = sample_type full_samples = [] for bcp, sample_type in sorted(pref.iteritems()): filename = '%s_%s.txt' % (bcp, sample_type) filepath = path.join(sample_dir, filename) with codecs.open(filepath, 'r', 'utf-8') as f: sample_text = f.read() full_samples.append((bcp, sample_type, sample_text)) script_to_samples[script] = full_samples return script_to_samples
def _get_script_to_samples(): script_to_samples = collections.defaultdict(list) sample_dir = tool_utils.resolve_path("[tools]/sample_texts") for f in sorted(os.listdir(sample_dir)): base, ext = path.splitext(f) if ext != ".txt" or "_" not in base: print("skipping", f) continue bcp, sample_type = base.split("_") try: lang, script, region, variant = cldr_data.loc_tag_to_lsrv(bcp) except: print("bcp %s did not parse as lsrv" % bcp) continue if script == "Latn": continue script_to_samples[script].append((bcp, sample_type)) for script, samples in sorted(script_to_samples.items()): pref = {} for bcp, sample_type in samples: if bcp not in pref or sample_type == "udhr": pref[bcp] = sample_type full_samples = [] for bcp, sample_type in sorted(pref.items()): filename = "%s_%s.txt" % (bcp, sample_type) filepath = path.join(sample_dir, filename) with codecs.open(filepath, "r", "utf-8") as f: sample_text = f.read() full_samples.append((bcp, sample_type, sample_text)) script_to_samples[script] = full_samples return script_to_samples
def get_script_to_exemplar_data_map(): """Return a map from script to 3-tuples of: - locale tuple (lang, script, region, variant) - cldr_relative path to src of exemplar data - tuple of the exemplar chars""" script_map = collections.defaultdict(dict) for directory in ['common', 'seed', 'exemplars']: data_dir = path.join(directory, 'main') for filename in os.listdir(path.join(CLDR_DIR, data_dir)): if not filename.endswith('.xml'): continue exemplar_list = cldr_data.get_exemplar_from_file(path.join(data_dir, filename)) if not exemplar_list: if _VERBOSE: print ' no exemplar list for %s' % path.join(data_dir, filename) continue lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4]) if not lsrv: if _VERBOSE: print ' no lsrv for %s' % path.join(data_dir, filename) continue src = path.join(directory, filename) script = lsrv[1] if not script: if _VERBOSE: print ' no script for %s' % path.join(data_dir, filename) continue loc_tag = cldr_data.lsrv_to_loc_tag(lsrv) loc_to_exemplar_info = script_map[script] if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # fix exemplars that look incorrect if script == 'Arab' and 'd' in exemplar_list: if _VERBOSE: print 'found \'d\' in %s for %s' % (src, lsrv) no_latin = True else: no_latin = False # exclude exemplar strings, and restrict to letters and digits def accept_cp(cp): if len(cp) != 1: return False cat = unicode_data.category(cp) if cat[0] != 'L' and cat != 'Nd': return False if no_latin and cp in 'df': return False return True filtered_exemplar_list = filter(accept_cp, exemplar_list) # some exemplar lists don't surround strings with curly braces, and end up # with duplicate characters. Flag these exemplar_chars = set() dup_chars = set() fixed_exemplar_list = [] for cp in filtered_exemplar_list: if cp in exemplar_chars: dup_chars.add(cp) else: exemplar_chars.add(cp) fixed_exemplar_list.append(cp) if len(dup_chars) > 0 and _VERBOSE: print 'duplicate exemplars in %s: %s' % ( src, ', '.join([u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars])) loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list)) # supplement with extra locale data for loc_tag in extra_locale_data.EXEMPLARS: exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag) lang, script = loc_tag.split('-') lsrv = (lang, script, None, None) loc_to_exemplar_info = script_map[script] src = '[extra locale data]/%s' % loc_tag if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # restrict to letters, except for zsym def accept_cp(cp): cat = unicode_data.category(cp) return cat[0] == 'L' or cat == 'Nd' if 'Zsym' not in loc_tag: filtered_exemplar_list = filter(accept_cp, exemplar_list) if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE: print 'filtered some characters from %s' % src else: filtered_exemplar_list = exemplar_list loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list)) return script_map
def get_script_to_exemplar_data_map(): """Return a map from script to 3-tuples of: - locale tuple (lang, script, region, variant) - cldr_relative path to src of exemplar data - tuple of the exemplar chars""" script_map = collections.defaultdict(dict) for directory in ['common', 'seed', 'exemplars']: data_dir = path.join(directory, 'main') for filename in os.listdir(path.join(CLDR_DIR, data_dir)): if not filename.endswith('.xml'): continue exemplar_list = cldr_data.get_exemplar_from_file( path.join(data_dir, filename)) if not exemplar_list: if _VERBOSE: print ' no exemplar list for %s' % path.join( data_dir, filename) continue lsrv = cldr_data.loc_tag_to_lsrv(filename[:-4]) if not lsrv: if _VERBOSE: print ' no lsrv for %s' % path.join(data_dir, filename) continue src = path.join(directory, filename) script = lsrv[1] if not script: if _VERBOSE: print ' no script for %s' % path.join(data_dir, filename) continue loc_tag = cldr_data.lsrv_to_loc_tag(lsrv) loc_to_exemplar_info = script_map[script] if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # fix exemplars that look incorrect if script == 'Arab' and 'd' in exemplar_list: if _VERBOSE: print 'found \'d\' in %s for %s' % (src, lsrv) no_latin = True else: no_latin = False # exclude exemplar strings, and restrict to letters and digits def accept_cp(cp): if len(cp) != 1: return False cat = unicode_data.category(cp) if cat[0] != 'L' and cat != 'Nd': return False if no_latin and cp in 'df': return False return True filtered_exemplar_list = filter(accept_cp, exemplar_list) # some exemplar lists don't surround strings with curly braces, and end up # with duplicate characters. Flag these exemplar_chars = set() dup_chars = set() fixed_exemplar_list = [] for cp in filtered_exemplar_list: if cp in exemplar_chars: dup_chars.add(cp) else: exemplar_chars.add(cp) fixed_exemplar_list.append(cp) if len(dup_chars) > 0 and _VERBOSE: print 'duplicate exemplars in %s: %s' % (src, ', '.join([ u'\u200e%s\u200e (%x)' % (cp, ord(cp)) for cp in dup_chars ])) loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(fixed_exemplar_list)) # supplement with extra locale data for loc_tag in extra_locale_data.EXEMPLARS: exemplar_list = cldr_data.get_exemplar_from_extra_data(loc_tag) lang, script = loc_tag.split('-') lsrv = (lang, script, None, None) loc_to_exemplar_info = script_map[script] src = '[extra locale data]/%s' % loc_tag if loc_tag in loc_to_exemplar_info: if _VERBOSE: print 'skipping %s, already have exemplars for %s from %s' % ( src, loc_tag, loc_to_exemplar_info[loc_tag][1]) continue # restrict to letters, except for zsym def accept_cp(cp): cat = unicode_data.category(cp) return cat[0] == 'L' or cat == 'Nd' if 'Zsym' not in loc_tag: filtered_exemplar_list = filter(accept_cp, exemplar_list) if len(filtered_exemplar_list) != len(exemplar_list) and _VERBOSE: print 'filtered some characters from %s' % src else: filtered_exemplar_list = exemplar_list loc_to_exemplar_info[loc_tag] = (lsrv, src, tuple(filtered_exemplar_list)) return script_map