def get_font_data(): fonts = {} path = charset_path('font-metadata') for path in ls(path): print('Reading font data: %s' % path) try: font_data = get_font_file_data(path) except Exception as e: print('Error: %s' % e) continue if font_data is None: print('Skipping this font because reasons.') elif font_data[0] is None: print('Error: Font has no name.') elif font_data[0] in fonts: newchars = fonts[font_data[0]][1].update(font_data[1]) newvendor = font_data[2] if fonts[ font_data[0]][2] is None else fonts[font_data[0]][2] fonts[font_data[0]] = (font_data[0], newchars, newvendor, None) else: fonts[font_data[0]] = (font_data[0], font_data[1], font_data[2], None) path = charset_path('acquisition', 'fonts') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for name, path, url in mod.list_fonts(): print('Reading font data: %s' % path) try: font_data = get_font_file_data(path) except Exception as e: print('Error: %s' % e) continue if font_data is None: print('Skipping this font because reasons.') elif font_data[0] is None: print('Error: Font has no name.') elif font_data[0] in fonts: newchars = fonts[font_data[0]][1].update(font_data[1]) newvendor = font_data[2] if fonts[ font_data[0]][2] is None else fonts[font_data[0]][2] fonts[font_data[0]] = (font_data[0], newchars, newvendor, url) else: fonts[font_data[0]] = (font_data[0], font_data[1], font_data[2], url) fonts = [fonts[k] for k in fonts] fonts.sort(key=lambda font: font[0].lower()) return fonts
def main(): mappings = charset_path('out', 'MAPPINGS') with cd(charset_path('mappings')): for path in ls('.'): out = generate(path, mappings) if out is not None: print('mappings/%s -> %s' % (path[2:], out)) create_indices(mappings) puadata = charset_path('out', 'PUADATA') with cd(charset_path('puadata')): for path in ls('.'): out = generate(path, puadata) if out is not None: print('puadata/%s -> %s' % (path[2:], out)) create_indices(puadata)
def main(): shared = charset_path('out', 'shared') if not os.path.exists(shared): os.makedirs(shared) ucd = {} ucd['ranges'], ucd['chars'] = get_unidata() path = os.path.join(shared, 'ucd.js') print('Writing Unicode data: %s' % path) with open(path, 'w') as f: f.write('UCD=%s;' % json.dumps(ucd, separators=(',', ':'))) pua = {} for meta, chars in get_puadata(): if 'Agreement-Type' in meta: if meta['Agreement-Type'] == 'Please-Ignore': continue meta['chars'] = chars pua[meta['Agreement-Name']] = meta path = os.path.join(shared, 'pua.js') print('Writing Private Use Area data: %s' % path) with open(path, 'w') as f: f.write('PUA=%s;' % json.dumps(pua, separators=(',', ':'))) entities = get_entities() path = os.path.join(shared, 'entitydb.js') print('Writing named character entity data: %s' % path) with open(path, 'w') as f: f.write('ENTITYDB=%s;' % json.dumps(entities, separators=(',', ':')))
def main(): path = charset_path('acquisition', 'unidata') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for name, path in mod.list_files(): print('%s -> %s' % (name, path))
def get_puadata(): with cd(charset_path('puadata')): for path in ls('.'): if os.path.basename(path) == 'sources.txt': meta = {} chars = {} blocks = [] for line in expand(path): if line: fields = strip_comment(line).split(':', 2) if len(fields) == 2: meta[fields[0].strip()] = fields[1].strip() else: break for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')): fields = line.split(';') try: cp = int(fields[0], 16) chars[cp] = fields except ValueError: continue for line in expand(os.path.join(os.path.dirname(path), 'blocks.txt')): fields = line.split(';') if len(fields) == 2: blockname = fields[1].strip() fields = fields[0].split('..') if len(fields) == 2: try: start = int(fields[0], 16) stop = int(fields[1], 16) blocks.append((start, stop, blockname)) except ValueError: continue blocks.sort() yield meta, chars, blocks
def main(): path = charset_path('acquisition', 'fonts') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for name, path, url in mod.list_fonts(): print('%s (%s) -> %s' % (name, url, path))
def main(): for cc, ir, cs, cp, mib, m, filename in __iso646_table: with open(charset_path('mappings', 'public', 'iso646', filename), 'w') as f: for line in iso646_encoding(sorted(cc), sorted(ir), cs, sorted(cp), sorted(mib), m): print(line, file=f)
def main(): path = charset_path('acquisition', 'entities') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for cp, entity in mod.list_entities(): print('%s: %s' % (cp, entity))
def get_assertions(): assertions = {} with cd(charset_path('identifiers')): for path in ls('.'): headers = [] dotdotdot = False for line in expand(path): if is_atline(line): headers = [] dotdotdot = False for field in strip_comment(line).split(): if field == '...': dotdotdot = True break elif field[0] == '@': headers.append(field[1:].lower()) else: headers.append(field.lower()) else: aa = [] fields = strip_comment(line).split() for i in range(0, len(fields)): if dotdotdot or i < len(headers): if fields[i] != '--': aa.append( (headers[i if i < len(headers) else -1], fields[i].lower())) for a in aa: if a not in assertions: assertions[a] = [] assertions[a].extend(aa) return assertions
def get_unidata(): ranges = {} chars = {} path = charset_path('acquisition', 'unidata') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: print('Reading Unicode data: %s' % modfile) for name, path in mod.list_files(): if name == 'UnicodeData.txt': with open(path, 'r') as ucd: for line in ucd: fields = line.strip().split(';') try: cp = int(fields[0], 16) if fields[1][:1] == '<' and fields[1][-3:] == 'st>': range_name = fields[1][1:-1].split(', ') if range_name[0] not in ranges: ranges[range_name[0]] = [cp, cp, fields, fields] elif range_name[1] == 'First': ranges[range_name[0]][0] = cp ranges[range_name[0]][2] = fields elif range_name[1] == 'Last': ranges[range_name[0]][1] = cp ranges[range_name[0]][3] = fields else: chars[cp] = fields except ValueError: continue return ranges, chars
def main(): path = charset_path('acquisition', 'vendors') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for vendor in mod.list_vendors(): for key in sorted(vendor.keys()): print('%s: %s' % (key, vendor[key])) print()
def main(): with cd(charset_path('mappings')): for path in ls('.'): result = verify(path) if result is not None: if len(result) > 0: print('mappings/%s: FAILED:\n%r' % (path[2:], result)) else: print('mappings/%s: PASSED' % path[2:])
def get_entities(): entities = {} path = charset_path('acquisition', 'entities') for modfile in ls(path): mod = load_plugin(modfile) if mod is not None: for cp, entity in mod.list_entities(): if cp not in entities: entities[cp] = entity return entities
def main(): if len(sys.argv) > 1: for arg in sys.argv[1:]: if arg[0] == '@': for f in ls(charset_path(arg[1:])): print(f) else: for f in ls(arg): print(f) else: for f in ls('.'): print(f)
def main(): assertions = get_assertions() with cd(charset_path('mappings')): for path in ls('.'): errors = verify(path, assertions) if errors is not None: if len(errors) > 0: print('mappings/%s: FAILED:' % path[2:]) for e in errors: print('Encoding with %s %s must have %s %s.' % e) else: print('mappings/%s: PASSED' % path[2:])
def get_puadata(): with cd(charset_path('puadata')): for path in ls('.'): if os.path.basename(path) == 'sources.txt': print('Reading Private Use Area data: %s' % path) meta = {} chars = {} for line in expand(path): if line: fields = strip_comment(line).split(':', 2) if len(fields) == 2: meta[fields[0].strip()] = fields[1].strip() else: break for line in expand(os.path.join(os.path.dirname(path), 'unicodedata.txt')): fields = line.split(';') try: cp = int(fields[0], 16) chars[cp] = fields except ValueError: continue yield meta, chars
def main(): categories = {} by_charset = {} by_mibenum = {} by_codepage = {} by_cfstrenc = {} by_nsstrenc = {} by_name = {} by_kte = {} ranges, chars = get_unidata() fonts = get_font_data() with cd(charset_path('mappings')): for path in ls('.'): meta, root = read_encoding(path) if meta['display'] is None or meta['name'] is None: print('Skipping: Is a fragment or encoding has no name.') continue basedir = charset_path('out', 'encoding', meta['name']) build_encoding(ranges, chars, fonts, meta, root, basedir) if meta['category'] in categories: categories[meta['category']].append(meta) else: categories[meta['category']] = [meta] if 'charset' in meta: for cs in meta['charset']: by_charset[cs] = meta if 'mibenum' in meta: for i in meta['mibenum']: try: by_mibenum[int(i)] = meta except ValueError: pass if 'codepage' in meta: for i in meta['codepage']: try: by_codepage[int(i)] = meta except ValueError: pass if 'cfstringencoding' in meta: for i in meta['cfstringencoding']: try: by_cfstrenc[int(i)] = meta except ValueError: pass if 'nsstringencoding' in meta: for i in meta['nsstringencoding']: try: by_nsstrenc[int(i)] = meta except ValueError: pass by_name[meta['name']] = meta for n in meta['name_other']: by_name[n] = meta if 'filename-kte' in meta: for n in meta['filename-kte']: by_kte[n] = meta basedir = charset_path('out', 'encoding') if not os.path.exists(basedir): os.makedirs(basedir) path = os.path.join(basedir, 'index.shtml') print('Writing encoding index: %s' % path) with open(path, 'w') as f: print('<!--#include virtual="/static/head.html"-->', file=f) print('<title>Character Encodings - Legacy Encodings</title>', file=f) print('<link rel="stylesheet" href="/charset/shared/enclist.css">', file=f) print('<!--#include virtual="/static/body.html"-->', file=f) print('<p class="breadcrumb"><a href="/charset/">Character Encodings</a> »</p>', file=f) print('<h1>Legacy Encodings</h1>', file=f) for category in sorted(categories, key=lambda c: nat_key(c if ' - ' in c else ' - ' + c)): print('<h2>%s</h2>' % html_encode(category), file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for m in sorted(categories[category], key=lambda m: nat_key(m['display'])): print('<tr><td>%s</td></tr>' % encoding_link(m), file=f) print('</table></div>', file=f) print('<h2>By IANA Charset</h2>', file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for cs in sorted(by_charset, key=nat_key): print('<tr><td class="charset">%s</td><td>%s</td></tr>' % (cs, encoding_link(by_charset[cs])), file=f) print('</table></div>', file=f) print('<h2>By IANA MIBenum</h2>', file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for i in sorted(by_mibenum): print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_mibenum[i])), file=f) print('</table></div>', file=f) print('<h2>By Code Page</h2>', file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for i in sorted(by_codepage): print('<tr><td>%03d</td><td>%s</td></tr>' % (i, encoding_link(by_codepage[i])), file=f) print('</table></div>', file=f) print('<h2>By CFStringEncoding</h2>', file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for i in sorted(by_cfstrenc): print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_cfstrenc[i])), file=f) print('</table></div>', file=f) print('<h2>By NSStringEncoding</h2>', file=f) print('<div class="enclist-wrapper"><table class="enclist">', file=f) for i in sorted(by_nsstrenc): print('<tr><td>%d</td><td>%s</td></tr>' % (i, encoding_link(by_nsstrenc[i])), file=f) print('</table></div>', file=f) print('<!--#include virtual="/static/tail.html"-->', file=f) path = charset_path('out', 'encoding.php') print('Writing encoding redirect: %s' % path) with open(path, 'w') as f: print('<?php', file=f) print('if (isset($_GET[\'file\'])) {', file=f) print('\t$file = $_GET[\'file\'];', file=f) print('\tswitch ($file) {', file=f) for k in sorted(by_kte): print('\t\tcase \'%s\': header(\'Location: /charset/encoding/%s\'); exit(0);' % (k, by_kte[k]['name']), file=f) print('\t}', file=f) print('}', file=f) print('if (isset($_GET[\'name\'])) {', file=f) print('\t$name = preg_replace(\'/[^A-Za-z0-9]+/\', \'\', $_GET[\'name\']);', file=f) print('\tswitch ($name) {', file=f) for k in sorted(by_name): print('\t\tcase \'%s\': header(\'Location: /charset/encoding/%s\'); exit(0);' % (k, by_name[k]['name']), file=f) print('\t}', file=f) print('}', file=f) print('header(\'Location: /charset/encoding/\');', file=f)