Пример #1
0
    props += (2**53) * data.get('Punctuation_In_Word', 0)
    props += (2**54) * data.get('Optional_Space_After', 0)
    props += (2**55) * data.get('Extended_Dash', 0)
    props += (2**56) * data.get('Paragraph_Separator', 0)
    props += (2**57) * data.get('Ellipsis', 0)
    props += (2**58) * data.get('Semi_Colon', 0)
    props += (2**59) * data.get('Colon', 0)
    props += (2**60) * data.get('Comma', 0)
    props += (2**61) * data.get('Exclamation_Mark', 0)
    props += (2**62) * data.get('Question_Mark', 0)
    props += (2**63) * data.get('Full_Stop', 0)
    return props


if __name__ == '__main__':
    for codepoint in ucd.CodeRange('000000..10FFFF'):
        try:
            data = unicode_chars[codepoint]
        except KeyError:
            data = {'CodePoint': codepoint}
        script = data.get('Script', 'Zzzz')
        title = data.get('TitleCase', codepoint)
        upper = data.get('UpperCase', codepoint)
        lower = data.get('LowerCase', codepoint)
        if title == null: title = codepoint
        if upper == null: upper = codepoint
        if lower == null: lower = codepoint
        print(
            '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %016x' %
            (codepoint, script, data.get('GeneralCategory', 'Cn')[0],
             data.get(
Пример #2
0
unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
    for codepoint in data['CodePoint']:
        unicode_chars[codepoint] = data['GeneralCategory']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:
        for data in ucd.parse_ucd_data('data/csur', csur):
            for codepoint in data['CodePoint']:
                unicode_chars[codepoint] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
category_sets = [
    (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
    (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),
    (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
    (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
    (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
    (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
    (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
    (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
]

# These categories have many pages consisting of just this category:
#     Cn -- Unassigned
#     Lo -- CJK Ideographs
Пример #3
0
unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
    for codepoint in data['Range']:
        unicode_chars[codepoint] = data['Script']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:
        for data in ucd.parse_ucd_data('data/csur', csur):
            for codepoint in data['CodePoint']:
                unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
script_sets = [
    (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
    (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
    (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
]

# These scripts have many pages consisting of just this script:
special_scripts = []

script_tables = {}
for codepoints, script, comment in script_sets:
    if not script:
        table = {}
        table_entry = None