Exemplo n.º 1
0
def get_unicode_properties():
    props_file = open(LOCAL_PROPS_FILE)

    props = defaultdict(list)

    for line in parse_file(props_file):
        char_range, prop = line

        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop.lower()].append(wide_unichr(i))
        elif char_range:
            props[prop.lower()].append(wide_unichr(char_range[0]))

    derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
    for line in parse_file(derived_props_file):
        char_range, prop = line
        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop.lower()].append(wide_unichr(i))
        elif char_range:
            props[prop.lower()].append(wide_unichr(char_range[0]))

    return dict(props)
Exemplo n.º 2
0
def get_unicode_properties():
    props_file = open(LOCAL_PROPS_FILE)

    props = defaultdict(list)

    for line in parse_file(props_file):
        char_range, prop = line

        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop.lower()].append(wide_unichr(i))
        elif char_range:
            props[prop.lower()].append(wide_unichr(char_range[0]))

    derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
    for line in parse_file(derived_props_file):
        char_range, prop = line
        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop.lower()].append(wide_unichr(i))
        elif char_range:
            props[prop.lower()].append(wide_unichr(char_range[0]))

    return dict(props)
Exemplo n.º 3
0
def get_unicode_blocks():
    blocks_file = open(LOCAL_BLOCKS_FILE)

    blocks = defaultdict(list)

    for line in parse_file(blocks_file):
        char_range, block = line
        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                blocks[block.lower()].append(wide_unichr(i))
        elif char_range:
            blocks[block.lower()].append(wide_unichr(char_range[0]))

    return dict(blocks)
Exemplo n.º 4
0
def get_unicode_blocks():
    blocks_file = open(LOCAL_BLOCKS_FILE)

    blocks = defaultdict(list)

    for line in parse_file(blocks_file):
        char_range, block = line
        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                blocks[block.lower()].append(wide_unichr(i))
        elif char_range:
            blocks[block.lower()].append(wide_unichr(char_range[0]))

    return dict(blocks)
Exemplo n.º 5
0
def get_word_break_properties():
    props_file = open(LOCAL_WORD_BREAKS_FILE)

    props = defaultdict(list)

    for line in parse_file(props_file):
        char_range, prop = line

        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop].append(wide_unichr(i))
        elif char_range:
            props[prop].append(wide_unichr(char_range[0]))

    return dict(props)
Exemplo n.º 6
0
def get_word_break_properties():
    props_file = open(LOCAL_WORD_BREAKS_FILE)

    props = defaultdict(list)

    for line in parse_file(props_file):
        char_range, prop = line

        char_range = parse_char_range(char_range)

        if len(char_range) == 2:
            for i in xrange(char_range[0], char_range[1] + 1):
                props[prop].append(wide_unichr(i))
        elif char_range:
            props[prop].append(wide_unichr(char_range[0]))

    return dict(props)
Exemplo n.º 7
0
def get_unicode_combining_classes():
    """
    Build dict of unicode combining classes e.g.

    {
        '0': ['\x00', '\x01', \x02', ...]
    }
    """
    combining_classes = defaultdict(list)
    for row in parse_unicode_data():
        combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
    return dict(combining_classes)
Exemplo n.º 8
0
def get_unicode_categories():
    """
    Build dict of unicode categories e.g.

    {
        'Lu': ['A', 'B', 'C', ...]
        'Ll': ['a', 'b', 'c', ...]
    }
    """
    categories = defaultdict(list)
    for row in parse_unicode_data():
        categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
    return dict(categories)
Exemplo n.º 9
0
def get_unicode_combining_classes():
    '''
    Build dict of unicode combining classes e.g.

    {
        '0': ['\x00', '\x01', \x02', ...]
    }
    '''
    combining_classes = defaultdict(list)
    for row in parse_unicode_data():
        combining_classes[int(row.combining)].append(
            wide_unichr(unicode_to_integer(row.code)))
    return dict(combining_classes)
Exemplo n.º 10
0
def get_unicode_categories():
    '''
    Build dict of unicode categories e.g.

    {
        'Lu': ['A', 'B', 'C', ...]
        'Ll': ['a', 'b', 'c', ...]
    }
    '''
    categories = defaultdict(list)
    for row in parse_unicode_data():
        categories[row.category].append(
            wide_unichr(unicode_to_integer(row.code)))
    return dict(categories)
Exemplo n.º 11
0
def init_unicode_categories():
    """
    Initialize module-level dictionaries
    """
    global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
    global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
    global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks

    unicode_categories.update(get_unicode_categories())
    unicode_combining_classes.update(get_unicode_combining_classes())

    for key in unicode_categories.keys():
        unicode_general_categories[key[0]].extend(unicode_categories[key])

    script_chars = get_chars_by_script()
    for i, script in enumerate(script_chars):
        if script:
            unicode_scripts[script.lower()].append(wide_unichr(i))

    unicode_scripts = dict(unicode_scripts)

    unicode_script_ids.update(build_master_scripts_list(script_chars))

    unicode_blocks.update(get_unicode_blocks())
    unicode_properties.update(get_unicode_properties())
    unicode_property_aliases.update(get_property_aliases())

    unicode_word_breaks.update(get_word_break_properties())

    for key, value in get_property_value_aliases().iteritems():
        key = unicode_property_aliases.get(key, key)
        if key == GENERAL_CATEGORY_PROP:
            for k, v in value.iteritems():
                k = k.lower()
                unicode_category_aliases[k] = v
                if "_" in k:
                    unicode_category_aliases[k.replace("_", "")] = v

        unicode_property_value_aliases[key] = value
Exemplo n.º 12
0
def init_unicode_categories():
    '''
    Initialize module-level dictionaries
    '''
    global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
    global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
    global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks

    unicode_categories.update(get_unicode_categories())
    unicode_combining_classes.update(get_unicode_combining_classes())

    for key in unicode_categories.keys():
        unicode_general_categories[key[0]].extend(unicode_categories[key])

    script_chars = get_chars_by_script()
    for i, script in enumerate(script_chars):
        if script:
            unicode_scripts[script.lower()].append(wide_unichr(i))

    unicode_scripts = dict(unicode_scripts)

    unicode_script_ids.update(build_master_scripts_list(script_chars))

    unicode_blocks.update(get_unicode_blocks())
    unicode_properties.update(get_unicode_properties())
    unicode_property_aliases.update(get_property_aliases())

    unicode_word_breaks.update(get_word_break_properties())

    for key, value in get_property_value_aliases().iteritems():
        key = unicode_property_aliases.get(key, key)
        if key == GENERAL_CATEGORY_PROP:
            for k, v in value.iteritems():
                k = k.lower()
                unicode_category_aliases[k] = v
                if '_' in k:
                    unicode_category_aliases[k.replace('_', '')] = v

        unicode_property_value_aliases[key] = value
Exemplo n.º 13
0
def format_regex_char(i):
    c = wide_unichr(i)
    return replace_regex_chars(c.encode("unicode-escape"))
Exemplo n.º 14
0
def format_regex_char(i):
    c = wide_unichr(i)
    return replace_regex_chars(c.encode('unicode-escape'))