Python digit示例，unicodedata.digit Python示例

示例#1

0

显示文件

文件： ArabicNumbers.py 项目： davejagoda/proggy

def print_unicode_entry(n):
    u = get_unicode_using_unicode_escape(n)
    try:
        print unicodedata.digit(u),
    except:
        return False
    print '{:4d} 0x{:3x}'.format(n, n), u.encode('utf8'), unicodedata.category(u), unicodedata.name(u)
    return True

示例#2

0

显示文件

文件： unicode_util.py 项目： kipanshi/vikiticket

def digit(unichr, default_value=None):
    """Returns the digit value assigned to the Unicode character unichr as
    integer. If no such value is defined, default is returned, or, if not
    given, ValueError is raised."""
    unichr = unicode(unichr)
    if default_value is not None:
        return unicodedata.digit(unichr, default_value)
    else:
        return unicodedata.digit(unichr)

示例#3

0

显示文件

文件： unicode_util.py 项目： timgates42/python-phonenumbers

def digit(uni_char, default_value=None):
    """Returns the digit value assigned to the Unicode character uni_char as
    integer. If no such value is defined, default is returned, or, if not
    given, ValueError is raised."""
    uni_char = unicod(uni_char)  # Force to Unicode.
    if default_value is not None:
        return unicodedata.digit(uni_char, default_value)
    else:
        return unicodedata.digit(uni_char)

示例#4

0

显示文件

文件：一.py 项目： davejagoda/proggy

def print_unicode_entry(n):
    u = get_unicode_using_unicode_escape(n)
    print '{:8d} {:8x}'.format(n, n),
    print u.encode('utf8'), unicodedata.category(u),
    try:
        print unicodedata.name(u),
    except:
        print 'unicodedata has no name defined',
    try:
        print unicodedata.digit(u)
    except:
        print 'unicodedata has no numeric value'

示例#5

0

显示文件

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        unicodedata.digit(s)  # digit 把一个合法的数字字符串转换为数字值
        return True
    except (TypeError, ValueError):
        pass
    return False

示例#6

0

显示文件

def is_int(x):
    try:
        long(x)
    except ValueError:
        try:
            unicodedata.digit(x)
        except (ValueError, TypeError):
            return False
        else:
            return True
    else:
        return True

示例#7

0

显示文件

文件： test_fake_fastnumbers.py 项目： InSertCod3/natsort

def is_int(x):
    try:
        int(x)
    except ValueError:
        try:
            unicodedata.digit(x)
        except (ValueError, TypeError):
            return False
        else:
            return True
    else:
        return True

示例#8

0

显示文件

文件： text_dataset.py 项目： NLCP/AlgoLabel

    def _parse_number(token):

        try:
            number = int(token)
        except ValueError:
            try:
                if len(token) > 1:
                    number = "".join(
                        [str(unicodedata.digit(ch)) for ch in token])
                else:
                    number = unicodedata.digit(token)
            except ValueError:
                return None

        return int(number)

示例#9

0

显示文件

文件： test_fake_fastnumbers.py 项目： vreuter/natsort

def is_int(x):
    try:
        return x.is_integer()
    except AttributeError:
        try:
            long(x)
        except ValueError:
            try:
                unicodedata.digit(x)
            except (ValueError, TypeError):
                return False
            else:
                return True
        else:
            return True

示例#10

0

显示文件

文件： text_tool.py 项目： Hrissimir/hed_utils

def normalize(text,
              *,
              map_cmb=True,
              map_digits=True,
              map_whitespace=True,
              form="NFKD") -> str:  # pragma: no-cov
    text = unicodedata.normalize(form, text)

    if map_cmb:
        cmb_map = dict.fromkeys(c for c in range(sys.maxunicode)
                                if unicodedata.combining(chr(c)))
        text = text.translate(cmb_map)

    if map_digits:
        digits_map = {
            c: ord("0") + unicodedata.digit(chr(c))
            for c in range(sys.maxunicode)
            if unicodedata.category(chr(c)) == "Nd"
        }
        text = text.translate(digits_map)

    if map_whitespace:
        whitespace_map = {ord("\t"): " ", ord("\f"): " ", ord("\r"): None}
        text = text.translate(whitespace_map)

    return text

示例#11

0

显示文件

文件： clean_str.py 项目： Chiva-Zhao/pproject

def more():
    digitmap = {c: ord('0') + unicodedata.digit(chr(c))
                for c in range(sys.maxunicode)
                if unicodedata.category(chr(c)) == 'Nd'}
    print(len(digitmap))
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

示例#12

0

显示文件

def try_to_read_signed_integer(iterable, val):
    """
    If the given string ends with +/-, attempt to return a signed int.
    Otherwise, return the string as-is.
    """
    if val.endswith(('+', '-')):
        next_element = next(iterable, None)

        # Last element, return as-is.
        if next_element is None:
            yield val
            return

        # We know the next value in the sequence must be "isnum == True".
        # We just need to handle unicode or not.
        _, next_val, next_isuni = next_element

        # If unicode, don't apply sign and just return the val as-is
        # and convert the unicode character.
        if next_isuni:
            yield val
            yield unicodedata.digit(next_val)

        # If the val is *only* the sign, return only the number.
        elif val in ('-', '+'):
            yield [val, next_val]

        # Otherwise, remove the sign from the val and apply it to the number,
        # returning both.
        else:
            yield val[:-1]
            yield [val[-1], next_val]

    else:
        yield val

示例#13

0

显示文件

def translate_fun():
    s = 'pýtĥöñ\fis\tawesome\r\n'
    print(s)

    remap = {
        # ord 一个ascii字符
        ord('\t'): ' ',
        ord('\f'): ' ',
        ord('\r'): None
    }
    a = s.translate(remap)
    print(a)
    # 找到所有的和音字符
    cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                             if unicodedata.combining(chr(c)))
    # 标准化
    b = unicodedata.normalize('NFD', a)
    print(b.translate(cmb_chrs))

    digitmap = {
        # unicodedata.digit把一个合法的数字字符串转换为数字值
        c: ord('0') + unicodedata.digit(chr(c))
        for c in range(sys.maxunicode)
        # 类型是否为Nd，及数字类型
        if unicodedata.category(chr(c)) == 'Nd'
    }
    print(len(digitmap))

    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

    # IO解码与编码处理
    b = unicodedata.normalize('NFD', s)
    print(b.encode('ascii', 'ignore').decode('ascii'))
    print(b.encode('utf-8', 'ignore').decode('utf-8'))

示例#14

0

显示文件

def try_to_read_signed_integer(iterable, val):
    """
    If the given string ends with +/-, attempt to return a signed int.
    Otherwise, return the string as-is.
    """
    if val.endswith(('+', '-')):
        next_element = next(iterable, None)

        # Last element, return as-is.
        if next_element is None:
            yield val
            return

        # We know the next value in the sequence must be "isnum == True".
        # We just need to handle unicode or not.
        _, next_val, next_isuni = next_element

        # If unicode, don't apply sign and just return the val as-is
        # and convert the unicode character.
        if next_isuni:
            yield val
            yield unicodedata.digit(next_val)

        # If the val is *only* the sign, return only the number.
        elif val in ('-', '+'):
            yield [val, next_val]

        # Otherwise, remove the sign from the val and apply it to the number,
        # returning both.
        else:
            yield val[:-1]
            yield [val[-1], next_val]

    else:
        yield val

示例#15

0

显示文件

文件： p12_translate.py 项目： Barathrum-Liu/python3-cookbook

def translate_str():
    s = 'pýtĥöñ\fis\tawesome\r\n'
    print(s)

    remap = {
        ord('\t'): ' ',
        ord('\f'): ' ',
        ord('\r'): None  # Deleted
    }

    a = s.translate(remap)
    print(a)

    # 删除和音符
    cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                             if unicodedata.combining(chr(c)))
    b = unicodedata.normalize('NFD', a)
    print(b)
    print(b.translate(cmb_chrs))

    # unicode数字字符映射到ascii字符
    digitmap = {c: ord('0') + unicodedata.digit(chr(c))
                for c in range(sys.maxunicode)
                if unicodedata.category(chr(c)) == 'Nd'}
    print(len(digitmap))
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

    # 先标准化，然后使用encode和decode函数
    b = unicodedata.normalize('NFD', a)
    print(type(b))
    print(b.encode('ascii', 'ignore').decode('ascii'))

示例#16

0

显示文件

文件： core.py 项目： copyninja/chardetails

    def getdetails(self, text):
        chardetails = {}
        for character in text:
            chardetails[character] = {}
            chardetails[character]['Name'] = unicodedata.name(character)
            chardetails[character]['HTML Entity'] = str(ord(character))
            chardetails[character]['Code point'] = repr(character)
            try:
                chardetails[character]['Numeric Value'] = \
                        unicodedata.numeric(character)
            except:
                pass
            try:
                chardetails[character]['Decimal Value'] = \
                        unicodedata.decimal(character)
            except:
                pass
            try:
                chardetails[character]['Digit'] = unicodedata.digit(mychar)
            except:
                pass
            chardetails[character]['Alphabet'] = str(character.isalpha())
            chardetails[character]['Digit'] = str(character.isdigit())
            chardetails[character]['AlphaNumeric'] = str(character.isalnum())
            chardetails[character]['Canonical Decomposition'] = \
                    unicodedata.decomposition(character)

        chardetails['Characters'] = list(text)
        return chardetails

示例#17

0

显示文件

def translate_str():
    s = 'pýtĥöñ\fis\tawesome\r\n'
    print(s)

    #创建转换表格,再使用translate()方法
    remap = {
        ord('\t'):' ',
        ord('\f'):' ',
        ord('\r'):None
    }
    a = s.translate(remap)
    print(a)


    ##删除和音符
    cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))         ##参考ex2_9a.py
    b = unicodedata.normalize('NFD',a)
    print(b)
    print(b.translate(cmb_chrs))

    #unicode数字字符映射到ascii字符
    digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' }
    print(len(digitmap))
    #Arabic digits
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

    #unicodedata.category(chr)   返回分配给字符 chr 的常规类别为字符串。
    #unicodedata.digit(chr)  返回分配给字符 chr 的数字值作为整数。 如果没有定义这样的值，则返回 default ，如果没有给出，则 ValueError 被引发。


    ## 先标准化，然后使用encode和decode函数
    b = unicodedata.normalize('NFD',a)
    print(b.encode('ascii','ignore').decode('ascii'))    #ascii编码，解码操作丢弃了那些和音符，只在目标是获取文本对应ascii表示的时候生效

示例#18

0

显示文件

文件： chap2-strings-text.py 项目： gaopinghuang0/learn-python

def test_translate():
	s = 'pýtĥöñ\fis\tawesome\r\n'
	print s
	remap = {
		ord('\t'): ' ',
		ord('\f'): ' ',
		ord('\r'): None
	}
	a = s.translate(remap)
	print a

	# remove all combining characters
	import unicodedata
	import sys
	cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
		if unicodedata.combining(chr(c)))
	b = unicodedata.normalize('NFD', a)
	print b
	print b.translate(cmb_chrs)

	# maps all Unicode decimal digit to ASCII
	digitmap = {c: ord('0')+unicodedata.digit(chr(c))
	for c in range(sys.maxunicode)
	if unicodedata.category(chr(c)) == 'Nd'}
	print len(digitmap)
	# Arabic digits
	x = '\u0661\u0662\u0663'
	print x.translate(digitmap)

示例#19

0

显示文件

def translate_str():
    s = 'pýtĥöñ\fis\tawesome\r\n'
    print(s)

    remap = {
        ord('\t'): ' ',
        ord('\f'): ' ',
        ord('\r'): None  # Deleted
    }

    a = s.translate(remap)
    print(a)

    # 删除和音符
    cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                             if unicodedata.combining(chr(c)))
    b = unicodedata.normalize('NFD', a)
    print(b)
    print(b.translate(cmb_chrs))

    # unicode数字字符映射到ascii字符
    digitmap = {
        c: ord('0') + unicodedata.digit(chr(c))
        for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'
    }
    print(len(digitmap))
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

    # 先标准化，然后使用encode和decode函数
    b = unicodedata.normalize('NFD', a)
    print(type(b))
    print(b.encode('ascii', 'ignore').decode('ascii'))

示例#20

0

显示文件

def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters(
):
    set_numeric_hex = set(numeric_hex)
    set_numeric_chars = set(numeric_chars)
    set_digit_chars = set(digit_chars)
    set_decimal_chars = set(decimal_chars)
    for i in py23_range(0X110000):
        try:
            a = py23_unichr(i)
        except ValueError:
            break
        if a in set('0123456789'):
            continue
        if unicodedata.numeric(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_numeric_chars
        if unicodedata.digit(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_digit_chars
        if unicodedata.decimal(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_decimal_chars

    assert set_decimal_chars.isdisjoint(digits_no_decimals)
    assert set_digit_chars.issuperset(digits_no_decimals)

    assert set_decimal_chars.isdisjoint(numeric_no_decimals)
    assert set_numeric_chars.issuperset(numeric_no_decimals)

示例#21

0

显示文件

文件： String_Tool.py 项目： JohnGoods/Python

def CleanRubbishStr(needCleanStr='pýtĥöñ\fis\tawesome\r\n'):
    remap = {
        ord('\t'): ' ',
        ord('\f'): ' ',
        ord('\n'): None,  # Deleted
        ord('\r'): None,  # Deleted
    }
    print("remap Count is " + str(len(remap)))

    remap2 = dict.fromkeys(c for c in range(sys.maxunicode)
                           if unicodedata.combining(chr(c)))
    print("remap2 Count is " + str(len(remap2)))
    # a = needCleanStr.translate(remap)
    # print(a)
    # b = unicodedata.normalize('NFD', a)
    # print(b)
    # b = b.translate(remap2)
    # print(b)

    digitmap = {
        c: ord('0') + unicodedata.digit(chr(c))
        for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'
    }
    print("digitmap Count is " + str(len(digitmap)))
    # x = '\u0661\u0662\u0663'
    # print(x.translate(digitmap))

    allList = tool.MergeTwoDicts(remap, remap2)
    allList = tool.MergeTwoDicts(allList, digitmap)
    print("allList Count is " + str(len(allList)))

    a = unicodedata.normalize('NFD', needCleanStr)
    b = a.translate(allList)
    print(b)

示例#22

0

显示文件

 def tr(c):
     try:
         return table[c]
     except KeyError:
         try:
             return str(unicodedata.digit(c))
         except ValueError:
             return c

示例#23

0

显示文件

文件： clean_str.py 项目： csu-xiao-an/pproject

def more():
    digitmap = {
        c: ord('0') + unicodedata.digit(chr(c))
        for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'
    }
    print(len(digitmap))
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

示例#24

0

显示文件

文件： task_1.py 项目： PKoshkin/DAS_NLP

def decode_digits(string):
    new_digit = ''
    for character in string:
        try:
            new_digit += str(unicodedata.digit(character))
        except ValueError:
            pritn('have some problem')
            pass
    return new_digit

示例#25

0

显示文件

def print_unicode_entry(n):
    u = chr(n)
    try:
        print(unicodedata.digit(u), end=' ')
    except:
        return False
    print('{:4d} 0x{:3x}'.format(n, n), u, unicodedata.category(u),
          unicodedata.name(u))
    return True

示例#26

0

显示文件

文件： shift_jis_converter.py 项目： develin/cloudbudget

 def convert_to_int(self, string):
     tmp = self.convert(string)
     result = 0
     digit = -1
     for letter in string:
         digit = unicodedata.digit(letter, -1)
         if digit >= 0:
             result = result * 10 + digit
     return result

示例#27

0

显示文件

文件： test_unicode_numbers.py 项目： agustinhenze/natsort.debian

def test_digit_chars_contains_all_valid_unicode_digit_characters():
    for i in py23_range(0X10FFFF):
        try:
            a = py23_unichr(i)
        except ValueError:
            break
        if a in set('0123456789'):
            continue
        if unicodedata.digit(a, None) is not None:
            assert a in digit_chars

示例#28

0

显示文件

def example_3():
    import unicodedata
    import sys

    digitmap = { c: ord('0') + unicodedata.digit(chr(c))
                 for c in range(sys.maxunicode)
                 if unicodedata.category(chr(c)) == 'Nd' } 
    print(len(digitmap))
    x = '\u0661\u0662\u0663'
    print(x.translate(digitmap))

示例#29

0

显示文件

文件： commons.py 项目： wpoa/yadkard

def uninum2en(string):
    """Convert non-ascii unicode digits to equivalent English one (0-9).

    Example:
    >>> uninum2en('٤۴৪౪')
    '4444'
    """
    digits = set(re.findall(r'\d', string))
    for d in digits:
        string = string.replace(d, str(unicodedata.digit(d)))
    return string

示例#30

0

显示文件

def int_splitter_iter(iterable, signed):
    """Split the input into integers and strings."""
    for isnum, val, isuni in iterable:
        if isuni:
            yield unicodedata.digit(val)
        elif isnum:
            yield int(val)
        elif signed:
            for x in try_to_read_signed_integer(iterable, val):
                yield int(''.join(x)) if isinstance(x, list) else x
        else:
            yield val

示例#31

0

显示文件

文件： slow_splitters.py 项目： InSertCod3/natsort

def int_splitter(x, signed, safe, sep):
    """Alternate (slow) method to split a string into numbers."""
    if not x:
        return []
    all_digits = set('0123456789')
    full_list, strings, nums = [], [], []
    input_len = len(x)
    for i, char in enumerate(x):
        # If this character is a sign and the next is a number,
        # start a new number.
        if (i + 1 < input_len and signed and (char in '-+')
                and (x[i + 1] in all_digits)):
            # Reset any current string or number.
            if strings:
                full_list.append(''.join(strings))
            if nums:
                full_list.append(int(''.join(nums)))
            strings = []
            nums = [char]
        # If this is a number, add to the number list.
        elif char in all_digits:
            nums.append(char)
            # Reset any string.
            if strings:
                full_list.append(''.join(strings))
            strings = []
        # If this is a unicode digit, append directly to the full list.
        elif char.isdigit():
            # Reset any string or number.
            if strings:
                full_list.append(''.join(strings))
            if nums:
                full_list.append(int(''.join(nums)))
            strings = []
            nums = []
            full_list.append(unicodedata.digit(char))
        # Otherwise add to the string.
        else:
            strings.append(char)
            # Reset any number.
            if nums:
                full_list.append(int(''.join(nums)))
            nums = []
    if nums:
        full_list.append(int(''.join(nums)))
    elif strings:
        full_list.append(''.join(strings))
    if safe:
        full_list = sep_inserter(full_list, (int, long), sep)
    if type(full_list[0]) in (int, long):
        return [sep] + full_list
    else:
        return full_list

示例#32

0

显示文件

def int_splitter_iter(iterable, signed):
    """Split the input into integers and strings."""
    for isnum, val, isuni in iterable:
        if isuni:
            yield unicodedata.digit(val)
        elif isnum:
            yield int(val)
        elif signed:
            for x in try_to_read_signed_integer(iterable, val):
                yield int(''.join(x)) if isinstance(x, list) else x
        else:
            yield val

示例#33

0

显示文件

文件： slow_splitters.py 项目： InSertCod3/natsort

def int_splitter(x, signed, safe, sep):
    """Alternate (slow) method to split a string into numbers."""
    if not x:
        return []
    all_digits = set('0123456789')
    full_list, strings, nums = [], [], []
    input_len = len(x)
    for i, char in enumerate(x):
        # If this character is a sign and the next is a number,
        # start a new number.
        if (i+1 < input_len and signed and
                (char in '-+') and (x[i+1] in all_digits)):
            # Reset any current string or number.
            if strings:
                full_list.append(''.join(strings))
            if nums:
                full_list.append(int(''.join(nums)))
            strings = []
            nums = [char]
        # If this is a number, add to the number list.
        elif char in all_digits:
            nums.append(char)
            # Reset any string.
            if strings:
                full_list.append(''.join(strings))
            strings = []
        # If this is a unicode digit, append directly to the full list.
        elif char.isdigit():
            # Reset any string or number.
            if strings:
                full_list.append(''.join(strings))
            if nums:
                full_list.append(int(''.join(nums)))
            strings = []
            nums = []
            full_list.append(unicodedata.digit(char))
        # Otherwise add to the string.
        else:
            strings.append(char)
            # Reset any number.
            if nums:
                full_list.append(int(''.join(nums)))
            nums = []
    if nums:
        full_list.append(int(''.join(nums)))
    elif strings:
        full_list.append(''.join(strings))
    if safe:
        full_list = sep_inserter(full_list, (int, long), sep)
    if type(full_list[0]) in (int, long):
        return [sep] + full_list
    else:
        return full_list

示例#34

0

显示文件

文件：一.py 项目： davejagoda/proggy

def print_unicode_entry(n):
    u = chr(n)
    print('{:8d} {:8x}'.format(n, n), end=' ')
    print(u, unicodedata.category(u), end=' ')
    try:
        print(unicodedata.name(u), end=' ')
    except:
        print('unicodedata has no name defined', end=' ')
    try:
        print(unicodedata.digit(u))
    except:
        print('unicodedata has no numeric value')

示例#35

0

显示文件

def test_digit_chars_contains_all_valid_unicode_digit_characters():
    set_numeric_hex = set(numeric_hex)
    set_numeric_chars = set(numeric_chars)
    for i in py23_range(0X110000):
        try:
            a = py23_unichr(i)
        except ValueError:
            break
        if a in set('0123456789'):
            continue
        if unicodedata.digit(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_numeric_chars

示例#36

0

显示文件

文件： unicode_browser.py 项目： stephensmitchell-forks/objbrowser

def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char,
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))

示例#37

0

显示文件

    def _explain_char(self, ch, further):
        try:
            name = unicodedata.name(ch)
        except ValueError:
            name = f'[U+{hex(ord(ch))[2:]}]'
        if not further:
            return name + f'({ch})'
        infos = {
            'category': unicodedata.category(ch),
            'direction': unicodedata.bidirectional(ch),
            'east asian width': unicodedata.east_asian_width(ch)
        }

        decomposition = unicodedata.decomposition(ch)
        if decomposition:
            infos['decomposition'] = decomposition

        try:
            infos['digit value'] = unicodedata.digit(ch)
        except ValueError:
            pass
        try:
            infos['decimal value'] = unicodedata.decimal(ch)
        except ValueError:
            pass
        try:
            infos['numeric value'] = unicodedata.numeric(ch)
        except ValueError:
            pass
        comb = unicodedata.combining(ch)
        if comb != 0:
            infos['combining class'] = str(comb)

        mirrored = unicodedata.mirrored(ch)
        if mirrored:
            infos['mirrored'] = 'yes'
        if hasattr(unicodedata, 'is_normalized'):
            forms = []
            for form in ('NFC', 'NFD', 'NFKC', 'NFKD'):
                if unicodedata.is_normalized(form, ch):
                    forms.append(form)
            if forms:
                infos['normalized'] = f'yes: {", ".join(forms)}'
            else:
                infos['normalized'] = 'no'
        else:
            infos['normalized'] = 'unavailable'

        info = ', '.join([f'{k}: {v}' for k, v in infos.items()])
        return f'{name}: {ch!r} ({info})'

示例#38

0

显示文件

文件： literate.py 项目： LimeHunter7/ipython-latex-literate

def latex_char(char: Character, prefix: str) -> str:
    if char.category[1] == 'd':
        return ud.digit(char.character)
    ipc = char.ipython_cmd
    # TODO: special-case upgreek in bf
    suffix = ipc[len(prefix):]
    if len(suffix) > 1:
        if prefix in ('\\', r'\bf') and suffix not in upgreek_blacklist:
            suffix = 'up' + suffix
        elif (prefix in (r'\it', r'\bi') and suffix in upgreek_blacklist
              and 'var' not in suffix):
            suffix = 'var' + suffix
        suffix = '\\' + suffix
    return suffix

示例#39

0

显示文件

文件： unicode_browser.py 项目： lebedov/objbrowser

def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), 
                           char, 
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))

示例#40

0

显示文件

文件： theses-brussels.py 项目： fschwenn/ejlmod

def convert_string(string):
    string_list = list(string)

    for i in range(0, len(string_list)):
        try:
            string_list[i] = str(unicodedata.digit(
                string_list[i])).encode('utf-8')
        except ValueError:
            pass

    out = ""
    for j in string_list:
        out += j

    return out

示例#41

0

显示文件

文件： char2name.py 项目： edt-yxz-zzd/python3_src

def char2info(ch):
    name = U.name(ch, None)
    decimal = U.decimal(ch, None)
    digit = U.digit(ch, None)
    numeric = U.numeric(ch, None)

    category = U.category(ch)
    bidirectional = U.bidirectional(ch)
    combining = U.combining(ch)
    east_asian_width = U.east_asian_width(ch)
    mirrored = U.mirrored(ch)
    decomposition = U.decomposition(ch)

    unicode = ord(ch)
    unicode_hex = hex(unicode)
    return dict(locals())

示例#42

0

显示文件

文件： periodpicker.py 项目： johannesmik/edojidaipicker

def checkEntry(*event):
    entry = yearentry.get()

    if isWesternYear(entry):
        westernyear = int(entry)
        period, year = edoFromWestern(westernyear)
        showPeriod(period, year)
    else:
        period, year = "", "0"
        for c in entry:
            if c in "0123456789０１２３４５６７８９":
                year += str(unicodedata.digit(c))
            else:
                period += c
        
        showPeriod(period, int(year))

示例#43

0

显示文件

    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)

示例#44

0

显示文件

文件： test_unicodedata.py 项目： mozillazg/pypy

    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)

示例#45

0

显示文件

文件： dateFld.py 项目： OCLC-Developer-Network/viaf-dates

   def startPattern(self, input):
      """Do the normalization that is possible before splitting the string and that is needed 
         to split the string.

         Args:
            input: date subfield

         Returns:
            (norminput, pattern)
            norminput is the normalized date subfield. pattern is the preliminary date pattern. 
            The pattern will continue to be refined after the date is split.

         Sets:
            self.hijri: if the date subfield is hijri
            self.datetype: if the date was flourished
      """
      pattern = unicodedata.normalize('NFKD', unicode(input[1:]).lower())
      pattern = ''.join([unicode(unicodedata.digit(d, d)) for d in pattern])
      pattern = re.sub(',', ' ', pattern)
      # convert various dashes to dash
      pattern = re.sub(u'\u2212|\u2013|\u2014|\u05be|\u2010|\u2015|\u30fb', '-', pattern)
      pattern = pattern.replace('bzw.', '-') ## from DNB records
      pattern = re.sub(u'\u061f', '?', pattern)  # arabic question mark
      pattern = re.sub('----|-t\.|\[.*h\]| reg\..*$| age .*$', '', pattern)
      pattern = re.sub('\(|\)|;|<|>|\]|\[', '', pattern)
      ## moved these to overrides
      ##pattern = re.sub('av\. ?j\.?-\.?c', 'av jc', pattern)
      ##pattern = re.sub('-talet', ' talet', pattern)
      pattern = re.sub('\[|\]', '', pattern)
      pattern = pattern.replace('xxxx', '').replace('gegenwart', '')
      pattern = re.sub('\.{4,10}', '', pattern)
      pattern = pattern.strip(' ')
      pattern = re.sub(' +', ' ', pattern)
      flourishedpattern = isFlourished if self.flags.find('fIsFlourished') == -1 else altisFlourished
      if flourishedpattern.search(pattern):
         pattern = flourishedpattern.sub('', pattern)
         self.datetype = 'flourished'
      norminput = pattern
      pattern = monthMasker7.sub(r'\1month\4', pattern)
      self.hijri = isHijri.search(pattern)
      if self.hijri:
         pattern = isHijri.sub('', pattern).strip()
         norminput = isHijri.sub('', norminput).strip()
      pattern = re.sub('\d', 'N', pattern)
      return norminput, pattern

示例#46

0

显示文件

文件： show_utf8_char.py 项目： odashi/nlptools

def main():
  try:
    v = bytes(int(x, 16) for x in sys.argv[1:])
    c = v.decode('utf8')
    print('gryph:            %s' % c)
    print('codepoint:        U+%x' % ord(c))
    print('name:             %s' % unicodedata.name(c, 'Unknown'))
    print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
    print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
    print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
    print('category:         %s' % unicodedata.category(c))
    print('bidirectional:    %s' % unicodedata.bidirectional(c))
    print('combining:        %s' % unicodedata.combining(c))
    print('east_asian_width: %s' % unicodedata.east_asian_width(c))
    print('mirrored:         %s' % unicodedata.mirrored(c))
    print('decomposition:    %s' % unicodedata.decomposition(c))
  except Exception as ex:
    print('ERROR: %s' % ex)

示例#47

0

显示文件

def main():
    try:
        v = bytes(int(x, 16) for x in sys.argv[1:])
        c = v.decode('utf8')
        print('gryph:            %s' % c)
        print('codepoint:        U+%x' % ord(c))
        print('name:             %s' % unicodedata.name(c, 'Unknown'))
        print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
        print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
        print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
        print('category:         %s' % unicodedata.category(c))
        print('bidirectional:    %s' % unicodedata.bidirectional(c))
        print('combining:        %s' % unicodedata.combining(c))
        print('east_asian_width: %s' % unicodedata.east_asian_width(c))
        print('mirrored:         %s' % unicodedata.mirrored(c))
        print('decomposition:    %s' % unicodedata.decomposition(c))
    except Exception as ex:
        print('ERROR: %s' % ex)

示例#48

0

显示文件

文件： test_unicodedata.py 项目： AishwaryaKM/python-tutorial

    def test_compare_functions(self):
        import unicodedata # CPython implementation

        def getX(fun, code):
            if fun == 'numeric' and code in self.diff_numeric:
                return -1
            try:
                return getattr(unicodedb_4_1_0, fun)(code)
            except KeyError:
                return -1
        
        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_4_1_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)

示例#49

0

显示文件

文件： test_regressions.py 项目： IronLanguages/ironpython2

    def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')

示例#50

0

显示文件

文件： test_unicode_numbers.py 项目： agustinhenze/natsort.debian

def test_digit_chars_contains_only_valid_unicode_digit_characters():
    for a in digit_chars:
        assert unicodedata.digit(a, None) is not None

示例#51

0

显示文件

文件： py3.5_example.py 项目： xuyan0/pycookbook

}

a = s.translate(remap)
print(a)

import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))

b = unicodedata.normalize('NFD', a)
print(b)
result = b.translate(cmb_chrs)
print(result)

digitmap = { c : ord('0') + unicodedata.digit(chr(c))
             for c in range(sys.maxunicode)
             if unicodedata.category(chr(c)) == 'Nd' }

print(len(digitmap))

# Arabic digits
x = '\u0661\u0662\u0663'
x.translate(digitmap)

print(a)
b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')


"""

示例#52

0

显示文件

文件： unicode_numbers.py 项目： heylenz/TACTIC-Handler

    0X1D7FB, 0X1D7FC, 0X1D7FD, 0X1D7FE, 0X1D7FF, 0X1E8C7, 0X1E8C8,
    0X1E8C9, 0X1E8CA, 0X1E8CB, 0X1E8CC, 0X1E8CD, 0X1E8CE, 0X1E8CF,
    0X1F100, 0X1F101, 0X1F102, 0X1F103, 0X1F104, 0X1F105, 0X1F106,
    0X1F107, 0X1F108, 0X1F109, 0X1F10A, 0X1F10B, 0X1F10C, 0X20001,
    0X20064, 0X200E2, 0X20121, 0X2092A, 0X20983, 0X2098C, 0X2099C,
    0X20AEA, 0X20AFD, 0X20B19, 0X22390, 0X22998, 0X23B1B, 0X2626D,
    0X2F890
]

# Convert each hex into the literal Unicode character.
# Stop if a ValueError is raised in case of a narrow Unicode build.
# The extra check with unicodedata is in case this Python version
# does not support some characters.
numeric_chars = []
for a in numeric_hex:
    try:
        l = py23_unichr(a)
    except ValueError:  # pragma: no cover
        break
    if unicodedata.numeric(l, None) is None:
        continue
    numeric_chars.append(l)

# The digit characters are a subset of the numerals.
digit_chars = [a for a in numeric_chars
               if unicodedata.digit(a, None) is not None]

# Create a single string with the above data.
digits = ''.join(digit_chars)
numeric = ''.join(numeric_chars)

示例#53

0

显示文件

文件： test_unicodedata.py 项目： zhoupan/OpenModelSphereMirror

print "Testing Unicode Database..."
print "Methods:",
print test_methods()

# In case unicodedata is not available, this will raise an ImportError,
# but still test the above cases...
import unicodedata

print "Functions:",
print test_unicodedata()

# Some additional checks of the API:
print "API:",

verify(unicodedata.digit(u"A", None) is None)
verify(unicodedata.digit(u"9") == 9)
verify(unicodedata.digit(u"\u215b", None) is None)
verify(unicodedata.digit(u"\u2468") == 9)

verify(unicodedata.numeric(u"A", None) is None)
verify(unicodedata.numeric(u"9") == 9)
verify(unicodedata.numeric(u"\u215b") == 0.125)
verify(unicodedata.numeric(u"\u2468") == 9.0)

verify(unicodedata.decimal(u"A", None) is None)
verify(unicodedata.decimal(u"9") == 9)
verify(unicodedata.decimal(u"\u215b", None) is None)
verify(unicodedata.decimal(u"\u2468", None) is None)

verify(unicodedata.category(u"\uFFFE") == "Cn")

示例#54

0

显示文件

文件： Chapter2_12.py 项目： DwyaneTalk/basicSkillsLearning

审查清理字符串：
str.upper()、str.lower()、str.replace()、re.sub()、unicodedata.normalize()等实现不同功能的基础处理
str.translate()通过构造替换字典进行处理
'''

if __name__ == '__main__':
	s = 'pýtĥöñ\fis\taWesome\r\n'
	print(s)
	print(s.upper())
	print(s.lower())
	print(s.replace('W', 'w'))

	remap = {
		ord('\t') : ' ',
		ord('\f') : ' ',
		ord('\r') : None
	}
	a = s.translate(remap)
	print(a)

	import unicodedata, sys
	cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
	b = unicodedata.normalize('NFD', a)
	print(b)
	print(b.translate(cmb_chrs))

	x = '\u0661\u0662\u0663'
	digitmap = {c : ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'}
	print(len(digitmap))
	print(x.translate(digitmap))
	print(b.encode('ascii', 'ignore').decode('ascii'))

示例#55

0

显示文件

文件： tokenize-anything.py 项目： akashb-cmu/sp2016.11-731

def quote_norm(line):
    line = ' %s ' % line
    # Delete control characters:
    line = re.sub(r'[\x00-\x1F]+', ' ', line)

    # PTB --> normal
    line = line.replace(r'-LRB-', '(')
    line = line.replace(r'-RRB-', ')')
    line = line.replace(r'-LSB-', '[')
    line = line.replace(r'-RSB-', ']')
    line = line.replace(r'-LCB-', '{')
    line = line.replace(r'-RCB-', '}')
    line = line.replace(r' gon na ', ' gonna ')

    # Regularize named HTML/XML escapes:
    line = re.sub(r'&\s*lt\s*;', '<', line, flags=re.IGNORECASE)     # HTML opening angle bracket
    line = re.sub(r'&\s*gt\s*;', '>', line, flags=re.IGNORECASE)     # HTML closing angle bracket
    line = re.sub(r'&\s*squot\s*;', '\'', line, flags=re.IGNORECASE) # HTML single quote
    line = re.sub(r'&\s*quot\s*;', '"', line, flags=re.IGNORECASE)   # HTML double quote
    line = re.sub(r'&\s*nbsp\s*;', ' ', line, flags=re.IGNORECASE)   # HTML non-breaking space
    line = re.sub(r'&\s*apos\s*;', '\'', line, flags=re.IGNORECASE)  # HTML apostrophe
    line = re.sub(r'&\s*amp\s*;', '&', line, flags=re.IGNORECASE)    # HTML ampersand (last)

    # Regularize known HTML numeric codes:
    line = re.sub(r'&\s*#\s*160\s*;', ' ', line)
    line = re.sub(r'&\s*#45\s*;\s*&\s*#45\s*;', '--', line)
    line = re.sub(r'&\s*#45\s*;', '--', line)

    # Convert arbitrary hex or decimal HTML entities to actual characters:
    line = re.sub(r'&\#x([0-9A-Fa-f]+);', html_hex_entity, line)
    line = re.sub(r'&\#([0-9]+);', html_entity, line)

    # Regularlize spaces:
    zero_width_spaces = [u'\u00ad', # soft hyphen
                         u'\u200C'] # zero-width non-joiner
    line = re.sub('|'.join(zero_width_spaces), '', line)

    spaces = [u'\u00a0', # non-breaking space
              u'\u2009', # thin space
              u'\u2028', # "line separator"
              u'\u2029', # "paragraph separator"
              u'\u202a', # "left-to-right embedding"
              u'\u202b', # "right-to-left embedding"
              u'\u202c', # "pop directional formatting"
              u'\u202d', # "left-to-right override"
              u'\u202e', # "right-to-left override"
              u'\u0085', # "next line"
              u'\ufffd', # "replacement character"
              u'\ufeff', # byte-order mark
              u'\ufdd3'] # "unicode non-character"
    line = re.sub('|'.join(spaces), ' ', line)

    # Convert other Windows 1252 characters to UTF-8
    line = line.replace(u'\u0080', u'\u20ac') # euro sign
    line = line.replace(u'\u0095', u'\u2022') # bullet
    line = line.replace(u'\u0099', u'\u2122') # trademark sign

    # Currency and measure conversions:
    line = re.sub(r' (\d\d): (\d\d)', r' \1:\2', line)
    line = line.replace(u'\u20a0', ' EUR ')
    line = line.replace(u'\u20ac', ' EUR ')
    line = line.replace(u'\u00a3', ' GBP ')
    line = re.sub(r'(\W)([A-Z]+\$?)(\d*\.\d+|\d+)', r'\1\2 \3', line) # AU$12.34
    line = re.sub(r'(\W)(euro?)(\d*\.\d+|\d+)', r'\1EUR \3', line, flags=re.IGNORECASE) # EUR12.34

    # Ridiculous double conversions, UTF8 -> Windows 1252 -> UTF8:
    line = line.replace(u'ï¿½c', '--')                 # long dash
    line = line.replace(u'\u00e2\u20acoe', '"')        # opening double quote
    line = line.replace(u'\u00e2\u20ac\u009c', '"')    # opening double quote
    line = line.replace(u'\u00e2\u20ac\u009d', '"')    # closing double quote
    line = line.replace(u'\u00e2\u20ac\u2122', '\'')   # apostrophe
    line = line.replace(u'\u00e2\u20ac\u201c', ' -- ') # en dash?
    line = line.replace(u'\u00e2\u20ac\u201d', ' -- ') # em dash?

    line = line.replace(u'\u00e2\u0080\u0098', r'\'') # single quote?
    line = line.replace(u'\u00e2\u0080\u0099', r'\'') # single quote?
    line = line.replace(u'\u00e2\u0080\u009c', r'"')  # double quote?
    line = line.replace(u'\u00e2\u0080\u009d', r'"')  # double quote?
    line = line.replace(u'\u00c3\u009f', u'\u00df')    # esset
    line = line.replace(u'\u00c3\u0178', u'\u00df')    # esset
    line = line.replace(u'\u00c3\u00a4', u'\u00e4')    # a umlaut
    line = line.replace(u'\u00c3\u00b6', u'\u00f6')    # o umlaut
    line = line.replace(u'\u00c3\u00bc', u'\u00fc')    # u umlaut
    line = line.replace(u'\u00c3\u0084', u'\u00c4')    # A umlaut: create no C4s after this
    line = line.replace(u'\u00c3\u201e', u'\u00c4')    # A umlaut: create no C4s after this
    line = line.replace(u'\u00c3\u0096', u'\u00d6')    # O umlaut
    line = line.replace(u'\u00c3\u2013', u'\u00d6')    # O umlaut
    line = line.replace(u'\u00c3\u00bc', u'\u00dc')    # U umlaut
    line = line.replace(u'\u0080', u'\u20ac')           # euro sign
    line = line.replace(u'\u0095', u'\u2022')           # bullet
    line = line.replace(u'\u0099', u'\u2122')           # trademark sign

    # Regularize quotes:
    line = line.replace(u'ˇ', '\'')      # caron
    line = line.replace(u'´', '\'')      # acute accent
    line = line.replace(u'`', '\'')      # grave accent
    line = line.replace(u'ˉ', '\'')      # modified letter macron
    line = line.replace(u' ,,', '"')     # ghetto low-99 quote
    line = line.replace(u'``', '"')      # latex-style left quote
    line = line.replace(u'\'\'', '"')    # latex-style right quote
    line = line.replace(u'\u300c', '"')  # left corner bracket
    line = line.replace(u'\u300d', '"')  # right corner bracket
    line = line.replace(u'\u3003', '"')  # ditto mark
    line = line.replace(u'\u00a8', '"')  # diaeresis
    line = line.replace(u'\u0092', '\'') # curly apostrophe
    line = line.replace(u'\u2019', '\'') # curly apostrophe
    line = line.replace(u'\uf03d', '\'') # curly apostrophe
    line = line.replace(u'\u00b4', '\'') # curly apostrophe
    line = line.replace(u'\u2018', '\'') # curly single open quote
    line = line.replace(u'\u201a', '\'') # low-9 quote
    line = line.replace(u'\u0093', '"')  # curly left quote
    line = line.replace(u'\u201c', '"')  # curly left quote
    line = line.replace(u'\u0094', '"')  # curly right quote
    line = line.replace(u'\u201d', '"')  # curly right quote
    line = line.replace(u'\u2033', '"')  # curly right quote
    line = line.replace(u'\u201e', '"')  # low-99 quote
    line = line.replace(u'\u0084', '"')  # low-99 quote (bad enc)
    line = line.replace(u'\u201f', '"')  # high-rev-99 quote
    line = line.replace(u'\u00ab', '"')  # opening guillemet
    line = line.replace(u'\u00bb', '"')  # closing guillemet
    line = line.replace(u'\u0301', '\'') # combining acute accent
    line = line.replace(u'\u203a', '"')  # angle quotation mark
    line = line.replace(u'\u2039', '"')  # angle quotation mark

    # Space inverted punctuation:
    line = line.replace(u'¡', u' ¡ ')
    line = line.replace(u'¿', u' ¿ ')

    # Russian abbreviations:
    line = line.replace(u' п. п. ', u' п.п. ')
    line = line.replace(u' ст. л. ', u' ст.л. ')
    line = line.replace(u' т. е. ', u' т.е. ')
    line = line.replace(u' т. к. ', u' т.к. ')
    line = line.replace(u' т. ч. ', u' т.ч. ')
    line = line.replace(u' т. д. ', u' т.д. ')
    line = line.replace(u' т. п. ', u' т.п. ')
    line = line.replace(u' и. о. ', u' и.о. ')
    line = line.replace(u' с. г. ', u' с.г. ')
    line = line.replace(u' г. р. ', u' г.р. ')
    line = line.replace(u' т. н. ', u' т.н. ')
    line = line.replace(u' т. ч. ', u' т.ч. ')
    line = line.replace(u' н. э. ', u' н.э. ')

    # Convert foreign numerals into Arabic numerals
    line = ''.join([str(unicodedata.digit(c)) if c.isdigit() else c for c in line])

    # Random punctuation:
    line = line.replace(u'！', '!')
    line = line.replace(u'-', '-')
    line = line.replace(u'～', '~')
    line = line.replace(u'、', ',')
    #line = line.replace(u'。', '.')
    line = line.replace(u'\u0085', '...')
    line = line.replace(u'…', '...')
    line = line.replace(u'―', '--')
    line = line.replace(u'–', '--')
    line = line.replace(u'─', '--')
    line = line.replace(u'—', '--')
    line = line.replace(u'\u0097', '--')
    line = line.replace(u'•', ' * ')
    line = line.replace(u'\*', ' * ')
    line = line.replace(u'،', ',')
    line = line.replace(u'؟', '?')
    line = line.replace(u'ـ', ' ')
    line = line.replace(u'Ã ̄', 'i')
    line = line.replace(u'â€™', '\'')
    line = line.replace(u'â€"', '"')
    line = line.replace(u'؛', ';')

    # Regularize ligatures:
    line = line.replace(u'\u009c', 'oe')  # "oe" ligature
    line = line.replace(u'\u0153', 'oe')  # "oe" ligature
    line = line.replace(u'\u008c', 'Oe')  # "OE" ligature
    line = line.replace(u'\u0152', 'Oe')  # "OE" ligature
    line = line.replace(u'\ufb00', 'ff')  # "ff" ligature
    line = line.replace(u'\ufb01', 'fi')  # "fi" ligature
    line = line.replace(u'\ufb02', 'fl')  # "fl" ligature
    line = line.replace(u'\ufb03', 'ffi') # "ffi" ligature
    line = line.replace(u'\ufb04', 'ffl') # "ffl" ligature
    line = line.replace(u'\u0132', 'Ij')  # "Ij" ligature
    line = line.replace(u'\u0133', 'ij')  # "ij" ligature
    line = line.replace(u'\ufb06', 'st')  # "st" ligature
    line = line.replace(u'\u00c6', 'Ae')  # "Ae" ligature
    line = line.replace(u'\u00e6', 'ae')  # "ae" ligature
    line = line.replace(u'\ufb05', 'st')  # "st" ligature

    line = line.replace(u'β', u'ß') # WMT 2010 error

    # Strip extra spaces:
    line = re.sub(r'\s+', ' ', line)
    line = line.strip()
    return line

示例#56

0

显示文件

文件： 161_test_unicodedata.py 项目： asottile/ancient-pythons

""" Test script for the unicodedata module.

Written by Marc-Andre Lemburg ([email protected]).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"
from test_support import verbose
import sys

# Test Unicode database APIs
import unicodedata

print 'Testing unicodedata module...',

assert unicodedata.digit(u'A',None) is None
assert unicodedata.digit(u'9') == 9
assert unicodedata.digit(u'\u215b',None) is None
assert unicodedata.digit(u'\u2468') == 9

assert unicodedata.numeric(u'A',None) is None
assert unicodedata.numeric(u'9') == 9
assert unicodedata.numeric(u'\u215b') == 0.125
assert unicodedata.numeric(u'\u2468') == 9.0

assert unicodedata.decimal(u'A',None) is None
assert unicodedata.decimal(u'9') == 9
assert unicodedata.decimal(u'\u215b',None) is None
assert unicodedata.decimal(u'\u2468',None) is None

assert unicodedata.category(u'\uFFFE') == 'Cn'

示例#57

0

显示文件

文件： codepoint.py 项目： Codepoints/unicodeinfo

 def digit(self, default=None):
     return ud.digit(self.char, default)

示例#58

0

显示文件

文件： test_unicodedata.py 项目： BackupTheBerlios/etpe-svn

### Run tests

print 'Testing Unicode Database...'
print 'Methods:',
print test_methods()

# In case unicodedata is not available, this will raise an ImportError,
# but still test the above cases...
import unicodedata
print 'Functions:',
print test_unicodedata()

# Some additional checks of the API:
print 'API:',

verify(unicodedata.digit(u'A',None) is None)
verify(unicodedata.digit(u'9') == 9)
verify(unicodedata.digit(u'\u215b',None) is None)
verify(unicodedata.digit(u'\u2468') == 9)

verify(unicodedata.numeric(u'A',None) is None)
verify(unicodedata.numeric(u'9') == 9)
verify(unicodedata.numeric(u'\u215b') == 0.125)
verify(unicodedata.numeric(u'\u2468') == 9.0)

verify(unicodedata.decimal(u'A',None) is None)
verify(unicodedata.decimal(u'9') == 9)
verify(unicodedata.decimal(u'\u215b',None) is None)
verify(unicodedata.decimal(u'\u2468',None) is None)

verify(unicodedata.category(u'\uFFFE') == 'Cn')

示例#59

0

显示文件

文件： test_unicodedata.py 项目： mcyril/ravel-ftn

""" Test script for the unicodedata module.
    Written by Marc-Andre Lemburg ([email protected]).
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#"
from test_support import verify, verbose
import sha
encoding = 'utf-8'
def test_methods():
    h = sha.sha()
    for i in range(65536):
        char = unichr(i)
        data = [
            # Predicates (single char)
            char.isalnum() and u'1' or u'0',
            char.isalpha() and u'1' or u'0',
            char.isdecimal() and u'1' or u'0',
            char.isdigit() and u'1' or u'0',
            char.islower() and u'1' or u'0',
            char.isnumeric() and u'1' or u'0',
            char.isspace() and u'1' or u'0',
            char.istitle() and u'1' or u'0',
            char.isupper() and u'1' or u'0',
            # Predicates (multiple chars)
            (char + u'abc').isalnum() and u'1' or u'0',
            (char + u'abc').isalpha() and u'1' or u'0',
            (char + u'123').isdecimal() and u'1' or u'0',
            (char + u'123').isdigit() and u'1' or u'0',
            (char + u'abc').islower() and u'1' or u'0',
            (char + u'123').isnumeric() and u'1' or u'0',
            (char + u' \t').isspace() and u'1' or u'0',