예제 #1
0
def uni2tex(text):
    out = ""
    txt = tuple(text)
    i = 0
    while i < len(txt):
        char = text[i]
        code = ord(char)

        # Elsevier bibtex dumps sometimes have a fancy dash
        if code == 8211:
            out += "-"
        # combining marks
        elif unicodedata.category(char) in ("Mn", "Mc") and code in accents:
            out += "{\\%s%s}" %(accents[code], txt[i+1])
            i += 1
        # precomposed characters
        elif unicodedata.decomposition(char):
            base, acc = unicodedata.decomposition(char).split()
            acc = int(acc, 16)
            base = int(base, 16)
            if acc in accents:
                out += "\\%s{%s}" %(accents[acc], unichr(base))
            else:
                out += char
        else:
            out += char

        i += 1

    return out
예제 #2
0
파일: latex.py 프로젝트: jdumas/autobib
def uni2tex(text):
    """
    Translate accented unicode characters intro latex macros.

    http://tex.stackexchange.com/questions/23410/how-to-convert-characters-to-latex-code
    """
    out = ""
    txt = tuple(text)
    i = 0
    while i < len(txt):
        char = text[i]
        code = ord(char)

        # combining marks
        if unicodedata.category(char) in ("Mn", "Mc") and code in accents:
            out += "{\\%s{%s}}" % (accents[code], txt[i + 1])
            i += 1
        # precomposed characters
        elif unicodedata.decomposition(char):
            base, acc = unicodedata.decomposition(char).split()
            acc = int(acc, 16)
            base = int(base, 16)
            if acc in accents:
                out += "{\\%s{%s}}" % (accents[acc], chr(base))
            else:
                out += char
        # other special case
        elif char in specials:
            out += "{%s}" % specials[char]
        else:
            out += char

        i += 1

    return out
예제 #3
0
def uni2tex(text):
    # Courtesy of https://tex.stackexchange.com/q/23410
    accents = {
        0x0300: '`', 0x0301: "'", 0x0302: '^', 0x0308: '"',
        0x030B: 'H', 0x0303: '~', 0x0327: 'c', 0x0328: 'k',
        0x0304: '=', 0x0331: 'b', 0x0307: '.', 0x0323: 'd',
        0x030A: 'r', 0x0306: 'u', 0x030C: 'v',
        }
    out = ""
    txt = tuple(text)
    i = 0
    while i < len(txt):
        char = text[i]
        code = ord(char)

        # combining marks
        if unicodedata.category(char) in ("Mn", "Mc") and code in accents:
            out += "\\%s{%s}" % (accents[code], txt[i+1])
            i += 1
        # precomposed characters
        elif unicodedata.decomposition(char):
            base, acc = unicodedata.decomposition(char).split()
            acc = int(acc, 16)
            base = int(base, 16)
            if acc in accents:
                out += "\\%s{%s}" % (accents[acc], chr(base))
            else:
                out += char
        else:
            out += char
        i += 1
    return out
예제 #4
0
 def mapchar(self, key):
     if key in self:
         return self[key]
     de = unicodedata.decomposition(unichr(key))
     if de:
         try:
             ch = int(de.split(None, 1)[0], 16)
         except (IndexError, ValueError):
             ch = key
     else:
         ch = CHAR_REPLACEMENT.get(unichr(key), key)
     if ch  == 32: # space
         pass
     elif 47 < ch < 58: # digits
         pass
     elif 64 < ch < 91: # uppercase
         pass
     elif 96 < ch < 123: # lowercase
         pass
     elif 127 < ch < 165: # upper ascii latin1
         pass
     elif ch == 9: # map tab to space
         ch = 32
     elif ch < 128: # reject invalid lower ascii
         ch = None
     elif ch in (152, 158) or ch < 256:
         ch = None
     self[key] = ch
     return ch
예제 #5
0
    def getdetails(self, text):
        chardetails = {}
        for character in text:
            chardetails[character] = {}
            chardetails[character]['Name'] = unicodedata.name(character)
            chardetails[character]['HTML Entity'] = str(ord(character))
            chardetails[character]['Code point'] = repr(character)
            try:
                chardetails[character]['Numeric Value'] = \
                        unicodedata.numeric(character)
            except:
                pass
            try:
                chardetails[character]['Decimal Value'] = \
                        unicodedata.decimal(character)
            except:
                pass
            try:
                chardetails[character]['Digit'] = unicodedata.digit(mychar)
            except:
                pass
            chardetails[character]['Alphabet'] = str(character.isalpha())
            chardetails[character]['Digit'] = str(character.isdigit())
            chardetails[character]['AlphaNumeric'] = str(character.isalnum())
            chardetails[character]['Canonical Decomposition'] = \
                    unicodedata.decomposition(character)

        chardetails['Characters'] = list(text)
        return chardetails
예제 #6
0
파일: utils.py 프로젝트: mrtopf/jmstvcamp
def string2filename(s):
    """convert a string to a valid filename"""
    

    s = s.strip()
    s = s.lower()

    # remove an eventual path
    s = s.replace("\\","/")
    _, s = os.path.split(s)
    
    res = u''
    mkeys = mapping.keys()
    for c in s:
        o = ord(c)
        if o in mapping.keys():
            res = res+mapping[o]
            continue
        if decomposition(c):
            res = res + normalize('NFKD', c)
        else:
            res = res + c
    
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    res = ''.join(c for c in res if c in valid_chars)
    res = res.replace(" ","-")
    return res
예제 #7
0
파일: ttfdiet.py 프로젝트: davelab6/ttfdiet
	def getDecompositionData(u,missingMarks):
	# inside so we can use umap, nmap ...
			udec = None
			try: 
				dec = unicodedata.decomposition(unichr(u))
				if len(dec) > 1:
					if not dec[:1] == "<":
						udec = [int(s, 16) for s in dec.split()]
						decall = 0
						for ud in udec:
							if ud in SKIP_MARKS_FINAL: # if mark is in SKIP_MARKS_FINAL we don't want to do any decomposition
								return 0
							if ud in umap:
								decall += 1
							else:
								if  ud not in SKIP_MARKS_FINAL \
								and ud     in MARK_GLYPH_CODEPOINT_RANGE:
									missingMarks += [unicodeIntToHexstr(ud)]
	#					if decall == len(udec) and decall == 1:
	#						print "SAME:",umap[u],[umap[ud] for ud in udec]
						if decall == len(udec) and decall > 1: # the last condition may go for the sake of allowing reference to same-shape glyphs
							return umap[u],[umap[ud] for ud in udec],udec[0] # last one is the one to check next
			except ValueError: 
				return 0
			return 0
예제 #8
0
파일: title_to_id.py 프로젝트: dvorberg/t4
def asciify(string):
    '''
    "ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.

    This very cool function originates at
    http://www.physic.ut.ee/~kkannike/english/prog/python/index.html
    '''
    # Unfortunately, I don’t really understand, how this function works.
    # I have a hunch, this could be done better with a decomposed representation
    # of the string ("NFKD"), but I don’t have time to really test a function
    # as sensitive as this one right now.
    # To work reliably the way it is, strings must consist of composed
    # characters.
    string = normalize("NFC", string)
    
    temp = u'' 
    for char in string:
        decomp = decomposition(char)
        if decomp: # Not an empty string
            d = decomp.split()[0]
            try:
                temp += unichr(int(d, 16))
            except ValueError:
                if d == "<super>":
                    temp += unichr(int(decomp.split()[1], 16))
                else:
                    pass
                    #raise Exception("Can't handle this: " + repr(decomp))
        else:
            temp += char

    return temp
예제 #9
0
파일: models.py 프로젝트: hsoft/dtf
 def deaccent_char(c):
     decomposed = unicodedata.decomposition(c)
     if decomposed:
         basechar = int(decomposed.split(' ')[0], 16)
         return chr(basechar)
     else:
         return c
예제 #10
0
def extended_unicode_model(list):
    """
    Takes as input a list of QLC-formatted words and outputs a unigram model.
    """
    segments_hash = collections.defaultdict(int)
    segment_count = 0

    for word in list:
        word = word.strip()
        segments = word.split()
        for segment in segments:
            segment_count += 1
            segments_hash[segment] += 1

    segments_sorted = sorted(segments_hash.items(), key=operator.itemgetter(1), reverse=True)
    
    # print("Phone"+"\t"+"Int"+"\t"+"Count"+"\t"+"Frequency") # +"\t"+"plog")
    print("Char"+"\t"+"int"+"\t"+"Unicode name"+"\t"+"category"+"\t"+"comb class"+"\t"+"decomposition"+"\t"+"count"+"\t"+"frequency")

    for segment in segments_sorted:
        segment, count = segment[0], segment[1]
        frequency = segments_hash[segment]/segment_count

        # decimal = unicodedata.decimal(segment)
        name = unicodedata.name(segment)
        category = unicodedata.category(segment)
        combining_class = unicodedata.combining(segment)
        decomposition = unicodedata.decomposition(segment)

        print(segment+"\t"+str(ord(segment))+"\t"+name+"\t"+category+"\t"+str(combining_class)+"\t"+decomposition+"\t"+str(count)+"\t"+str(frequency))
예제 #11
0
def normalizeUnicode(text, encoding='humanascii'):
    """
    This method is used for normalization of unicode characters to the base ASCII
    letters. Output is ASCII encoded string (or char) with only ASCII letters,
    digits, punctuation and whitespace characters. Case is preserved.
    """
    if text == "":
	return ""

    unicodeinput = True
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8')
        unicodeinput = False

    res = ''
    global allowed, allowedid
    if encoding == 'humanascii' or encoding == 'identifier':
        enc = 'ascii'
    else:
        enc = encoding
    for ch in text:
        if (encoding == 'humanascii') and (ch in allowed):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        if (encoding == 'identifier') and (ch in allowedid):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        else:
            try:
                ch.encode(enc,'strict')
                if encoding == 'identifier':
                    res += '_'
                else:
                    res += ch
            except UnicodeEncodeError:
                ordinal = ord(ch)
                if mapping.has_key(ordinal):
                    # try to apply custom mappings
                    res += mapping.get(ordinal)
                elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
                    normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip()
                    # normalized string may contain non-letter chars too. Remove them
                    # normalized string may result to  more than one char
                    if encoding == 'identifier':
                        res += ''.join([c for c in normalized if c in allowedid])
                    else:
                        res += ''.join([c for c in normalized if c in allowed])
                else:
                    # hex string instead of unknown char
                    res += "%x" % ordinal
    if encoding == 'identifier':
        res = res.strip('_').replace('_____','_').replace('____','_').replace('___','_').replace('__','_')
        if not res.strip('_')[0] in string.ascii_letters:
            res = '_' + res
    if unicodeinput:
        return res
    else:
        return res.encode('utf-8')
예제 #12
0
def isvalidaccelerator(accelerator, acceptlist=None):
    """returns whether the given accelerator character is valid

    @type accelerator: character
    @param accelerator: A character to be checked for accelerator validity
    @type acceptlist: String
    @param acceptlist: A list of characters that are permissible as accelerators
    @rtype: Boolean
    @return: True if the supplied character is an acceptable accelerator
    """
    assert isinstance(accelerator, unicode)
    assert isinstance(acceptlist, unicode) or acceptlist is None
    if len(accelerator) == 0:
        return False
    if acceptlist is not None:
        acceptlist = data.normalize(acceptlist)
        if accelerator in acceptlist:
            return True
        return False
    else:
        # Old code path - ensures that we don't get a large number of regressions
        accelerator = accelerator.replace("_","")
        if accelerator in u"-?":
            return True
        if not accelerator.isalnum():
            return False

        # We don't want to have accelerators on characters with diacritics, so let's 
        # see if the character can decompose.
        decomposition = unicodedata.decomposition(accelerator)
        # Next we strip out any extra information like <this>
        decomposition = re.sub("<[^>]+>", "", decomposition).strip()
        return decomposition.count(" ") == 0
def remove_accents(chars):
    """Divides a given string into decomposable and undecomposable characters."""
    decomposable = []
    undecomposable = []
    for c in chars:
        de = unicodedata.decomposition(c)
        if de:
            dechars = de.split(None)
            try:
                # Only keep characters with a decimal value < 300
                dechars = map(lambda i: int(i, 16), dechars)
                dechars = filter(lambda i: i < 300, dechars)                
                dechars = map(unichr, dechars)
                de = "".join(dechars)
            except (IndexError, ValueError):
                if ord(c) in CHAR_REPLACEMENT:
                    de = CHAR_REPLACEMENT[ord(c)]
                else:
                    dechars = filter(lambda s: s[0] != "<", dechars)
                    dechars = map(lambda i: int(i, 16), dechars)
                    dechars = map(unichr, dechars)
                    de = "".join(dechars)
                undecomposable.append((c, de))
            else:
                decomposable.append((c, de))
        else:
            if ord(c) in CHAR_REPLACEMENT:
                de = CHAR_REPLACEMENT[ord(c)]
                undecomposable.append((c, de))
    return decomposable, undecomposable
예제 #14
0
def buildCompatChars(sfd, ttf):
    zwj = u'\u200D'
    ranges = (
            (0xfb50, 0xfbb1),
            (0xfbd3, 0xfd3d),
            (0xfd50, 0xfdf9),
            (0xfdfc, 0xfdfc),
            (0xfe70, 0xfefc),
            )

    with open(ttf, "rb") as f:
        data = f.read()
        blob = HarfBuzz.glib_blob_create(GLib.Bytes.new(data))
        face = HarfBuzz.face_create(blob, 0)
        hbfont = HarfBuzz.font_create(face)
        upem = HarfBuzz.face_get_upem(face)
        HarfBuzz.font_set_scale(hbfont, upem, upem)
        HarfBuzz.ot_font_set_funcs(hbfont)

    ttfont = TTFont(ttf)

    for r in ranges:
        for c in range(r[0], r[1]+1):
            dec = ucd.decomposition(unichr(c)).split()
            if dec:
                keyword = dec[0]
                text = u''

                for i in dec[1:]:
                    text += unichr(int(str(i),16))

                if keyword == '<initial>':
                    text = text + zwj
                elif keyword == '<final>':
                    text = zwj + text
                elif keyword == '<medial>':
                    text = zwj + text + zwj

                components = shape(text, hbfont)
                if components:
                    glyph = sfd.createChar(c)
                    glyph.clear()
                    glyph.color = 0xff0000 # red color
                    x = 0
                    for component in components:
                        gid = component[0]
                        name = ttfont.getGlyphName(gid)
                        x_advance = component[1]
                        x_offset = component[2]
                        y_offset = component[3]

                        matrix = psMat.translate(x + x_offset, y_offset)

                        # ignore blank glyphs, e.g. space or ZWJ
                        if sfd[name].foreground or sfd[name].references:
                            glyph.addReference(name, matrix)

                        x += x_advance

                    glyph.width = x
예제 #15
0
파일: hooks.py 프로젝트: comlounge/userbase
def string2filename(s, path = None, default=u"anonymous"):
    """convert a string to a valid filename"""
    
    from unicodedata import decomposition, normalize

    # TODO: make it a better conversion?
    if type(s) != types.UnicodeType:
        s = unicode(s)
    
    s = s.strip()
    s = s.lower()


    if s=="":
        s = default

    # remove an eventual path
    s = s.replace("\\","/")
    _, s = os.path.split(s)
    
    res = u''
    mkeys = mapping.keys()
    for c in s:
        o = ord(c)
        if o in mapping.keys():
            res = res+mapping[o]
            continue
        if decomposition(c):
            res = res + normalize('NFKD', c)
        else:
            res = res + c
    
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in res if c in valid_chars)
    filename = filename.replace(" ","_")
    
    # if path is not None we can check if there already is a file with that name
    if path is None:
        return filename
        
    fullpath=os.path.join(path, filename)
    if not os.path.exists(fullpath):
        return filename

    # remove the extension
    root, ext = os.path.splitext(filename)
        
    for idx in range(1,100):
        filename = "%s-%d%s" %(root, idx, ext)
        if not os.path.exists(os.path.join(path,filename)):
            return filename
            
    for idx in range(1,100):
        u = unicode(uuid.uuid4())
        filename = "%s-%s%s" %(root, u, ext)
        if not os.path.exists(os.path.join(path,filename)):
            return filename
        
    return None # we did not get a result, TODO: further checking
예제 #16
0
 def mapchar(self, key):
     ch = self.get(key)
     if ch is not None:
         return ch
     if sys.version_info >= (3, 0):
         de = unicodedata.decomposition(chr(key))
     else:
         de = unicodedata.decomposition(unichr(key))
     if de:
         try:
             ch = int(de.split(None, 1)[0], 16)
         except (IndexError, ValueError):
             ch = key
     else:
         ch = CHAR_REPLACEMENT.get(key, key)
     self[key] = ch
     return ch
예제 #17
0
def buildCompatChars(font, hbfont):
    zwj = u'\u200D'
    ranges = (
            (0xfb50, 0xfbb1),
            (0xfbd3, 0xfd3d),
            (0xfd50, 0xfdf9),
            (0xfdfc, 0xfdfc),
            (0xfe70, 0xfefc),
            )
    text = u''
    codes = []
    for r in ranges:
        for c in range(r[0], r[1]+1):
            dec = ucd.decomposition(unichr(c)).split()
            if dec:
                codes.append(c)
                keyword = dec[0]
                new_text = u''

                for i in dec[1:]:
                    new_text += unichr(int(str(i),16))

                if keyword == '<initial>':
                    new_text = new_text + zwj
                elif keyword == '<final>':
                    new_text = zwj + new_text
                elif keyword == '<medial>':
                    new_text = zwj + new_text + zwj

                text += new_text + '\n'

    lines = runHB(text, hbfont)
    i = 0
    for c in codes:
        components = lines[i]
        i += 1
        if components:
            glyph = font.createChar(c)
            glyph.clear()
            glyph.color = 0xff0000 # red color
            x = 0
            for component in components:
                name = component[0]
                x_advance = component[1]
                y_advance = component[2]
                x_offset = component[3]
                y_offset = component[4]

                matrix = psMat.translate(x + x_offset, y_offset)

                # ignore blank glyphs, e.g. space or ZWJ
                if font[name].foreground or font[name].references:
                    glyph.addReference(name, matrix)

                x += x_advance

            glyph.width = x
예제 #18
0
파일: compile.py 프로젝트: bpeel/catverbs
def make_index_value(display_name):
    buf = bytearray()

    for ch in display_name:
        decomposition = unicodedata.decomposition(ch)
        if len(decomposition) > 0:
            ch = chr(int(decomposition.split()[0], 16))
        if ch >= 'a' and ch <= 'z':
            buf.append(ord(ch))

    return buf.decode("ASCII")
예제 #19
0
def normalizeRtlString(s):
	l=[]
	for c in s:
		#If this is an arabic presentation form b character (commenly given by Windows when converting from glyphs)
		#Decompose it to its original basic arabic (non-presentational_ character.
		if 0xfe70<=ord(c)<=0xfeff:
			d=unicodedata.decomposition(c)
			d=d.split(' ') if d else None
			if d and len(d)==2 and d[0] in ('<initial>','<medial>','<final>','<isolated>'):
				c=unichr(int(d[1],16))
		l.append(c)
	return u"".join(l)
예제 #20
0
파일: hu_lastfm.py 프로젝트: shajith/hu
def asciify(string):
	"""
	gets rid of pesky things like umlauts and tildes and other accents. ascii all the way, baby.
	"""
	temp = u'' 
	for char in string:
		decomp = decomposition(char)
		if decomp: # Not an empty string 
			temp += unichr(int(decomp.split()[0], 16))
		else:
			temp += char
	return temp
예제 #21
0
def store_contextual_form():
	# print('store_contextual_form', equiv, file=sys.stderr)
	compat_disp = equiv.compat
	if equiv.compat[0] == ' ': compat_disp = '\u00A0' + compat_disp[1:]
	#nonlocal current_line
	form_cells = StringIO()
	form = decomposition(char)[1:7]
	print('<td class="ch">{}{}</td>'.format(contextual_form_formats.get(form, '{}').format(compat_disp),
		'<small><br/>{}</small>'.format(ord_mul(compat_disp)) if len(compat_disp) >=2 else ''), file=form_cells)
	print('<td class="ch">{}<small><br />{:04X}</small></td>'.format(char, equiv.code_point), file=form_cells)
	#if current_line.get(form, 'not found') != 'not found': print('collision', current_line[form].rstrip(), equiv, file=stderr)
	current_line[form] = form_cells.getvalue()
	form_cells.close()
예제 #22
0
 def mapchar(self,key):
     ch = self.get(key)
     if ch is not None:
         return ch
     de = unicodedata.decomposition(unichr(key))
     if de:
         try:
             ch = int(de.split(None,1)[0],16)
         except (IndexError, ValueError):
             ch = key
     else:
         ch = charmap.get(key,key)
     self[key] = ch
     return ch
예제 #23
0
파일: gordon_db.py 프로젝트: bmcfee/gordon
 def mapchar(self, key):
     ch = self.get(key)
     if ch is not None:
         return ch
     ch = unichr(key)
     try:
         ch = unichr(int(decomposition(ch).split()[0], 16))
     except (IndexError, ValueError):
         ch = self.CHAR_REPLACEMENT.get(key, ch)
     # uncomment the following line if you want to remove remaining
     # non-ascii characters
     # if ch >= u"\x80": return None
     self[key] = ch
     return ch
예제 #24
0
def return_an_analysis_of_a_string(src,
                                   raw_number_only=False):
    """
        This function is debug-oriented and describes what's lie in the string
        <src>.

        * if <raw_number_only> == True: output = "xxxx yyyy zzzz..."
    """

    res = []

    for char in src:

        # normal case :
        try:

            if not raw_number_only:

                decompos = ""
                if unicodedata.decomposition(char) != "":
                    decompos = " ( ="+unicodedata.decomposition(char)+" )"

                res.append( str(hex(ord(char)))+decompos )


            else:
                res = "".join( (res,
                                "{0:04X}".format(ord(char)),
                                ))

        # special case :
        except ValueError:
            res = "".join( (res,
                            "!UNKNOWN CHARACTER! : "+str(hex(ord(char)))
                            ))

    return " + ".join(res)
예제 #25
0
파일: kana.py 프로젝트: davejagoda/proggy
def make_list(lo, hi):
# risuto is Romaji for list, since lower case l is easily confused
# it is a list of pairs (tuples) like this: (romaji, kana)
    risuto = []
    for i in xrange(lo,hi):
        kana = unichr(i)
        try:
            name = unicodedata.name(kana)
        except:
            name = 'NONE'
        if '' == unicodedata.decomposition(kana) and 'LETTER' in name and 'SMALL' not in name:
            (syllabary, letter, romaji) = name.split(' ')
            if 1 == len(romaji): romaji += ' ' # hack, do this with format
            risuto.append((romaji, kana))
    return(risuto)
예제 #26
0
 def __missing__(self, key):
     ch = self.get(key)
     if ch is not None:
         return ch
     try:
         de = unicodedata.decomposition(unichr(key))
         p1, _, p2 = de.rpartition(' ')
         if int(p2, 16) == 0x308:
             ch = self.get(key)
         else:
             ch = int(p1, 16)
     except (IndexError, ValueError):
         ch = self.get(key, key)
     self[key] = ch
     return ch
예제 #27
0
파일: utils.py 프로젝트: ibz/travelist
 def __missing__(self, k):
     if k in self:
         return self[k]
     v = k
     if k in self.STATIC_MAP:
         v = self.STATIC_MAP[k]
     else:
         de = unicodedata.decomposition(unichr(k))
         if de:
             try:
                 v = int(de.split(None, 1)[0], 16)
             except (IndexError, ValueError):
                 pass
     self[k] = v
     return v
예제 #28
0
def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), 
                           char, 
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))                          
예제 #29
0
def asciified(text):
    """
    Similar to ``text`` but with none ASCII letters replaced by their decomposed ASCII
    equivalent.
    """
    assert text is not None
    if not isinstance(text, unicode):
        raise ValueError(u"text must be unicode instead of %s" % type(text))
    result = u""
    for ch in text:
        decomp = unicodedata.decomposition(ch)
        if decomp:
            result += unichr(int(decomp.split()[0], 16))
        else:
            result += ch
    return result
예제 #30
0
파일: text.py 프로젝트: prymatex/prymatex
    def __missing__(self, key):
        ch = self.get(key)
        if ch is not None:
            return ch
        try:
            de = unicodedata.decomposition(six.PY3 and chr(key) or unichr(key))
            p1, p2 = [int(x, 16) for x in de.split(None, 1)]
            if p2 == 0x308:
                ch = self.CHAR_REPLACEMENT.get(key)
            else:
                ch = int(p1)

        except (IndexError, ValueError):
            ch = self.CHAR_REPLACEMENT.get(key, key)
        self[key] = ch
        return ch
예제 #31
0
assert unicodedata.numeric(u'A', None) is None
assert unicodedata.numeric(u'9') == 9
assert unicodedata.numeric(u'\u215b') == 0.125
assert unicodedata.numeric(u'\u2468') == 9.0

assert unicodedata.decimal(u'A', None) is None
assert unicodedata.decimal(u'9') == 9
assert unicodedata.decimal(u'\u215b', None) is None
assert unicodedata.decimal(u'\u2468', None) is None

assert unicodedata.category(u'\uFFFE') == 'Cn'
assert unicodedata.category(u'a') == 'Ll'
assert unicodedata.category(u'A') == 'Lu'

assert unicodedata.bidirectional(u'\uFFFE') == ''
assert unicodedata.bidirectional(u' ') == 'WS'
assert unicodedata.bidirectional(u'A') == 'L'

assert unicodedata.decomposition(u'\uFFFE') == ''
assert unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034'

assert unicodedata.mirrored(u'\uFFFE') == 0
assert unicodedata.mirrored(u'a') == 0
assert unicodedata.mirrored(u'\u2201') == 1

assert unicodedata.combining(u'\uFFFE') == 0
assert unicodedata.combining(u'a') == 0
assert unicodedata.combining(u'\u20e1') == 230

print 'done.'
예제 #32
0
import unicodedata

for char in [u"A", u"-", u"1", u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}"]:
    print repr(char),
    print unicodedata.category(char),
    print repr(unicodedata.decomposition(char)),
    print unicodedata.decimal(char, None),
    print unicodedata.numeric(char, None)

## u'A' Lu '' None None
## u'-' Pd '' None None
## u'1' Nd '' 1 1.0
## u'Ö' Lu '004F 0308' None None

예제 #33
0
 def decomposition(self):
     """Return unicodedata.decomposition."""
     return unicodedata.decomposition(self.c)
verify(unicodedata.numeric(u'A',None) is None)
verify(unicodedata.numeric(u'9') == 9)
verify(unicodedata.numeric(u'\u215b') == 0.125)
verify(unicodedata.numeric(u'\u2468') == 9.0)

verify(unicodedata.decimal(u'A',None) is None)
verify(unicodedata.decimal(u'9') == 9)
verify(unicodedata.decimal(u'\u215b',None) is None)
verify(unicodedata.decimal(u'\u2468',None) is None)

verify(unicodedata.category(u'\uFFFE') == 'Cn')
verify(unicodedata.category(u'a') == 'Ll')
verify(unicodedata.category(u'A') == 'Lu')

verify(unicodedata.bidirectional(u'\uFFFE') == '')
verify(unicodedata.bidirectional(u' ') == 'WS')
verify(unicodedata.bidirectional(u'A') == 'L')

verify(unicodedata.decomposition(u'\uFFFE') == '')
verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034')

verify(unicodedata.mirrored(u'\uFFFE') == 0)
verify(unicodedata.mirrored(u'a') == 0)
verify(unicodedata.mirrored(u'\u2201') == 1)

verify(unicodedata.combining(u'\uFFFE') == 0)
verify(unicodedata.combining(u'a') == 0)
verify(unicodedata.combining(u'\u20e1') == 230)

print 'ok'
예제 #35
0
def baseNormalize(text):
    """
    This method is used for normalization of unicode characters to the base ASCII
    letters. Output is ASCII encoded string (or char) with only ASCII letters,
    digits, punctuation and whitespace characters. Case is preserved.

      >>> baseNormalize(123)
      '123'

      >>> baseNormalize(u'a\u0fff')
      'afff'

      >>> baseNormalize(u"foo\N{LATIN CAPITAL LETTER I WITH CARON}")
      'fooI'

      >>> baseNormalize(u"\u5317\u4EB0")
      '53174eb0'
    """
    if not isinstance(text, basestring):
        # This most surely ends up in something the user does not expect
        # to see. But at least it does not break.
        return repr(text)

    text = text.strip()

    res = []
    for ch in text:
        if ch in allowed:
            # ASCII chars, digits etc. stay untouched
            res.append(ch)
        else:
            ordinal = ord(ch)
            if ordinal < UNIDECODE_LIMIT:
                h = ordinal >> 8
                l = ordinal & 0xff

                c = CHAR.get(h, None)

                if c == None:
                    try:
                        mod = __import__('unidecode.x%02x' % (h), [], [],
                                         ['data'])
                    except ImportError:
                        CHAR[h] = NULLMAP
                        res.append('')
                        continue

                    CHAR[h] = mod.data

                    try:
                        res.append(mod.data[l])
                    except IndexError:
                        res.append('')
                else:
                    try:
                        res.append(c[l])
                    except IndexError:
                        res.append('')

            elif decomposition(ch):
                normalized = normalize('NFKD', ch).strip()
                # string may contain non-letter chars too. Remove them
                # string may result to more than one char
                res.append(''.join([c for c in normalized if c in allowed]))

            else:
                # hex string instead of unknown char
                res.append("%x" % ordinal)

    return ''.join(res).encode('ascii')
예제 #36
0
def setUpModule():
    log = logging.getLogger('unicodedata')

    log.info('generating unicodedata CSV')
    with tempfile.NamedTemporaryFile(prefix='unicode-',
                                     suffix='.csv') as csvfile:
        c = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        for i in xrange(sys.maxunicode + 1):
            if i >= 5024 and i <= 5119:
                continue  # the Unicode Cherokee-Block is broken in Python 2.7 and Python 3.4 (maybe also 3.5)
            u = unichr(i)
            if unicodedata.category(u).startswith('C'):
                # [Cc]Other, Control
                # [Cf]Other, Format
                # [Cn]Other, Not Assigned
                # [Co]Other, Private Use
                # [Cs]Other, Surrogate
                continue
            row = (
                i,  # INT 0-1114111
                unicodedata.name(u,
                                 'UNICODE U+%08X' % i),  # VARCHAR(100) ASCII
                u,  # VARCHAR(1) UNICODE
                u.upper(),  # VARCHAR(1) UNICODE
                u.lower(),  # VARCHAR(1) UNICODE
                unicodedata.decimal(u, None),  # INT
                unicodedata.numeric(u, None),  # DOUBLE
                unicodedata.category(u),  # VARCHAR(3) ASCII
                unicodedata.bidirectional(u),  # VARCHAR(3) ASCII
                unicodedata.combining(u),  # VARCHAR(3) ASCII
                unicodedata.east_asian_width(u),  # VARCHAR(1) ASCII
                bool(unicodedata.mirrored),  # BOOLEAN
                unicodedata.decomposition(u),  # VARCHAR(10) ASCII
                unicodedata.normalize('NFC', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFD', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFKC', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFKD', u),  # VARCHAR(3) UNICODE
            )
            c.writerow(utf8encoder(row))
        csvfile.flush()

        log.info('loading CSV')
        sql = '''
            DROP SCHEMA utest CASCADE;
            CREATE SCHEMA utest;
            CREATE TABLE unicodedata (
                codepoint INT NOT NULL,
                name VARCHAR(100) ASCII,
                uchar VARCHAR(1) UTF8,
                to_upper VARCHAR(1) UTF8,
                to_lower VARCHAR(1) UTF8,
                decimal_value INT,
                numeric_value INT,
                category VARCHAR(3) ASCII,
                bidirectional VARCHAR(3) ASCII,
                combining VARCHAR(10) ASCII,
                east_asian_width VARCHAR(2) ASCII,                mirrored BOOLEAN,
                decomposition VARCHAR(100) ASCII,
                NFC VARCHAR(10) UTF8,
                NFD VARCHAR(10) UTF8,
                NFKC VARCHAR(20) UTF8,
                NFKD VARCHAR(20) UTF8
                );
            IMPORT INTO unicodedata
            FROM LOCAL CSV FILE '%s'
            ROW SEPARATOR = 'CRLF';
            ''' % os.path.join(os.getcwd(), csvfile.name)
        cmd = '''%(exaplus)s -c %(conn)s -u sys -P exasol
		        -no-config -autocommit ON -L -pipe''' % {
            'exaplus':
            os.environ.get(
                'EXAPLUS',
                '/usr/opt/EXASuite-4/EXASolution-4.2.9/bin/Console/exaplus'),
            'conn':
            udf.opts.server
        }
        env = os.environ.copy()
        env['PATH'] = '/usr/opt/jdk1.8.0_latest/bin:' + env['PATH']
        exaplus = subprocess.Popen(cmd.split(),
                                   env=env,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        out, _err = exaplus.communicate(sql)
    if exaplus.returncode != 0:
        log.critical('EXAplus error: %d', exaplus.returncode)
        log.error(out)
    else:
        log.debug(out)
예제 #37
0
def get_ascii_char(c):
    s = ud.decomposition(c)
    if s == '':  # for an indecomposable character, it returns ''
        return c
    code = int('0x' + s.split()[0], 0)
    return chr(code)
예제 #38
0
def filter_accents(text):
    """Return a sequence of accented characters found in
       the passed in lowercased text string
    """
    return {char for char in text.lower() if ud.decomposition(char) != ''}
예제 #39
0
 def decompose(c):
     d = unicodedata.decomposition(c)
     if d and d.split(None, 1)[0] in ['<compat>', '<wide>', '<narrow>', '<noBreak>']:
         return unicodedata.normalize('NFKD', c)
     else:
         return c
예제 #40
0
def simplechar(ch):
    dec = unicodedata.decomposition(ch)
    if len(dec) > 0:
        return chr(int(dec.split(' ')[0], 16))
    else:
        return ch
예제 #41
0
print("categories ok")

for comb, cp in tests["combinings"].items():
    assert int(comb) == unicodedata.combining(chr(int(cp, 16)))

print("combining ok")

for decimal, cp in tests["decimals"].items():
    if decimal:
        assert eval(decimal) == unicodedata.decimal(chr(int(cp, 16)))

print("decimals ok")

for decomp, cp in tests["decompositions"].items():
    assert decomp == unicodedata.decomposition(chr(int(cp, 16)))

print("decomposition ok")

for digit, cp in tests["digits"].items():
    if digit:
        assert eval(digit) == unicodedata.digit(chr(int(cp, 16)))

print("digits ok")

for name, cp in tests["names"].items():
    assert name == unicodedata.name(chr(int(cp, 16)))

print("names ok")

for numeric, cp in tests["numerics"].items():
예제 #42
0
"""

import sys
from unicodedata import decomposition
from string import ascii_uppercase

ocorrencias = {}

for linha in file(sys.argv[1]):
    for car_uni in linha.decode('utf-8'):  # converter linha para unicode
        if not car_uni.strip():
            continue  # ignorar brancos
        try:  # primeiro tentamos converter para ASCII
            car = car_uni.encode('ascii')
        except UnicodeEncodeError:  # se não dá certo, apelamos
            partes = decomposition(car_uni)
            if partes:  # se o caractere pode ser decomposto...
                ascii = partes.split()[
                    0]  # a primeira parte é o código ASCII...
                car = chr(int(ascii, 16))  # converter o ASCII hexadecimal
            else:  # se o caractere não pode ser decomposto...
                continue  # então não tem correspondente na tabela ASCII

        car = car.upper()  # converter para maiúsculas
        if car in ascii_uppercase:
            # finalmente, podemos computar a ocorrência
            if car in ocorrencias:
                ocorrencias[car] += 1
            else:
                ocorrencias[car] = 1
def decompose(s):
    return "".join(
        [chr(int(x, 16)) for x in unicodedata.decomposition(s).split()])
예제 #44
0
파일: textt.py 프로젝트: yukideee/mine
    async def unicode(self, ctx, *, arg):
        """Returns the information on a Unicode character or named character."""

        if len(arg) == 1:
            chars = [arg]
        else:
            #if " " in arg[1:-1] or "," in arg[1:-1] or ";" in arg[1:-1]:
            #    arg = arg[:0] + arg[1:-1].replace(",", " ").replace(";", " ") + arg[-1:]

            # try to find what character is meant
            # if starts with "U+", "\x", "\u", it"s hex

            if arg.upper().startswith("U+") or arg.upper().startswith(
                    "\\U") or arg.upper().startswith("\\X"):
                arg = "0x" + arg[2:].strip()
            try:
                if arg.lower().startswith("0x"):
                    arg = arg[2:]
                chars = [chr(int(arg, 16))]
            except ValueError:
                # otherwise, use name lookup
                try:
                    chars = [unicodedata.lookup(arg)]
                except KeyError:
                    chars = arg
                    #await ctx.send(error("Character not found: `{}`".format(arg)))
                    #return

        embeds = []
        n = 0
        for char in chars:
            n += 1
            value = ord(char)
            name = unicodedata.name(char, None)
            #name_url = name.lower().replace(" ", "-")
            dt = {}
            dt["Character"] = char
            dt["Name"] = name  # str or None
            dt["Decimal"] = unicodedata.decimal(char, None)  # int or None
            dt["Digit"] = unicodedata.digit(char, None)  # int or None
            dt["Numeric"] = unicodedata.numeric(char, None)  # float or None
            dt["Category"] = unicodedata.category(char)  # str
            dt["Bidirectional"] = unicodedata.bidirectional(char)  # str
            dt["Combining class"] = unicodedata.combining(char)  # str
            dt["East Asian width"] = unicodedata.east_asian_width(char)  # str
            dt["Mirrored"] = unicodedata.mirrored(char)  # int
            dt["Decomposition"] = unicodedata.decomposition(char)  # str

            embed = discord.Embed(
                title="Unicode codepoints of: {input}".format(input=arg),
                #url="https://emojipedia.org/{}/".format(name_url),
                description="About Unicode U+{codepoint:04X}.".format(
                    codepoint=value))

            for k, v in dt.items():
                if not v is None and len(str(v)):
                    if len(
                            str(v).strip(
                                " \t\r\n\v\f\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
                            )) == 0:
                        v = '"{}"'.format(v)
                    embed.add_field(name=k, value=str(v), inline=False)
            embed.set_footer(text="Character {index} of {count}".format(
                index=n, count=len(arg)))
            embeds.append(embed)

        if len(embeds) > 1:
            await menu(ctx, embeds, DEFAULT_CONTROLS)
        else:
            await ctx.send(embed=embeds[0])
예제 #45
0
    def _info_on_char(self, reply, c):

        try:
            name = unicodedata.name(c)
        except ValueError:
            name = "(no name in database)"

        cat = unicodedata.category(c)

        replytxt = u"U+%04X" % (ord(c), )
        if not cat.startswith("C"):
            replytxt += " (%s)" % c
        replytxt += ": %s" % name

        cats = {
            "Cc": "Other, Control",
            "Cf": "Other, Format",
            "Cn": "Other, Not Assigned",
            "Co": "Other, Private Use",
            "Cs": "Other, Surrogate",
            "LC": "Letter, Cased",
            "Ll": "Letter, Lowercase",
            "Lm": "Letter, Modifier",
            "Lo": "Letter, Other",
            "Lt": "Letter, Titlecase",
            "Lu": "Letter, Uppercase",
            "Mc": "Mark, Spacing Combining",
            "Me": "Mark, Enclosing",
            "Mn": "Mark, Nonspacing",
            "Nd": "Number, Decimal Digit",
            "Nl": "Number, Letter",
            "No": "Number, Other",
            "Pc": "Punctuation, Connector",
            "Pd": "Punctuation, Dash",
            "Pe": "Punctuation, Close",
            "Pf": "Punctuation, Final quote",
            "Pi": "Punctuation, Initial quote",
            "Po": "Punctuation, Other",
            "Ps": "Punctuation, Open",
            "Sc": "Symbol, Currency",
            "Sk": "Symbol, Modifier",
            "Sm": "Symbol, Math",
            "So": "Symbol, Other",
            "Zl": "Separator, Line",
            "Zp": "Separator, Paragraph",
            "Zs": "Separator, Space",
        }
        try:
            replytxt += ", category: %s" % cats[cat]
        except KeyError:
            log.err("No category found for %s" % cat)

        try:
            replytxt += ", numeric value %s" % unicodedata.numeric(c)
        except ValueError:
            pass

        decomp = unicodedata.decomposition(c)
        if decomp:
            replytxt += ", decomposition: " + decomp

        reply(replytxt)
예제 #46
0
""" Test script for the unicodedata module.
    Written by Marc-Andre Lemburg ([email protected]).
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#"
from test_support import verify, verbose
import sha
encoding = 'utf-8'

def test_methods():
    h = sha.sha()
    for i in range(65536):
        char = unichr(i)
        data = [
            # Predicates (single char)
            char.isalnum() and u'1' or u'0',
            char.isalpha() and u'1' or u'0',
            char.isdecimal() and u'1' or u'0',
            char.isdigit() and u'1' or u'0',
            char.islower() and u'1' or u'0',
            char.isnumeric() and u'1' or u'0',
            char.isspace() and u'1' or u'0',
            char.istitle() and u'1' or u'0',
            char.isupper() and u'1' or u'0',
            # Predicates (multiple chars)
            (char + u'abc').isalnum() and u'1' or u'0',
            (char + u'abc').isalpha() and u'1' or u'0',
            (char + u'123').isdecimal() and u'1' or u'0',
            (char + u'123').isdigit() and u'1' or u'0',
            (char + u'abc').islower() and u'1' or u'0',
            (char + u'123').isnumeric() and u'1' or u'0',
예제 #47
0
def normalizeUnicode(text, encoding='humanascii'):
    """
    This method is used for normalization of unicode characters to the base ASCII
    letters. Output is ASCII encoded string (or char) with only ASCII letters,
    digits, punctuation and whitespace characters. Case is preserved.
    """
    if text == "":
        return ""

    unicodeinput = True
    if not isinstance(text, str):
        text = str(text, 'utf-8')
        unicodeinput = False

    res = ''
    global allowed, allowedid
    if encoding == 'humanascii' or encoding == 'identifier':
        enc = 'ascii'
    else:
        enc = encoding
    for ch in text:
        if (encoding == 'humanascii') and (ch in allowed):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        if (encoding == 'identifier') and (ch in allowedid):
            # ASCII chars, digits etc. stay untouched
            res += ch
            continue
        else:
            try:
                ch.encode(enc, 'strict')
                if encoding == 'identifier':
                    res += '_'
                else:
                    res += ch
            except UnicodeEncodeError:
                ordinal = ord(ch)
                if ordinal in mapping:
                    # try to apply custom mappings
                    res += mapping.get(ordinal)
                elif decomposition(ch) or len(normalize('NFKD', ch)) > 1:
                    normalized = str(
                        filter(lambda i: not combining(i),
                               normalize('NFKD', ch))).strip()
                    # normalized string may contain non-letter chars too. Remove them
                    # normalized string may result to  more than one char
                    if encoding == 'identifier':
                        res += ''.join(
                            [c for c in normalized if c in allowedid])
                    else:
                        res += ''.join([c for c in normalized if c in allowed])
                else:
                    # hex string instead of unknown char
                    res += "%x" % ordinal
    if encoding == 'identifier':
        res = res.strip('_').replace('_____',
                                     '_').replace('____', '_').replace(
                                         '___', '_').replace('__', '_')
        if not res.strip('_')[0] in string.ascii_letters:
            res = '_' + res
    if unicodeinput:
        return res
    else:
        return res.encode('utf-8')
예제 #48
0
def handle_unicode_characters(s):
    """Handle unicode characters appearing in string. Some do actually contain valuable information for NLP applications. But there is also a lot of "unnecessary" unicode in scientific texts (at least from an Software-NER perspective). It can either be dropped, or different codes can be summarized by one characters. 

    Args:
        s (string): string to transform

    Returns:
        string: unicode 'normalized' string
    """
    dropped_char_indices = []
    running_count = 0
    out_s = ''
    for char in s:
        if re.match(r"[A-Za-z0-9\s]",
                    char) is not None or char in string.punctuation:
            # keep "normal" chars
            out_s += char
            running_count += 1
        else:
            # here we will deal with unicode
            if char in ['©', '™', '®']:
                # 'TradeMarks' are tricky but often used to indicate external equipment in studies
                out_s += '™'
                running_count += 1
                continue

            if char == '°':
                # Temperatures are almost always indicated by °
                out_s += char
                running_count += 1
                continue

            # some unicodes are combined and based on 'normal' characters -> we want to keep the base characters, e.g. á -> a
            unicode_matched = False
            #char_
            u_map = unicodedata.decomposition(char)
            if u_map and len(u_map) > 1:
                split_codes = [
                    code for code in u_map.split()
                    if not re.match(r'<.*>', code)
                ]
                for code in split_codes:
                    code_char = chr(int(code, 16))
                    if re.match(r'[a-zA-Z]', code_char):
                        out_s += code_char  # TODO
                        unicode_matched = True
                        running_count += 1
                        break
            if unicode_matched:
                continue

            # normalized unicode for everything else just to be save..
            char = unicodedata.normalize('NFC', char)

            if len(char) > 1:
                print(
                    RuntimeWarning(
                        "Unkown unicode character with length > 1: {} -- ignored"
                        .format(char)))
                continue

            # we want to keep basic greek letters no matter what
            if char == 'µ':  # yes, they are actually different: this is the 'micro sign'
                char = 'μ'  # this the greek letter..
            if (ord(char) >= 945
                    and ord(char) <= 970) or (ord(char) >= 913
                                              and ord(char) <= 938):
                out_s += char
                running_count += 1
                continue

            # the rest is based on unicode categories some of which are considered important and others are not
            category = unicodedata.category(char)
            if category == 'Pi':
                if ord(char) == 8216 or ord(char) == 8219:
                    out_s += char
                else:
                    out_s += '“'
                running_count += 1
            elif category == 'Pf':
                if ord(char) == 8217:
                    out_s += '’'
                else:
                    out_s += '”'
                running_count += 1
            elif category == 'Pd':
                char = '-'
                out_s += char
                running_count += 1
            elif category == 'Sc':
                out_s += char
                running_count += 1
            elif category in ['Pe', 'Cf', 'Ps', 'So', 'Sk', 'No']:
                dropped_char_indices.append([running_count, char])
                running_count += 1
            elif category == 'Lm':
                if ord(char) >= 697 and ord(char) <= 719:
                    char = "'"
                    running_count += 1
                    out_s += char
            elif category in ['Lu', 'Ll', 'Po']:
                # keep
                out_s += char
                running_count += 1
            elif category == 'Sm':
                # Mathsymbols, TODO: handle them better?
                out_s += char
                running_count += 1
                unicode_in_sent = True
            else:
                #print("Encountered an unhandled unicode character: {} - DROPPED".format(char))
                dropped_char_indices.append([running_count, char])
                running_count += 1

    return out_s, dropped_char_indices
with codecs.open(filename_in, encoding='utf-8') as fin:
    with open(filename_out, 'w') as fout:
        with codecs.open(filename_err, encoding='utf-8', mode='w') as ferr:

            ascii = ''
            err = False
            ln = 1
            badnames = 0
            goodnames = 0

            line = fin.readline()

            while line:

                for c in line:
                    d = unicodedata.decomposition(c)
                    if d:

                        # Character is a unicode composition.

                        ascii = ascii + decode(unicodedata.name(c))

                    else:

                        # Character is not a unicode decomposition.

                        (replacement, handled, reason) = repl(c)

                        if handled:

                            # We want to keep this char.
예제 #50
0

def _escape(s):
    return s.encode('unicode-escape').decode('ascii')


def _idempotent_ignoring_space(profile, value):
    result1 = profile.enforce(value)
    result2 = profile.enforce(result1)
    return result1.strip() == result2.strip()


results = Counter()
profile = precis.get_profile('NicknameCaseMapped:ToLower')

for cp in range(0x0110000):
    char = chr(cp)
    try:
        if not _idempotent_ignoring_space(profile, char):
            decomp = unicodedata.decomposition(char)
            kind = decomp.split()[0]
            if kind.startswith('<'):
                results[kind] += 1
            else:
                print(_escape(char), unicodedata.name(char))
    except UnicodeEncodeError:
        pass

print(results)

예제 #51
0
 def _char_translate(c):
     base = unicodedata.decomposition(c).split(" ")[0].strip('0')
     return bytes.fromhex(base).decode("utf-8")
예제 #52
0
 def getDecomposition(cls, char):
     u"""
     <doc>
     Decomposition.
     </doc>
     """
     charDec = ord(char)
     decompString = unicodedata.decomposition(char)
     if decompString:
         decompHex = decompString.split(' ')
         decomp = [TX.hex2dec(i) for i in decompHex]
         overrides = {
             290: {
                 807: 806
             },  # u'Ģ': {u'̦': u'̧'}
             291: {
                 807: 806
             },  # u'ģ': {u'̦': u'̧'}
             325: {
                 807: 806
             },  # u'Ņ': {u'̦': u'̧'}
             311: {
                 807: 806
             },  # u'ķ': {u'̦': u'̧'}
             310: {
                 807: 806
             },  # u'Ķ': {u'̦': u'̧'}
             342: {
                 807: 806
             },  # u'Ŗ': {u'̦': u'̧'}
             343: {
                 807: 806
             },  # u'ŗ': {u'̦': u'̧'}
             536: {
                 807: 806
             },  # u'Ș': {u'̦': u'̧'}
             537: {
                 807: 806
             },  # u'ș': {u'̦': u'̧'}
             538: {
                 807: 806
             },  # u'Ț': {u'̦': u'̧'}
             539: {
                 807: 806
             },  # u'ț': {u'̦': u'̧'}
             316: {
                 807: 806
             },  # u'ļ': {u'̦': u'̧'}
             315: {
                 807: 806
             },  # u'Ļ': {u'̦': u'̧'}
             291: {
                 807: 786
             },  # gcommaccent
             319: {
                 183: 775
             },
             320: {
                 183: 775
             }
         }
         for x, u in enumerate(decomp):
             if charDec in overrides and u in overrides[charDec]:
                 decomp[x] = overrides[charDec][u]
         charList = []
         for d in decomp:
             if isinstance(d, int):
                 charList.append(unichr(d))
         return charList
     return None
예제 #53
0
파일: utils.py 프로젝트: llebleis/legi.py
def add_accentless_fallbacks(pattern):
    r"""Modifies a regexp pattern to also match accentless text.

    >>> add_accentless_fallbacks(r'Arrêté')
    'Arr[êe]t[ée]'
    >>> add_accentless_fallbacks(r'foo|bar')
    'foo|bar'
    >>> add_accentless_fallbacks(r'm[êè]me')
    'm[êèe]me'
    >>> add_accentless_fallbacks(r'm[êèe]me')
    'm[êèe]me'
    >>> add_accentless_fallbacks(r'\[Décret')
    '\\[D[ée]cret'
    >>> add_accentless_fallbacks(r'\[(?P<blé>Décret[ée])?(?(blé) à | a )(?P=blé)')
    '\\[(?P<blé>D[ée]cret[ée])?(?(blé) [àa] | a )(?P=blé)'
    >>> add_accentless_fallbacks(r'(?# commenté )')
    '(?# commenté )'
    >>> add_accentless_fallbacks(r'[\]é]')
    '[\\]ée]'
    """
    def remove_accent(c):
        return chr(int(decomposition(c).split(' ', 1)[0], 16))

    r = []
    source = sre_parse.Tokenizer(pattern)
    sourceget = source.get
    while True:
        this = source.next
        if this is None:
            break  # end of pattern
        sourceget()

        if this[0] == '\\':
            r.append(this)
        elif this == '[':
            elements = []
            accented = set()
            while True:
                this = sourceget()
                if this in (None, ']'):
                    break
                elements.append(this)
                if this[0] == '\\':
                    continue
                if decomposition(this):
                    accented.add(this)
            if accented:
                elements_set = set(elements)
                for c in sorted(accented):
                    accentless = remove_accent(c)
                    if accentless not in elements_set:
                        elements.append(accentless)
                        elements_set.add(accentless)
            r.append('[')
            r.extend(elements)
            if this:
                r.append(']')
        elif this == '(' and source.match('?'):
            this = sourceget()
            if this is None:
                this = ''
            elif this == 'P':
                if source.next == '<':
                    # named group
                    this += source.getuntil('>') + '>'
                elif source.next == '=':
                    # named backreference
                    this += source.getuntil(')') + ')'
            elif this == '#':
                # comment
                this += source.getuntil(')') + ')'
            elif this == '(':
                # conditional backreference group
                this += source.getuntil(')') + ')'
            r.append('(?' + this)
        else:
            if decomposition(this):
                this = '[%s%s]' % (this, remove_accent(this))
            r.append(this)

    return ''.join(r)
예제 #54
0
import unicodedata
print(unicodedata.bidirectional('$'))
print(unicodedata.category('$'))
print(unicodedata.combining('7'))
print(unicodedata.decimal('1'))
print(unicodedata.decomposition('\u00fc'))
print(unicodedata.digit('7'))
print(unicodedata.lookup('COPYRIGHT SIGN'))
print(unicodedata.mirrored('('))
print(unicodedata.name('\u00fc'))
print(len(unicodedata.normalize('NFC','resume\u0301')))
print(len(unicodedata.normalize('NFD','resume\u0301')))
print(unicodedata.normalize('NFKD','\u2165'))
print(unicodedata.numeric('\u2157'))
print(unicodedata.unidata_version)
예제 #55
0
파일: utils.py 프로젝트: llebleis/legi.py
 def remove_accent(c):
     return chr(int(decomposition(c).split(' ', 1)[0], 16))
예제 #56
0
print(unicodedata.normalize(
    'NFD', s1))  # NFD使组合字符拆开为两个字符,这里'é'被拆为'e'和重音符,即输出结果为:'cafeˋ'
print(
    unicodedata.normalize('NFD', s2)
)  # s2最后两个字符为'e'和'\u0301',(我不知道内部机理是什么,接下来的叙述是我自己的理解,不知道正确与否),直接将'\u0301'解释为重音符'ˋ',输出为'cafeˋ'

print("-------------------")
print(unicodedata.east_asian_width('我'))
print(unicodedata.east_asian_width('1'))
print(unicodedata.east_asian_width('a'))
print(unicodedata.east_asian_width('ﷺ'))
# F:fullwidth,H:halfwidth,W:wide,Na:narrow,A:ambiguous(不明确),N:natural(正常)

print(unicodedata.mirrored('薛'))  # 不懂

print(unicodedata.decomposition('ﷺ'))  # 可分解
print(unicodedata.decomposition('é'))  # 可分解
print(unicodedata.decomposition('e'))  # 不可分解,所以返回空值(输出就是一片空白)

# 判断 Unicode 字符串 unistr 是否为正规形式 form。 form 的有效值为 'NFC', 'NFKC', 'NFD' 和 'NFKD'
# 3.8
# print(unicodedata.is_normalized('NFC','a')) # true
# print(unicodedata.is_normalized('NFC','ﷺ')) # true
# print(unicodedata.is_normalized('NFKD','ﷺ')) # false

print(unicodedata.unidata_version)
print(unicodedata.ucd_3_2_0)

#print('const CATEGORY_e CHAR_CATEGORIES[] = {%s};' % ', '.join(unicodedata.category(chr(codepoint)) for codepoint in range(0x110000)))

print(u'\ua62c')  #꘬  因为没有定义 print(unicodedata.name(u"\ua62c"))
예제 #57
0
    async def charinfo(self, ctx, *, data: str):
        """Shows information about one or several characters.

        'data' can either be a character, a unicode escape sequence, a unicode character name or a string.
        If 'data' is a string only a summary of each character's info will be displayed.
        """
        data = data.lower()

        if data.startswith('\\u'):
            # Let's interpret the unicode escape sequence
            hex_values = data.split('\\u')[1:]
            try:
                code_points = [int(val, 16) for val in hex_values]
            except ValueError:
                raise commands.BadArgument('Invalid unicode escape sequence.')
            else:
                data = ''.join(chr(cp) for cp in code_points)
        elif len(data) > 1:
            # Maybe we've been given the character's name ?
            try:
                data = unicodedata.lookup(data)
            except KeyError:
                pass

        # Normalise the input
        data = unicodedata.normalize('NFC', data)
        url_fmt = '<http://unicode-table.com/en/{:X}>'

        if len(data) == 1:
            # Detailed info on the character
            entries = [('Character', data),
                       ('Name', unicodedata.name(data, 'None')),
                       ('Code point', f'{ord(data):04x}')]
            decomposition = unicodedata.decomposition(data)
            if decomposition != '':
                entries.append(('Decomposition', decomposition))

            combining = unicodedata.combining(data)
            if combining:
                entries.append(('Combining class', combining))

            entries.append(('Category', unicodedata.category(data)))
            bidirectional = unicodedata.bidirectional(data)
            entries.append(('Bidirectional',
                            bidirectional if bidirectional != '' else 'None'))
            entries.append(
                ('Mirrored',
                 'True' if unicodedata.mirrored(data) == 1 else 'False'))
            entries.append(
                ('East asian width', unicodedata.east_asian_width(data)))
            entries.append(('Url', url_fmt.format(ord(data))))

            # Create the message's content and send it
            content = utils.indented_entry_to_str(entries)
            await ctx.send(utils.format_block(content))
        else:
            # Minimal info for each character
            entries = [
                f'`\N{ZERO WIDTH SPACE}{c}\N{ZERO WIDTH SPACE}` | `\\u{ord(c):04x}` | `{unicodedata.name(c, "None")}` | {url_fmt.format(ord(c))}'
                for c in data
            ]
            content = '\n'.join(entries)
            await ctx.send(content)