Python ucnorm 예제들, unicodedata.ucnorm Python 예제들

예제 #1

0

파일 보기

def normalize(text, PY3):
    if PY3:
        if not isinstance(text, str):
            str(text, 'utf-8')
    else:
        if not isinstance(text, unicode):
            text = unicode(text)
    text = text.lower()
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('C'):
            filtered.append(' ')
        elif cat.startswith('M'):
            # marks, such as umlauts
            continue
        elif cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
            # elif cat.startswith('S'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = text.strip()
    return ucnorm('NFKC', text)

예제 #2

0

파일 보기

파일: normalize.py 프로젝트: pudo-attic/spon-scraper

def normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    text = text.lower()
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if char == "'":
            continue
        if cat.startswith('C'):
            filtered.append(' ')
        elif cat.startswith('M'):
            # marks, such as umlauts
            continue
        elif cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S'):
            # symbols, such as currency
            continue
        elif cat.startswith('L') or cat.startswith('N'):
            filtered.append(char)
        else:
        #    print (cat, char)
            filtered.append(' ')
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = text.strip()
    return ucnorm('NFKC', text)

예제 #3

0

파일 보기

파일: text.py 프로젝트: csenger/offenesparlament

def normalize(text):
    """ Simplify a piece of text to generate a more canonical 
    representation. This involves lowercasing, stripping trailing
    spaces, removing symbols, diacritical marks (umlauts) and 
    converting all newlines etc. to single spaces.
    """
    if not isinstance(text, unicode):
        text = unicode(text)
    text = text.lower()
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('C'):
            filtered.append(' ')
        elif cat.startswith('M'):
            # marks, such as umlauts
            continue
        elif cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = text.strip()
    return ucnorm('NFKC', text)

예제 #4

0

파일 보기

파일: normalize.py 프로젝트: tmarthal/nomenklatura

def full_normalize(text):
    """ Simplify a piece of text to generate a more canonical
    representation. This involves lowercasing, stripping trailing
    spaces, removing symbols, diacritical marks (umlauts) and
    converting all newlines etc. to single spaces.
    """
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('C'):
            filtered.append(' ')
        elif cat.startswith('M'):
            # marks, such as umlauts
            continue
        elif cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S') or cat.startswith('P'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = text.strip()
    return ucnorm('NFKC', text)

예제 #5

0

파일 보기

def normalize(text, PY3):
    """ Simplify a piece of text to generate a more canonical
    representation. This involves lowercasing, stripping trailing
    spaces, removing symbols, diacritical marks (umlauts) and
    converting all newlines etc. to single spaces.
    """
    if PY3:
        if not isinstance(text, str):
            str(text, 'utf-8')
    else:
        if not isinstance(text, unicode):
            text = unicode(text)
    text = text.lower()
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('C'):
            filtered.append(' ')
        elif cat.startswith('M'):
            # marks, such as umlauts
            continue
        elif cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    #remove hyphens
    text = text.replace('-', ' ')
    #remove colons
    text = text.replace(':', ' ')
    #remove opening parenthesis
    text = text.replace('(', ' ')
    #remove closing parenthesis
    text = text.replace(')', ' ')
    #remove periods
    text = text.replace('.', ' ')
    #remove commas
    text = text.replace(',', ' ')
    text = text.strip()
    return ucnorm('NFKC', text)

예제 #6

0

파일 보기

파일: common.py 프로젝트: KarinaBunyik/twindle

def normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if char == "'" or cat.startswith('M') or cat.startswith('S'):
            continue
        elif cat.startswith('L') or cat.startswith('N'):
            filtered.append(char)
        else:
            filtered.append(' ')
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    return ucnorm('NFKC', text).strip().lower()

예제 #7

0

파일 보기

파일: common.py 프로젝트: pombredanne/twindle

def normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if char == "'" or cat.startswith('M') or cat.startswith('S'):
            continue
        elif cat.startswith('L') or cat.startswith('N'):
            filtered.append(char)
        else:
            filtered.append(' ')
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    return ucnorm('NFKC', text).strip().lower()

예제 #8

0

파일 보기

파일: egofaktor.py 프로젝트: sidrg/offenesparlament.de

def normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm("NFKD", text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if char == "'" or cat.startswith("M") or cat.startswith("S"):
            continue
        elif cat.startswith("L") or cat.startswith("N"):
            filtered.append(char)
        else:
            filtered.append(" ")
    text = u"".join(filtered)
    while "  " in text:
        text = text.replace("  ", " ")
    return ucnorm("NFKC", text).strip().lower()

예제 #9

0

파일 보기

def slugify(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    text = text.lower()
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if char == "'" or cat.startswith('M') or cat.startswith('S'):
            continue
        elif cat.startswith('L') or cat.startswith('N'):
            filtered.append(char)
        else:
            filtered.append('-')
    text = u''.join(filtered)
    while '--' in text:
        text = text.replace('--', '-')
    text = text.strip()
    return ucnorm('NFKC', text).encode('ascii', 'ignore')

예제 #10

0

파일 보기

파일: normalize.py 프로젝트: pudo-attic/lobbytransparency

def reverse_normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S') or cat.startswith('P'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = ucnorm('NFKC', text).strip().split(' ')
    return ' '.join(reversed(text))

예제 #11

0

파일 보기

파일: text.py 프로젝트: pudo-attic/journoid

def reverse_normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    decomposed = ucnorm('NFKD', text)
    filtered = []
    for char in decomposed:
        cat = category(char)
        if cat.startswith('Z'):
            # newlines, non-breaking etc.
            filtered.append(' ')
        elif cat.startswith('S') or cat.startswith('P'):
            # symbols, such as currency
            continue
        else:
            filtered.append(char)
    text = u''.join(filtered)
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = ucnorm('NFKC', text).strip().split(' ')
    return ' '.join(reversed(text))

예제 #12

0

파일 보기

파일: watershed.py 프로젝트: OpenOil-UG/edgar-oil-contracts

def normalize_text(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    chars = []
    # http://www.fileformat.info/info/unicode/category/index.htm
    for char in ucnorm("NFKD", text):
        cat = category(char)[0]
        if cat in ["C", "Z", "S"]:
            chars.append(u" ")
        elif cat in ["M", "P"]:
            continue
        else:
            chars.append(char)
    text = u"".join(chars)
    text = REMOVE_SPACES.sub(" ", text)
    return text.strip().lower()

예제 #13

0

파일 보기

파일: watershed.py 프로젝트: wizardshowing/edgar-oil-contracts

def normalize_text(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    chars = []
    # http://www.fileformat.info/info/unicode/category/index.htm
    for char in ucnorm('NFKD', text):
        cat = category(char)[0]
        if cat in ['C', 'Z', 'S']:
            chars.append(u' ')
        elif cat in ['M', 'P']:
            continue
        else:
            chars.append(char)
    text = u''.join(chars)
    text = REMOVE_SPACES.sub(' ', text)
    return text.strip().lower()

예제 #14

0

파일 보기

파일: text.py 프로젝트: 01-/storyweb

def normalize(text):
    if not isinstance(text, unicode):
        text = unicode(text)
    chars = []
    for char in ucnorm('NFKD', text):
        cat = category(char)[0]
        if cat in ['C', 'Z', 'S']:
            chars.append(u' ')
        elif cat in ['M', 'P']:
            continue
        else:
            chars.append(char)
    text = u''.join(chars)
    text = REMOVE_SPACES.sub(' ', text)
    text = text.strip().lower()
    #return ucnorm('NFKC', text)
    return text