def normalize(text, PY3): if PY3: if not isinstance(text, str): str(text, 'utf-8') else: if not isinstance(text, unicode): text = unicode(text) text = text.lower() decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if cat.startswith('C'): filtered.append(' ') elif cat.startswith('M'): # marks, such as umlauts continue elif cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') # elif cat.startswith('S'): # symbols, such as currency continue else: filtered.append(char) text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') text = text.strip() return ucnorm('NFKC', text)
def normalize(text): if not isinstance(text, unicode): text = unicode(text) text = text.lower() decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if char == "'": continue if cat.startswith('C'): filtered.append(' ') elif cat.startswith('M'): # marks, such as umlauts continue elif cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') elif cat.startswith('S'): # symbols, such as currency continue elif cat.startswith('L') or cat.startswith('N'): filtered.append(char) else: # print (cat, char) filtered.append(' ') text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') text = text.strip() return ucnorm('NFKC', text)
def normalize(text): """ Simplify a piece of text to generate a more canonical representation. This involves lowercasing, stripping trailing spaces, removing symbols, diacritical marks (umlauts) and converting all newlines etc. to single spaces. """ if not isinstance(text, unicode): text = unicode(text) text = text.lower() decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if cat.startswith('C'): filtered.append(' ') elif cat.startswith('M'): # marks, such as umlauts continue elif cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') elif cat.startswith('S'): # symbols, such as currency continue else: filtered.append(char) text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') text = text.strip() return ucnorm('NFKC', text)
def full_normalize(text): """ Simplify a piece of text to generate a more canonical representation. This involves lowercasing, stripping trailing spaces, removing symbols, diacritical marks (umlauts) and converting all newlines etc. to single spaces. """ if not isinstance(text, unicode): text = unicode(text) decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if cat.startswith('C'): filtered.append(' ') elif cat.startswith('M'): # marks, such as umlauts continue elif cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') elif cat.startswith('S') or cat.startswith('P'): # symbols, such as currency continue else: filtered.append(char) text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') text = text.strip() return ucnorm('NFKC', text)
def normalize(text, PY3): """ Simplify a piece of text to generate a more canonical representation. This involves lowercasing, stripping trailing spaces, removing symbols, diacritical marks (umlauts) and converting all newlines etc. to single spaces. """ if PY3: if not isinstance(text, str): str(text, 'utf-8') else: if not isinstance(text, unicode): text = unicode(text) text = text.lower() decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if cat.startswith('C'): filtered.append(' ') elif cat.startswith('M'): # marks, such as umlauts continue elif cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') elif cat.startswith('S'): # symbols, such as currency continue else: filtered.append(char) text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') #remove hyphens text = text.replace('-', ' ') #remove colons text = text.replace(':', ' ') #remove opening parenthesis text = text.replace('(', ' ') #remove closing parenthesis text = text.replace(')', ' ') #remove periods text = text.replace('.', ' ') #remove commas text = text.replace(',', ' ') text = text.strip() return ucnorm('NFKC', text)
def normalize(text): if not isinstance(text, unicode): text = unicode(text) decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if char == "'" or cat.startswith('M') or cat.startswith('S'): continue elif cat.startswith('L') or cat.startswith('N'): filtered.append(char) else: filtered.append(' ') text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') return ucnorm('NFKC', text).strip().lower()
def normalize(text): if not isinstance(text, unicode): text = unicode(text) decomposed = ucnorm("NFKD", text) filtered = [] for char in decomposed: cat = category(char) if char == "'" or cat.startswith("M") or cat.startswith("S"): continue elif cat.startswith("L") or cat.startswith("N"): filtered.append(char) else: filtered.append(" ") text = u"".join(filtered) while " " in text: text = text.replace(" ", " ") return ucnorm("NFKC", text).strip().lower()
def slugify(text): if not isinstance(text, unicode): text = unicode(text) text = text.lower() decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if char == "'" or cat.startswith('M') or cat.startswith('S'): continue elif cat.startswith('L') or cat.startswith('N'): filtered.append(char) else: filtered.append('-') text = u''.join(filtered) while '--' in text: text = text.replace('--', '-') text = text.strip() return ucnorm('NFKC', text).encode('ascii', 'ignore')
def reverse_normalize(text): if not isinstance(text, unicode): text = unicode(text) decomposed = ucnorm('NFKD', text) filtered = [] for char in decomposed: cat = category(char) if cat.startswith('Z'): # newlines, non-breaking etc. filtered.append(' ') elif cat.startswith('S') or cat.startswith('P'): # symbols, such as currency continue else: filtered.append(char) text = u''.join(filtered) while ' ' in text: text = text.replace(' ', ' ') text = ucnorm('NFKC', text).strip().split(' ') return ' '.join(reversed(text))
def normalize_text(text): if not isinstance(text, unicode): text = unicode(text) chars = [] # http://www.fileformat.info/info/unicode/category/index.htm for char in ucnorm("NFKD", text): cat = category(char)[0] if cat in ["C", "Z", "S"]: chars.append(u" ") elif cat in ["M", "P"]: continue else: chars.append(char) text = u"".join(chars) text = REMOVE_SPACES.sub(" ", text) return text.strip().lower()
def normalize_text(text): if not isinstance(text, unicode): text = unicode(text) chars = [] # http://www.fileformat.info/info/unicode/category/index.htm for char in ucnorm('NFKD', text): cat = category(char)[0] if cat in ['C', 'Z', 'S']: chars.append(u' ') elif cat in ['M', 'P']: continue else: chars.append(char) text = u''.join(chars) text = REMOVE_SPACES.sub(' ', text) return text.strip().lower()
def normalize(text): if not isinstance(text, unicode): text = unicode(text) chars = [] for char in ucnorm('NFKD', text): cat = category(char)[0] if cat in ['C', 'Z', 'S']: chars.append(u' ') elif cat in ['M', 'P']: continue else: chars.append(char) text = u''.join(chars) text = REMOVE_SPACES.sub(' ', text) text = text.strip().lower() #return ucnorm('NFKC', text) return text