def test_find_something_rare(): st = characters(whitelist_categories=['Zs'], min_codepoint=12288) find(st, lambda c: unicodedata.category(c) == 'Zs') with pytest.raises(NoSuchExample): find(st, lambda c: unicodedata.category(c) != 'Zs')
def TokenOffsets(string: str): """ Yield the offsets of all Unicode category borders in the *string*, including the initial 0 and the final offset value of ``len(string)``. Caplitalized words special case: A single upper case letter ('Lu') followed by lower case letters ('Ll') are treated as a single token. """ if string is not None and len(string) > 0: yield 0 last = category(string[0]) for i in range(1, len(string)): current = category(string[i]) if last != current: # "join" capitalized tokens: if last == 'Lu' and \ current == 'Ll' and \ (i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')): pass else: yield i last = current yield len(string)
def crear_nombre_usuario(nombre, apellidos): # En primer lugar quitamos tildes, colocamos nombres en minúsculas y : nombre = ''.join( (c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if unicodedata.category(c) != 'Mn')).lower().split() apellidos = ''.join( (c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if unicodedata.category(c) != 'Mn')).lower().split() iniciales_nombre = '' for parte in nombre: iniciales_nombre = iniciales_nombre + parte[0] try: iniciales_apellidos = apellidos[0] except: # Estas dos líneas están para crear usuarios cuando no tienen apellidos iniciales_apellidos = 'sin' for ind in range(len(apellidos))[1:]: try: # Por si acaso el usuario sólo tuviera un apellido: iniciales_apellidos = iniciales_apellidos + apellidos[ind][0] except IndexError: pass usuario = iniciales_nombre + iniciales_apellidos valid_usuario = False n = 1 while valid_usuario == False: username = usuario + str(n) try: user = Gauser.objects.get(username=username) n += 1 except: valid_usuario = True return username
def test_exclude_characters_of_specific_groups(): st = characters(blacklist_categories=('Lu', 'Nd')) find(st, lambda c: unicodedata.category(c) != 'Lu') find(st, lambda c: unicodedata.category(c) != 'Nd') assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
def __new__(cls,s,on_fail='die',msg=None): if type(s) == cls: return s cls.arg_chk(on_fail) for k in cls.forbidden,cls.allowed: assert type(k) == list for ch in k: assert type(ch) == str and len(ch) == 1 try: s = s.strip() if type(s) != str: s = s.decode('utf8') for ch in s: # Allow: (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space # Disallow: (C)ontrol,(M)combining # Combining characters create width formatting issues, so disallow them for now if unicodedata.category(ch)[0] in 'CM': t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]] raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t)) me = str.__new__(cls,s) if cls.max_screen_width: me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')]) assert me.screen_width <= cls.max_screen_width,( 'too wide (>{} screen width)'.format(cls.max_screen_width)) else: assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len) assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len) assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\ 'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed))) assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\ "contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden)) return me except Exception as e: return cls.init_fail(e,s)
def tokens(source): p = 0 while p < len(source): ch = source[p] cat = category(ch) if ch in NEWLINE_CHARS: yield NewlineToken(source[p]) p += 1 elif cat[0] in "CZ": q = p + 1 while q < len(source) and category(source[q])[0] in "CZ": q += 1 yield WhitespaceToken(source[p:q]) p = q elif cat[0] in "LN": q = p + 1 while q < len(source) and category(source[q])[0] in "LN": q += 1 yield WordToken(source[p:q]) p = q else: q = p + 1 while q < len(source) and source[q] == ch: q += 1 yield SymbolToken(source[p:q]) p = q
def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False): """ Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda": { "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"], "ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"] } El diccionario no pude contener una misma cadena de busqueda en varías IDs. La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide, se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra: por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español" coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad. """ import unicodedata match_dict = dict() matches = [] # Pasamos la cadena a unicode data = unicode(data, "utf8") # Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode for key in match_list: if order_list and not key in order_list: raise Exception("key '%s' not in match_list" % key) for value in match_list[key]: if value in match_dict: raise Exception("Duplicate word in list: '%s'" % value) match_dict[unicode(value, "utf8")] = key # Si ignorecase = True, lo pasamos todo a mayusculas if ignorecase: data = data.upper() match_dict = dict((key.upper(), match_dict[key]) for key in match_dict) # Si ascii = True, eliminamos todos los accentos y Ñ if only_ascii: data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn')) match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')), match_dict[key]) for key in match_dict) # Ordenamos el listado de mayor tamaño a menor y buscamos. for match in sorted(match_dict, key=lambda x: len(x), reverse=True): s = data for a in matches: s = s.replace(a, "") if match in s: matches.append(match) if matches: if order_list: return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])}) else: return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None}) else: if order_list: return type("Mtch_list", (), {"key": None, "index": len(order_list)}) else: return type("Mtch_list", (), {"key": None, "index": None})
def ranking(self): """ For each result, removes stopwords, ranks the word, augments the query and returns True if successful else False """ print "Indexing results ...." for i in range(len(self.results)): result = self.results[i] title = result[0] summary = result[1] # Remove punctuation and create lists of words titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split() summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split() for tw in titleWords: if tw.lower() in self.stopWords: continue if self.user_feedback[i] == 'y': self.applyRanking(i, tw, True, True) else: self.applyRanking(i, tw, True, False) for sw in summaryWords: if sw.lower() in self.stopWords: continue if self.user_feedback[i] == 'y': self.applyRanking(i, sw, False, True) else: self.applyRanking(i, sw, False, False) print "Indexing results ...." return self.augmentQuery()
def normalize_roman(string, additional=None): """Removes diacritics from the string and converts to lowercase. >>> normalize_roman(u'Eèé') u'eee' """ if additional: safe = additional.keys() + additional.values() def gen(): for c in string: if c not in safe: yield normalize_roman(c) elif c in additional: yield additional[c] else: yield c return ''.join(gen()) else: chars = [] for c in string: if unicodedata.category(c) == 'Lo': chars.append(c) else: nor = unicodedata.normalize('NFD', c) chars.extend(x for x in nor if unicodedata.category(x) != 'Mn') return ''.join(chars).lower()
def characters(self, content): text = content.strip() if self._inTitle: if self._headerProcessed: if not self._ignoreTitle: self._writeHtml(content) else : if self._headerProcessed: if not self._ignoreText: if len(text) > 0: if not self._glossTitleWritten and not self._inTitle: self._writeDefaultTitle() if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable: self._startGeneratedPara() if self._endDfn: if self._keywordTag == 'dfn': if unicodedata.category(content[0]) == 'Pd': self._writeHtml(' ') elif content[0] == ' ': if unicodedata.category(text[0]) != 'Pd': self._writeHtml(u' \u2014') else: self._writeHtml(u' \u2014 ') self._writeHtml(content) else: # 'h4' for fb2 if unicodedata.category(text[0]) == 'Pd': text = text[1:] self._writeHtml(text.strip()) self._endDfn = False else: self._writeHtml(content)
def splitText(text): """ Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """ segments = [] remaining_text = __class__.cleanSpaces(text) while len(remaining_text) > __class__.MAX_SEGMENT_SIZE: cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE] # try to split at punctuation split_idx = __class__.findLastCharIndexMatching(cur_text, # https://en.wikipedia.org/wiki/Unicode_character_property#General_Category lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po")) if split_idx is None: # try to split at whitespace split_idx = __class__.findLastCharIndexMatching(cur_text, lambda x: unicodedata.category(x).startswith("Z")) if split_idx is None: # try to split at anything not a letter or number split_idx = __class__.findLastCharIndexMatching(cur_text, lambda x: not (unicodedata.category(x)[0] in ("L", "N"))) if split_idx is None: # split at the last char split_idx = __class__.MAX_SEGMENT_SIZE - 1 new_segment = cur_text[:split_idx + 1].rstrip() segments.append(new_segment) remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation) if remaining_text: segments.append(remaining_text) return segments
def consolidate_ampers(text: str) -> str: """Converts all ampersands in a text to a single one (&). :param text: A string which should have ampersands converted. :return: The text string after all ampersands have been replaced. """ chosen_amper_value = "\u0026" amper_values = dict.fromkeys( [chr(i) for i in range(sys.maxunicode) # Avoid unnamed control chars throwing ValueErrors if (unicodedata.category(chr(i)).startswith('P') or unicodedata.category(chr(i)).startswith('S')) and re.search( r" ampersand|ampersand ", unicodedata.name(chr(i)), re.IGNORECASE) is not None and chr(i) != chosen_amper_value] ) # Change all ampersands to one type of ampersand for value in amper_values: text = text.replace(value, chosen_amper_value) return text
def test_characters_of_specific_groups(): st = characters(whitelist_categories=("Lu", "Nd")) find_any(st, lambda c: unicodedata.category(c) == "Lu") find_any(st, lambda c: unicodedata.category(c) == "Nd") assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
def test_exclude_characters_of_specific_groups(): st = characters(blacklist_categories=("Lu", "Nd")) find_any(st, lambda c: unicodedata.category(c) != "Lu") find_any(st, lambda c: unicodedata.category(c) != "Nd") assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))
def parse(cls, string): from unicodedata import category parts = [] last_ch = None for ch in string: if last_ch is None: parts.append([ch]) elif ch == ".": if last_ch in ".-": parts[-1][-1] += "0" parts[-1].append("") elif ch == "-": if last_ch in ".-": parts[-1][-1] += "0" parts.append([""]) else: if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]: parts.append([ch]) else: parts[-1][-1] += ch last_ch = ch for part in parts: for i, x in enumerate(part): try: part[i] = int(x) except (ValueError, TypeError): pass while len(part) > 1 and not part[-1]: part[:] = part[:-1] return cls(*map(tuple, parts))
def is_yelling(stuff): """ :return boolean True if all letters in stuff are uppercased """ letters = filter(lambda c: 'L' in unicodedata.category(c), unicode(stuff)) # 'L' category is for 'letter' if letters == u'': return False return all(('u' in unicodedata.category(c) for c in letters)) # 'u' category is for 'uppercase'
def test_characters_of_specific_groups(): st = characters(whitelist_categories=('Lu', 'Nd')) find(st, lambda c: unicodedata.category(c) == 'Lu') find(st, lambda c: unicodedata.category(c) == 'Nd') assert_no_examples( st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_exclude_characters_of_specific_groups(): st = characters(blacklist_categories=('Lu', 'Nd')) find(st, lambda c: unicodedata.category(c) != 'Lu') find(st, lambda c: unicodedata.category(c) != 'Nd') with pytest.raises(NoSuchExample): find(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
def test_characters_of_specific_groups(): st = characters(whitelist_categories=('Lu', 'Nd')) find(st, lambda c: unicodedata.category(c) == 'Lu') find(st, lambda c: unicodedata.category(c) == 'Nd') with pytest.raises(NoSuchExample): find(st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
def test_characters_of_specific_groups(): st = characters(whitelist_categories=("Lu", "Nd")) find(st, lambda c: unicodedata.category(c) == "Lu") find(st, lambda c: unicodedata.category(c) == "Nd") with pytest.raises(NoSuchExample): find(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
def combine_modifiers(self, string): """ Given a string that is space-delimited on Unicode grapheme clusters, group Unicode modifier letters with their preceding base characters, deal with tie bars, etc. Parameters ---------- string : str A Unicode string tokenized into grapheme clusters to be tokenized into simple IPA. """ result = [] graphemes = string.split() temp = "" count = len(graphemes) for grapheme in reversed(graphemes): count -= 1 if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" and not ord(grapheme) in [712, 716]: temp = grapheme+temp # hack for the cases where a space modifier is the first character in the string if count == 0: result[-1] = temp+result[-1] continue # catch and repair stress marks if len(grapheme) == 1 and ord(grapheme) in [712, 716]: result[-1] = grapheme+result[-1] temp = "" continue # combine contour tone marks (non-accents) if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk": if len(result) == 0: result.append(grapheme) temp = "" continue else: if unicodedata.category(result[-1][0]) == "Sk": result[-1] = grapheme+result[-1] temp = "" continue result.append(grapheme+temp) temp = "" # last check for tie bars segments = result[::-1] i = 0 r = [] while i < len(segments): # tie bars if ord(segments[i][-1]) in [865, 860]: r.append(segments[i]+segments[i+1]) i = i+2 else: r.append(segments[i]) i += 1 return " ".join(r)
def filterCharacters(s): """ Strip non printable characters @type s: dict|list|tuple|bytes|string @param s: Object to remove non-printable characters from @rtype: dict|list|tuple|bytes|string @return: An object that corresponds with the original object, nonprintable characters removed. """ validCategories = ('Lu', 'Ll', 'Lt', 'LC', 'Lm', 'Lo', 'L', 'Mn', 'Mc', 'Me', 'M', 'Nd', 'Nl', 'No', 'N', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'P', 'Sm', 'Sc', 'Sk', 'So', 'S', 'Zs', 'Zl', 'Zp', 'Z') convertToBytes = False if isinstance(s, dict): new = {} for k,v in s.items(): new[k] = filterCharacters(v) return new if isinstance(s, list): new = [] for item in s: new.append(filterCharacters(item)) return new if isinstance(s, tuple): new = [] for item in s: new.append(filterCharacters(item)) return tuple(new) if (3, 0) <= sys.version_info: if isinstance(s, bytes): s = s.decode('utf-8') convertToBytes = True if isinstance(s, str): s = ''.join(c for c in s if unicodedata.category(c) in validCategories) if convertToBytes: s = s.encode('utf-8') return s else: return None else: if isinstance(s, str): s = s.decode('utf-8') convertToBytes = True if isinstance(s, unicode): s = ''.join(c for c in s if unicodedata.category(c) in validCategories) if convertToBytes: s = s.encode('utf-8') return s else: return None
def hey(talk_str): if talk_str.isupper(): return u'Woah, chill out!' # all letters uppercase elif [c for c in talk_str if category(c)[0]=='L' or category(c)[0]=='N'] == []: return u'Fine. Be that way!' # no letters and no numbers elif talk_str[-1] == '?': # if not all letters uppercase return u'Sure.' else : return u'Whatever.'
def strip_accents(s): """ normalize given string """ if isinstance(s, unicode): return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn") return "".join( c for c in unicodedata.normalize("NFD", s.decode("utf8", "replace")) if unicodedata.category(c) != "Mn" )
def hey(self, message): message = unicode(message.strip()) print message if message == '': return 'Fine. Be that way!' elif all([ud.category(x) == "Lu" for x in message if ud.category(x)[0] == "L"]) and any([ud.category(x)[0] == "L" for x in message]): return 'Woah, chill out!' elif message[-1] == '?': return 'Sure.' else: return 'Whatever.'
def 數字調英文中央加分字符號(self, 語句): 新語句 = [] 舊字 = '0' for 字 in 語句: if 舊字 != '0' and \ unicodedata.category(舊字) in 統一碼數字類 and \ unicodedata.category(字) in 統一碼羅馬字類: 新語句.append(分字符號) 新語句.append(字) 舊字 = 字 return self.除掉重覆的空白(''.join(新語句))
def subsplit(pos,len,unich,nextch): cat=unicodedata.category(unich) if cat[0]=='L' and cat[1] != 'o': return False if cat[0]=='P': if pos == 0 or pos >= len-1 or nextch and pos == len-2 and unicodedata.category(nextch)[0]=='P': return True if unich in _apo_set or unich in _hyp_set: return False return True return True
def charname(s, verbose=False): if type(s) != str: print('Error: argument must be a str.') for i, c in enumerate(s): name = unicodedata.name(c) if verbose: long = general_category_values[unicodedata.category(c)]['Long'] desc = general_category_values[unicodedata.category(c)]['Description'] print('%d "%s" %s (%s: %s)' % (i, c, name, long, desc)) else: category = unicodedata.category(c) print('%d "%s" %s (%s)' % (i, c, name, category))
def main(): for line in fileinput.input(): try: uid, text = line.strip().split(" ", 1) except ValueError: continue tokens = [token.strip() for token in RE_NUM.split(text) if len(token.strip()) > 0] ntoken = len(tokens) for i in range(ntoken): if tokens[i] and unicodedata.category(tokens[i][0]) != "Nd": continue # Length rule if tokens[i][0] == 0 or len(tokens[i]) > len(UNITS): tokens[i] = digit_to_single_chinese(tokens[i]) continue # Pre-fix rules if (i-1) >= 0: if RE_LAST_END_SINGLE.match(tokens[i-1]): tokens[i] = digit_to_single_chinese(tokens[i]) continue if tokens[i-1].endswith("第") or tokens[i-1].endswith("比"): tokens[i] = digit_to_chinese(tokens[i], liang=False) continue if len(tokens[i-1]) > 1 and tokens[i-1][1] in "零一二三四五六七八九": tokens[i] = digit_to_single_chinese(tokens[i]) continue # Post-fix rules if (i+1) < ntoken: if RE_NEXT_START_SINGLE.match(tokens[i+1]): tokens[i] = digit_to_single_chinese(tokens[i]) continue if tokens[i+1].startswith("年"): if len(tokens[i]) > 4: tokens[i] = "%s %s" % ( \ digit_to_chinese(tokens[i][:-4], liang=False), digit_to_single_chinese(tokens[i][-4:])) continue elif len(tokens[i]) > 2: tokens[i] = digit_to_single_chinese(tokens[i]) continue if tokens[i+1].startswith("比"): tokens[i] = digit_to_chinese(tokens[i], liang=False) continue if unicodedata.category(tokens[i+1][0]) == "Nd": tokens[i] = digit_to_single_chinese(tokens[i]) continue # General fall-back rule tokens[i] = digit_to_chinese(tokens[i]) sys.stdout.write("%s %s\n" % (uid, RE_SPACES.sub(" ", "".join(tokens).strip())))
def get_all_punctuation_map() -> Dict[int, type(None)]: """Creates a dictionary containing all unicode punctuation and symbols. :return: The dictionary, with the ord() of each char mapped to None. """ punctuation_map = dict.fromkeys( [i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') or unicodedata.category(chr(i)).startswith('S')]) return punctuation_map
import re import sys import unicodedata punctuation = "" for i in range(sys.maxunicode): c = i try: c = unichr(c) except: c = chr(c) if (unicodedata.category(c)).startswith("P"): punctuation += c class Matcher: def __init__(self, word_re="\w+"): self.word_re = word_re def __getstate__(self): return self.word_re def __setstate__(self, word_re): self.word_re = word_re def __call__(self, text): tokens = re.findall(self.word_re, text) # Remove empty tokens tokens = [token for token in tokens if token] return tuple(tokens)
def eliminar_acentos(frase): frase = frase.replace('ñ', '#') res = ''.join((x for x in unicodedata.normalize('NFD', frase) if unicodedata.category(x) != 'Mn')) return res.replace('#', 'ñ')
def remove_diacritics(word: str): return ''.join(c for c in unicodedata.normalize('NFKD', word) if unicodedata.category(c) != 'Mn')
def strip_unicode(string): for s, r in STRIP_UNICODE.items(): string = string.replace(s, r) return ''.join(c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn')
def removeDiacritics(string): code = ''.join((c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn')) code = code.lower().replace(' ', '').replace('\'', '').replace('-', '') return code
def property_chars(self, prefix): return "".join( six.unichr(x) for x in range(sys.maxunicode) if unicodedata.category(six.unichr(x)).startswith(prefix))
class Factory: random_letters = map(random.choice, repeat(string.ascii_letters + string.digits)) random_letters_with_spaces = map( random.choice, repeat(string.ascii_letters + string.digits + " ")) # See django.contrib.auth.forms.UserCreationForm.username. random_letters_for_usernames = map(random.choice, repeat(string.ascii_letters + ".@+-")) random_http_responses = map(random.choice, repeat(tuple(http.client.responses))) random_octet = partial(random.randint, 0, 255) random_octets = iter(random_octet, None) random_unicode_codepoint = partial(random.randint, 0, 0x10FFFF) random_unicode_codepoints = iter(random_unicode_codepoint, None) random_unicode_characters = ( char for char in map(chr, random_unicode_codepoints) if unicodedata.category(char)[0] in "LMNPS") random_unicode_non_ascii_characters = ( char for char in random_unicode_characters if ord(char) >= 128) random_unicode_characters_with_spaces = ( char for char in map(chr, random_unicode_codepoints) if unicodedata.category(char)[0] in "LMNPSZ") random_unicode_non_ascii_characters_with_spaces = ( char for char in random_unicode_characters_with_spaces if char == " " or ord(char) >= 128) def make_string(self, size=10, spaces=False, prefix=""): """Return a `str` filled with random ASCII letters or digits.""" source = (self.random_letters_with_spaces if spaces else self.random_letters) return prefix + "".join(islice(source, size)) def make_unicode_string(self, size=10, spaces=False, prefix=""): """Return a `str` filled with random Unicode characters.""" source = (self.random_unicode_characters_with_spaces if spaces else self.random_unicode_characters) return prefix + "".join(islice(source, size)) def make_unicode_non_ascii_string(self, size=10, spaces=False, prefix=""): """Return a `str` filled with random non-ASCII Unicode characters.""" source = (self.random_unicode_non_ascii_characters_with_spaces if spaces else self.random_unicode_non_ascii_characters) return prefix + "".join(islice(source, size)) def make_bytes(self, size=10): """Return a `bytes` filled with random data.""" return os.urandom(size) def make_username(self, size=10): """Create an arbitrary user name (but not the actual user).""" return "".join(islice(self.random_letters_for_usernames, size)) def make_email_address(self, login_size=10): """Generate an arbitrary email address.""" return "*****@*****.**" % self.make_string(size=login_size) def make_status_code(self): """Return an arbitrary HTTP status code.""" return next(self.random_http_responses) exception_type_names = ("TestException#%d" % i for i in count(1)) def make_exception_type(self, bases=(Exception, ), **namespace): return type(next(self.exception_type_names), bases, namespace) def make_exception(self, message=None, bases=(Exception, ), **namespace): exc_type = self.make_exception_type(bases, **namespace) return exc_type() if message is None else exc_type(message) def make_absolute_path(self, directories=3, directory_length=10, path_seperator="/"): return path_seperator + path_seperator.join( self.make_string(size=directory_length) for _ in range(directories)) def pick_bool(self): """Return an arbitrary Boolean value (`True` or `False`).""" return random.choice((True, False)) def pick_enum(self, enum, *, but_not=EMPTY_SET): """Pick a random item from an enumeration class. :param enum: An enumeration class such as `NODE_STATUS`. Can also be an `enum.Enum` subclass. :return: The value of one of its items. :param but_not: A list of choices' IDs to exclude. :type but_not: Sequence. """ if issubclass(enum, Enum): return random.choice( [value for value in enum if value not in but_not]) else: return random.choice([ value for key, value in vars(enum).items() if not key.startswith("_") and value not in but_not ]) def pick_port(self, port_min=1024, port_max=65535): assert port_min >= 0 and port_max <= 65535 return random.randint(port_min, port_max) def pick_choice(self, choices, but_not=None): """Pick a random item from `choices`. :param choices: A sequence of choices in Django form choices format: [ ('choice_id_1', "Choice name 1"), ('choice_id_2', "Choice name 2"), ] :param but_not: A list of choices' IDs to exclude. :type but_not: Sequence. :return: The "id" portion of a random choice out of `choices`. """ if but_not is None: but_not = () return random.choice( [choice for choice in choices if choice[0] not in but_not])[0] def make_vlan_tag(self, allow_none=False, *, but_not=EMPTY_SET): """Create a random VLAN tag. :param allow_none: Whether `None` ("no VLAN") can be allowed as an outcome. If `True`, `None` will be included in the possible results with a deliberately over-represented probability, in order to help trip up bugs that might only show up once in about 4094 calls otherwise. :param but_not: A set of tags that should not be returned. Any zero or `None` entries will be ignored. """ if allow_none and self.pick_bool(): return None else: for _ in range(100): vlan_tag = random.randint(1, 0xFFE) if vlan_tag not in but_not: return vlan_tag raise TooManyRandomRetries("Could not find an available VLAN tag.") def ip_to_url_format(self, ip): # We return either '[ip:v6:address]' or 'a.b.c.d' depending on the # family of the IP Address. ip_addr = IPAddress(ip) if ip_addr.version == 6: return "[%s]" % str(ip_addr) else: return "%s" % str(ip_addr) def make_ipv4_address(self): octets = list(islice(self.random_octets, 4)) if octets[0] == 0: octets[0] = 1 return "%d.%d.%d.%d" % tuple(octets) def make_ipv6_address(self): # We return from the fc00::/7 space because that's a private # space and shouldn't cause problems of addressing the outside # world. network = IPNetwork("fc00::/7") # We can't use random.choice() because there are too many # elements in network. random_address_index = random.randint(0, network.size - 1) return str(IPAddress(network[random_address_index])) def make_ip_address(self, ipv6=None): """Create a random ip address. :param ipv6: True for ipv6, False for ipv4, None for random. :return: an IP Address :rtype: string """ if ipv6 is None: ipv6 = random.randint(0, 1) # intentionally allowing all "true" values, including "1". if ipv6: return self.make_ipv6_address() else: return self.make_ipv4_address() def make_UUID(self): return str(uuid1()) def make_UUID_with_timestamp(self, timestamp, clock_seq=None, node=None): if node is None: node = random.getrandbits(48) | 0x010000000000 if clock_seq is None: clock_seq = random.getrandbits(14) timestamp = int(timestamp * 1e9 / 100) + 0x01B21DD213814000 time_low = timestamp & 0xFFFFFFFF time_mid = (timestamp >> 32) & 0xFFFF time_hi_version = (timestamp >> 48) & 0x0FFF clock_seq_low = clock_seq & 0xFF clock_seq_hi_variant = (clock_seq >> 8) & 0x3F fields = ( time_low, time_mid, time_hi_version, clock_seq_hi_variant, clock_seq_low, node, ) return str(UUID(fields=fields, version=1)) def _make_random_network( self, slash=None, but_not=EMPTY_SET, disjoint_from=None, random_address_factory=None, ): """Generate a random IP network. :param slash: Netmask or bit width of the network, e.g. 24 or '255.255.255.0' for what used to be known as a class-C network. :param but_not: Optional iterable of `IPNetwork` objects whose values should not be returned. Use this when you need a different network from any returned previously. The new network may overlap any of these, but it won't be identical. :param disjoint_from: Optional iterable of `IPNetwork` objects whose IP ranges the new network must not overlap. :param random_address_factory: A callable that returns a random IP address. If not provided, will default to Factory.make_ipv4_address(). :return: A network spanning at least 8 IP addresses (at most 29 bits). :rtype: :class:`IPNetwork` """ but_not = frozenset(but_not) if disjoint_from is None: disjoint_from = [] if slash is None: slash = random.randint(16, 29) if random_address_factory is None: random_address_factory = self.make_ipv4_address # Look randomly for a network that matches our criteria. for _ in range(100): network = IPNetwork("%s/%s" % (random_address_factory(), slash)).cidr forbidden = network in but_not clashes = network_clashes(network, disjoint_from) if not forbidden and not clashes: return network raise TooManyRandomRetries("Could not find available network") def make_ipv4_network(self, slash=None, *, but_not=EMPTY_SET, disjoint_from=None): """Generate a random IPv4 network. :param slash: Netmask or bit width of the network, e.g. 24 or '255.255.255.0' for what used to be known as a class-C network. :param but_not: Optional iterable of `IPNetwork` objects whose values should not be returned. Use this when you need a different network from any returned previously. The new network may overlap any of these, but it won't be identical. :param disjoint_from: Optional iterable of `IPNetwork` objects whose IP ranges the new network must not overlap. :return: A network spanning at least 16 IP addresses (at most 28 bits). :rtype: :class:`IPNetwork` """ if slash is None: slash = random.randint(16, 28) return self._make_random_network( slash=slash, but_not=but_not, disjoint_from=disjoint_from, random_address_factory=self.make_ipv4_address, ) def make_ipv6_network(self, slash=None, *, but_not=EMPTY_SET, disjoint_from=None): """Generate a random IPv6 network. :param slash: Netmask or bit width of the network. If not specified, will default to a bit width of between 112 (65536 addresses) and 125 (8 addresses); :param but_not: Optional iterable of `IPNetwork` objects whose values should not be returned. Use this when you need a different network from any returned previously. The new network may overlap any of these, but it won't be identical. :param disjoint_from: Optional iterable of `IPNetwork` objects whose IP ranges the new network must not overlap. :return: A network spanning at least 8 IP addresses. :rtype: :class:`IPNetwork` """ if slash is None: slash = random.randint(112, 125) return self._make_random_network( slash=slash, but_not=but_not, disjoint_from=disjoint_from, random_address_factory=self.make_ipv6_address, ) def make_ip4_or_6_network(self, version=None, host_bits=None): """Generate a random IPv4 or IPv6 network.""" slash = None if version is None: version = random.choice([4, 6]) if version == 4: if host_bits is not None: slash = 32 - host_bits return self.make_ipv4_network(slash=slash) else: if host_bits is not None: slash = 128 - host_bits return self.make_ipv6_network(slash=slash) def pick_ip_in_dynamic_range(self, ngi, *, but_not=EMPTY_SET): first = ngi.get_dynamic_ip_range().first last = ngi.get_dynamic_ip_range().last but_not = {IPAddress(but) for but in but_not if but is not None} for _ in range(100): address = IPAddress(random.randint(first, last)) if address not in but_not: return str(address) raise TooManyRandomRetries( "Could not find available IP in static range") def pick_ip_in_static_range(self, ngi, *, but_not=EMPTY_SET): first = ngi.get_static_ip_range().first last = ngi.get_static_ip_range().last but_not = {IPAddress(but) for but in but_not if but is not None} for _ in range(100): address = IPAddress(random.randint(first, last)) if address not in but_not: return str(address) raise TooManyRandomRetries( "Could not find available IP in static range") def pick_ip_in_network(self, network, *, but_not=EMPTY_SET): but_not = { IPAddress(but) for but in but_not if but is not None and IPAddress(but) in network } # Unless the prefix length is very small, make sure we don't select # a normally-unusable IP address. if network.version == 6 and network.prefixlen < 127: # Don't pick the all-zeroes address, since it has special meaning # in IPv6 as the subnet-router anycast address. IPv6 does not have # a broadcast address, though. first, last = network.first + 1, network.last network_size = network.size - 1 elif network.prefixlen < 31: # Don't pick broadcast or network addresses. first, last = network.first + 1, network.last - 1 network_size = network.size - 2 else: first, last = network.first, network.last network_size = network.size if len(but_not) == network_size: raise ValueError( "No IP addresses available in network: %s (but_not=%r)" % (network, but_not)) for _ in range(100): address = IPAddress(random.randint(first, last)) if address not in but_not: return str(address) raise TooManyRandomRetries( "Could not find available IP in network: %s (but_not=%r)" % (network, but_not)) def make_ip_range(self, network): """Return a pair of IP addresses from the given network. :param network: Return IP addresses within this network. :param but_not: A pair of addresses that should not be returned. :return: A pair of `IPAddress`. """ for _ in range(100): ip_range = tuple( sorted( IPAddress(factory.pick_ip_in_network(network)) for _ in range(2))) if ip_range[0] < ip_range[1]: return ip_range raise TooManyRandomRetries( "Could not find available IP range in network: %s" % network) def make_ipv4_range(self, network=None): """Return a pair of IPv4 addresses. :param network: Return IP addresses within this network. :param but_not: A pair of addresses that should not be returned. :return: A pair of `IPAddress`. """ if network is None: network = self.make_ipv4_network() return self.make_ip_range(network=network) def make_ipv6_range(self, network=None): """Return a pair of IPv6 addresses. :param network: Return IP addresses within this network. :param but_not: A pair of addresses that should not be returned. :return: A pair of `IPAddress`. """ if network is None: network = self.make_ipv6_network() return self.make_ip_range(network=network) def make_mac_address(self, delimiter=":"): assert isinstance(delimiter, str) octets = islice(self.random_octets, 6) return delimiter.join(format(octet, "02x") for octet in octets) def make_random_leases(self, num_leases=1): """Create a dict of arbitrary ip-to-mac address mappings.""" # This could be a dict comprehension, but the current loop # guards against shortfalls as random IP addresses collide. leases = {} while len(leases) < num_leases: leases[self.make_ipv4_address()] = self.make_mac_address() return leases def make_date(self, year=2017): start = time.mktime(datetime.datetime(year, 1, 1).timetuple()) end = time.mktime(datetime.datetime(year + 1, 1, 1).timetuple()) stamp = random.randrange(start, end) return datetime.datetime.fromtimestamp(stamp) def make_timedelta(self): return datetime.timedelta( days=random.randint(0, 3 * 365), seconds=random.randint(0, 24 * 60 * 60 - 1), microseconds=random.randint(0, 999999), ) def make_file(self, location, name=None, contents=None): """Create a file, and write data to it. Prefer the eponymous convenience wrapper in :class:`maastesting.testcase.MAASTestCase`. It creates a temporary directory and arranges for its eventual cleanup. :param location: Directory. Use a temporary directory for this, and make sure it gets cleaned up after the test! :param name: Optional name for the file. If none is given, one will be made up. :param contents: Optional contents for the file. If omitted, some arbitrary ASCII text will be written. If Unicode content is provided, it will be encoded with UTF-8. :type contents: unicode, but containing only ASCII characters. :return: Path to the file. """ if name is None: name = self.make_string() if contents is None: contents = self.make_string().encode("ascii") if isinstance(contents, str): contents = contents.encode("utf-8") path = os.path.join(location, name) with open(path, "wb") as f: f.write(contents) return path def make_name(self, prefix=None, sep="-", size=6): """Generate a random name. :param prefix: Optional prefix. Pass one to help make test failures and tracebacks easier to read! If you don't, you might as well use `make_string`. :param sep: Separator that will go between the prefix and the random portion of the name. Defaults to a dash. :param size: Length of the random portion of the name. Don't get hung up on this; you may need more if uniqueness is really important or less if it doesn't but legibility does, but generally, use the default. :return: A randomized unicode string. """ if prefix is None: return self.make_string(size=size) else: return prefix + sep + self.make_string(size=size) def make_hostname(self, prefix="host", *args, **kwargs): """Generate a random hostname. The returned hostname is lowercase because python's urlparse implicitely lowercases the hostnames.""" return self.make_name(prefix=prefix, *args, **kwargs).lower() # Always select from a scheme that allows parameters in the URL so # that we can round-trip a URL with params successfully (otherwise # the params don't get parsed out of the path). _make_parsed_url_schemes = tuple(scheme for scheme in urllib.parse.uses_params if scheme != "") def make_parsed_url( self, scheme=None, netloc=None, path=None, port=None, params=None, query=None, fragment=None, ): """Generate a random parsed URL object. Contains randomly generated values for all parts of a URL: scheme, location, path, parameters, query, and fragment. However, each part can be overridden individually. If port=None or port=True, make_port() will be used to select a random port, while port=False will create a netloc for the URL that does not specify a port. To specify a port in netloc, port parameter must be False. :return: Instance of :py:class:`urlparse.ParseResult`. """ if port is not False and netloc is not None and netloc.count(":") == 1: raise AssertionError( "A port number has been requested, however the given netloc " "spec %r already contains a port number." % (netloc, )) if scheme is None: # Select a scheme that allows parameters; see above. scheme = random.choice(self._make_parsed_url_schemes) if port is None or port is True: port = self.pick_port() if netloc is None: netloc = "%s.example.com" % self.make_name("netloc").lower() if isinstance(port, int) and not isinstance(port, bool): netloc += ":%d" % port if path is None: # A leading forward-slash will be added in geturl() if we # don't, so ensure it's here now so tests can compare URLs # without worrying about it. path = self.make_name("/path") else: # Same here with the forward-slash prefix. if not path.startswith("/"): path = "/" + path if params is None: params = self.make_name("params") if query is None: query = self.make_name("query") if fragment is None: fragment = self.make_name("fragment") return urllib.parse.ParseResult(scheme, netloc, path, params, query, fragment) def make_url( self, scheme=None, netloc=None, path=None, params=None, query=None, fragment=None, ): """Generate a random URL. Contains randomly generated values for all parts of a URL: scheme, location, path, parameters, query, and fragment. However, each part can be overridden individually. :return: string """ return self.make_parsed_url(scheme, netloc, path, params, query, fragment).geturl() def make_simple_http_url(self, netloc=None, path=None, port=None): """Create an arbitrary HTTP URL with only a location and path.""" return self.make_parsed_url( scheme="http", netloc=netloc, path=path, port=port, params="", query="", fragment="", ).geturl() def make_names(self, *prefixes): """Generate random names. Yields a name for each prefix specified. :param prefixes: Zero or more prefixes. See `make_name`. """ for prefix in prefixes: yield self.make_name(prefix) def make_tarball(self, location, contents): """Create a tarball containing the given files. :param location: Path to a directory where the tarball can be stored. :param contents: A dict mapping file names to file contents. Where the value is `None`, the file will contain arbitrary data. :return: Path to a gzip-compressed tarball. """ tarball = os.path.join(location, "%s.tar.gz" % self.make_name()) with TempDirectory() as working_dir: source = working_dir.path for name, content in contents.items(): self.make_file(source, name, content) subprocess.check_call(["tar", "-C", source, "-czf", tarball, "."]) return tarball def make_response(self, status_code, content, content_type=None): """Return a similar response to that which `urllib` returns.""" headers = http.client.HTTPMessage() if content_type is not None: headers.set_type(content_type) return urllib.request.addinfourl(fp=io.BytesIO(content), headers=headers, url=None, code=status_code) def make_streams(self, stdin=None, stdout=None, stderr=None): """Make a fake return value for a SSHClient.exec_command.""" # stdout.read() is called so stdout can't be None. if stdout is None: stdout = mock.Mock() return (stdin, stdout, stderr) def make_CalledProcessError(self): """Make a fake :py:class:`subprocess.CalledProcessError`.""" return subprocess.CalledProcessError( returncode=random.randint(1, 10), cmd=[self.make_name("command")], output=factory.make_bytes(), ) def make_kernel_string(self, can_be_release_or_version=False, generic_only=False): ubuntu = UbuntuDistroInfo() # Only select from MAAS supported releases so we don't have to deal # with versions name overlap(e.g Warty and Wily). try: ubuntu_rows = ubuntu._rows except AttributeError: ubuntu_rows = [row.__dict__ for row in ubuntu._releases] supported_releases = [ release for release in ubuntu_rows if int(release["version"].split(".")[0]) >= 12 ] release = random.choice(supported_releases) # Remove 'LTS' from version if it exists version_str = release["version"].split(" ")[0] strings = [ "hwe-%s" % release["series"][0], "hwe-%s" % version_str, "hwe-%s-edge" % version_str, ] if not generic_only: strings += [ "hwe-%s-lowlatency" % version_str, "hwe-%s-lowlatency-edge" % version_str, ] if can_be_release_or_version: strings += [release["series"], version_str] return random.choice(strings) def make_dhcp_packet( self, transaction_id: bytes = None, truncated: bool = False, truncated_option_value: bool = False, bad_cookie: bool = False, truncated_option_length: bool = False, include_server_identifier: bool = False, server_ip: str = "127.1.1.1", include_end_option: bool = True, ) -> bytes: """Returns a [possibly invalid] DHCP packet.""" if transaction_id is None: transaction_id = self.make_bytes(size=4) options = b"" if include_server_identifier: # 0x36 == 54 (Server Identifier option) ip_bytes = int(IPAddress(server_ip).value).to_bytes(4, "big") options += b"\x36\x04" + ip_bytes if truncated_option_value: options += b"\x36\x04\x7f\x01" include_end_option = False if truncated_option_length: options += b"\x36" include_end_option = False # Currently, we only validation the transaction ID, and the fact that # the reply packet has a "Server Identifier" option. This might be # considered a bug, but in practice it works out. packet = ( # Message type: 0x02 (BOOTP operation: reply). b"\x02" # Hardware type: Ethernet b"\x01" # Hardware address length: 6 b"\x06" # Hops: 0 b"\x00" + # Transaction ID transaction_id + # Seconds b"\x00\x00" # Flags b"\x00\x00" # Client IP address: 0.0.0.0 b"\x00\x00\x00\x00" # Your (client) IP address: 0.0.0.0 b"\x00\x00\x00\x00" # Next server IP address: 0.0.0.0 b"\x00\x00\x00\x00" # Relay agent IP address: 0.0.0.0 b"\x00\x00\x00\x00" + # Client hardware address b"\x01\x02\x03\x04\x05\x06" # Hardware address padding b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + # Server host name (b"\x00" * 67) + # Boot filename (b"\x00" * 125) + # Cookie (b"\x63\x82\x53\x63" if not bad_cookie else b"xxxx") + # "DHCP Offer" option b"\x35\x01\x02" + options + # End options. (b"\xff" if include_end_option else b"")) if truncated: packet = packet[:200] return packet
def unicode_to_ascii(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
"\U000E01E5", "\U000E01E6", "\U000E01E7", "\U000E01E8", "\U000E01E9", "\U000E01EA", "\U000E01EB", "\U000E01EC", "\U000E01ED", "\U000E01EE", "\U000E01EF", ] MARK_SET = { chr(c) for c in range(sys.maxunicode + 1) if unicodedata.category(chr(c))[0] == "M" } print("len(UNICODE_NSM) = {}".format(len(UNICODE_NSM))) print("len(MARK_SET) = {}".format(len(MARK_SET))) filepath = "UnicodeData.txt" with open(filepath) as f: text = f.read() text = text[:10000] def main(): ground_truth = loop_count(text) functions = [ # (loop_count, 'loop_count'), # (generator_count, 'generator_count'),
def category_count(text): return sum(unicodedata.category(char) != "Mn" for char in text)
import cloudscraper from bs4 import BeautifulSoup from bs4.element import Comment, Tag from requests import Response, Session from ..assets.user_agents import user_agents from ..utils.ssl_no_verify import no_ssl_verification from .exeptions import LNException logger = logging.getLogger(__name__) LINE_SEP = '<br>' INVISIBLE_CHARS = [c for c in range(sys.maxunicode) if unicodedata.category(chr(c)) in {'Cf', 'Cc'}] NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7f, 0xa0), INVISIBLE_CHARS) NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE} MAX_CONCURRENT_REQUEST_PER_DOMAIN = 15 REQUEST_SEMAPHORES: Dict[str, Semaphore] = {} def get_domain_semaphore(url): host = urlparse(url).hostname or url if host not in REQUEST_SEMAPHORES: REQUEST_SEMAPHORES[host] = Semaphore(MAX_CONCURRENT_REQUEST_PER_DOMAIN) return REQUEST_SEMAPHORES[host] class Crawler(ABC): '''Blueprint for creating new crawlers'''
def remove_accents(self, data): return ''.join(x for x in unicodedata.normalize('NFKD', data) if \ unicodedata.category(x)[0] == 'L')
def strip_accents(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def normalize_text(text,nlp): s = [] for tok in nlp.tokenizer(text.lower()): if not tok.is_stop: if tok.is_alpha and not (tok.is_digit or len(tok.text) == 1): if not tok.is_ascii: tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn') s.append(tok) else: s.append(tok.text) if not s: return "emptystring" else: s = ' '.join(s) return s
def remove_control_characters(s): if not s: return "" return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
import unicodedata import sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) # unicodedata.combining(chr(c)) b = unicodedata.normalize('NFD', s) print(b) print(b.translate(cmb_chrs)) # Maps all unicode decimal digit characters to their equivalent in ASCII digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'ND' } print(len(digitmap)) # Arabic digits x = '\u0661\u0662\u0663' print(x.translate(digitmap)) print(s) b = unicodedata.normalize('NFD', s) b.encode('ascii', 'ignore').decode('ascii') print(b) print(b.translate(remap))
# coding: utf-8 from __future__ import print_function from __future__ import unicode_literals from unidecode import unidecode import unicodedata hun_characters = {bytearray([i]).decode('iso-8859-2') for i in range(256)} hun_punctuation = { c for c in hun_characters if unicodedata.category(c).startswith('P') } translate_remove_hun_punctuation = {ord(c): None for c in hun_punctuation} translate_space_for_hun_punctuation = {ord(c): ' ' for c in hun_punctuation} def lower(words): return tuple(w.lower() for w in words) def remove_accents(words): return tuple(''.__class__(unidecode(w)) for w in words) def lower_without_accents(words): return lower(remove_accents(words)) def remove_punctuations(words):