from hypothesis.internal.charmap import as_general_categories, categories from hypothesis.internal.compat import int_to_byte HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6) UNICODE_CATEGORIES = set(categories()) SPACE_CHARS = set(" \t\n\r\f\v") UNICODE_SPACE_CHARS = SPACE_CHARS | set("\x1c\x1d\x1e\x1f\x85") UNICODE_DIGIT_CATEGORIES = {"Nd"} UNICODE_SPACE_CATEGORIES = set(as_general_categories("Z")) UNICODE_LETTER_CATEGORIES = set(as_general_categories("L")) UNICODE_WORD_CATEGORIES = set(as_general_categories(["L", "N"])) # This is verbose, but correct on all versions of Python BYTES_ALL = {int_to_byte(i) for i in range(256)} BYTES_DIGIT = {b for b in BYTES_ALL if re.match(b"\\d", b)} BYTES_SPACE = {b for b in BYTES_ALL if re.match(b"\\s", b)} BYTES_WORD = {b for b in BYTES_ALL if re.match(b"\\w", b)} BYTES_LOOKUP = { sre.CATEGORY_DIGIT: BYTES_DIGIT, sre.CATEGORY_SPACE: BYTES_SPACE, sre.CATEGORY_WORD: BYTES_WORD, sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT, sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE, sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD, } GROUP_CACHE_STRATEGY = st.shared(st.builds(dict), key="hypothesis.regex.group_cache")
HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6) UNICODE_CATEGORIES = set(categories()) SPACE_CHARS = set(u" \t\n\r\f\v") UNICODE_SPACE_CHARS = SPACE_CHARS | set(u"\x1c\x1d\x1e\x1f\x85") UNICODE_DIGIT_CATEGORIES = {"Nd"} UNICODE_SPACE_CATEGORIES = set(as_general_categories("Z")) UNICODE_LETTER_CATEGORIES = set(as_general_categories("L")) UNICODE_WORD_CATEGORIES = set(as_general_categories(["L", "N"])) # This is verbose, but correct on all versions of Python BYTES_ALL = {int_to_byte(i) for i in range(256)} BYTES_DIGIT = {b for b in BYTES_ALL if re.match(b"\\d", b)} BYTES_SPACE = {b for b in BYTES_ALL if re.match(b"\\s", b)} BYTES_WORD = {b for b in BYTES_ALL if re.match(b"\\w", b)} BYTES_LOOKUP = { sre.CATEGORY_DIGIT: BYTES_DIGIT, sre.CATEGORY_SPACE: BYTES_SPACE, sre.CATEGORY_WORD: BYTES_WORD, sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT, sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE, sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD, } # On Python 2, these unicode chars are matched by \W, meaning 'not word', # but unicodedata.category(c) returns one of the word categories above. UNICODE_WEIRD_NONWORD_CHARS = set(u"\U00012432\U00012433\U00012456\U00012457")
HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6) UNICODE_CATEGORIES = set(categories()) SPACE_CHARS = set(u' \t\n\r\f\v') UNICODE_SPACE_CHARS = SPACE_CHARS | set(u'\x1c\x1d\x1e\x1f\x85') UNICODE_DIGIT_CATEGORIES = set(['Nd']) UNICODE_SPACE_CATEGORIES = set(as_general_categories('Z')) UNICODE_LETTER_CATEGORIES = set(as_general_categories('L')) UNICODE_WORD_CATEGORIES = set(as_general_categories(['L', 'N'])) # This is verbose, but correct on all versions of Python BYTES_ALL = set(int_to_byte(i) for i in range(256)) BYTES_DIGIT = set(b for b in BYTES_ALL if re.match(b'\\d', b)) BYTES_SPACE = set(b for b in BYTES_ALL if re.match(b'\\s', b)) BYTES_WORD = set(b for b in BYTES_ALL if re.match(b'\\w', b)) BYTES_LOOKUP = { sre.CATEGORY_DIGIT: BYTES_DIGIT, sre.CATEGORY_SPACE: BYTES_SPACE, sre.CATEGORY_WORD: BYTES_WORD, sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT, sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE, sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD, } # On Python < 3.4 (including 2.7), the following unicode chars are weird. # They are matched by the \W, meaning 'not word', but unicodedata.category(c) # returns one of the word categories above. There's special handling below.
'Cf', 'Cn', 'Co', 'LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', ]) SPACE_CHARS = set(u' \t\n\r\f\v') UNICODE_SPACE_CHARS = SPACE_CHARS | set(u'\x1c\x1d\x1e\x1f\x85') UNICODE_DIGIT_CATEGORIES = set(['Nd']) UNICODE_SPACE_CATEGORIES = set(['Zs', 'Zl', 'Zp']) UNICODE_LETTER_CATEGORIES = set(['LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu']) UNICODE_WORD_CATEGORIES = UNICODE_LETTER_CATEGORIES | set(['Nd', 'Nl', 'No']) # This is verbose, but correct on all versions of Python BYTES_ALL = set(int_to_byte(i) for i in range(256)) BYTES_DIGIT = set(b for b in BYTES_ALL if re.match(b'\\d', b)) BYTES_SPACE = set(b for b in BYTES_ALL if re.match(b'\\s', b)) BYTES_WORD = set(b for b in BYTES_ALL if re.match(b'\\w', b)) BYTES_LOOKUP = { sre.CATEGORY_DIGIT: BYTES_DIGIT, sre.CATEGORY_SPACE: BYTES_SPACE, sre.CATEGORY_WORD: BYTES_WORD, sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT, sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE, sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD, } # On Python < 3.4 (including 2.7), the following unicode chars are weird. # They are matched by the \W, meaning 'not word', but unicodedata.category(c) # returns one of the word categories above. There's special handling below.