def test_pack_bitvector(self): for i in range(1 << 10): intermediate = unpack_bitvector(i) x = bitvector(intermediate) print(i, intermediate) self.assertEqual(i, x)
def test_unpack_bitvector(self): for i in range(32): n = 1 << i lst = unpack_bitvector(n) self.assertEqual(len(lst), 1) self.assertEqual(lst[0], i)
def simplify_charclass(matching_codes, ignorecase=False): """Given a sequence of ordinals, return a (seq, negated) tuple. `ignorecase` is whether the regex flags include re.IGNORECASE. If the class shouldn't be optimized, raises WontOptimize with a basic reason string. """ if max(matching_codes) > 255: raise WontOptimize('Unicode') # HACK: Don't simplify something that looks fairly like a hex digit pattern. # They look arguably prettier as '0-9a-f' than '\da-f' bv = bitvector(matching_codes) if (bv & HEX) == HEX and ord('g') not in matching_codes: raise WontOptimize('Hex digit') if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes: raise WontOptimize('Alphanumeric without _') if ignorecase: bv = bitvector(map(lowercase_code, matching_codes)) base = INSENSITIVE_ASCII else: base = ASCII # Tries all possibilities of categories first. keys = sorted(CATS.keys(), reverse=True) # Strategy: since we have a small number of categories, try each of them to # see if it's legal; add in remaining ranges; score. # when negated=0, there are 64 (=2**6) combinations to check. # when negated=1, there are only 8 (=2**3) combinations. possibilities = [] for negated in (0, 1): for i in range(2**len(keys)): chosen_keys = [keys[b] for b in range(len(keys)) if i & 1<<b] # Humans are terrible at double-negatives. If this involves a # negation of the charclass as well as the category, tough cookies. # This will cause suggested _expansion_ of any such uses already in # the codebase, which should be ignored by the caller. if negated: if any(k[1].isupper() for k in chosen_keys): continue if negated: t = base ^ (base & bv) else: t = bv chosen = 0 for k in chosen_keys: chosen |= CATS[k] chosen &= base # True iff. the chosen categories fit entirely in the target. if chosen & t == chosen: #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen) t ^= chosen #print " ", unpack_bitvector(t) r = build_ranges(unpack_bitvector(t)) r[:0] = chosen_keys discount = 1 if chosen_keys == ['\\w', '\\W'] else 0 if r: possibilities.append((charclass_score(r, negated) - discount, r, negated)) #print "possibilities", possibilities # There will always be one, since we include no-categories above, and it's # not on the WontOptimize list. possibilities.sort() return (possibilities[0][1], possibilities[0][2])
def test_pack_bitvector(self): for i in range(1<<10): intermediate=unpack_bitvector(i) x = bitvector(intermediate) print(i, intermediate) self.assertEqual(i, x)
def test_unpack_bitvector(self): for i in range(32): n = 1<<i lst = unpack_bitvector(n) self.assertEqual(len(lst), 1) self.assertEqual(lst[0], i)
def simplify_charclass(matching_codes, ignorecase=False): """Given a sequence of ordinals, return a (seq, negated) tuple. `ignorecase` is whether the regex flags include re.IGNORECASE. If the class shouldn't be optimized, raises WontOptimize with a basic reason string. """ if max(matching_codes) > 255: raise WontOptimize('Unicode') # HACK: Don't simplify something that looks fairly like a hex digit pattern. # They look arguably prettier as '0-9a-f' than '\da-f' bv = bitvector(matching_codes) if (bv & HEX) == HEX and ord('g') not in matching_codes: raise WontOptimize('Hex digit') if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes: raise WontOptimize('Alphanumeric without _') if ignorecase: bv = bitvector(map(lowercase_code, matching_codes)) base = INSENSITIVE_ASCII else: base = ASCII # Tries all possibilities of categories first. keys = sorted(CATS.keys(), reverse=True) # Strategy: since we have a small number of categories, try each of them to # see if it's legal; add in remaining ranges; score. # when negated=0, there are 64 (=2**6) combinations to check. # when negated=1, there are only 8 (=2**3) combinations. possibilities = [] for negated in (0, 1): # target is the set of all characters we want to match, and none of the # ones we don't (note: for case-insensitive, we mask `chosen' before # comparing later). if negated: if ignorecase: target = bitvector( map(lowercase_code, [ i for i in range(256) if i not in unpack_bitvector(bv) ])) else: target = base ^ (base & bv) else: target = bv for i in range(2**len(keys)): chosen_keys = [keys[b] for b in range(len(keys)) if i & 1 << b] # Humans are terrible at double-negatives. If this involves a # negation of the charclass as well as the category, tough cookies. # This will cause suggested _expansion_ of any such uses already in # the codebase, which should be ignored by the caller. if negated: if any(k[1].isupper() for k in chosen_keys): continue t = target chosen = 0 for k in chosen_keys: chosen |= CATS[k] # N.b. don't need to conditionally lowercase_code here because all # our categories contain lower if they contain upper. chosen &= base # True iff. the chosen categories fit entirely in the target. if chosen & t == chosen: #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen) t ^= chosen #print " ", unpack_bitvector(t) r = build_ranges(unpack_bitvector(t)) r[:0] = chosen_keys discount = 1 if chosen_keys == ['\\w', '\\W'] else 0 if r: possibilities.append( (charclass_score(r, negated) - discount, r, negated)) #print "possibilities", possibilities # There will always be one, since we include no-categories above, and it's # not on the WontOptimize list. possibilities.sort(key=lambda i: i[0]) return (possibilities[0][1], possibilities[0][2])