def sort_regions_with_gaps(self): """Guarantees that for each i we have tried to swap index i with index i + 2. This uses an adaptive algorithm that works by sorting contiguous regions centered on each element, where that element is treated as fixed and the elements around it are sorted.. """ for i in range(1, len(self.current) - 1): if self.current[i - 1] <= self.current[i] <= self.current[i + 1]: # The `continue` line is optimised out of the bytecode on # CPython >= 3.7 (https://bugs.python.org/issue2506) and on # PyPy, and so coverage cannot tell that it has been taken. continue # pragma: no cover def can_sort(a, b): if a < 0 or b > len(self.current): return False assert a <= i < b split = i - a values = sorted(self.current[a:i] + self.current[i + 1:b]) return self.consider( list(self.current[:a]) + values[:split] + [self.current[i]] + values[split:] + list(self.current[b:])) left = i right = i + 1 right += find_integer(lambda k: can_sort(left, right + k)) find_integer(lambda k: can_sort(left - k, right))
def sort_regions_with_gaps(self): """Guarantees that for each i we have tried to swap index i with index i + 2. This uses an adaptive algorithm that works by sorting contiguous regions centered on each element, where that element is treated as fixed and the elements around it are sorted.. """ for i in range(1, len(self.current) - 1): if self.current[i - 1] <= self.current[i] <= self.current[i + 1]: continue def can_sort(a, b): if a < 0 or b > len(self.current): return False assert a <= i < b split = i - a values = sorted(self.current[a:i] + self.current[i + 1:b]) return self.consider( list(self.current[:a]) + values[:split] + [self.current[i]] + values[split:] + list(self.current[b:])) left = i right = i + 1 right += find_integer(lambda k: can_sort(left, right + k)) find_integer(lambda k: can_sort(left - k, right))
def distinguish(self, value, test): """Checks whether ``test`` gives the same answer for ``value`` and ``self.normalize(value)``. If it does not, updates the list of canonical values so that it does. Returns True if and only if this makes a change to the underlying canonical values.""" canonical = self.normalize(value) if canonical == value: return False value_test = test(value) if test(canonical) == value_test: return False def can_lower(k): new_canon = value - k if new_canon <= canonical: return False return test(new_canon) == value_test new_canon = value - find_integer(can_lower) assert new_canon not in self.__values insort(self.__values, new_canon) assert self.normalize(value) == new_canon return True
def sort_regions(self): """Guarantees that for each i we have tried to swap index i with index i + 1. This uses an adaptive algorithm that works by sorting contiguous regions starting from each element. """ i = 0 while i + 1 < len(self.current): prefix = list(self.current[:i]) k = find_integer( lambda k: i + k <= len(self.current) and self.consider( prefix + sorted(self.current[i:i + k], key=self.key ) + list(self.current[i + k:]))) i += k
def hill_climb(self): """The main hill climbing loop where we actually do the work: Take data, and attempt to improve its score for target. select_example takes a data object and returns an index to an example where we should focus our efforts.""" blocks_examined = set() prev = None i = len(self.current_data.blocks) - 1 while i >= 0 and self.improvements <= self.max_improvements: if prev is not self.current_data: i = len(self.current_data.blocks) - 1 prev = self.current_data if i in blocks_examined: i -= 1 continue blocks_examined.add(i) data = self.current_data block = data.blocks[i] prefix = data.buffer[:block.start] existing = data.buffer[block.start:block.end] existing_as_int = int_from_bytes(existing) max_int_value = (256**len(existing)) - 1 if existing_as_int == max_int_value: continue def attempt_replace(v): """Try replacing the current block in the current best test case with an integer of value i. Note that we use the *current* best and not the one we started with. This helps ensure that if we luck into a good draw when making random choices we get to keep the good bits.""" if v < 0 or v > max_int_value: return False v_as_bytes = int_to_bytes(v, len(existing)) # We make a couple attempts at replacement. This only matters # if we end up growing the buffer - otherwise we exit the loop # early - but in the event that there *is* some randomized # component we want to give it a couple of tries to succeed. for _ in range(3): attempt = self.engine.cached_test_function( prefix + v_as_bytes + self.current_data.buffer[block.end:] + bytes(BUFFER_SIZE), ) if self.consider_new_test_data(attempt): return True if attempt.status < Status.INVALID or len( attempt.buffer) == len(self.current_data.buffer): return False for i, ex in enumerate(self.current_data.examples): if ex.start >= block.end: break if ex.end <= block.start: continue ex_attempt = attempt.examples[i] if ex.length == ex_attempt.length: continue replacement = attempt.buffer[ex_attempt. start:ex_attempt.end] if self.consider_new_test_data( self.engine.cached_test_function( prefix + replacement + self.current_data.buffer[ex.end:])): return True return False # We unconditionally scan both upwards and downwards. The reason # for this is that we allow "lateral" moves that don't increase the # score but instead leave it constant. All else being equal we'd # like to leave the test case closer to shrunk, so afterwards we # try lowering the value towards zero even if we've just raised it. if not attempt_replace(max_int_value): find_integer(lambda k: attempt_replace(k + existing_as_int)) existing = self.current_data.buffer[block.start:block.end] existing_as_int = int_from_bytes(existing) if not attempt_replace(0): find_integer(lambda k: attempt_replace(existing_as_int - k))
def learn(self, s): """Learn to give the correct answer on this string. That is, after this method completes we will have ``self.dfa.matches(s) == self.member(s)``. Note that we do not guarantee that this will remain true in the event that learn is called again with a different string. It is in principle possible that future learning will cause us to make a mistake on this string. However, repeatedly calling learn on each of a set of strings until the generation stops changing is guaranteed to terminate. """ s = bytes(s) correct_outcome = self.member(s) # We don't want to check this inside the loop because it potentially # causes us to evaluate more of the states than we actually need to, # but if our model is mostly correct then this will be faster because # we only need to evaluate strings that are of the form # ``state + experiment``, which will generally be cached and/or needed # later. if self.dfa.matches(s) == correct_outcome: return # In the papers they assume that we only run this process # once, but this is silly - often when you've got a messy # string it will be wrong for many different reasons. # # Thus we iterate this to a fixed point where we repair # the DFA by repeatedly adding experiments until the DFA # agrees with the membership function on this string. while True: dfa = self.dfa states = [dfa.start] def seems_right(n): """After reading n characters from s, do we seem to be in the right state? We determine this by replacing the first n characters of s with the label of the state we expect to be in. If we are in the right state, that will replace a substring with an equivalent one so must produce the same answer. """ if n > len(s): return False # Populate enough of the states list to know where we are. while n >= len(states): states.append(dfa.transition(states[-1], s[len(states) - 1])) return self.member(dfa.label(states[n]) + s[n:]) == correct_outcome n = find_integer(seems_right) # We got to the end without ever finding ourself in a bad # state, so we must correctly match this string. if n == len(s): assert dfa.matches(s) == correct_outcome break # Reading n characters does not put us in a bad state but # reading n + 1 does. This means that the remainder of # the string that we have not read yet is an experiment # that allows us to distinguish the state that we ended # up in from the state that we should have ended up in. # # There are two possibilities here: Either we have badly # normalised the byte that lead to this transition, or # we ended up in the wrong state because we could not # distinguish the state we eneded up infrom the correct # one. prefix = s[:n] suffix = s[n + 1 :] if self.__normalizer.distinguish( s[n], lambda x: self.member(prefix + bytes([x]) + suffix) ): self.__dfa = None continue self.__add_experiment(suffix)
def learn(self, string): """Learn to give the correct answer on this string. That is, after this method completes we will have ``self.dfa.matches(s) == self.member(s)``. Note that we do not guarantee that this will remain true in the event that learn is called again with a different string. It is in principle possible that future learning will cause us to make a mistake on this string. However, repeatedly calling learn on each of a set of strings until the generation stops changing is guaranteed to terminate. """ string = bytes(string) correct_outcome = self.member(string) # We don't want to check this inside the loop because it potentially # causes us to evaluate more of the states than we actually need to, # but if our model is mostly correct then this will be faster because # we only need to evaluate strings that are of the form # ``state + experiment``, which will generally be cached and/or needed # later. if self.dfa.matches(string) == correct_outcome: return # In the papers they assume that we only run this process # once, but this is silly - often when you've got a messy # string it will be wrong for many different reasons. # # Thus we iterate this to a fixed point where we repair # the DFA by repeatedly adding experiments until the DFA # agrees with the membership function on this string. # First we make sure that normalization is not the source of the # failure to match. while True: normalized = bytes(self.normalizer.normalize(c) for c in string) # We can correctly replace the string with its normalized version # so normalization is not the problem here. if self.member(normalized) == correct_outcome: string = normalized break alphabet = sorted(set(string), reverse=True) target = string for a in alphabet: def replace(b): if a == b: return target return bytes(b if c == a else c for c in target) self.normalizer.distinguish(a, lambda x: self.member(replace(x))) target = replace(self.normalizer.normalize(a)) assert self.member(target) == correct_outcome assert target != normalized self.__dfa_changed() if self.dfa.matches(string) == correct_outcome: return # Now we know normalization is correct we can attempt to determine if # any of our transitions are wrong. while True: dfa = self.dfa states = [dfa.start] def seems_right(n): """After reading n characters from s, do we seem to be in the right state? We determine this by replacing the first n characters of s with the label of the state we expect to be in. If we are in the right state, that will replace a substring with an equivalent one so must produce the same answer. """ if n > len(string): return False # Populate enough of the states list to know where we are. while n >= len(states): states.append( dfa.transition(states[-1], string[len(states) - 1])) return self.member(dfa.label(states[n]) + string[n:]) == correct_outcome assert seems_right(0) n = find_integer(seems_right) # We got to the end without ever finding ourself in a bad # state, so we must correctly match this string. if n == len(string): assert dfa.matches(string) == correct_outcome break # Reading n characters does not put us in a bad state but # reading n + 1 does. This means that the remainder of # the string that we have not read yet is an experiment # that allows us to distinguish the state that we ended # up in from the state that we should have ended up in. source = states[n] character = string[n] wrong_destination = states[n + 1] # We've made an error in transitioning from ``source`` to # ``wrong_destination`` via ``character``. We now need to update # the DFA so that this transition no longer occurs. Note that we # do not guarantee that the transition is *correct* after this, # only that we don't make this particular error. assert self.transition(source, character) == wrong_destination labels_wrong_destination = self.dfa.label(wrong_destination) labels_correct_destination = self.dfa.label(source) + bytes( [character]) ex = string[n + 1:] assert self.member(labels_wrong_destination + ex) != self.member(labels_correct_destination + ex) # Adding this experiment causes us to distinguish the wrong # destination from the correct one. self.__states[wrong_destination].experiments[ex] = self.member( labels_wrong_destination + ex) # We now clear the cached details that caused us to make this error # so that when we recalculate this transition we get to a # (hopefully now correct) different state. del self.__states[source].transitions[character] self.__dfa_changed() # We immediately recalculate the transition so that we can check # that it has changed as we expect it to have. new_destination = self.transition(source, string[n]) assert new_destination != wrong_destination
def shift_right(self): base = self.current find_integer(lambda k: k <= self.size and self.consider(base >> k))