def test_or_flaky(*args, **kwargs): text_repr = arg_string(test, args, kwargs) raise Flaky(('Hypothesis %s(%s) produces unreliable results: Falsified' ' on the first call but did not on a subsequent one') % ( test.__name__, text_repr, ))
def test_or_flaky(*args, **kwargs): text_repr = arg_string(test, args, kwargs) if text_repr == expected_repr: raise Flaky( ( 'Hypothesis %s(%s) produces unreliable results: Falsified' ' on the first call but did not on a subsequent one' ) % (test.__name__, text_repr,)) else: raise Flaky( ( 'Hypothesis %s produces unreliable results: Falsified' ' on the first call but did not on a subsequent one. This ' ' is possibly due to unreliable values, which may be a bug' ' in the strategy.\nCall 1: %s\nCall 2: %s\n' ) % (test.__name__, expected_repr, text_repr,))
def conclude_test(self, status, interesting_origin): """Says that ``status`` occurred at node ``node``. This updates the node if necessary and checks for consistency.""" if status == Status.OVERRUN: return i = self.__index_in_current_node node = self.__current_node if i < len(node.values) or isinstance(node.transition, Branch): inconsistent_generation() new_transition = Conclusion(status, interesting_origin) if node.transition is not None and node.transition != new_transition: # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness # where tests go from interesting to valid, because it's much easier # to produce good error messages for these further up the stack. if isinstance(node.transition, Conclusion) and ( node.transition.status != Status.INTERESTING or new_transition.status != Status.VALID ): raise Flaky( "Inconsistent test results! Test case was %r on first run but %r on second" % (node.transition, new_transition) ) else: node.transition = new_transition assert node is self.__trail[-1] node.check_exhausted() assert len(node.values) > 0 or node.check_exhausted() if not self.killed: self.__update_exhausted()
def run_state_machine_as_test(state_machine_factory, settings=None): """Run a state machine definition as a test, either silently doing nothing or printing a minimal breaking program and raising an exception. state_machine_factory is anything which returns an instance of GenericStateMachine when called with no arguments - it can be a class or a function. settings will be used to control the execution of the test. """ try: breaker = find_breaking_runner(state_machine_factory, settings) except NoSuchExample: return breaker.run(state_machine_factory(), print_steps=True) raise Flaky('Run failed initially by succeeded on a second try')
def conclude_test(self, status, interesting_origin): """Says that ``status`` occurred at node ``node``. This updates the node if necessary and checks for consistency.""" if status == Status.OVERRUN: return i = self.__index_in_current_node node = self.__current_node if i < len(node.values) or isinstance(node.transition, Branch): inconsistent_generation() new_transition = conclusion(status, interesting_origin) if node.transition is not None and node.transition != new_transition: # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness # where tests go from interesting to valid, because it's much easier # to produce good error messages for these further up the stack. if ( node.transition.status != Status.INTERESTING or new_transition.status != Status.VALID ): raise Flaky( "Inconsistent test results! Test case was %r on first run but %r on second" % (node.transition, new_transition) ) else: node.transition = new_transition assert node is self.__trail[-1] node.check_exhausted() assert len(node.values) > 0 or node.check_exhausted() for t in reversed(self.__trail): # Any node we've traversed might have now become exhausted. # We check from the right. As soon as we hit a node that # isn't exhausted, this automatically implies that all of # its parents are not exhausted, so we stop. if not t.check_exhausted(): break
def run(self): # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True if global_force_seed is None: database_key = str_to_bytes(fully_qualified_name(self.test)) else: database_key = None self.start_time = time.time() global in_given runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) if in_given or self.collector is None: runner.run() else: # pragma: no cover in_given = True original_trace = sys.gettrace() try: sys.settrace(None) runner.run() finally: in_given = False sys.settrace(original_trace) self.used_examples_from_database = \ runner.used_examples_from_database note_engine_for_statistics(runner) run_time = time.time() - self.start_time self.used_examples_from_database = runner.used_examples_from_database if runner.used_examples_from_database: if self.settings.derandomize: note_deprecation( 'In future derandomize will imply database=None, but your ' 'test is currently using examples from the database. To ' 'get the future behaviour, update your settings to ' 'include database=None.') if self.__had_seed: note_deprecation( 'In future use of @seed will imply database=None in your ' 'settings, but your test is currently using examples from ' 'the database. To get the future behaviour, update your ' 'settings for this test to include database=None.') timed_out = runner.exit_reason == ExitReason.timeout if runner.call_count == 0: return if runner.interesting_examples: self.falsifying_examples = sorted( [d for d in runner.interesting_examples.values()], key=lambda d: sort_key(d.buffer), reverse=True) else: if timed_out: note_deprecation(( 'Your tests are hitting the settings timeout (%.2fs). ' 'This functionality will go away in a future release ' 'and you should not rely on it. Instead, try setting ' 'max_examples to be some value lower than %d (the number ' 'of examples your test successfully ran here). Or, if you ' 'would prefer your tests to run to completion, regardless ' 'of how long they take, you can set the timeout value to ' 'hypothesis.unlimited.') % (self.settings.timeout, runner.valid_examples), self.settings) if runner.valid_examples == 0: if timed_out: raise Timeout( ('Ran out of time before finding a satisfying ' 'example for %s. Only found %d examples in %.2fs.') % (get_pretty_function_description( self.test), runner.valid_examples, run_time)) else: raise Unsatisfiable( 'Unable to satisfy assumptions of hypothesis %s.' % (get_pretty_function_description(self.test), )) if not self.falsifying_examples: return self.failed_normally = True flaky = 0 for falsifying_example in self.falsifying_examples: ran_example = ConjectureData.for_buffer(falsifying_example.buffer) self.__was_flaky = False assert falsifying_example.__expected_exception is not None try: self.execute(ran_example, print_example=True, is_final=True, expected_failure=( falsifying_example.__expected_exception, falsifying_example.__expected_traceback, )) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) self.__flaky( 'Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.') except BaseException: if len(self.falsifying_examples) <= 1: raise report(traceback.format_exc()) finally: # pragma: no cover # This section is in fact entirely covered by the tests in # test_reproduce_failure, but it seems to trigger a lovely set # of coverage bugs: The branches show up as uncovered (despite # definitely being covered - you can add an assert False else # branch to verify this and see it fail - and additionally the # second branch still complains about lack of coverage even if # you add a pragma: no cover to it! # See https://bitbucket.org/ned/coveragepy/issues/623/ if self.settings.print_blob is not PrintSettings.NEVER: failure_blob = encode_failure(falsifying_example.buffer) # Have to use the example we actually ran, not the original # falsifying example! Otherwise we won't catch problems # where the repr of the generated example doesn't parse. can_use_repr = ran_example.can_reproduce_example_from_repr if (self.settings.print_blob is PrintSettings.ALWAYS or (self.settings.print_blob is PrintSettings.INFER and not can_use_repr and len(failure_blob) < 200)): report(( '\n' 'You can reproduce this example by temporarily ' 'adding @reproduce_failure(%r, %r) as a decorator ' 'on your test case') % ( __version__, failure_blob, )) if self.__was_flaky: flaky += 1 # If we only have one example then we should have raised an error or # flaky prior to this point. assert len(self.falsifying_examples) > 1 if flaky > 0: raise Flaky( ('Hypothesis found %d distinct failures, but %d of them ' 'exhibited some sort of flaky behaviour.') % (len(self.falsifying_examples), flaky)) else: raise MultipleFailures(('Hypothesis found %d distinct failures.') % (len(self.falsifying_examples, )))
def run(self): database_key = str_to_bytes(fully_qualified_name(self.test)) start_time = time.time() runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) runner.run() note_engine_for_statistics(runner) run_time = time.time() - start_time timed_out = (self.settings.timeout > 0 and run_time >= self.settings.timeout) if runner.last_data is None: return if runner.last_data.status == Status.INTERESTING: self.falsifying_example = runner.last_data.buffer if self.settings.database is not None: self.settings.database.save(database_key, self.falsifying_example) else: if runner.valid_examples < min( self.settings.min_satisfying_examples, self.settings.max_examples, ) and not (runner.exit_reason == ExitReason.finished and self.at_least_one_success): if timed_out: raise Timeout( ('Ran out of time before finding a satisfying ' 'example for ' '%s. Only found %d examples in ' + '%.2fs.') % (get_pretty_function_description( self.test), runner.valid_examples, run_time)) else: raise Unsatisfiable( ('Unable to satisfy assumptions of hypothesis ' '%s. Only %d examples considered ' 'satisfied assumptions') % ( get_pretty_function_description(self.test), runner.valid_examples, )) if self.falsifying_example is None: return assert self.last_exception is not None try: with self.settings: self.test_runner( ConjectureData.for_buffer(self.falsifying_example), reify_and_execute(self.search_strategy, self.test, print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) raise Flaky('Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.') report( 'Failed to reproduce exception. Expected: \n' + self.last_exception, ) filter_message = ( 'Unreliable test data: Failed to reproduce a failure ' 'and then when it came to recreating the example in ' 'order to print the test data with a flaky result ' 'the example was filtered out (by e.g. a ' 'call to filter in your strategy) when we didn\'t ' 'expect it to be.') try: self.test_runner( ConjectureData.for_buffer(self.falsifying_example), reify_and_execute(self.search_strategy, test_is_flaky(self.test, self.repr_for_last_exception), print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): raise Flaky(filter_message)
def run(self): # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True database_key = str_to_bytes(fully_qualified_name(self.test)) self.start_time = time.time() runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) runner.run() note_engine_for_statistics(runner) run_time = time.time() - self.start_time timed_out = runner.exit_reason == ExitReason.timeout if runner.last_data is None: return if runner.interesting_examples: self.falsifying_examples = sorted( [d for d in runner.interesting_examples.values()], key=lambda d: sort_key(d.buffer), reverse=True) else: if timed_out: note_deprecation(( 'Your tests are hitting the settings timeout (%.2fs). ' 'This functionality will go away in a future release ' 'and you should not rely on it. Instead, try setting ' 'max_examples to be some value lower than %d (the number ' 'of examples your test successfully ran here). Or, if you ' 'would prefer your tests to run to completion, regardless ' 'of how long they take, you can set the timeout value to ' 'hypothesis.unlimited.') % (self.settings.timeout, runner.valid_examples), self.settings) if runner.valid_examples < min( self.settings.min_satisfying_examples, self.settings.max_examples, ) and not (runner.exit_reason == ExitReason.finished and self.at_least_one_success): if timed_out: raise Timeout( ('Ran out of time before finding a satisfying ' 'example for ' '%s. Only found %d examples in ' + '%.2fs.') % (get_pretty_function_description( self.test), runner.valid_examples, run_time)) else: raise Unsatisfiable( ('Unable to satisfy assumptions of hypothesis ' '%s. Only %d examples considered ' 'satisfied assumptions') % ( get_pretty_function_description(self.test), runner.valid_examples, )) if not self.falsifying_examples: return flaky = 0 for falsifying_example in self.falsifying_examples: self.__was_flaky = False raised_exception = False try: with self.settings: self.test_runner( ConjectureData.for_buffer(falsifying_example.buffer), reify_and_execute(self.search_strategy, self.test, print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) self.__flaky( 'Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.') except: if len(self.falsifying_examples) <= 1: raise raised_exception = True report(traceback.format_exc()) if not raised_exception: report( 'Failed to reproduce exception. Expected: \n' + falsifying_example.__expected_exception, ) filter_message = ( 'Unreliable test data: Failed to reproduce a failure ' 'and then when it came to recreating the example in ' 'order to print the test data with a flaky result ' 'the example was filtered out (by e.g. a ' 'call to filter in your strategy) when we didn\'t ' 'expect it to be.') try: self.test_runner( ConjectureData.for_buffer(falsifying_example.buffer), reify_and_execute(self.search_strategy, test_is_flaky( self.test, self.repr_for_last_exception), print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): self.__flaky(filter_message) except Flaky as e: if len(self.falsifying_examples) > 1: self.__flaky(e.args[0]) else: raise if self.__was_flaky: flaky += 1 # If we only have one example then we should have raised an error or # flaky prior to this point. assert len(self.falsifying_examples) > 1 if flaky > 0: raise Flaky( ('Hypothesis found %d distinct failures, but %d of them ' 'exhibited some sort of flaky behaviour.') % (len(self.falsifying_examples), flaky)) else: raise MultipleFailures(('Hypothesis found %d distinct failures.') % (len(self.falsifying_examples, )))
def __flaky(self, message): if len(self.falsifying_examples) <= 1: raise Flaky(message) else: self.__was_flaky = True report("Flaky example! " + message)
def falsify(self, hypothesis, *argument_types, **kwargs): # pylint: disable=too-many-locals,too-many-branches """ Attempt to construct an example tuple x matching argument_types such that hypothesis(*x) returns a falsey value """ teardown_example = kwargs.get('teardown_example') or (lambda x: None) setup_example = kwargs.get('setup_example') or (lambda: None) random = self.random if random is None: random = Random(function_digest(hypothesis)) build_context = BuildContext(random) search_strategy = strategy(argument_types, self.settings) storage = None if self.database is not None: storage = self.database.storage_for(argument_types) def falsifies(args): # pylint: disable=missing-docstring example = None try: try: setup_example() example = search_strategy.reify(args) return not hypothesis(*example) except UnsatisfiedAssumption: return False finally: teardown_example(example) track_seen = Tracker() falsifying_examples = [] if storage: for example in storage.fetch(): track_seen.track(example) if falsifies(example): falsifying_examples = [example] break satisfying_examples = 0 timed_out = False max_examples = self.max_examples min_satisfying_examples = self.min_satisfying_examples parameter_source = ParameterSource(context=build_context, strategy=search_strategy, min_parameters=max( 2, int(float(max_examples) / 10))) start_time = time.time() def time_to_call_it_a_day(): """Have we exceeded our timeout?""" if self.timeout <= 0: return False return time.time() >= start_time + self.timeout for parameter in islice(parameter_source, max_examples - len(track_seen)): if len(track_seen) >= search_strategy.size_upper_bound: break if falsifying_examples: break if time_to_call_it_a_day(): break args = search_strategy.produce_template(build_context, parameter) if track_seen.track(args) > 1: parameter_source.mark_bad() continue try: setup_example() a = None try: a = search_strategy.reify(args) is_falsifying_example = not hypothesis(*a) finally: teardown_example(a) except UnsatisfiedAssumption: parameter_source.mark_bad() continue satisfying_examples += 1 if is_falsifying_example: falsifying_examples.append(args) run_time = time.time() - start_time timed_out = self.timeout >= 0 and run_time >= self.timeout if not falsifying_examples: if (satisfying_examples and len(track_seen) >= search_strategy.size_lower_bound): raise Exhausted(hypothesis, satisfying_examples) elif satisfying_examples < min_satisfying_examples: if timed_out: raise Timeout(hypothesis, satisfying_examples, run_time) else: raise Unsatisfiable(hypothesis, satisfying_examples, run_time) else: raise Unfalsifiable(hypothesis) for example in falsifying_examples: if not falsifies(example): raise Flaky(hypothesis, example) best_example = falsifying_examples[0] for simpler in search_strategy.simplify_such_that( random, best_example, falsifies, tracker=track_seen, ): best_example = simpler if time_to_call_it_a_day(): # We no cover in here because it's a bit sensitive to timing # and tends to make tests flaky. There are tests that mean # this is definitely covered most of the time. break # pragma: no cover if storage is not None: storage.save(best_example) setup_example() return search_strategy.reify(best_example)
def run(self): # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True if global_force_seed is None: database_key = str_to_bytes(fully_qualified_name(self.test)) else: database_key = None self.start_time = time.time() global in_given runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) if in_given or self.collector is None: runner.run() else: # pragma: no cover in_given = True original_trace = sys.gettrace() try: sys.settrace(None) runner.run() finally: in_given = False sys.settrace(original_trace) note_engine_for_statistics(runner) run_time = time.time() - self.start_time self.used_examples_from_database = runner.used_examples_from_database if runner.used_examples_from_database: if self.settings.derandomize: note_deprecation( 'In future derandomize will imply database=None, but your ' 'test is currently using examples from the database. To ' 'get the future behaviour, update your settings to ' 'include database=None.' ) if self.__had_seed: note_deprecation( 'In future use of @seed will imply database=None in your ' 'settings, but your test is currently using examples from ' 'the database. To get the future behaviour, update your ' 'settings for this test to include database=None.' ) timed_out = runner.exit_reason == ExitReason.timeout if runner.last_data is None: return if runner.interesting_examples: self.falsifying_examples = sorted( [d for d in runner.interesting_examples.values()], key=lambda d: sort_key(d.buffer), reverse=True ) else: if timed_out: note_deprecation(( 'Your tests are hitting the settings timeout (%.2fs). ' 'This functionality will go away in a future release ' 'and you should not rely on it. Instead, try setting ' 'max_examples to be some value lower than %d (the number ' 'of examples your test successfully ran here). Or, if you ' 'would prefer your tests to run to completion, regardless ' 'of how long they take, you can set the timeout value to ' 'hypothesis.unlimited.' ) % ( self.settings.timeout, runner.valid_examples), self.settings) if runner.valid_examples < min( self.settings.min_satisfying_examples, self.settings.max_examples, ) and not ( runner.exit_reason == ExitReason.finished and self.at_least_one_success ): if timed_out: raise Timeout(( 'Ran out of time before finding a satisfying ' 'example for ' '%s. Only found %d examples in ' + '%.2fs.' ) % ( get_pretty_function_description(self.test), runner.valid_examples, run_time )) else: raise Unsatisfiable(( 'Unable to satisfy assumptions of hypothesis ' '%s. Only %d examples considered ' 'satisfied assumptions' ) % ( get_pretty_function_description(self.test), runner.valid_examples,)) if not self.falsifying_examples: return flaky = 0 for falsifying_example in self.falsifying_examples: self.__was_flaky = False assert falsifying_example.__expected_exception is not None try: self.execute( ConjectureData.for_buffer(falsifying_example.buffer), print_example=True, is_final=True, expected_failure=( falsifying_example.__expected_exception, falsifying_example.__expected_traceback, ) ) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) self.__flaky( 'Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.' ) except BaseException: if len(self.falsifying_examples) <= 1: raise report(traceback.format_exc()) if self.__was_flaky: flaky += 1 # If we only have one example then we should have raised an error or # flaky prior to this point. assert len(self.falsifying_examples) > 1 if flaky > 0: raise Flaky(( 'Hypothesis found %d distinct failures, but %d of them ' 'exhibited some sort of flaky behaviour.') % ( len(self.falsifying_examples), flaky)) else: raise MultipleFailures(( 'Hypothesis found %d distinct failures.') % ( len(self.falsifying_examples,)))
def wrapped_test(*arguments, **kwargs): settings = wrapped_test._hypothesis_internal_use_settings if wrapped_test._hypothesis_internal_use_seed is not None: random = Random( wrapped_test._hypothesis_internal_use_seed) elif settings.derandomize: random = Random(function_digest(test)) else: random = new_random() import hypothesis.strategies as sd selfy = None arguments, kwargs = convert_positional_arguments( wrapped_test, arguments, kwargs) # If the test function is a method of some kind, the bound object # will be the first named argument if there are any, otherwise the # first vararg (if any). if argspec.args: selfy = kwargs.get(argspec.args[0]) elif arguments: selfy = arguments[0] test_runner = new_style_executor(selfy) for example in reversed(getattr( wrapped_test, 'hypothesis_explicit_examples', () )): if example.args: if len(example.args) > len(original_argspec.args): raise InvalidArgument( 'example has too many arguments for test. ' 'Expected at most %d but got %d' % ( len(original_argspec.args), len(example.args))) example_kwargs = dict(zip( original_argspec.args[-len(example.args):], example.args )) else: example_kwargs = example.kwargs if Phase.explicit not in settings.phases: continue example_kwargs.update(kwargs) # Note: Test may mutate arguments and we can't rerun explicit # examples, so we have to calculate the failure message at this # point rather than than later. message_on_failure = 'Falsifying example: %s(%s)' % ( test.__name__, arg_string(test, arguments, example_kwargs) ) try: with BuildContext(None) as b: test_runner( None, lambda data: test(*arguments, **example_kwargs) ) except BaseException: traceback.print_exc() report(message_on_failure) for n in b.notes: report(n) raise if settings.max_examples <= 0: return arguments = tuple(arguments) given_specifier = sd.tuples( sd.just(arguments), sd.fixed_dictionaries(generator_kwargs).map( lambda args: dict(args, **kwargs) ) ) def fail_health_check(message, label): if label in settings.suppress_health_check: return message += ( '\nSee https://hypothesis.readthedocs.io/en/latest/health' 'checks.html for more information about this. ' ) message += ( 'If you want to disable just this health check, add %s ' 'to the suppress_health_check settings for this test.' ) % (label,) raise FailedHealthCheck(message) search_strategy = given_specifier if selfy is not None: search_strategy = WithRunner(search_strategy, selfy) search_strategy.validate() perform_health_check = settings.perform_health_check perform_health_check &= Settings.default.perform_health_check from hypothesis.internal.conjecture.data import ConjectureData, \ Status, StopTest if not ( Phase.reuse in settings.phases or Phase.generate in settings.phases ): return if perform_health_check: health_check_random = Random(random.getrandbits(128)) # We "pre warm" the health check with one draw to give it some # time to calculate any cached data. This prevents the case # where the first draw of the health check takes ages because # of loading unicode data the first time. data = ConjectureData( max_length=settings.buffer_size, draw_bytes=lambda data, n, distribution: distribution(health_check_random, n) ) with Settings(settings, verbosity=Verbosity.quiet): try: test_runner(data, reify_and_execute( search_strategy, lambda *args, **kwargs: None, )) except BaseException: pass count = 0 overruns = 0 filtered_draws = 0 start = time.time() while ( count < 10 and time.time() < start + 1 and filtered_draws < 50 and overruns < 20 ): try: data = ConjectureData( max_length=settings.buffer_size, draw_bytes=lambda data, n, distribution: distribution(health_check_random, n) ) with Settings(settings, verbosity=Verbosity.quiet): test_runner(data, reify_and_execute( search_strategy, lambda *args, **kwargs: None, )) count += 1 except UnsatisfiedAssumption: filtered_draws += 1 except StopTest: if data.status == Status.INVALID: filtered_draws += 1 else: assert data.status == Status.OVERRUN overruns += 1 except InvalidArgument: raise except Exception: if ( HealthCheck.exception_in_generation in settings.suppress_health_check ): raise report(traceback.format_exc()) if test_runner is default_new_style_executor: fail_health_check( 'An exception occurred during data ' 'generation in initial health check. ' 'This indicates a bug in the strategy. ' 'This could either be a Hypothesis bug or ' "an error in a function you've passed to " 'it to construct your data.', HealthCheck.exception_in_generation, ) else: fail_health_check( 'An exception occurred during data ' 'generation in initial health check. ' 'This indicates a bug in the strategy. ' 'This could either be a Hypothesis bug or ' 'an error in a function you\'ve passed to ' 'it to construct your data. Additionally, ' 'you have a custom executor, which means ' 'that this could be your executor failing ' 'to handle a function which returns None. ', HealthCheck.exception_in_generation, ) if overruns >= 20 or ( not count and overruns > 0 ): fail_health_check(( 'Examples routinely exceeded the max allowable size. ' '(%d examples overran while generating %d valid ones)' '. Generating examples this large will usually lead to' ' bad results. You should try setting average_size or ' 'max_size parameters on your collections and turning ' 'max_leaves down on recursive() calls.') % ( overruns, count ), HealthCheck.data_too_large) if filtered_draws >= 50 or ( not count and filtered_draws > 0 ): fail_health_check(( 'It looks like your strategy is filtering out a lot ' 'of data. Health check found %d filtered examples but ' 'only %d good ones. This will make your tests much ' 'slower, and also will probably distort the data ' 'generation quite a lot. You should adapt your ' 'strategy to filter less. This can also be caused by ' 'a low max_leaves parameter in recursive() calls') % ( filtered_draws, count ), HealthCheck.filter_too_much) runtime = time.time() - start if runtime > 1.0 or count < 10: fail_health_check(( 'Data generation is extremely slow: Only produced ' '%d valid examples in %.2f seconds (%d invalid ones ' 'and %d exceeded maximum size). Try decreasing ' "size of the data you're generating (with e.g." 'average_size or max_leaves parameters).' ) % (count, runtime, filtered_draws, overruns), HealthCheck.too_slow, ) last_exception = [None] repr_for_last_exception = [None] def evaluate_test_data(data): try: result = test_runner(data, reify_and_execute( search_strategy, test, )) if result is not None and settings.perform_health_check: fail_health_check(( 'Tests run under @given should return None, but ' '%s returned %r instead.' ) % (test.__name__, result), HealthCheck.return_value) return False except UnsatisfiedAssumption: data.mark_invalid() except ( HypothesisDeprecationWarning, FailedHealthCheck, StopTest, ): raise except Exception: last_exception[0] = traceback.format_exc() verbose_report(last_exception[0]) data.mark_interesting() from hypothesis.internal.conjecture.engine import ConjectureRunner falsifying_example = None database_key = str_to_bytes(fully_qualified_name(test)) start_time = time.time() runner = ConjectureRunner( evaluate_test_data, settings=settings, random=random, database_key=database_key, ) runner.run() note_engine_for_statistics(runner) run_time = time.time() - start_time timed_out = ( settings.timeout > 0 and run_time >= settings.timeout ) if runner.last_data is None: return if runner.last_data.status == Status.INTERESTING: falsifying_example = runner.last_data.buffer if settings.database is not None: settings.database.save( database_key, falsifying_example ) else: if runner.valid_examples < min( settings.min_satisfying_examples, settings.max_examples, ): if timed_out: raise Timeout(( 'Ran out of time before finding a satisfying ' 'example for ' '%s. Only found %d examples in ' + '%.2fs.' ) % ( get_pretty_function_description(test), runner.valid_examples, run_time )) else: raise Unsatisfiable(( 'Unable to satisfy assumptions of hypothesis ' '%s. Only %d examples considered ' 'satisfied assumptions' ) % ( get_pretty_function_description(test), runner.valid_examples,)) return assert last_exception[0] is not None try: with settings: test_runner( ConjectureData.for_buffer(falsifying_example), reify_and_execute( search_strategy, test, print_example=True, is_final=True )) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) raise Flaky( 'Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.' ) report( 'Failed to reproduce exception. Expected: \n' + last_exception[0], ) filter_message = ( 'Unreliable test data: Failed to reproduce a failure ' 'and then when it came to recreating the example in ' 'order to print the test data with a flaky result ' 'the example was filtered out (by e.g. a ' 'call to filter in your strategy) when we didn\'t ' 'expect it to be.' ) try: test_runner( ConjectureData.for_buffer(falsifying_example), reify_and_execute( search_strategy, test_is_flaky(test, repr_for_last_exception[0]), print_example=True, is_final=True )) except (UnsatisfiedAssumption, StopTest): raise Flaky(filter_message)
def inconsistent_generation(): raise Flaky( "Inconsistent data generation! Data generation behaved differently " "between different runs. Is your data generation depending on external " "state?" )
def run(self): # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True if global_force_seed is None: database_key = function_digest(self.test) else: database_key = None runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) try: runner.run() finally: self.used_examples_from_database = runner.used_examples_from_database note_engine_for_statistics(runner) self.used_examples_from_database = runner.used_examples_from_database if runner.call_count == 0: return if runner.interesting_examples: self.falsifying_examples = sorted( [d for d in runner.interesting_examples.values()], key=lambda d: sort_key(d.buffer), reverse=True, ) else: if runner.valid_examples == 0: raise Unsatisfiable( "Unable to satisfy assumptions of hypothesis %s." % (get_pretty_function_description(self.test), )) if not self.falsifying_examples: return self.failed_normally = True flaky = 0 for falsifying_example in self.falsifying_examples: ran_example = ConjectureData.for_buffer(falsifying_example.buffer) self.__was_flaky = False assert falsifying_example.__expected_exception is not None try: self.execute( ran_example, print_example=True, is_final=True, expected_failure=( falsifying_example.__expected_exception, falsifying_example.__expected_traceback, ), ) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) self.__flaky( "Unreliable assumption: An example which satisfied " "assumptions on the first run now fails it.") except BaseException as e: if len(self.falsifying_examples) <= 1: raise tb = get_trimmed_traceback() report("".join(traceback.format_exception(type(e), e, tb))) finally: # pragma: no cover # This section is in fact entirely covered by the tests in # test_reproduce_failure, but it seems to trigger a lovely set # of coverage bugs: The branches show up as uncovered (despite # definitely being covered - you can add an assert False else # branch to verify this and see it fail - and additionally the # second branch still complains about lack of coverage even if # you add a pragma: no cover to it! # See https://bitbucket.org/ned/coveragepy/issues/623/ if self.settings.print_blob is not PrintSettings.NEVER: failure_blob = encode_failure(falsifying_example.buffer) # Have to use the example we actually ran, not the original # falsifying example! Otherwise we won't catch problems # where the repr of the generated example doesn't parse. can_use_repr = ran_example.can_reproduce_example_from_repr if self.settings.print_blob is PrintSettings.ALWAYS or ( self.settings.print_blob is PrintSettings.INFER and self.settings.verbosity >= Verbosity.normal and not can_use_repr and len(failure_blob) < 200): report(( "\nYou can reproduce this example by temporarily " "adding @reproduce_failure(%r, %r) as a decorator " "on your test case") % (__version__, failure_blob)) if self.__was_flaky: flaky += 1 # If we only have one example then we should have raised an error or # flaky prior to this point. assert len(self.falsifying_examples) > 1 if flaky > 0: raise Flaky( ("Hypothesis found %d distinct failures, but %d of them " "exhibited some sort of flaky behaviour.") % (len(self.falsifying_examples), flaky)) else: raise MultipleFailures(("Hypothesis found %d distinct failures.") % (len(self.falsifying_examples)))
def wrapped_test(*arguments, **kwargs): settings = wrapped_test._hypothesis_internal_use_settings if wrapped_test._hypothesis_internal_use_seed is not None: random = Random(wrapped_test._hypothesis_internal_use_seed) elif settings.derandomize: random = Random(function_digest(test)) else: random = new_random() import hypothesis.strategies as sd selfy = None arguments, kwargs = convert_positional_arguments( wrapped_test, arguments, kwargs) # If the test function is a method of some kind, the bound object # will be the first named argument if there are any, otherwise the # first vararg (if any). if argspec.args: selfy = kwargs.get(argspec.args[0]) elif arguments: selfy = arguments[0] test_runner = executor(selfy) for example in reversed( getattr(wrapped_test, 'hypothesis_explicit_examples', ())): if example.args: example_kwargs = dict( zip(original_argspec.args[-len(example.args):], example.args)) else: example_kwargs = example.kwargs example_kwargs.update(kwargs) # Note: Test may mutate arguments and we can't rerun explicit # examples, so we have to calculate the failure message at this # point rather than than later. message_on_failure = 'Falsifying example: %s(%s)' % ( test.__name__, arg_string(test, arguments, example_kwargs)) try: with BuildContext() as b: test_runner(lambda: test(*arguments, **example_kwargs)) except BaseException: report(message_on_failure) for n in b.notes: report(n) raise arguments = tuple(arguments) given_specifier = sd.tuples( sd.just(arguments), sd.fixed_dictionaries(generator_kwargs).map( lambda args: dict(args, **kwargs))) def fail_health_check(message): message += ( '\nSee http://hypothesis.readthedocs.org/en/latest/health' 'checks.html for more information about this.') if settings.strict: raise FailedHealthCheck(message) else: warnings.warn(FailedHealthCheck(message)) search_strategy = given_specifier search_strategy.validate() if settings.database: storage = settings.database.storage(fully_qualified_name(test)) else: storage = None start = time.time() warned_random = [False] perform_health_check = settings.perform_health_check if Settings.default is not None: perform_health_check &= Settings.default.perform_health_check if perform_health_check: initial_state = getglobalrandomstate() health_check_random = Random(random.getrandbits(128)) count = 0 bad_draws = 0 filtered_draws = 0 errors = 0 while (count < 10 and time.time() < start + 1 and filtered_draws < 50 and bad_draws < 50): try: with Settings(settings, verbosity=Verbosity.quiet): test_runner( reify_and_execute( search_strategy, search_strategy.draw_template( health_check_random, search_strategy.draw_parameter( health_check_random, )), lambda *args, **kwargs: None, )) count += 1 except BadTemplateDraw: bad_draws += 1 except UnsatisfiedAssumption: filtered_draws += 1 except Exception: if errors == 0: report(traceback.format_exc()) errors += 1 if test_runner is default_executor: fail_health_check( 'An exception occurred during data ' 'generation in initial health check. ' 'This indicates a bug in the strategy. ' 'This could either be a Hypothesis bug or ' "an error in a function yo've passed to " 'it to construct your data.') else: fail_health_check( 'An exception occurred during data ' 'generation in initial health check. ' 'This indicates a bug in the strategy. ' 'This could either be a Hypothesis bug or ' 'an error in a function you\'ve passed to ' 'it to construct your data. Additionally, ' 'you have a custom executor, which means ' 'that this could be your executor failing ' 'to handle a function which returns None. ') if filtered_draws >= 50: fail_health_check(( 'It looks like your strategy is filtering out a lot ' 'of data. Health check found %d filtered examples but ' 'only %d good ones. This will make your tests much ' 'slower, and also will probably distort the data ' 'generation quite a lot. You should adapt your ' 'strategy to filter less.') % (filtered_draws, count)) if bad_draws >= 50: fail_health_check( 'Hypothesis is struggling to generate examples. ' 'This is often a sign of a recursive strategy which ' 'fans out too broadly. If you\'re using recursive, ' 'try to reduce the size of the recursive step or ' 'increase the maximum permitted number of leaves.') runtime = time.time() - start if runtime > 1.0 or count < 10: fail_health_check( ('Data generation is extremely slow: Only produced ' '%d valid examples in %.2f seconds. Try decreasing ' "size of the data yo're generating (with e.g." 'average_size or max_leaves parameters).') % (count, runtime)) if getglobalrandomstate() != initial_state: warned_random[0] = True fail_health_check( 'Data generation depends on global random module. ' 'This makes results impossible to replay, which ' 'prevents Hypothesis from working correctly. ' 'If you want to use methods from random, use ' 'randoms() from hypothesis.strategies to get an ' 'instance of Random you can use. Alternatively, you ' 'can use the random_module() strategy to explicitly ' 'seed the random module.') last_exception = [None] repr_for_last_exception = [None] def is_template_example(xs): if perform_health_check and not warned_random[0]: initial_state = getglobalrandomstate() record_repr = [None] try: result = test_runner( reify_and_execute( search_strategy, xs, test, record_repr=record_repr, )) if result is not None and settings.perform_health_check: raise FailedHealthCheck( ('Tests run under @given should return None, but ' '%s returned %r instead.') % (test.__name__, result), settings) return False except (HypothesisDeprecationWarning, FailedHealthCheck, UnsatisfiedAssumption): raise except Exception: last_exception[0] = traceback.format_exc() repr_for_last_exception[0] = record_repr[0] verbose_report(last_exception[0]) return True finally: if (not warned_random[0] and perform_health_check and getglobalrandomstate() != initial_state): warned_random[0] = True fail_health_check( 'Your test used the global random module. ' 'This is unlikely to work correctly. You should ' 'consider using the randoms() strategy from ' 'hypothesis.strategies instead. Alternatively, ' 'you can use the random_module() strategy to ' 'explicitly seed the random module.') is_template_example.__name__ = test.__name__ is_template_example.__qualname__ = qualname(test) with settings: falsifying_template = None try: falsifying_template = best_satisfying_template( search_strategy, random, is_template_example, settings, storage, start_time=start, ) except NoSuchExample: return assert last_exception[0] is not None try: test_runner( reify_and_execute(search_strategy, falsifying_template, test, print_example=True, is_final=True)) except UnsatisfiedAssumption: report(traceback.format_exc()) raise Flaky( 'Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.') report( 'Failed to reproduce exception. Expected: \n' + last_exception[0], ) try: test_runner( reify_and_execute(search_strategy, falsifying_template, test_is_flaky( test, repr_for_last_exception[0]), print_example=True, is_final=True)) except UnsatisfiedAssumption: raise Flaky( 'Unreliable test data: Failed to reproduce a failure ' 'and then when it came to recreating the example in ' 'order to print the test data with a flaky result ' 'the example was filtered out (by e.g. a ' 'call to filter in your strategy) when we didn\'t ' 'expect it to be.')
def run_engine(self): """Run the test function many times, on database input and generated input, using the Conjecture engine. """ # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True try: database_key = self.wrapped_test._hypothesis_internal_database_key except AttributeError: if global_force_seed is None: database_key = function_digest(self.test) else: database_key = None runner = ConjectureRunner( self._execute_once_for_engine, settings=self.settings, random=self.random, database_key=database_key, ) # Use the Conjecture engine to run the test function many times # on different inputs. runner.run() note_engine_for_statistics(runner) if runner.call_count == 0: return if runner.interesting_examples: self.falsifying_examples = sorted( runner.interesting_examples.values(), key=lambda d: sort_key(d.buffer), reverse=True, ) else: if runner.valid_examples == 0: raise Unsatisfiable( "Unable to satisfy assumptions of hypothesis %s." % (get_pretty_function_description(self.test), )) if not self.falsifying_examples: return elif not self.settings.report_multiple_bugs: # Pretend that we only found one failure, by discarding the others. del self.falsifying_examples[:-1] # The engine found one or more failures, so we need to reproduce and # report them. self.failed_normally = True flaky = 0 for falsifying_example in self.falsifying_examples: info = falsifying_example.extra_information ran_example = ConjectureData.for_buffer(falsifying_example.buffer) self.__was_flaky = False assert info.__expected_exception is not None try: self.execute_once( ran_example, print_example=not self.is_find, is_final=True, expected_failure=( info.__expected_exception, info.__expected_traceback, ), ) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) self.__flaky( "Unreliable assumption: An example which satisfied " "assumptions on the first run now fails it.") except BaseException as e: if len(self.falsifying_examples) <= 1: # There is only one failure, so we can report it by raising # it directly. raise # We are reporting multiple failures, so we need to manually # print each exception's stack trace and information. tb = get_trimmed_traceback() report("".join(traceback.format_exception(type(e), e, tb))) finally: # pragma: no cover # Mostly useful for ``find`` and ensuring that objects that # hold on to a reference to ``data`` know that it's now been # finished and they shouldn't attempt to draw more data from # it. ran_example.freeze() # This section is in fact entirely covered by the tests in # test_reproduce_failure, but it seems to trigger a lovely set # of coverage bugs: The branches show up as uncovered (despite # definitely being covered - you can add an assert False else # branch to verify this and see it fail - and additionally the # second branch still complains about lack of coverage even if # you add a pragma: no cover to it! # See https://bitbucket.org/ned/coveragepy/issues/623/ if self.settings.print_blob: report(("\nYou can reproduce this example by temporarily " "adding @reproduce_failure(%r, %r) as a decorator " "on your test case") % (__version__, encode_failure(falsifying_example.buffer))) if self.__was_flaky: flaky += 1 # If we only have one example then we should have raised an error or # flaky prior to this point. assert len(self.falsifying_examples) > 1 if flaky > 0: raise Flaky( ("Hypothesis found %d distinct failures, but %d of them " "exhibited some sort of flaky behaviour.") % (len(self.falsifying_examples), flaky)) else: raise MultipleFailures(("Hypothesis found %d distinct failures.") % (len(self.falsifying_examples)))
def run(self): # Tell pytest to omit the body of this function from tracebacks __tracebackhide__ = True database_key = str_to_bytes(fully_qualified_name(self.test)) self.start_time = time.time() runner = ConjectureRunner( self.evaluate_test_data, settings=self.settings, random=self.random, database_key=database_key, ) runner.run() note_engine_for_statistics(runner) run_time = time.time() - self.start_time timed_out = runner.exit_reason == ExitReason.timeout if runner.last_data is None: return if runner.last_data.status == Status.INTERESTING: self.falsifying_example = runner.last_data.buffer if self.settings.database is not None: self.settings.database.save(database_key, self.falsifying_example) else: if timed_out: note_deprecation(( 'Your tests are hitting the settings timeout (%.2fs). ' 'This functionality will go away in a future release ' 'and you should not rely on it. Instead, try setting ' 'max_examples to be some value lower than %d (the number ' 'of examples your test successfully ran here). Or, if you ' 'would prefer your tests to run to completion, regardless ' 'of how long they take, you can set the timeout value to ' 'hypothesis.unlimited.') % (self.settings.timeout, runner.valid_examples), self.settings) if runner.valid_examples < min( self.settings.min_satisfying_examples, self.settings.max_examples, ) and not (runner.exit_reason == ExitReason.finished and self.at_least_one_success): if timed_out: raise Timeout( ('Ran out of time before finding a satisfying ' 'example for ' '%s. Only found %d examples in ' + '%.2fs.') % (get_pretty_function_description( self.test), runner.valid_examples, run_time)) else: raise Unsatisfiable( ('Unable to satisfy assumptions of hypothesis ' '%s. Only %d examples considered ' 'satisfied assumptions') % ( get_pretty_function_description(self.test), runner.valid_examples, )) if self.falsifying_example is None: return assert self.last_exception is not None try: with self.settings: self.test_runner( ConjectureData.for_buffer(self.falsifying_example), reify_and_execute(self.search_strategy, self.test, print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): report(traceback.format_exc()) raise Flaky('Unreliable assumption: An example which satisfied ' 'assumptions on the first run now fails it.') report( 'Failed to reproduce exception. Expected: \n' + self.last_exception, ) filter_message = ( 'Unreliable test data: Failed to reproduce a failure ' 'and then when it came to recreating the example in ' 'order to print the test data with a flaky result ' 'the example was filtered out (by e.g. a ' 'call to filter in your strategy) when we didn\'t ' 'expect it to be.') try: self.test_runner( ConjectureData.for_buffer(self.falsifying_example), reify_and_execute(self.search_strategy, test_is_flaky(self.test, self.repr_for_last_exception), print_example=True, is_final=True)) except (UnsatisfiedAssumption, StopTest): raise Flaky(filter_message)
def test_or_flaky(*args, **kwargs): raise Flaky(test, (args, kwargs))
def test_or_flaky(*args, **kwargs): raise Flaky( ( 'Hypothesis %r produces unreliable results: %r falsified it on' ' the first call but did not on a subsequent one' ) % (get_pretty_function_description(test), example))