def verify_trace(self, trace, pipeline_start_date, pipeline_end_date, expected_chunks, empty=False): # Percent complete should be monotonically increasing through the whole # execution. for before, after in toolz.sliding_window(2, trace): self.assertGreaterEqual( after.percent_complete, before.percent_complete, ) # First publish should come from the start of the first chunk, with no # work yet. first = trace[0] expected_first = TestingProgressPublisher.TraceState( state='init', percent_complete=0.0, execution_bounds=(pipeline_start_date, pipeline_end_date), current_chunk_bounds=expected_chunks[0], current_work=None, ) self.assertEqual(first, expected_first) # Last publish should have a state of success and be 100% complete. last = trace[-1] expected_last = TestingProgressPublisher.TraceState( state='success', percent_complete=100.0, execution_bounds=(pipeline_start_date, pipeline_end_date), current_chunk_bounds=expected_chunks[-1], # We don't know what the last work item will be, but it must be an # instance of a single ComputableTerm, because we only run # ComputableTerms one at a time, and a LoadableTerm will only be in # the graph if some ComputableTerm depends on it. # # The one exception to this rule is that, if we run a completely # empty pipeline, the final work will be None. current_work=None if empty else [instance_of(ComputableTerm)], ) self.assertEqual(last, expected_last) # Remaining updates should all be loads or computes. middle = trace[1:-1] for update in middle: # For empty pipelines we never leave the 'init' state. if empty: self.assertEqual(update.state, 'init') self.assertIs(update.current_work, None) continue if update.state in ('loading', 'computing'): self.assertIsInstance(update.current_work, list) if update.state == 'loading': for term in update.current_work: self.assertIsInstance(term, (LoadableTerm, AssetExists)) elif update.state == 'computing': for term in update.current_work: self.assertIsInstance(term, ComputableTerm) else: raise AssertionError(f"Unexpected state: {update.state}", ) # Break up the remaining updates by chunk. all_chunks = [] grouped = itertools.groupby(middle, attrgetter('current_chunk_bounds')) for (chunk_start, chunk_stop), chunk_trace in grouped: all_chunks.append((chunk_start, chunk_stop)) chunk_trace = list(chunk_trace) expected_end_progress = self.expected_chunk_progress( pipeline_start_date, pipeline_end_date, chunk_stop, ) end_progress = chunk_trace[-1].percent_complete assert_almost_equal( end_progress, expected_end_progress, ) self.assertEqual(all_chunks, expected_chunks)
def verify_trace(self, trace, pipeline_start_date, pipeline_end_date, expected_loads, expected_computes, expected_chunks): # Percent complete should be monotonically increasing through the whole # execution. for before, after in toolz.sliding_window(2, trace): self.assertGreaterEqual( after.percent_complete, before.percent_complete, ) # First publish should contain precomputed terms from first chunk. first = trace[0] expected_first = TestingProgressPublisher.TraceState( state='loading', percent_complete=instance_of(float), execution_bounds=(pipeline_start_date, pipeline_end_date), current_chunk_bounds=expected_chunks[0], current_work=instance_of(list)) self.assertEqual(first, expected_first) self.assertGreater(first.percent_complete, 0.0) self.assertEqual( set(first.current_work), {AssetExists(), PREPOPULATED_TERM}, ) # Last publish should have a state of success and be 100% complete. last = trace[-1] expected_last = TestingProgressPublisher.TraceState( state='success', percent_complete=100.0, execution_bounds=(pipeline_start_date, pipeline_end_date), current_chunk_bounds=expected_chunks[-1], # We don't know what the last work item will be, but it must be an # instance of a single ComputableTerm, because we only run # ComputableTerms one at a time, and a LoadableTerm will only be in # the graph if some ComputableTerm depends on it. current_work=[instance_of(ComputableTerm)], ) self.assertEqual(last, expected_last) # Remaining updates should all be loads or computes. middle = trace[1:-1] for update in middle: self.assertIsInstance(update.current_work, list) if update.state == 'loading': for term in update.current_work: self.assertIsInstance( term, (LoadableTerm, AssetExists, PrepopulatedFactor), ) elif update.state == 'computing': for term in update.current_work: self.assertIsInstance(term, ComputableTerm) else: raise AssertionError( "Unexpected state: {}".format(update.state), ) # Break up the remaining updates by chunk. all_chunks = [] grouped = itertools.groupby(middle, attrgetter('current_chunk_bounds')) for (chunk_start, chunk_stop), chunk_trace in grouped: all_chunks.append((chunk_start, chunk_stop)) chunk_trace = list(chunk_trace) expected_end_progress = self.expected_chunk_progress( pipeline_start_date, pipeline_end_date, chunk_stop, ) end_progress = chunk_trace[-1].percent_complete assert_almost_equal(end_progress, expected_end_progress) self.assertEqual(all_chunks, expected_chunks)