def test_no_window_context_fails(self): expected_timestamp = timestamp.Timestamp(5) # Assuming the default window function is window.GlobalWindows. expected_window = window.GlobalWindow() class AddTimestampDoFn(beam.DoFn): def process(self, element): yield window.TimestampedValue(element, expected_timestamp) pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(kv, expected_timestamp, [expected_window]) for kv in data ] before_identity = (pipeline | 'start' >> beam.Create(data) | 'add_timestamps' >> beam.ParDo(AddTimestampDoFn())) assert_that(before_identity, equal_to(expected_windows), label='before_identity', reify_windows=True) after_identity = ( before_identity | 'window' >> beam.WindowInto( beam.transforms.util._IdentityWindowFn( coders.GlobalWindowCoder())) # This DoFn will return TimestampedValues, making # WindowFn.AssignContext passed to IdentityWindowFn # contain a window of None. IdentityWindowFn should # raise an exception. | 'add_timestamps2' >> beam.ParDo(AddTimestampDoFn())) assert_that(after_identity, equal_to(expected_windows), label='after_identity', reify_windows=True) with self.assertRaisesRegexp(ValueError, r'window.*None.*add_timestamps2'): pipeline.run()
def test_read_messages_success(self, mock_pubsub): data = 'data' message_id = 'message_id' publish_time = '2018-03-12T13:37:01.234567Z' attributes = {'key': 'value'} payloads = [create_client_message( data, message_id, attributes, publish_time)] expected_elements = [ TestWindowedValue(PubsubMessage(data, attributes), timestamp.Timestamp(1520861821.234567), [window.GlobalWindow()])] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True)) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run()
def test_basic_execution(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)]) .advance_watermark_to_infinity()) # yapf: disable class RecordFn(beam.DoFn): def process( self, element=beam.DoFn.ElementParam, timestamp=beam.DoFn.TimestampParam): yield (element, timestamp) options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: my_record_fn = RecordFn() records = p | test_stream | beam.ParDo(my_record_fn) assert_that( records, equal_to([ ('a', timestamp.Timestamp(10)), ('b', timestamp.Timestamp(10)), ('c', timestamp.Timestamp(10)), ('d', timestamp.Timestamp(20)), ('e', timestamp.Timestamp(20)), ('late', timestamp.Timestamp(12)), ('last', timestamp.Timestamp(310)), ]))
def clear(self): dummy_millis = int(common_urns.constants.MAX_TIMESTAMP_MILLIS.constant) + 1 clear_ts = timestamp.Timestamp(micros=dummy_millis * 1000) self._receiver.receive( windowed_value.WindowedValue( (self._key, dict(timestamp=clear_ts)), 0, (self._window,)))
def test_triggering_frequency(self, is_streaming, with_auto_sharding): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job # Insert a fake clock to work with auto-sharding which needs a processing # time timer. class _FakeClock(object): def __init__(self, now=time.time()): self._now = now def __call__(self): return self._now start_time = timestamp.Timestamp(0) bq_client.test_clock = _FakeClock(now=start_time) triggering_frequency = 20 if is_streaming else None transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, temp_file_format=bigquery_tools.FileFormat.JSON, is_streaming_pipeline=is_streaming, triggering_frequency=triggering_frequency, with_auto_sharding=with_auto_sharding) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline( runner='BundleBasedDirectRunner', options=StandardOptions(streaming=is_streaming)) as p: if is_streaming: _SIZE = len(_ELEMENTS) fisrt_batch = [ TimestampedValue(value, start_time + i + 1) for i, value in enumerate(_ELEMENTS[:_SIZE // 2]) ] second_batch = [ TimestampedValue(value, start_time + _SIZE // 2 + i + 1) for i, value in enumerate(_ELEMENTS[_SIZE // 2:]) ] # Advance processing time between batches of input elements to fire the # user triggers. Intentionally advance the processing time twice for the # auto-sharding case since we need to first fire the timer and then # fire the trigger. test_stream = ( TestStream().advance_watermark_to(start_time).add_elements( fisrt_batch).advance_processing_time(30). advance_processing_time(30).add_elements(second_batch). advance_processing_time(30).advance_processing_time( 30).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) outputs = input | transform dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) # Check that all files exist. _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # Expect two load jobs are generated in the streaming case due to the # triggering frequency. Grouping is per trigger so we expect two entries # in the output as opposed to one. file_count = files | combiners.Count.Globally().without_defaults() expected_file_count = [1, 1] if is_streaming else [1] expected_destinations = [destination, destination ] if is_streaming else [destination] expected_jobs = [job_reference, job_reference ] if is_streaming else [job_reference] assert_that(file_count, equal_to(expected_file_count), label='CountFiles') assert_that(destinations, equal_to(expected_destinations), label='CheckDestinations') assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')
def test_track_timestamp(self): estimator = ThreadsafeWatermarkEstimator(ManualWatermarkEstimator(None)) estimator.observe_timestamp(timestamp.Timestamp(10)) self.assertIsNone(estimator.current_watermark()) estimator.set_watermark(timestamp.Timestamp(20)) self.assertEqual(estimator.current_watermark(), timestamp.Timestamp(20))
def test_get_estimator_state(self): estimator = ThreadsafeWatermarkEstimator(ManualWatermarkEstimator(None)) self.assertIsNone(estimator.get_estimator_state()) estimator.set_watermark(timestamp.Timestamp(10)) self.assertEqual(estimator.get_estimator_state(), timestamp.Timestamp(10))