示例#1
0
文件: util_test.py 项目: yarbelk/beam
    def test_no_window_context_fails(self):
        expected_timestamp = timestamp.Timestamp(5)
        # Assuming the default window function is window.GlobalWindows.
        expected_window = window.GlobalWindow()

        class AddTimestampDoFn(beam.DoFn):
            def process(self, element):
                yield window.TimestampedValue(element, expected_timestamp)

        pipeline = TestPipeline()
        data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
        expected_windows = [
            TestWindowedValue(kv, expected_timestamp, [expected_window])
            for kv in data
        ]
        before_identity = (pipeline
                           | 'start' >> beam.Create(data)
                           |
                           'add_timestamps' >> beam.ParDo(AddTimestampDoFn()))
        assert_that(before_identity,
                    equal_to(expected_windows),
                    label='before_identity',
                    reify_windows=True)
        after_identity = (
            before_identity
            | 'window' >> beam.WindowInto(
                beam.transforms.util._IdentityWindowFn(
                    coders.GlobalWindowCoder()))
            # This DoFn will return TimestampedValues, making
            # WindowFn.AssignContext passed to IdentityWindowFn
            # contain a window of None. IdentityWindowFn should
            # raise an exception.
            | 'add_timestamps2' >> beam.ParDo(AddTimestampDoFn()))
        assert_that(after_identity,
                    equal_to(expected_windows),
                    label='after_identity',
                    reify_windows=True)
        with self.assertRaisesRegexp(ValueError,
                                     r'window.*None.*add_timestamps2'):
            pipeline.run()
示例#2
0
  def test_read_messages_success(self, mock_pubsub):
    data = 'data'
    message_id = 'message_id'
    publish_time = '2018-03-12T13:37:01.234567Z'
    attributes = {'key': 'value'}
    payloads = [create_client_message(
        data, message_id, attributes, publish_time)]
    expected_elements = [
        TestWindowedValue(PubsubMessage(data, attributes),
                          timestamp.Timestamp(1520861821.234567),
                          [window.GlobalWindow()])]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                              None, 'a_label', with_attributes=True))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
示例#3
0
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)])
                   .advance_watermark_to_infinity())  # yapf: disable

    class RecordFn(beam.DoFn):
      def process(
          self,
          element=beam.DoFn.ElementParam,
          timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      my_record_fn = RecordFn()
      records = p | test_stream | beam.ParDo(my_record_fn)

      assert_that(
          records,
          equal_to([
              ('a', timestamp.Timestamp(10)),
              ('b', timestamp.Timestamp(10)),
              ('c', timestamp.Timestamp(10)),
              ('d', timestamp.Timestamp(20)),
              ('e', timestamp.Timestamp(20)),
              ('late', timestamp.Timestamp(12)),
              ('last', timestamp.Timestamp(310)),
          ]))
示例#4
0
 def clear(self):
   dummy_millis = int(common_urns.constants.MAX_TIMESTAMP_MILLIS.constant) + 1
   clear_ts = timestamp.Timestamp(micros=dummy_millis * 1000)
   self._receiver.receive(
       windowed_value.WindowedValue(
           (self._key, dict(timestamp=clear_ts)), 0, (self._window,)))
    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job
        bq_client.jobs.Insert.return_value = result_job

        # Insert a fake clock to work with auto-sharding which needs a processing
        # time timer.
        class _FakeClock(object):
            def __init__(self, now=time.time()):
                self._now = now

            def __call__(self):
                return self._now

        start_time = timestamp.Timestamp(0)
        bq_client.test_clock = _FakeClock(now=start_time)

        triggering_frequency = 20 if is_streaming else None
        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            temp_file_format=bigquery_tools.FileFormat.JSON,
            is_streaming_pipeline=is_streaming,
            triggering_frequency=triggering_frequency,
            with_auto_sharding=with_auto_sharding)

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline(
                runner='BundleBasedDirectRunner',
                options=StandardOptions(streaming=is_streaming)) as p:
            if is_streaming:
                _SIZE = len(_ELEMENTS)
                fisrt_batch = [
                    TimestampedValue(value, start_time + i + 1)
                    for i, value in enumerate(_ELEMENTS[:_SIZE // 2])
                ]
                second_batch = [
                    TimestampedValue(value, start_time + _SIZE // 2 + i + 1)
                    for i, value in enumerate(_ELEMENTS[_SIZE // 2:])
                ]
                # Advance processing time between batches of input elements to fire the
                # user triggers. Intentionally advance the processing time twice for the
                # auto-sharding case since we need to first fire the timer and then
                # fire the trigger.
                test_stream = (
                    TestStream().advance_watermark_to(start_time).add_elements(
                        fisrt_batch).advance_processing_time(30).
                    advance_processing_time(30).add_elements(second_batch).
                    advance_processing_time(30).advance_processing_time(
                        30).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)
            outputs = input | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())
            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            # Check that all files exist.
            _ = (files
                 | beam.Map(
                     lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # Expect two load jobs are generated in the streaming case due to the
            # triggering frequency. Grouping is per trigger so we expect two entries
            # in the output as opposed to one.
            file_count = files | combiners.Count.Globally().without_defaults()
            expected_file_count = [1, 1] if is_streaming else [1]
            expected_destinations = [destination, destination
                                     ] if is_streaming else [destination]
            expected_jobs = [job_reference, job_reference
                             ] if is_streaming else [job_reference]
            assert_that(file_count,
                        equal_to(expected_file_count),
                        label='CountFiles')
            assert_that(destinations,
                        equal_to(expected_destinations),
                        label='CheckDestinations')
            assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')
示例#6
0
 def test_track_timestamp(self):
   estimator = ThreadsafeWatermarkEstimator(ManualWatermarkEstimator(None))
   estimator.observe_timestamp(timestamp.Timestamp(10))
   self.assertIsNone(estimator.current_watermark())
   estimator.set_watermark(timestamp.Timestamp(20))
   self.assertEqual(estimator.current_watermark(), timestamp.Timestamp(20))
示例#7
0
 def test_get_estimator_state(self):
   estimator = ThreadsafeWatermarkEstimator(ManualWatermarkEstimator(None))
   self.assertIsNone(estimator.get_estimator_state())
   estimator.set_watermark(timestamp.Timestamp(10))
   self.assertEqual(estimator.get_estimator_state(), timestamp.Timestamp(10))