def test_gbk_execution(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. assert_that(records, equal_to([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])])) p.run()
def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.window.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto( beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([42, 187])) p.run()
def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def model_multiple_pcollections_flatten(contents, output_path): """Merging a PCollection with Flatten.""" some_hash_fn = lambda s: ord(s[0]) import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. partition_fn = lambda element, partitions: some_hash_fn(element) % partitions # Partition into deciles partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3) pcoll1 = partitioned[0] pcoll2 = partitioned[1] pcoll3 = partitioned[2] # Flatten them back into 1 # A collection of PCollection objects can be represented simply # as a tuple (or list) of PCollections. # (The SDK for Python has no separate type to store multiple # PCollection objects, whether containing the same or different # types.) # [START model_multiple_pcollections_flatten] merged = ( (pcoll1, pcoll2, pcoll3) # A list of tuples can be "piped" directly into a Flatten transform. | beam.Flatten()) # [END model_multiple_pcollections_flatten] merged | beam.io.WriteToText(output_path) p.run()
def model_composite_transform_example(contents, output_path): """Example of a composite transform. To declare a composite transform, define a subclass of PTransform. To override the apply method, define a method "apply" that takes a PCollection as its only parameter and returns a PCollection. """ import re import apache_beam as beam # [START composite_transform_example] # [START composite_ptransform_apply_method] # [START composite_ptransform_declare] class CountWords(beam.PTransform): # [END composite_ptransform_declare] def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda x: re.findall(r'\w+', x)) | beam.combiners.Count.PerElement() | beam.Map(lambda (word, c): '%s: %s' % (word, c))) # [END composite_ptransform_apply_method] # [END composite_transform_example] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(contents) | CountWords() | beam.io.WriteToText(output_path)) p.run()
def pipeline_logging(lines, output): """Logging Pipeline Messages.""" import re import apache_beam as beam # [START pipeline_logging] # import Python logging module. import logging class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word if word.lower() == 'love': # Log using the root logger at info or higher levels logging.info('Found : %s', word.lower()) # Remaining WordCount example code ... # [END pipeline_logging] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(lines) | beam.ParDo(ExtractWordsFn()) | beam.io.WriteToText(output)) p.run()
class StreamingWordCountIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" logging.debug('Injecting %d numbers to topic %s', num_messages, topic.full_name) for n in range(num_messages): topic.publish(str(n)) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def tearDown(self): self._cleanup_pubsub() @attr('IT') def test_streaming_wordcount_it(self): # Build expected dataset. expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, OUTPUT_SUB + self.uuid, expected_msg, timeout=400) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_read_from_text_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def model_multiple_pcollections_partition(contents, output_path): """Splitting a PCollection with Partition.""" some_hash_fn = lambda s: ord(s[0]) def get_percentile(i): """Assume i in [0,100).""" return i import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. students = p | beam.Create(contents) # [START model_multiple_pcollections_partition] def partition_fn(student, num_partitions): return int(get_percentile(student) * num_partitions / 100) by_decile = students | beam.Partition(partition_fn, 10) # [END model_multiple_pcollections_partition] # [START model_multiple_pcollections_partition_40th] fortieth_percentile = by_decile[4] # [END model_multiple_pcollections_partition_40th] ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile] | beam.Flatten() | beam.io.WriteToText(output_path)) p.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_basic_execution_sideinputs(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(10) .add_elements(['e'])) side_stream = (p | 'side TestStream' >> TestStream() .add_elements([window.TimestampedValue(2, 2)]) .add_elements([window.TimestampedValue(1, 1)]) .add_elements([window.TimestampedValue(7, 7)]) .add_elements([window.TimestampedValue(4, 4)]) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])])) p.run()
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM `%s`' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_read_messages_timestamp_attribute_milli_success(self, mock_pubsub): data = b'data' attributes = {'time': '1337'} publish_time_secs = 1520861821 publish_time_nanos = 234567000 ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage( data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp(micros=int(attributes['time']) * 1000), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls([ mock.call(mock.ANY, [ack_id])])
def model_co_group_by_key_tuple(email_list, phone_list, output_path): """Applying a CoGroupByKey Transform to a tuple.""" import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. # [START model_group_by_key_cogroupbykey_tuple] # Each data set is represented by key-value pairs in separate PCollections. # Both data sets share a common key type (in this example str). # The email_list contains values such as: ('joe', '*****@*****.**') with # multiple possible values for each key. # The phone_list contains values such as: ('mary': '111-222-3333') with # multiple possible values for each key. emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) # The result PCollection contains one key-value element for each key in the # input PCollections. The key of the pair will be the key from the input and # the value will be a dictionary with two entries: 'emails' - an iterable of # all values for the current key in the emails PCollection and 'phones': an # iterable of all values for the current key in the phones PCollection. # For instance, if 'emails' contained ('joe', '*****@*****.**') and # ('joe', '*****@*****.**'), then 'result' will contain the element # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...}) result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey() def join_info((name, info)): return '; '.join(['%s' % name, '%s' % ','.join(info['emails']), '%s' % ','.join(info['phones'])]) contact_lines = result | beam.Map(join_info) # [END model_group_by_key_cogroupbykey_tuple] contact_lines | beam.io.WriteToText(output_path) p.run()
def test_read_messages_timestamp_attribute_rfc3339_success(self, mock_pubsub): data = 'data' message_id = 'message_id' attributes = {'time': '2018-03-12T13:37:01.234567Z'} publish_time = '2018-03-12T13:37:01.234567Z' payloads = [ create_client_message(data, message_id, attributes, publish_time)] expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(attributes['time']), [window.GlobalWindow()]), ] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run()
def test_on_direct_runner(self): class FakeSink(NativeSink): """A fake sink outputing a number of elements.""" def __init__(self): self.written_values = [] self.writer_instance = FakeSinkWriter(self.written_values) def writer(self): return self.writer_instance class FakeSinkWriter(NativeSinkWriter): """A fake sink writer for testing.""" def __init__(self, written_values): self.written_values = written_values def __enter__(self): return self def __exit__(self, *unused_args): pass def Write(self, value): self.written_values.append(value) p = TestPipeline() sink = FakeSink() p | Create(['a', 'b', 'c']) | _NativeWrite(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual(['a', 'b', 'c'], sink.written_values)
def test_basic_execution(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) class RecordFn(beam.DoFn): def process(self, element=beam.DoFn.ElementParam, timestamp=beam.DoFn.TimestampParam): yield (element, timestamp) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) my_record_fn = RecordFn() records = p | test_stream | beam.ParDo(my_record_fn) assert_that(records, equal_to([ ('a', timestamp.Timestamp(10)), ('b', timestamp.Timestamp(10)), ('c', timestamp.Timestamp(10)), ('d', timestamp.Timestamp(20)), ('e', timestamp.Timestamp(20)), ('late', timestamp.Timestamp(12)), ('last', timestamp.Timestamp(310)),])) p.run()
def test_no_window_context_fails(self): expected_timestamp = timestamp.Timestamp(5) # Assuming the default window function is window.GlobalWindows. expected_window = window.GlobalWindow() class AddTimestampDoFn(beam.DoFn): def process(self, element): yield window.TimestampedValue(element, expected_timestamp) pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(kv, expected_timestamp, [expected_window]) for kv in data] before_identity = (pipeline | 'start' >> beam.Create(data) | 'add_timestamps' >> beam.ParDo(AddTimestampDoFn())) assert_that(before_identity, equal_to(expected_windows), label='before_identity', reify_windows=True) after_identity = (before_identity | 'window' >> beam.WindowInto( beam.transforms.util._IdentityWindowFn( coders.GlobalWindowCoder())) # This DoFn will return TimestampedValues, making # WindowFn.AssignContext passed to IdentityWindowFn # contain a window of None. IdentityWindowFn should # raise an exception. | 'add_timestamps2' >> beam.ParDo(AddTimestampDoFn())) assert_that(after_identity, equal_to(expected_windows), label='after_identity', reify_windows=True) with self.assertRaisesRegexp(ValueError, r'window.*None.*add_timestamps2'): pipeline.run()
def test_window_preserved(self): expected_timestamp = timestamp.Timestamp(5) expected_window = window.IntervalWindow(1.0, 2.0) class AddWindowDoFn(beam.DoFn): def process(self, element): yield WindowedValue( element, expected_timestamp, [expected_window]) pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(kv, expected_timestamp, [expected_window]) for kv in data] before_identity = (pipeline | 'start' >> beam.Create(data) | 'add_windows' >> beam.ParDo(AddWindowDoFn())) assert_that(before_identity, equal_to(expected_windows), label='before_identity', reify_windows=True) after_identity = (before_identity | 'window' >> beam.WindowInto( beam.transforms.util._IdentityWindowFn( coders.IntervalWindowCoder()))) assert_that(after_identity, equal_to(expected_windows), label='after_identity', reify_windows=True) pipeline.run()
def test_reshuffle_window_fn_preserved(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ((2, 1), 1.0, IntervalWindow(1.0, 3.0)), ((3, 1), 1.0, IntervalWindow(1.0, 3.0)), ((1, 2), 2.0, IntervalWindow(2.0, 4.0)), ((2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]] expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that(before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle()) assert_that(after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = (after_reshuffle | 'group_by_key' >> beam.GroupByKey()) assert_that(after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True) pipeline.run()
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read(beam.io.BigQuerySource( query=known_args.query, use_standard_sql=known_args.use_standard_sql)) | 'write' >> beam.io.Write(beam.io.BigQuerySink( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run() result.wait_until_finish()
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline(options=PipelineOptions(runtime_type_check=True)) with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
class UserScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join([self.test_pipeline.get_option('output'), self.uuid, 'results']) @attr('IT') def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) file_verifier = FileChecksumMatcher(self.output + '*-of-*', self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier)} # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_compute_top_sessions(self): p = TestPipeline() edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) assert_that(result, equal_to(self.EXPECTED)) p.run()
def test_read_messages_timestamp_attribute_missing(self, mock_pubsub): data = 'data' attributes = {} publish_time_secs = 1520861821 publish_time_nanos = 234567000 publish_time = '2018-03-12T13:37:01.234567Z' ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage( data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(publish_time), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='nonexistent')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls([ mock.call(mock.ANY, [ack_id])])
def test_ptransform_override_type_hints(self): class NoTypeHintOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): return ToStringParDo() class WithTypeHintOverride(PTransformOverride): def matches(self, applied_ptransform): return isinstance(applied_ptransform.transform, DoubleParDo) def get_replacement_transform(self, ptransform): return (ToStringParDo() .with_input_types(int) .with_output_types(str)) for override, expected_type in [(NoTypeHintOverride(), typehints.Any), (WithTypeHintOverride(), str)]: p = TestPipeline() pcoll = (p | beam.Create([1, 2, 3]) | 'Operate' >> DoubleParDo() | 'NoOp' >> beam.Map(lambda x: x)) p.replace_all([override]) self.assertEquals(pcoll.producer.inputs[0].element_type, expected_type)
def test_read_from_text_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_model_use_and_query_metrics(self): """DebuggingWordCount example snippets.""" import re p = TestPipeline() # Use TestPipeline for testing. words = p | beam.Create(['albert', 'sam', 'mark', 'sarah', 'swati', 'daniel', 'andrea']) # pylint: disable=unused-variable # [START metrics_usage_example] class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regex.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics to count unmatched words, and know the distribution of # word lengths in the input PCollection. self.word_len_dist = Metrics.distribution(self.__class__, 'word_len_dist') self.unmatched_words = Metrics.counter(self.__class__, 'unmatched_words') def process(self, element): word = element self.word_len_dist.update(len(word)) if re.match(self.pattern, word): yield element else: self.unmatched_words.inc() filtered_words = ( words | 'FilterText' >> beam.ParDo(FilterTextFn('s.*'))) # [END metrics_usage_example] # pylint: enable=unused-variable # [START metrics_check_values_example] result = p.run() result.wait_until_finish() custom_distribution = result.metrics().query( MetricsFilter().with_name('word_len_dist'))['distributions'] custom_counter = result.metrics().query( MetricsFilter().with_name('unmatched_words'))['counters'] if custom_distribution: logging.info('The average word length was %d', custom_distribution[0].committed.mean) if custom_counter: logging.info('There were %d words that did not match the filter.', custom_counter[0].committed) # [END metrics_check_values_example] # There should be 4 words that did not match self.assertEqual(custom_counter[0].committed, 4) # The shortest word is 3 characters, the longest is 6 self.assertEqual(custom_distribution[0].committed.min, 3) self.assertEqual(custom_distribution[0].committed.max, 6)
def test_read_all_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create( [file_name]) |'ReadAll' >> ReadAllFromText() assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def setUpClass(cls): cls.test_pipeline = TestPipeline(is_integration_test=True) cls.args = cls.test_pipeline.get_full_options_as_args() cls.runner_name = type(cls.test_pipeline.runner).__name__ cls.project = cls.test_pipeline.get_option('project')
def test_create_singleton_pcollection(self): pipeline = TestPipeline() pcoll = pipeline | 'label' >> Create([[1, 2, 3]]) assert_that(pcoll, equal_to([[1, 2, 3]])) pipeline.run()
def test_match_group_name_pattern(self): with TestPipeline() as p: rc = re.compile("x (?P<namedgroup>[xyz]*)") result = (p | beam.Create(["a", "x xxx", "x yyy", "x zzz"]) | util.Regex.matches(rc, 'namedgroup')) assert_that(result, equal_to(("xxx", "yyy", "zzz")))
def test_match_group_kv_none(self): with TestPipeline() as p: result = (p | beam.Create(["x y z"]) | util.Regex.matches_kv("a (b) (c)", 1, 2)) assert_that(result, equal_to([]))
def test_match_kv_group_name_none(self): with TestPipeline() as p: result = (p | beam.Create(["x y z"]) | util.Regex.matches_kv( "a (?P<keyname>b) (?P<valuename>c)", 'keyname', 'valuename')) assert_that(result, equal_to([]))
def test_replace_first_mixed(self): with TestPipeline() as p: result = (p | beam.Create(["abc", "xjx", "yjy", "zjz", "def"]) | util.Regex.replace_first("[xyz]", 'new')) assert_that(result, equal_to(["abc", "newjx", "newjy", "newjz", "def"]))
def setUp(self): parser = argparse.ArgumentParser() parser.add_argument( '--aws_kinesis_stream', default='beam_kinesis_xlang', help='Kinesis stream name', ) parser.add_argument( '--aws_access_key', default='accesskey', help=('Aws access key'), ) parser.add_argument( '--aws_secret_key', default='secretkey', help='Aws secret key', ) parser.add_argument( '--aws_region', default='us-east-1', help='Aws region', ) parser.add_argument( '--aws_service_endpoint', default=None, help='Url to external aws endpoint', ) parser.add_argument( '--use_real_aws', default=False, dest='use_real_aws', action='store_true', help='Flag whether to use real aws for the tests purpose', ) parser.add_argument( '--expansion_service', help='Url to externally launched expansion service.', ) pipeline = TestPipeline() argv = pipeline.get_full_options_as_args() known_args, self.pipeline_args = parser.parse_known_args(argv) self.aws_kinesis_stream = known_args.aws_kinesis_stream self.aws_access_key = known_args.aws_access_key self.aws_secret_key = known_args.aws_secret_key self.aws_region = known_args.aws_region self.aws_service_endpoint = known_args.aws_service_endpoint self.use_localstack = not known_args.use_real_aws self.expansion_service = known_args.expansion_service self.producer_properties = { 'CollectionMaxCount': str(NUM_RECORDS), 'ConnectTimeout': str(MAX_READ_TIME), } if self.use_localstack: self.set_localstack() self.kinesis_helper = KinesisHelper( self.aws_access_key, self.aws_secret_key, self.aws_region, self.aws_service_endpoint.replace('https', 'http') if self.aws_service_endpoint else None, ) if self.use_localstack: self.kinesis_helper.create_stream(self.aws_kinesis_stream)
def test_tostring_iterables(self): with TestPipeline() as p: result = (p | beam.Create([("one", "two", "three"), ("four", "five", "six")]) | util.ToString.Iterables()) assert_that(result, equal_to(["one,two,three", "four,five,six"]))
def test_tostring_elements(self): with TestPipeline() as p: result = (p | beam.Create([1, 1, 2, 3]) | util.ToString.Element()) assert_that(result, equal_to(["1", "1", "2", "3"]))
def test_match_group(self): with TestPipeline() as p: result = (p | beam.Create(["a", "x xxx", "x yyy", "x zzz"]) | util.Regex.matches("x ([xyz]*)", 1)) assert_that(result, equal_to(("xxx", "yyy", "zzz")))
def test_pipeline_read_file_pattern_large(self): pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromVcf( os.path.join(get_full_dir(), 'valid-*.vcf')) assert_that(pcoll, _count_equals_to(9900)) pipeline.run()
def test_timestamp_param_map(self): with TestPipeline() as p: assert_that( p | Create([1, 2]) | beam.Map(lambda _, t=DoFn.TimestampParam: t), equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def test_tostring_iterables_with_delimeter(self): with TestPipeline() as p: data = [("one", "two", "three"), ("four", "five", "six")] result = (p | beam.Create(data) | util.ToString.Iterables("\t")) assert_that(result, equal_to(["one\ttwo\tthree", "four\tfive\tsix"]))
class CrossLanguageKinesisIOTest(unittest.TestCase): @unittest.skipUnless( TestPipeline().get_option('aws_kinesis_stream'), 'Cannot test on real aws without pipeline options provided') def test_kinesis_io_roundtrip(self): # TODO: enable this test for localstack once # https://github.com/apache/beam/issues/20416 is resolved self.run_kinesis_write() self.run_kinesis_read() @unittest.skipIf( TestPipeline().get_option('aws_kinesis_stream'), 'Do not test on localstack when pipeline options were provided') def test_kinesis_write(self): # TODO: remove this test once # https://github.com/apache/beam/issues/20416 is resolved self.run_kinesis_write() records = self.kinesis_helper.read_from_stream(self.aws_kinesis_stream) self.assertEqual( sorted(records), sorted([RECORD + str(i).encode() for i in range(NUM_RECORDS)])) def run_kinesis_write(self): with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: p.not_use_test_runner_api = True _ = ( p | 'Impulse' >> beam.Impulse() | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS)) # pylint: disable=bad-option-value | 'Map to bytes' >> beam.Map(lambda x: RECORD + str(x).encode( )).with_output_types(bytes) | 'WriteToKinesis' >> WriteToKinesis( stream_name=self.aws_kinesis_stream, aws_access_key=self.aws_access_key, aws_secret_key=self.aws_secret_key, region=self.aws_region, service_endpoint=self.aws_service_endpoint, verify_certificate=(not self.use_localstack), partition_key='1', producer_properties=self.producer_properties, )) def run_kinesis_read(self): records = [RECORD + str(i).encode() for i in range(NUM_RECORDS)] with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: result = (p | 'ReadFromKinesis' >> ReadDataFromKinesis( stream_name=self.aws_kinesis_stream, aws_access_key=self.aws_access_key, aws_secret_key=self.aws_secret_key, region=self.aws_region, service_endpoint=self.aws_service_endpoint, verify_certificate=not self.use_localstack, max_num_records=NUM_RECORDS, max_read_time=MAX_READ_TIME, request_records_limit=REQUEST_RECORDS_LIMIT, watermark_policy=WatermarkPolicy.ARRIVAL_TIME, watermark_idle_duration_threshold=MAX_READ_TIME, initial_position_in_stream=InitialPositionInStream. AT_TIMESTAMP, initial_timestamp_in_stream=NOW_MILLIS, ).with_output_types(bytes)) assert_that(result, equal_to(records)) def set_localstack(self): self.localstack = DockerContainer('localstack/localstack:{}' .format(LOCALSTACK_VERSION))\ .with_env('SERVICES', 'kinesis')\ .with_env('KINESIS_PORT', '4568')\ .with_env('USE_SSL', 'true')\ .with_exposed_ports(4568)\ .with_volume_mapping('/var/run/docker.sock', '/var/run/docker.sock', 'rw') # Repeat if ReadTimeout is raised. for i in range(4): try: self.localstack.start() break except Exception as e: # pylint: disable=bare-except if i == 3: logging.error('Could not initialize localstack container') raise e self.aws_service_endpoint = 'https://{}:{}'.format( self.localstack.get_container_host_ip(), self.localstack.get_exposed_port('4568'), ) def setUp(self): parser = argparse.ArgumentParser() parser.add_argument( '--aws_kinesis_stream', default='beam_kinesis_xlang', help='Kinesis stream name', ) parser.add_argument( '--aws_access_key', default='accesskey', help=('Aws access key'), ) parser.add_argument( '--aws_secret_key', default='secretkey', help='Aws secret key', ) parser.add_argument( '--aws_region', default='us-east-1', help='Aws region', ) parser.add_argument( '--aws_service_endpoint', default=None, help='Url to external aws endpoint', ) parser.add_argument( '--use_real_aws', default=False, dest='use_real_aws', action='store_true', help='Flag whether to use real aws for the tests purpose', ) parser.add_argument( '--expansion_service', help='Url to externally launched expansion service.', ) pipeline = TestPipeline() argv = pipeline.get_full_options_as_args() known_args, self.pipeline_args = parser.parse_known_args(argv) self.aws_kinesis_stream = known_args.aws_kinesis_stream self.aws_access_key = known_args.aws_access_key self.aws_secret_key = known_args.aws_secret_key self.aws_region = known_args.aws_region self.aws_service_endpoint = known_args.aws_service_endpoint self.use_localstack = not known_args.use_real_aws self.expansion_service = known_args.expansion_service self.producer_properties = { 'CollectionMaxCount': str(NUM_RECORDS), 'ConnectTimeout': str(MAX_READ_TIME), } if self.use_localstack: self.set_localstack() self.kinesis_helper = KinesisHelper( self.aws_access_key, self.aws_secret_key, self.aws_region, self.aws_service_endpoint.replace('https', 'http') if self.aws_service_endpoint else None, ) if self.use_localstack: self.kinesis_helper.create_stream(self.aws_kinesis_stream) def tearDown(self): if self.use_localstack: self.kinesis_helper.delete_stream(self.aws_kinesis_stream) try: self.localstack.stop() except: # pylint: disable=bare-except logging.error('Could not stop the localstack container')
def test_tostring_kvs(self): with TestPipeline() as p: result = (p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs()) assert_that(result, equal_to(["one,1", "two,2"]))
DockerContainer = None # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports LOCALSTACK_VERSION = '0.11.3' NUM_RECORDS = 10 MAX_READ_TIME = 5 * 60 * 1000 # 5min NOW_SECONDS = time.time() NOW_MILLIS = NOW_SECONDS * 1000 REQUEST_RECORDS_LIMIT = 1000 RECORD = b'record' + str(uuid.uuid4()).encode() @unittest.skipUnless(DockerContainer, 'testcontainers is not installed.') @unittest.skipUnless(boto3, 'boto3 is not installed.') @unittest.skipUnless( TestPipeline().get_pipeline_options().view_as(StandardOptions).runner, 'Do not run this test on precommit suites.') class CrossLanguageKinesisIOTest(unittest.TestCase): @unittest.skipUnless( TestPipeline().get_option('aws_kinesis_stream'), 'Cannot test on real aws without pipeline options provided') def test_kinesis_io_roundtrip(self): # TODO: enable this test for localstack once # https://github.com/apache/beam/issues/20416 is resolved self.run_kinesis_write() self.run_kinesis_read() @unittest.skipIf( TestPipeline().get_option('aws_kinesis_stream'), 'Do not test on localstack when pipeline options were provided') def test_kinesis_write(self):
def test_tostring_kvs_delimeter(self): with TestPipeline() as p: result = (p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs("\t")) assert_that(result, equal_to(["one\t1", "two\t2"]))
def test_replace_all(self): with TestPipeline() as p: result = (p | beam.Create(["xj", "yj", "zj"]) | util.Regex.replace_all("[xyz]", "new")) assert_that(result, equal_to(["newj", "newj", "newj"]))
def test_pipeline_read_single_file_large(self): pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromVcf( get_full_file_path('valid-4.0.vcf')) assert_that(pcoll, _count_equals_to(5)) pipeline.run()
def test_match_kv_group_names_pattern(self): with TestPipeline() as p: rc = re.compile("a (?P<keyname>b) (?P<valuename>c)") result = (p | beam.Create(["a b c"]) | util.Regex.matches_kv(rc, 'keyname', 'valuename')) assert_that(result, equal_to([("b", "c")]))
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = { # TODO(BEAM-4275): DirectRunner doesn't support reading or writing # label_ids, nor writing timestamp attributes. Once these features exist, # TestDirectRunner and TestDataflowRunner should behave identically. 'TestDirectRunner': [ PubsubMessage('data001', {}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ], 'TestDataflowRunner': [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }) ], } EXPECTED_OUTPUT_MESSAGES = { 'TestDirectRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage( 'data002-seen', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], 'TestDataflowRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage( 'data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], } def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created( [self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def tearDown(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Set on_success_matcher to verify pipeline state and pubsub output. These # verifications run on a (remote) worker. # Expect the state to be RUNNING since a streaming pipeline is usually # never DONE. The test runner will cancel the pipeline after verification. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] if not with_attributes: expected_messages = [ pubsub_msg.data for pubsub_msg in expected_messages ] if self.runner_name == 'TestDirectRunner': strip_attributes = None else: strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=strip_attributes) extra_opts = { 'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES[self.runner_name]: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
def test_match_group_empty(self): with TestPipeline() as p: result = (p | beam.Create(["a", "b", "c", "d"]) | util.Regex.matches("x (?P<namedgroup>[xyz]*)", 'namedgroup')) assert_that(result, equal_to([]))
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT # pipeline writes back the timestamp of each element (as reported by # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ] EXPECTED_OUTPUT_MESSAGES = [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def tearDown(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Build expected dataset. # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
def test_reshuffle_contents_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)] result = (pipeline | beam.Create(data) | beam.Reshuffle()) assert_that(result, equal_to(data)) pipeline.run()
def test_fake_read(self): pipeline = TestPipeline() pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3])) assert_that(pcoll, equal_to([1, 2, 3])) pipeline.run()
def test_constant_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys('k') assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)], ))
def test_callable_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys(lambda x: x * x) assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)]))
def test_apply_custom_transform(self): pipeline = TestPipeline() pcoll = pipeline | 'pcoll' >> Create([1, 2, 3]) result = pcoll | PipelineTest.CustomTransform() assert_that(result, equal_to([2, 3, 4])) pipeline.run()
def test_match_none(self): with TestPipeline() as p: result = (p | beam.Create(["a", "b", "c", "d"]) | util.Regex.matches("[xyz]")) assert_that(result, equal_to([]))