def individual_test_per_key_dd(sampleFn, n): trs = [sampleFn(n)] for transform in trs: dd = DisplayData.create_from(transform) hc.assert_that( dd.items, hc.contains_inanyorder(DisplayDataItemMatcher('n', transform._n)))
def test_query_only_display_data(self): source = beam.io.BigQuerySource(query='my_query') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('query', 'my_query')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): sink = _PubSubPayloadSink('projects/fakeprj/topics/a_topic') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_date_partitioned_table_name(self): source = beam.io.BigQuerySource('dataset.table$20030102', validate=True) dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', True), DisplayDataItemMatcher('table', 'dataset.table$20030102')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = PubSubSource('a_topic') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_display_data(self): file_name = 'some_avro_sink' sink = _create_avro_sink( file_name, self.SCHEMA, 'null', '.end', 0, None, 'application/x-avro', use_fastavro=self.use_fastavro) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher( 'schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher( 'codec', 'null'), DisplayDataItemMatcher( 'compression', 'uncompressed')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_project_table_display_data(self): sinkq = beam.io.BigQuerySink('PROJECT:dataset.table') dd = DisplayData.create_from(sinkq) expected_items = [ DisplayDataItemMatcher('table', 'PROJECT:dataset.table'), DisplayDataItemMatcher('validation', False)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def _add_step(self, step_kind, step_label, transform_node, side_tags=()): """Creates a Step object and adds it to the cache.""" # Import here to avoid adding the dependency for local running scenarios. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient step = apiclient.Step(step_kind, self._get_unique_step_name()) self.job.proto.steps.append(step.proto) step.add_property(PropertyNames.USER_NAME, step_label) # Cache the node/step association for the main output of the transform node. self._cache.cache_output(transform_node, None, step) # If side_tags is not () then this is a multi-output transform node and we # need to cache the (node, tag, step) for each of the tags used to access # the outputs. This is essential because the keys used to search in the # cache always contain the tag. for tag in side_tags: self._cache.cache_output(transform_node, tag, step) # Finally, we add the display data items to the pipeline step. # If the transform contains no display data then an empty list is added. step.add_property( PropertyNames.DISPLAY_DATA, [item.get_dict() for item in DisplayData.create_from(transform_node.transform).items]) return step
def test_display_data(self): sink = PubSubSink('a_topic') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_combine_globally_display_data(self): transform = beam.CombineGlobally(combine.Smallest(5)) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combine.Smallest), DisplayDataItemMatcher('n', 5), DisplayDataItemMatcher('compare', 'gt')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(combineFn): transform = beam.CombinePerKey(combineFn) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combineFn.__class__), DisplayDataItemMatcher('n', combineFn._n), DisplayDataItemMatcher('compare', combineFn._compare.__name__)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): source = _PubSubPayloadSource('a_topic', 'a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'a_topic'), DisplayDataItemMatcher('subscription', 'a_subscription'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_single_file_display_data(self): file_name, _ = write_data(10) fbs = LineSource(file_name) dd = DisplayData.create_from(fbs) expected_items = [ DisplayDataItemMatcher('file_pattern', file_name), DisplayDataItemMatcher('compression', 'auto')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = _PubSubSource('projects/fakeprj/topics/a_topic') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('with_attributes', False), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_source_creation_display_data(self): file_name = 'dummy_pattern' fbs = LineSource(file_name, validate=False) dd = DisplayData.create_from(fbs) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_basic_combiners_display_data(self): transform = beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)) dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn), DisplayDataItemMatcher('combiners', "['max', 'MeanCombineFn', 'sum']")] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_read_display_data(self): file_name = 'some_avro_source' read = avroio.ReadFromAvro(file_name, validate=False) dd = DisplayData.create_from(read) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_unicode_type_display_data(self): class MyDoFn(beam.DoFn): def display_data(self): return {'unicode_string': unicode('my string'), 'unicode_literal_string': u'my literal string'} fn = MyDoFn() dd = DisplayData.create_from(fn) for item in dd.items: self.assertEqual(item.type, 'STRING')
def test_display_data_topic(self): source = _PubSubPayloadSource( 'projects/fakeprj/topics/a_topic', None, 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher( 'topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data_subscription(self): source = _PubSubSource( None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher( 'subscription', 'projects/fakeprj/subscriptions/a_subscription'), DisplayDataItemMatcher('id_label', 'a_label')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): sink = _PubSubSink('projects/fakeprj/topics/a_topic', id_label='id', with_attributes=False, timestamp_attribute='time') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'), DisplayDataItemMatcher('id_label', 'id'), DisplayDataItemMatcher('with_attributes', False), DisplayDataItemMatcher('timestamp_attribute', 'time'), ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, args, kwargs): trs = [sampleFn(*args, **kwargs)] for transform in trs: dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('fn', transform._fn.__name__)] if args: expected_items.append( DisplayDataItemMatcher('args', str(args))) if kwargs: expected_items.append( DisplayDataItemMatcher('kwargs', str(kwargs))) hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_drop_if_none(self): class MyDoFn(beam.DoFn): def display_data(self): return {'some_val': DisplayDataItem('something').drop_if_none(), 'non_val': DisplayDataItem(None).drop_if_none(), 'def_val': DisplayDataItem(True).drop_if_default(True), 'nodef_val': DisplayDataItem(True).drop_if_default(False)} dd = DisplayData.create_from(MyDoFn()) expected_items = [DisplayDataItemMatcher('some_val', 'something'), DisplayDataItemMatcher('nodef_val', True)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_table_reference_display_data(self): source = beam.io.BigQuerySource('dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'project:dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('xyz.com:project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'xyz.com:project:dataset.table')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = os.path.join(self._new_tempdir(), 'display') sink = MyFileBasedSink( temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher( 'compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format( temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output'))] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, args, kwargs): trs = [sampleFn(*args, **kwargs)] for transform in trs: dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('fn', transform._fn.__name__) ] if args: expected_items.append( DisplayDataItemMatcher('args', str(args))) if kwargs: expected_items.append( DisplayDataItemMatcher('kwargs', str(kwargs))) hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_source_display_data(self): file_name = 'some_avro_source' source = \ _create_avro_source( file_name, validate=False, ) dd = DisplayData.create_from(source) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_subcomponent(self): class SpecialDoFn(beam.DoFn): def display_data(self): return {'dofn_value': 42} dofn = SpecialDoFn() pardo = beam.ParDo(dofn) dd = DisplayData.create_from(pardo) dofn_nspace = '{}.{}'.format(dofn.__module__, dofn.__class__.__name__) pardo_nspace = '{}.{}'.format(pardo.__module__, pardo.__class__.__name__) expected_items = [ DisplayDataItemMatcher('dofn_value', 42, dofn_nspace), DisplayDataItemMatcher('fn', SpecialDoFn, pardo_nspace)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_write_display_data(self): file_name = 'some_avro_sink' write = avroio.WriteToAvro(file_name, self.SCHEMA, use_fastavro=self.use_fastavro) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher('codec', 'deflate'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_table_reference_display_data(self): source = beam.io.BigQuerySource('dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'dataset.table') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'project:dataset.table') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) source = beam.io.BigQuerySource('xyz.com:project:dataset.table') dd = DisplayData.create_from(source) expected_items = [ DisplayDataItemMatcher('validation', False), DisplayDataItemMatcher('table', 'xyz.com:project:dataset.table') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_source_display_data(self): file_name = 'some_avro_source' source = \ _create_avro_source( file_name, validate=False, use_fastavro=self.use_fastavro ) dd = DisplayData.create_from(source) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_write_display_data(self): file_name = 'some_parquet_sink' write = WriteToParquet(file_name, self.SCHEMA) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher('row_group_buffer_size', str(64 * 1024 * 1024)), DisplayDataItemMatcher( 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_file_sink_display_data(self): temp_path = os.path.join(self._new_tempdir(), 'display') sink = MyFileSink(temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder()) dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher( 'file_pattern', '{}{}'.format(temp_path, '-%(shard_num)05d-of-%(num_shards)05d.foo')) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_read_display_data(self): file_name = 'some_avro_source' read = \ avroio.ReadFromAvro( file_name, validate=False, use_fastavro=self.use_fastavro) dd = DisplayData.create_from(read) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def individual_test_per_key_dd(sampleFn, args, kwargs): trs = [beam.CombinePerKey(sampleFn(*args, **kwargs)), beam.CombineGlobally(sampleFn(*args, **kwargs))] for transform in trs: dd = DisplayData.create_from(transform) expected_items = [ DisplayDataItemMatcher('fn', sampleFn.fn.__name__), DisplayDataItemMatcher('combine_fn', transform.fn.__class__)] if len(args) > 0: expected_items.append( DisplayDataItemMatcher('args', str(args))) if len(kwargs) > 0: expected_items.append( DisplayDataItemMatcher('kwargs', str(kwargs))) hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_display_data(self): file_name = 'some_avro_sink' sink = _create_avro_sink( file_name, self.SCHEMA, 'null', '.end', 0, None, 'application/x-avro') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher('codec', 'null'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_drop_if_none(self): class MyDoFn(beam.DoFn): def display_data(self): return { 'some_val': DisplayDataItem('something').drop_if_none(), 'non_val': DisplayDataItem(None).drop_if_none(), 'def_val': DisplayDataItem(True).drop_if_default(True), 'nodef_val': DisplayDataItem(True).drop_if_default(False) } dd = DisplayData.create_from(MyDoFn()) expected_items = [ DisplayDataItemMatcher('some_val', 'something'), DisplayDataItemMatcher('nodef_val', True) ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_display_data(self): file_name = 'some_parquet_sink' sink = _create_parquet_sink(file_name, self.SCHEMA, 'none', 1024 * 1024, 1000, False, '.end', 0, None, 'application/x-parquet') dd = DisplayData.create_from(sink) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('row_group_buffer_size', str(1024 * 1024)), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_base_cases(self): """ Tests basic display data cases (key:value, key:dict) It does not test subcomponent inclusion """ class MyDoFn(beam.DoFn): def __init__(self, my_display_data=None): self.my_display_data = my_display_data def process(self, element): yield element + 1 def display_data(self): return {'static_integer': 120, 'static_string': 'static me!', 'complex_url': DisplayDataItem('github.com', url='http://github.com', label='The URL'), 'python_class': HasDisplayData, 'my_dd': self.my_display_data} now = datetime.now() fn = MyDoFn(my_display_data=now) dd = DisplayData.create_from(fn) nspace = '{}.{}'.format(fn.__module__, fn.__class__.__name__) expected_items = [ DisplayDataItemMatcher(key='complex_url', value='github.com', namespace=nspace, label='The URL'), DisplayDataItemMatcher(key='my_dd', value=now, namespace=nspace), DisplayDataItemMatcher(key='python_class', value=HasDisplayData, namespace=nspace, shortValue='HasDisplayData'), DisplayDataItemMatcher(key='static_integer', value=120, namespace=nspace), DisplayDataItemMatcher(key='static_string', value='static me!', namespace=nspace)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_write_display_data(self): file_name = 'some_avro_sink' write = avroio.WriteToAvro(file_name, self.SCHEMA) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher( 'schema', str(self.SCHEMA)), DisplayDataItemMatcher( 'file_pattern', 'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher( 'codec', 'deflate'), DisplayDataItemMatcher( 'compression', 'uncompressed')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_perkey_display_data(self): transform = beam.ApproximateQuantiles.PerKey(3, key=len, reverse=True) data = DisplayData.create_from(transform) expected_items = self._display_data_matcher(transform) hc.assert_that(data.items, hc.contains_inanyorder(*expected_items))
def test_display_data_no_subscription(self): source = PubSubSource('a_topic') dd = DisplayData.create_from(source) expected_items = [DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_display_data(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) dd = DisplayData.create_from(options) hc.assert_that(dd.items, hc.contains_inanyorder(*case['display_data']))
def test_display_data(self): sink = PubSubSink('a_topic') dd = DisplayData.create_from(sink) expected_items = [DisplayDataItemMatcher('topic', 'a_topic')] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))