def test_partition(self, test_pipeline=None): if not test_pipeline: test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True CrossLanguageTestPipelines().run_partition(test_pipeline)
def test_combine_globally(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create([1, 2, 3]).with_output_types(int) | beam.ExternalTransform( TEST_COMGL_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([6]))
def test_prefix(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create(['a', 'b']).with_output_types(unicode) | beam.ExternalTransform( TEST_PREFIX_URN, ImplicitSchemaPayloadBuilder({'data': u'0'}), ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to(['0a', '0b']))
def test_partition(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = (p | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int) | beam.ExternalTransform( TEST_PARTITION_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res['0'], equal_to([2, 4, 6]), label='check_even') assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
def test_flatten(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int) col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int) res = ((col1, col2) | beam.ExternalTransform( TEST_FLATTEN_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
def test_combine_per_key(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = ( p | beam.Create([('a', 1), ('a', 2), ('b', 3)]).with_output_types( typing.Tuple[unicode, int]) | beam.ExternalTransform( TEST_COMPK_URN, None, ValidateRunnerXlangTest.expansion_service)) assert_that(res, equal_to([('a', 3), ('b', 3)]))
def run_pipeline(pipeline_options, expansion_service_port, wait_until_finish=True): # The actual definitions of these transforms is in # org.apache.beam.runners.core.construction.TestExpansionService. TEST_COUNT_URN = "beam:transforms:xlang:count" TEST_FILTER_URN = "beam:transforms:xlang:filter_less_than_eq" # Run a simple count-filtered-letters pipeline. p = TestPipeline(options=pipeline_options) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module # level). p.get_pipeline_options().view_as(SetupOptions).save_main_session = True address = 'localhost:%s' % str(expansion_service_port) res = ( p | beam.Create(list('aaabccxyyzzz')) | beam.Map(unicode) # TODO(BEAM-6587): Use strings directly rather than ints. | beam.Map(lambda x: int(ord(x))) | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address) | beam.ExternalTransform(TEST_COUNT_URN, None, address) # # TODO(BEAM-6587): Remove when above is removed. | beam.Map(lambda kv: (chr(kv[0]), kv[1])) | beam.Map(lambda kv: '%s: %s' % kv)) assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2'])) result = p.run() if wait_until_finish: result.wait_until_finish()
def test_group_by_key(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: res = ( p | beam.Create([(0, "1"), (0, "2"), (1, "3")], reshuffle=False).with_output_types( typing.Tuple[int, unicode]) | beam.ExternalTransform( TEST_GBK_URN, None, ValidateRunnerXlangTest.expansion_service) | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1]))))) assert_that(res, equal_to(['0:1,2', '1:3']))
def test_multi_input_output_with_sideinput(self): test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append( 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: main1 = p | 'Main1' >> beam.Create( ['a', 'bb'], reshuffle=False).with_output_types(unicode) main2 = p | 'Main2' >> beam.Create( ['x', 'yy', 'zzz'], reshuffle=False).with_output_types(unicode) side = p | 'Side' >> beam.Create(['s']).with_output_types(unicode) res = dict( main1=main1, main2=main2, side=side) | beam.ExternalTransform( TEST_MULTI_URN, None, ValidateRunnerXlangTest.expansion_service) assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs'])) assert_that(res['side'], equal_to(['ss']), label='CheckSide')
def make_test_pipeline(): path_to_jar = subprocess_server.JavaJarServer.path_to_beam_jar( ":sdks:java:extensions:sql:expansion-service:shadowJar") test_pipeline = TestPipeline() # TODO(BEAM-9238): Remove this when it's no longer needed for artifact # staging. test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments = ['jar_packages=' + path_to_jar] return test_pipeline
def test_write(self): expansion_jar = os.environ.get('EXPANSION_JAR') port = os.environ.get('EXPANSION_PORT') address = 'localhost:%s' % port try: test_pipeline = TestPipeline() test_pipeline.get_pipeline_options().view_as( DebugOptions).experiments.append('jar_packages='+expansion_jar) test_pipeline.not_use_test_runner_api = True with test_pipeline as p: _ = p \ | beam.Create([ AvroRecord({"name": "abc"}), AvroRecord({"name": "def"}), AvroRecord({"name": "ghi"})]) \ | beam.ExternalTransform( PARQUET_WRITE_URN, b'/tmp/test.parquet', address) except RuntimeError as e: if re.search(PARQUET_WRITE_URN, str(e)): print("looks like URN not implemented in expansion service, skipping.") else: raise e
class CombineFnLifecycleTest(unittest.TestCase): def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) @skip_unless_v2 def test_combine(self): run_combine(self.pipeline) @skip_unless_v2 def test_non_liftable_combine(self): run_combine(self.pipeline, lift_combiners=False) @skip_unless_v2 def test_combining_value_state(self): if ('DataflowRunner' in self.pipeline.get_pipeline_options().view_as( StandardOptions).runner): self.skipTest('BEAM-11793') run_pardo(self.pipeline)
def test_to_from_runner_api(self): """Tests that serialization of WriteToBigQuery is correct. This is not intended to be a change-detector test. As such, this only tests the more complicated serialization logic of parameters: ValueProviders, callables, and side inputs. """ FULL_OUTPUT_TABLE = 'test_project:output_table' p = TestPipeline() # Used for testing side input parameters. table_record_pcv = beam.pvalue.AsDict( p | "MakeTable" >> beam.Create([('table', FULL_OUTPUT_TABLE)])) # Used for testing value provider parameters. schema = value_provider.StaticValueProvider(str, '"a:str"') original = WriteToBigQuery( table=lambda _, side_input: side_input['table'], table_side_inputs=(table_record_pcv, ), schema=schema) # pylint: disable=expression-not-assigned p | 'MyWriteToBigQuery' >> original # Run the pipeline through to generate a pipeline proto from an empty # context. This ensures that the serialization code ran. pipeline_proto, context = TestPipeline.from_runner_api( p.to_runner_api(), p.runner, p.get_pipeline_options()).to_runner_api( return_context=True) # Find the transform from the context. write_to_bq_id = [ k for k, v in pipeline_proto.components.transforms.items() if v.unique_name == 'MyWriteToBigQuery' ][0] deserialized_node = context.transforms.get_by_id(write_to_bq_id) deserialized = deserialized_node.transform self.assertIsInstance(deserialized, WriteToBigQuery) # Test that the serialization of a value provider is correct. self.assertEqual(original.schema, deserialized.schema) # Test that the serialization of a callable is correct. self.assertEqual( deserialized._table(None, {'table': FULL_OUTPUT_TABLE}), FULL_OUTPUT_TABLE) # Test that the serialization of a side input is correct. self.assertEqual( len(original.table_side_inputs), len(deserialized.table_side_inputs)) original_side_input_data = original.table_side_inputs[0]._side_input_data() deserialized_side_input_data = deserialized.table_side_inputs[ 0]._side_input_data() self.assertEqual( original_side_input_data.access_pattern, deserialized_side_input_data.access_pattern) self.assertEqual( original_side_input_data.window_mapping_fn, deserialized_side_input_data.window_mapping_fn) self.assertEqual( original_side_input_data.view_fn, deserialized_side_input_data.view_fn)
class TaxirideIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.outdir = ( self.test_pipeline.get_option('temp_location') + '/taxiride_it-' + str(uuid.uuid4())) self.output_path = os.path.join(self.outdir, 'output.csv') def tearDown(self): FileSystems.delete([self.outdir + '/']) @pytest.mark.it_postcommit def test_aggregation(self): taxiride.run_aggregation_pipeline( self.test_pipeline, 'gs://apache-beam-samples/nyc_taxi/2018/*.csv', self.output_path) # Verify expected = pd.read_csv( os.path.join( os.path.dirname(__file__), 'data', 'taxiride_2018_aggregation_truth.csv'), comment='#') expected = expected.sort_values('DOLocationID').reset_index(drop=True) def read_csv(path): with FileSystems.open(path) as fp: return pd.read_csv(fp) result = pd.concat( read_csv(metadata.path) for metadata in FileSystems.match( [f'{self.output_path}*'])[0].metadata_list) result = result.sort_values('DOLocationID').reset_index(drop=True) pd.testing.assert_frame_equal(expected, result) @pytest.mark.it_postcommit def test_enrich(self): # Standard workers OOM with the enrich pipeline self.test_pipeline.get_pipeline_options().view_as( WorkerOptions).machine_type = 'e2-highmem-2' taxiride.run_enrich_pipeline( self.test_pipeline, 'gs://apache-beam-samples/nyc_taxi/2018/*.csv', self.output_path) # Verify expected = pd.read_csv( os.path.join( os.path.dirname(__file__), 'data', 'taxiride_2018_enrich_truth.csv'), comment='#') expected = expected.sort_values('Borough').reset_index(drop=True) def read_csv(path): with FileSystems.open(path) as fp: return pd.read_csv(fp) result = pd.concat( read_csv(metadata.path) for metadata in FileSystems.match( [f'{self.output_path}*'])[0].metadata_list) result = result.sort_values('Borough').reset_index(drop=True) pd.testing.assert_frame_equal(expected, result)
class LoadTest(object): """Base class for all integration and performance tests which export metrics to external databases: BigQuery or/and InfluxDB. Refer to :class:`~apache_beam.testing.load_tests.LoadTestOptions` for more information on the required pipeline options. If using InfluxDB with Basic HTTP authentication enabled, provide the following environment options: `INFLUXDB_USER` and `INFLUXDB_USER_PASSWORD`. """ def __init__(self, metrics_namespace=None): # Be sure to set blocking to false for timeout_ms to work properly self.pipeline = TestPipeline(is_integration_test=True, blocking=False) assert not self.pipeline.blocking options = self.pipeline.get_pipeline_options().view_as(LoadTestOptions) self.timeout_ms = options.timeout_ms self.input_options = options.input_options if metrics_namespace: self.metrics_namespace = metrics_namespace else: self.metrics_namespace = options.metrics_table \ if options.metrics_table else 'default' publish_to_bq = options.publish_to_big_query if publish_to_bq is None: logging.info( 'Missing --publish_to_big_query option. Metrics will not ' 'be published to BigQuery.') if options.input_options is None: logging.error('--input_options argument is required.') sys.exit(1) gcloud_options = self.pipeline.get_pipeline_options().view_as( GoogleCloudOptions) self.project_id = gcloud_options.project self._metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=self.project_id, bq_table=options.metrics_table, bq_dataset=options.metrics_dataset, namespace=self.metrics_namespace, influxdb_options=InfluxDBMetricsPublisherOptions( options.influx_measurement, options.influx_db_name, options.influx_hostname, os.getenv('INFLUXDB_USER'), os.getenv('INFLUXDB_USER_PASSWORD')), # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace)) def test(self): """An abstract method where the pipeline definition should be put.""" pass def cleanup(self): """An abstract method that executes after the test method.""" pass def run(self): try: self.test() if not hasattr(self, 'result'): self.result = self.pipeline.run() # Defaults to waiting forever, unless timeout_ms has been set self.result.wait_until_finish(duration=self.timeout_ms) self._metrics_monitor.publish_metrics(self.result) finally: self.cleanup() def parse_synthetic_source_options(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def get_option_or_default(self, opt_name, default=0): """Returns a testing option or a default value if it was not provided. The returned value is cast to the type of the default value. """ option = self.pipeline.get_option(opt_name, bool_option=type(default) == bool) if option is None: return default try: return type(default)(option) except: raise
class LoadTest(object): def __init__(self): self.pipeline = TestPipeline(is_integration_test=True) load_test_options = self.pipeline.get_pipeline_options().view_as( LoadTestOptions) self.input_options = load_test_options.input_options self.metrics_namespace = load_test_options.metrics_table or 'default' publish_to_bq = load_test_options.publish_to_big_query if publish_to_bq is None: logging.info( 'Missing --publish_to_big_query option. Metrics will not ' 'be published to BigQuery.') if load_test_options.input_options is None: logging.error('--input_options argument is required.') sys.exit(1) gcloud_options = self.pipeline.get_pipeline_options().view_as( GoogleCloudOptions) self.project_id = gcloud_options.project self._metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=self.project_id, bq_table=load_test_options.metrics_table, bq_dataset=load_test_options.metrics_dataset, # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace)) def test(self): """An abstract method where the pipeline definition should be put.""" pass def cleanup(self): """An abstract method that executes after the test method.""" pass def run(self): try: self.test() if not hasattr(self, 'result'): self.result = self.pipeline.run() self.result.wait_until_finish() self._metrics_monitor.publish_metrics(self.result) finally: self.cleanup() def parse_synthetic_source_options(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def get_option_or_default(self, opt_name, default=0): """Returns a pipeline option or a default value if it was not provided. The returned value is converted to an integer. """ option = self.pipeline.get_option(opt_name) try: return int(option) except TypeError: return default