Пример #1
0
 def test_partition(self, test_pipeline=None):
     if not test_pipeline:
         test_pipeline = TestPipeline()
         test_pipeline.get_pipeline_options().view_as(
             DebugOptions).experiments.append(
                 'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
         test_pipeline.not_use_test_runner_api = True
     CrossLanguageTestPipelines().run_partition(test_pipeline)
 def test_combine_globally(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create([1, 2, 3]).with_output_types(int)
                | beam.ExternalTransform(
                    TEST_COMGL_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to([6]))
 def test_prefix(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
 def test_partition(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create([1, 2, 3, 4, 5, 6]).with_output_types(int)
                | beam.ExternalTransform(
                    TEST_PARTITION_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res['0'], equal_to([2, 4, 6]), label='check_even')
         assert_that(res['1'], equal_to([1, 3, 5]), label='check_odd')
 def test_flatten(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         col1 = p | 'col1' >> beam.Create([1, 2, 3]).with_output_types(int)
         col2 = p | 'col2' >> beam.Create([4, 5, 6]).with_output_types(int)
         res = ((col1, col2)
                | beam.ExternalTransform(
                    TEST_FLATTEN_URN, None,
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to([1, 2, 3, 4, 5, 6]))
Пример #6
0
 def test_combine_per_key(self):
   test_pipeline = TestPipeline()
   test_pipeline.get_pipeline_options().view_as(
       DebugOptions).experiments.append(
           'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
   test_pipeline.not_use_test_runner_api = True
   with test_pipeline as p:
     res = (
         p
         | beam.Create([('a', 1), ('a', 2), ('b', 3)]).with_output_types(
             typing.Tuple[unicode, int])
         | beam.ExternalTransform(
             TEST_COMPK_URN, None, ValidateRunnerXlangTest.expansion_service))
     assert_that(res, equal_to([('a', 3), ('b', 3)]))
Пример #7
0
    def run_pipeline(pipeline_options,
                     expansion_service_port,
                     wait_until_finish=True):
        # The actual definitions of these transforms is in
        # org.apache.beam.runners.core.construction.TestExpansionService.
        TEST_COUNT_URN = "beam:transforms:xlang:count"
        TEST_FILTER_URN = "beam:transforms:xlang:filter_less_than_eq"

        # Run a simple count-filtered-letters pipeline.
        p = TestPipeline(options=pipeline_options)

        # We use the save_main_session option because one or more DoFn's in this
        # workflow rely on global context (e.g., a module imported at module
        # level).
        p.get_pipeline_options().view_as(SetupOptions).save_main_session = True

        address = 'localhost:%s' % str(expansion_service_port)
        res = (
            p
            | beam.Create(list('aaabccxyyzzz'))
            | beam.Map(unicode)
            # TODO(BEAM-6587): Use strings directly rather than ints.
            | beam.Map(lambda x: int(ord(x)))
            | beam.ExternalTransform(TEST_FILTER_URN, b'middle', address)
            | beam.ExternalTransform(TEST_COUNT_URN, None, address)
            # # TODO(BEAM-6587): Remove when above is removed.
            | beam.Map(lambda kv: (chr(kv[0]), kv[1]))
            | beam.Map(lambda kv: '%s: %s' % kv))

        assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2']))

        result = p.run()
        if wait_until_finish:
            result.wait_until_finish()
Пример #8
0
 def test_group_by_key(self):
   test_pipeline = TestPipeline()
   test_pipeline.get_pipeline_options().view_as(
       DebugOptions).experiments.append(
           'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
   test_pipeline.not_use_test_runner_api = True
   with test_pipeline as p:
     res = (
         p
         | beam.Create([(0, "1"), (0, "2"),
                        (1, "3")], reshuffle=False).with_output_types(
                            typing.Tuple[int, unicode])
         | beam.ExternalTransform(
             TEST_GBK_URN, None, ValidateRunnerXlangTest.expansion_service)
         | beam.Map(lambda x: "{}:{}".format(x[0], ','.join(sorted(x[1])))))
     assert_that(res, equal_to(['0:1,2', '1:3']))
Пример #9
0
 def test_multi_input_output_with_sideinput(self):
   test_pipeline = TestPipeline()
   test_pipeline.get_pipeline_options().view_as(
       DebugOptions).experiments.append(
           'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
   test_pipeline.not_use_test_runner_api = True
   with test_pipeline as p:
     main1 = p | 'Main1' >> beam.Create(
         ['a', 'bb'], reshuffle=False).with_output_types(unicode)
     main2 = p | 'Main2' >> beam.Create(
         ['x', 'yy', 'zzz'], reshuffle=False).with_output_types(unicode)
     side = p | 'Side' >> beam.Create(['s']).with_output_types(unicode)
     res = dict(
         main1=main1, main2=main2, side=side) | beam.ExternalTransform(
             TEST_MULTI_URN, None, ValidateRunnerXlangTest.expansion_service)
     assert_that(res['main'], equal_to(['as', 'bbs', 'xs', 'yys', 'zzzs']))
     assert_that(res['side'], equal_to(['ss']), label='CheckSide')
Пример #10
0
 def make_test_pipeline():
     path_to_jar = subprocess_server.JavaJarServer.path_to_beam_jar(
         ":sdks:java:extensions:sql:expansion-service:shadowJar")
     test_pipeline = TestPipeline()
     # TODO(BEAM-9238): Remove this when it's no longer needed for artifact
     # staging.
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments = ['jar_packages=' + path_to_jar]
     return test_pipeline
Пример #11
0
 def test_write(self):
   expansion_jar = os.environ.get('EXPANSION_JAR')
   port = os.environ.get('EXPANSION_PORT')
   address = 'localhost:%s' % port
   try:
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append('jar_packages='+expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
       _ = p \
         | beam.Create([
             AvroRecord({"name": "abc"}), AvroRecord({"name": "def"}),
             AvroRecord({"name": "ghi"})]) \
         | beam.ExternalTransform(
             PARQUET_WRITE_URN, b'/tmp/test.parquet', address)
   except RuntimeError as e:
     if re.search(PARQUET_WRITE_URN, str(e)):
       print("looks like URN not implemented in expansion service, skipping.")
     else:
       raise e
Пример #12
0
class CombineFnLifecycleTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = TestPipeline(is_integration_test=True)

    @skip_unless_v2
    def test_combine(self):
        run_combine(self.pipeline)

    @skip_unless_v2
    def test_non_liftable_combine(self):
        run_combine(self.pipeline, lift_combiners=False)

    @skip_unless_v2
    def test_combining_value_state(self):
        if ('DataflowRunner' in self.pipeline.get_pipeline_options().view_as(
                StandardOptions).runner):
            self.skipTest('BEAM-11793')
        run_pardo(self.pipeline)
Пример #13
0
  def test_to_from_runner_api(self):
    """Tests that serialization of WriteToBigQuery is correct.

    This is not intended to be a change-detector test. As such, this only tests
    the more complicated serialization logic of parameters: ValueProviders,
    callables, and side inputs.
    """
    FULL_OUTPUT_TABLE = 'test_project:output_table'

    p = TestPipeline()

    # Used for testing side input parameters.
    table_record_pcv = beam.pvalue.AsDict(
        p | "MakeTable" >> beam.Create([('table', FULL_OUTPUT_TABLE)]))

    # Used for testing value provider parameters.
    schema = value_provider.StaticValueProvider(str, '"a:str"')

    original = WriteToBigQuery(
        table=lambda _,
        side_input: side_input['table'],
        table_side_inputs=(table_record_pcv, ),
        schema=schema)

    # pylint: disable=expression-not-assigned
    p | 'MyWriteToBigQuery' >> original

    # Run the pipeline through to generate a pipeline proto from an empty
    # context. This ensures that the serialization code ran.
    pipeline_proto, context = TestPipeline.from_runner_api(
        p.to_runner_api(), p.runner, p.get_pipeline_options()).to_runner_api(
            return_context=True)

    # Find the transform from the context.
    write_to_bq_id = [
        k for k,
        v in pipeline_proto.components.transforms.items()
        if v.unique_name == 'MyWriteToBigQuery'
    ][0]
    deserialized_node = context.transforms.get_by_id(write_to_bq_id)
    deserialized = deserialized_node.transform
    self.assertIsInstance(deserialized, WriteToBigQuery)

    # Test that the serialization of a value provider is correct.
    self.assertEqual(original.schema, deserialized.schema)

    # Test that the serialization of a callable is correct.
    self.assertEqual(
        deserialized._table(None, {'table': FULL_OUTPUT_TABLE}),
        FULL_OUTPUT_TABLE)

    # Test that the serialization of a side input is correct.
    self.assertEqual(
        len(original.table_side_inputs), len(deserialized.table_side_inputs))
    original_side_input_data = original.table_side_inputs[0]._side_input_data()
    deserialized_side_input_data = deserialized.table_side_inputs[
        0]._side_input_data()
    self.assertEqual(
        original_side_input_data.access_pattern,
        deserialized_side_input_data.access_pattern)
    self.assertEqual(
        original_side_input_data.window_mapping_fn,
        deserialized_side_input_data.window_mapping_fn)
    self.assertEqual(
        original_side_input_data.view_fn, deserialized_side_input_data.view_fn)
Пример #14
0
class TaxirideIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.outdir = (
        self.test_pipeline.get_option('temp_location') + '/taxiride_it-' +
        str(uuid.uuid4()))
    self.output_path = os.path.join(self.outdir, 'output.csv')

  def tearDown(self):
    FileSystems.delete([self.outdir + '/'])

  @pytest.mark.it_postcommit
  def test_aggregation(self):
    taxiride.run_aggregation_pipeline(
        self.test_pipeline,
        'gs://apache-beam-samples/nyc_taxi/2018/*.csv',
        self.output_path)

    # Verify
    expected = pd.read_csv(
        os.path.join(
            os.path.dirname(__file__),
            'data',
            'taxiride_2018_aggregation_truth.csv'),
        comment='#')
    expected = expected.sort_values('DOLocationID').reset_index(drop=True)

    def read_csv(path):
      with FileSystems.open(path) as fp:
        return pd.read_csv(fp)

    result = pd.concat(
        read_csv(metadata.path) for metadata in FileSystems.match(
            [f'{self.output_path}*'])[0].metadata_list)
    result = result.sort_values('DOLocationID').reset_index(drop=True)

    pd.testing.assert_frame_equal(expected, result)

  @pytest.mark.it_postcommit
  def test_enrich(self):
    # Standard workers OOM with the enrich pipeline
    self.test_pipeline.get_pipeline_options().view_as(
        WorkerOptions).machine_type = 'e2-highmem-2'

    taxiride.run_enrich_pipeline(
        self.test_pipeline,
        'gs://apache-beam-samples/nyc_taxi/2018/*.csv',
        self.output_path)

    # Verify
    expected = pd.read_csv(
        os.path.join(
            os.path.dirname(__file__), 'data',
            'taxiride_2018_enrich_truth.csv'),
        comment='#')
    expected = expected.sort_values('Borough').reset_index(drop=True)

    def read_csv(path):
      with FileSystems.open(path) as fp:
        return pd.read_csv(fp)

    result = pd.concat(
        read_csv(metadata.path) for metadata in FileSystems.match(
            [f'{self.output_path}*'])[0].metadata_list)
    result = result.sort_values('Borough').reset_index(drop=True)

    pd.testing.assert_frame_equal(expected, result)
Пример #15
0
class LoadTest(object):
    """Base class for all integration and performance tests which export
  metrics to external databases: BigQuery or/and InfluxDB.

  Refer to :class:`~apache_beam.testing.load_tests.LoadTestOptions` for more
  information on the required pipeline options.

  If using InfluxDB with Basic HTTP authentication enabled, provide the
  following environment options: `INFLUXDB_USER` and `INFLUXDB_USER_PASSWORD`.
  """
    def __init__(self, metrics_namespace=None):
        # Be sure to set blocking to false for timeout_ms to work properly
        self.pipeline = TestPipeline(is_integration_test=True, blocking=False)
        assert not self.pipeline.blocking

        options = self.pipeline.get_pipeline_options().view_as(LoadTestOptions)
        self.timeout_ms = options.timeout_ms
        self.input_options = options.input_options

        if metrics_namespace:
            self.metrics_namespace = metrics_namespace
        else:
            self.metrics_namespace = options.metrics_table \
              if options.metrics_table else 'default'

        publish_to_bq = options.publish_to_big_query
        if publish_to_bq is None:
            logging.info(
                'Missing --publish_to_big_query option. Metrics will not '
                'be published to BigQuery.')
        if options.input_options is None:
            logging.error('--input_options argument is required.')
            sys.exit(1)

        gcloud_options = self.pipeline.get_pipeline_options().view_as(
            GoogleCloudOptions)
        self.project_id = gcloud_options.project

        self._metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=self.project_id,
            bq_table=options.metrics_table,
            bq_dataset=options.metrics_dataset,
            namespace=self.metrics_namespace,
            influxdb_options=InfluxDBMetricsPublisherOptions(
                options.influx_measurement, options.influx_db_name,
                options.influx_hostname, os.getenv('INFLUXDB_USER'),
                os.getenv('INFLUXDB_USER_PASSWORD')),
            # Apply filter to prevent system metrics from being published
            filters=MetricsFilter().with_namespace(self.metrics_namespace))

    def test(self):
        """An abstract method where the pipeline definition should be put."""
        pass

    def cleanup(self):
        """An abstract method that executes after the test method."""
        pass

    def run(self):
        try:
            self.test()
            if not hasattr(self, 'result'):
                self.result = self.pipeline.run()
                # Defaults to waiting forever, unless timeout_ms has been set
                self.result.wait_until_finish(duration=self.timeout_ms)
            self._metrics_monitor.publish_metrics(self.result)
        finally:
            self.cleanup()

    def parse_synthetic_source_options(self, options=None):
        if not options:
            options = self.input_options
        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def get_option_or_default(self, opt_name, default=0):
        """Returns a testing option or a default value if it was not provided.

    The returned value is cast to the type of the default value.
    """
        option = self.pipeline.get_option(opt_name,
                                          bool_option=type(default) == bool)
        if option is None:
            return default
        try:
            return type(default)(option)
        except:
            raise
Пример #16
0
class LoadTest(object):
    def __init__(self):
        self.pipeline = TestPipeline(is_integration_test=True)

        load_test_options = self.pipeline.get_pipeline_options().view_as(
            LoadTestOptions)
        self.input_options = load_test_options.input_options
        self.metrics_namespace = load_test_options.metrics_table or 'default'
        publish_to_bq = load_test_options.publish_to_big_query
        if publish_to_bq is None:
            logging.info(
                'Missing --publish_to_big_query option. Metrics will not '
                'be published to BigQuery.')
        if load_test_options.input_options is None:
            logging.error('--input_options argument is required.')
            sys.exit(1)

        gcloud_options = self.pipeline.get_pipeline_options().view_as(
            GoogleCloudOptions)
        self.project_id = gcloud_options.project

        self._metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=self.project_id,
            bq_table=load_test_options.metrics_table,
            bq_dataset=load_test_options.metrics_dataset,
            # Apply filter to prevent system metrics from being published
            filters=MetricsFilter().with_namespace(self.metrics_namespace))

    def test(self):
        """An abstract method where the pipeline definition should be put."""
        pass

    def cleanup(self):
        """An abstract method that executes after the test method."""
        pass

    def run(self):
        try:
            self.test()
            if not hasattr(self, 'result'):
                self.result = self.pipeline.run()
                self.result.wait_until_finish()
            self._metrics_monitor.publish_metrics(self.result)
        finally:
            self.cleanup()

    def parse_synthetic_source_options(self, options=None):
        if not options:
            options = self.input_options
        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def get_option_or_default(self, opt_name, default=0):
        """Returns a pipeline option or a default value if it was not provided.

    The returned value is converted to an integer.
    """
        option = self.pipeline.get_option(opt_name)
        try:
            return int(option)
        except TypeError:
            return default