def test_select_partitions_calls_select_partitions_with_params( self, mock_select_partitions): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) select_partitions_params = \ aggregate_params.SelectPartitionsParams( max_partitions_contributed=2, budget_weight=0.5) partition_extractor = lambda x: f"pk:{x // 10}" # Act transformer = private_beam.SelectPartitions( select_partitions_params=select_partitions_params, partition_extractor=partition_extractor, label="Test select partitions") private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_select_partitions.assert_called_once() args = mock_select_partitions.call_args[0] self.assertEqual(args[1], select_partitions_params)
def test_map_returns_correct_results_and_accountant(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4), (4, 5)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.Map( fn=lambda x: x[1]**2) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to( map( lambda x: (PrivateBeamTest.privacy_id_extractor(x), x[1]**2), pcol_input))) self.assertEqual(transformed._budget_accountant, budget_accountant)
def test_flatmap_returns_correct_results_and_accountant(self): def flat_map_fn(x): return [(x[0], x[1] + i) for i in range(2)] runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol_input = [(1, 2), (2, 3), (3, 4)] pcol = pipeline | 'Create produce' >> beam.Create(pcol_input) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | private_beam.FlatMap( flat_map_fn) # Assert self.assertIsInstance(transformed, private_beam.PrivatePCollection) beam_util.assert_that( transformed._pcol, beam_util.equal_to([('pid:(1, 2)', (1, 2)), ('pid:(1, 2)', (1, 3)), ('pid:(2, 3)', (2, 3)), ('pid:(2, 3)', (2, 4)), ('pid:(3, 4)', (3, 4)), ('pid:(3, 4)', (3, 5))])) self.assertEqual(transformed._budget_accountant, budget_accountant)
def create_pipeline(self): # Must be GRPC so we can send data and split requests concurrent # to the bundle process request. return beam.Pipeline( runner=fn_api_runner.FnApiRunner( default_environment=beam_runner_api_pb2.Environment( urn=python_urns.EMBEDDED_PYTHON_GRPC)))
def _make_beam_pipeline(self) -> beam_Pipeline: # pytype: disable=invalid-annotation """Makes beam pipeline.""" if not beam: raise Exception( 'Apache Beam must be installed to use this functionality.') # pylint: disable=g-import-not-at-top from apache_beam.options.pipeline_options import DirectOptions from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.runners.portability import fn_api_runner # pylint: enable=g-import-not-at-top pipeline_options = PipelineOptions(self._beam_pipeline_args) if pipeline_options.view_as(StandardOptions).runner: return beam.Pipeline(argv=self._beam_pipeline_args) # TODO(b/159468583): move this warning to Beam. direct_running_mode = pipeline_options.view_as( DirectOptions).direct_running_mode direct_num_workers = pipeline_options.view_as( DirectOptions).direct_num_workers if direct_running_mode == 'in_memory' and direct_num_workers != 1: absl.logging.warning( 'If direct_num_workers is not equal to 1, direct_running_mode should ' 'be `multi_processing` or `multi_threading` instead of `in_memory` ' 'in order for it to have the desired worker parallelism effect.' ) return beam.Pipeline(options=pipeline_options, runner=fn_api_runner.FnApiRunner())
def _make_beam_pipeline(self) -> beam.Pipeline: """Makes beam pipeline.""" # TODO(b/142684737): refactor when beam support multi-processing by args. pipeline_options = PipelineOptions(self._beam_pipeline_args) parallelism = pipeline_options.view_as( DirectOptions).direct_num_workers if parallelism == 0: try: parallelism = multiprocessing.cpu_count() except NotImplementedError as e: absl.logging.warning('Cannot get cpu count: %s' % e) parallelism = 1 pipeline_options.view_as( DirectOptions).direct_num_workers = parallelism absl.logging.info('Using %d process(es) for Beam pipeline execution.' % parallelism) if parallelism > 1: if beam_runner_api_pb2: env = beam_runner_api_pb2.Environment( urn=python_urns.SUBPROCESS_SDK, payload=b'%s -m apache_beam.runners.worker.sdk_worker_main' % (sys.executable or sys.argv[0]).encode('ascii')) else: env = environments.SubprocessSDKEnvironment( command_string= '%s -m apache_beam.runners.worker.sdk_worker_main' % (sys.executable or sys.argv[0])) return beam.Pipeline( options=pipeline_options, runner=fn_api_runner.FnApiRunner(default_environment=env)) return beam.Pipeline(argv=self._beam_pipeline_args)
def benchmarkAnalyzeAndTransformDataset(self): """Benchmark AnalyzeAndTransformDataset. Runs AnalyzeAndTransformDataset in a Beam pipeline. Records the wall time taken for the whole pipeline. """ common_variables = _get_common_variables(self._dataset) pipeline = beam.Pipeline(runner=fn_api_runner.FnApiRunner()) _ = pipeline | _AnalyzeAndTransformDataset( self._dataset, common_variables.tf_metadata_schema, common_variables.preprocessing_fn, common_variables.transform_input_dataset_metadata) start = time.time() result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( name=benchmark_utils.with_dataset_prefix( "benchmarkAnalyzeAndTransformDataset", FLAGS.dataset), iters=1, wall_time=delta, extras={"num_examples": self._dataset.num_examples()})
def _make_beam_pipeline(self) -> beam.Pipeline: """Makes beam pipeline.""" pipeline_options = PipelineOptions(self._beam_pipeline_args) if pipeline_options.view_as(StandardOptions).runner: return beam.Pipeline(argv=self._beam_pipeline_args) return beam.Pipeline( options=pipeline_options, runner=fn_api_runner.FnApiRunner())
def compute_on_beam(): runner = fn_api_runner.FnApiRunner() # local runner with beam.Pipeline(runner=runner) as pipeline: movie_views = pipeline | beam.io.ReadFromText( FLAGS.input_file) | beam.ParDo(ParseFile()) pipeline_backend = pipeline_dp.BeamBackend() dp_result = calculate_private_result(movie_views, pipeline_backend) dp_result | beam.io.WriteToText(FLAGS.output_file)
def compute_on_beam(): runner = fn_api_runner.FnApiRunner() # local runner public_partitions = get_public_partitions() with beam.Pipeline(runner=runner) as pipeline: movie_views = pipeline | beam.io.ReadFromText(FLAGS.input_file) | beam.ParDo( ParseFile()) pipeline_operations = pipeline_dp.BeamOperations() dp_result = calc_dp_rating_metrics(movie_views, pipeline_operations, public_partitions) dp_result | beam.io.WriteToText(FLAGS.output_file)
def run(self): with JobLogHandler(self._log_queue): try: fn_api_runner.FnApiRunner( use_grpc=self._use_grpc, sdk_harness_factory=self._sdk_harness_factory ).run_via_runner_api(self._pipeline_proto) self.state = beam_job_api_pb2.JobState.DONE except: # pylint: disable=bare-except logging.exception("Error running pipeline.") traceback.print_exc() self.state = beam_job_api_pb2.JobState.FAILED
def test_dofn_lifecycle(self): from apache_beam.runners.direct import direct_runner from apache_beam.runners.portability import fn_api_runner runners = [ direct_runner.BundleBasedDirectRunner(), fn_api_runner.FnApiRunner() ] for r in runners: with TestPipeline(runner=r) as p: _ = (p | 'Start' >> beam.Create([1, 2, 3]) | 'Do' >> beam.ParDo(CallSequenceEnforcingDoFn()))
def run(self): with JobLogHandler(self._log_queues): try: fn_api_runner.FnApiRunner().run_via_runner_api( self._pipeline_proto) logging.info('Successfully completed job.') self.state = beam_job_api_pb2.JobState.DONE except: # pylint: disable=bare-except logging.exception('Error running pipeline.') logging.exception(traceback) self.state = beam_job_api_pb2.JobState.FAILED raise
def _run_job(self): self.set_state(beam_job_api_pb2.JobState.RUNNING) with JobLogHandler(self._log_queues): try: result = fn_api_runner.FnApiRunner( provision_info=self._provision_info).run_via_runner_api( self._pipeline_proto) _LOGGER.info('Successfully completed job.') self.set_state(beam_job_api_pb2.JobState.DONE) self.result = result except: # pylint: disable=bare-except _LOGGER.exception('Error running pipeline.') _LOGGER.exception(traceback) self.set_state(beam_job_api_pb2.JobState.FAILED) raise
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data df = pd.read_csv(FLAGS.input_file) df.rename(inplace=True, columns={ 'VisitorId': 'user_id', 'Time entered': 'enter_time', 'Time spent (minutes)': 'spent_minutes', 'Money spent (euros)': 'spent_money', 'Day': 'day' }) restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()] beam_data = pipeline | beam.Create(restaraunt_visits_rows) # Wrap Beam's PCollection into it's private version private_restaraunt_visits = beam_data | private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda row: row.user_id) # Calculate the private sum dp_result = private_restaraunt_visits | private_beam.Sum( SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=7, max_contributions_per_partition=2, min_value=1, max_value=100, budget_weight=1, public_partitions=None, partition_extractor=lambda row: row.day, value_extractor=lambda row: row.spent_money)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def benchmarkMiniPipelineBatched(self): """Benchmark a batched "mini" TFMA - predict, slice and compute metrics. Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time taken for the whole pipeline. """ self._init_model() pipeline = beam.Pipeline(runner=fn_api_runner.FnApiRunner()) tfx_io = test_util.InMemoryTFExampleRecord( schema=benchmark_utils.read_schema( self._dataset.tf_metadata_schema_path()), raw_record_column_name=constants.ARROW_INPUT_COLUMN) raw_data = (pipeline | "Examples" >> beam.Create( self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES)) | "BatchExamples" >> tfx_io.BeamSource() | "InputsToExtracts" >> tfma.BatchedInputsToExtracts()) _ = (raw_data | "BatchedInputExtractor" >> batched_input_extractor. BatchedInputExtractor(eval_config=self._eval_config).ptransform | "V2BatchedPredictExtractor" >> batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform | "UnbatchExtractor" >> unbatch_extractor.UnbatchExtractor().ptransform | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform | "V2ComputeMetricsAndPlots" >> metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform) start = time.time() result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def main(unused_argv): # Setup Beam # Here, we use a local Beam runner. # For a truly distributed calculation, connect to a Beam cluster (e.g. # running on some cloud provider). runner = fn_api_runner.FnApiRunner() # Local Beam runner with beam.Pipeline(runner=runner) as pipeline: # Define the privacy budget available for our computation. budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6) # Load and parse input data movie_views_pcol = pipeline | \ beam.io.ReadFromText(FLAGS.input_file) | \ beam.ParDo(ParseFile()) # Wrap Beam's PCollection into it's private version private_movie_views = (movie_views_pcol | 'Create private collection' >> MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=lambda mv: mv.user_id)) # Calculate the private sum dp_result = private_movie_views | "Private Sum" >> private_beam.Sum( SumParams( # Limits to how much one user can contribute: # .. at most two movies rated per user max_partitions_contributed=2, # .. at most one rating for each movie max_contributions_per_partition=1, # .. with minimal rating of "1" min_value=1, # .. and maximum rating of "5" max_value=5, # The aggregation key: we're grouping data by movies partition_extractor=lambda mv: mv.movie_id, # The value we're aggregating: we're summing up ratings value_extractor=lambda mv: mv.rating)) budget_accountant.compute_budgets() # Save the results dp_result | beam.io.WriteToText(FLAGS.output_file) return 0
def test_sum_calls_aggregate_with_params(self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( float(i) for i in range(1, 7)) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) sum_params = aggregate_params.SumParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, max_contributions_per_partition=3, min_value=1, max_value=5, budget_weight=1, public_partitions=[], partition_extractor=lambda x: f"pk:{x // 10}", value_extractor=lambda x: x) # Act transformer = private_beam.Sum(sum_params=sum_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.SUM], max_partitions_contributed=sum_params. max_partitions_contributed, max_contributions_per_partition=sum_params. max_contributions_per_partition, min_value=sum_params.min_value, max_value=sum_params.max_value, public_partitions=sum_params.public_partitions) self.assertEqual(params, args[1])
def _make_beam_pipeline(self) -> beam.Pipeline: """Makes beam pipeline.""" pipeline_options = PipelineOptions(self._beam_pipeline_args) if pipeline_options.view_as(StandardOptions).runner: return beam.Pipeline(argv=self._beam_pipeline_args) # TODO(b/159468583): move this warning to Beam. direct_running_mode = pipeline_options.view_as( DirectOptions).direct_running_mode direct_num_workers = pipeline_options.view_as( DirectOptions).direct_num_workers if direct_running_mode == 'in_memory' and direct_num_workers != 1: absl.logging.warning( 'If direct_num_workers is not equal to 1, direct_running_mode should ' 'be `multi_processing` or `multi_threading` instead of `in_memory` ' 'in order for it to have the desired worker parallelism effect.' ) return beam.Pipeline(options=pipeline_options, runner=fn_api_runner.FnApiRunner())
def test_make_private_transform_succeeds(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) # Act private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Assert self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertEqual(private_collection._budget_accountant, budget_accountant)
def test_transform_with_return_anonymized_enabled_returns_pcollection( self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act transformed = private_collection | SimplePrivatePTransform( return_anonymized=True) # Assert self.assertIsInstance(transformed, pvalue.PCollection)
def benchmarkMiniPipelineUnbatched(self): """Benchmark an unbatched "mini" TFMA - predict, slice and compute metrics. Runs a "mini" version of TFMA in a Beam pipeline. Records the wall time taken for the whole pipeline. """ self._init_model() pipeline = beam.Pipeline(runner=fn_api_runner.FnApiRunner()) raw_data = (pipeline | "Examples" >> beam.Create( self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES)) | "InputsToExtracts" >> tfma.InputsToExtracts()) _ = (raw_data | "InputExtractor" >> input_extractor.InputExtractor( eval_config=self._eval_config).ptransform | "V2PredictExtractor" >> predict_extractor_v2.PredictExtractor( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform | "SliceKeyExtractor" >> tfma.extractors.SliceKeyExtractor().ptransform | "V2ComputeMetricsAndPlots" >> metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=self._eval_config, eval_shared_model=self._eval_shared_model).ptransform) start = time.time() result = pipeline.run() result.wait_until_finish() end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def test_private_collection_with_non_private_transform_throws_error(self): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) # Act and Assert with self.assertRaises(TypeError) as context: (private_collection | 'Non private transform on ' 'PrivatePCollection' >> beam.Map(lambda x: x)) self.assertIsInstance(private_collection, private_beam.PrivatePCollection) self.assertTrue( "private_transform should be of type " "PrivatePTransform but is " in str(context.exception))
def test_privacy_id_count_calls_aggregate_with_params( self, mock_aggregate): runner = fn_api_runner.FnApiRunner() with beam.Pipeline(runner=runner) as pipeline: # Arrange pcol = pipeline | 'Create produce' >> beam.Create( [1, 2, 3, 4, 5, 6]) budget_accountant = budget_accounting.NaiveBudgetAccountant( total_epsilon=1, total_delta=0.01) private_collection = ( pcol | 'Create private collection' >> private_beam.MakePrivate( budget_accountant=budget_accountant, privacy_id_extractor=PrivateBeamTest.privacy_id_extractor)) privacy_id_count_params = aggregate_params.PrivacyIdCountParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, max_partitions_contributed=2, budget_weight=1, partition_extractor=lambda x: f"pk:{x // 10}") # Act transformer = private_beam.PrivacyIdCount( privacy_id_count_params=privacy_id_count_params) private_collection | transformer # Assert self.assertEqual(transformer._budget_accountant, budget_accountant) mock_aggregate.assert_called_once() args = mock_aggregate.call_args[0] params = pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT], max_partitions_contributed=privacy_id_count_params. max_partitions_contributed, max_contributions_per_partition=1, public_partitions=privacy_id_count_params.public_partitions) self.assertEqual(args[1], params)
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner( use_runner_protos=False))
def create_pipeline(self): return beam.Pipeline( runner=fn_api_runner.FnApiRunner(use_grpc=False, bundle_repeat=3))
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner( use_grpc=True, sdk_harness_factory=functools.partial(sdk_worker.SdkHarness, worker_count=2)))
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner(use_grpc=True))
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner( default_environment=beam_runner_api_pb2.Environment( urn=python_urns.EMBEDDED_PYTHON_GRPC, payload=b'2')))
def create_pipeline(self): return beam.Pipeline(runner=fn_api_runner.FnApiRunner())