def test_metric_filter_step_matching(self): name = MetricName('ns1', 'name1') filter = MetricsFilter().with_step('Step1') key = MetricKey('Step1', name) self.assertTrue(MetricResults.matches(filter, key)) key = MetricKey('Step10', name) self.assertFalse(MetricResults.matches(filter, key)) key = MetricKey('Step10/Step1', name) self.assertTrue(MetricResults.matches(filter, key)) key = MetricKey('Top1/Outer1/Inner1', name) filter = MetricsFilter().with_step('Top1/Outer1/Inner1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Top1/Outer1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Outer1/Inner1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Top1/Inner1') self.assertFalse(MetricResults.matches(filter, key))
def word_count_with_metrics(text_input, dataflow_pipeline, text_output=beam_output): word_count( text_input=text_input, text_output=text_output, dataflow_pipeline=dataflow_pipeline, ) result = dataflow_pipeline.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, "has_job") or result.has_job # direct runner ): # not just a template creation empty_lines_filter = MetricsFilter().with_name("empty_lines") query_result = result.metrics().query(empty_lines_filter) if query_result["counters"]: empty_lines_counter = query_result["counters"][0] logging.info("number of empty lines: %d", empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name("word_len_dist") query_result = result.metrics().query(word_lengths_filter) if query_result["distributions"]: word_lengths_dist = query_result["distributions"][0] logging.info("average word length: %d", word_lengths_dist.result.mean)
def test_counted_metrics(self): pipeline = TestPipeline() examples = [1, 5, 3, 10] pcoll = pipeline | 'start' >> beam.Create(examples) _ = pcoll | base.RunInference(FakeModelHandler()) run_result = pipeline.run() run_result.wait_until_finish() metric_results = (run_result.metrics().query( MetricsFilter().with_name('num_inferences'))) num_inferences_counter = metric_results['counters'][0] self.assertEqual(num_inferences_counter.committed, 4) inference_request_batch_size = run_result.metrics().query( MetricsFilter().with_name('inference_request_batch_size')) self.assertTrue(inference_request_batch_size['distributions']) self.assertEqual( inference_request_batch_size['distributions'][0].result.sum, 4) inference_request_batch_byte_size = run_result.metrics().query( MetricsFilter().with_name('inference_request_batch_byte_size')) self.assertTrue(inference_request_batch_byte_size['distributions']) self.assertGreaterEqual( inference_request_batch_byte_size['distributions'][0].result.sum, len(pickle.dumps(examples))) inference_request_batch_byte_size = run_result.metrics().query( MetricsFilter().with_name('model_byte_size')) self.assertTrue(inference_request_batch_byte_size['distributions'])
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.committed.mean)
def test_model_use_and_query_metrics(self): """DebuggingWordCount example snippets.""" import re p = TestPipeline() # Use TestPipeline for testing. words = p | beam.Create(['albert', 'sam', 'mark', 'sarah', 'swati', 'daniel', 'andrea']) # pylint: disable=unused-variable # [START metrics_usage_example] class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regex.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics to count unmatched words, and know the distribution of # word lengths in the input PCollection. self.word_len_dist = Metrics.distribution(self.__class__, 'word_len_dist') self.unmatched_words = Metrics.counter(self.__class__, 'unmatched_words') def process(self, element): word = element self.word_len_dist.update(len(word)) if re.match(self.pattern, word): yield element else: self.unmatched_words.inc() filtered_words = ( words | 'FilterText' >> beam.ParDo(FilterTextFn('s.*'))) # [END metrics_usage_example] # pylint: enable=unused-variable # [START metrics_check_values_example] result = p.run() result.wait_until_finish() custom_distribution = result.metrics().query( MetricsFilter().with_name('word_len_dist'))['distributions'] custom_counter = result.metrics().query( MetricsFilter().with_name('unmatched_words'))['counters'] if custom_distribution: logging.info('The average word length was %d', custom_distribution[0].committed.mean) if custom_counter: logging.info('There were %d words that did not match the filter.', custom_counter[0].committed) # [END metrics_check_values_example] # There should be 4 words that did not match self.assertEqual(custom_counter[0].committed, 4) # The shortest word is 3 characters, the longest is 6 self.assertEqual(custom_distribution[0].committed.min, 3) self.assertEqual(custom_distribution[0].committed.max, 6)
def run(argv=None, save_main_session=True): '''Main entry point; defines and runs the wordcount pipeline.''' parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) processed_users = (lines | 'splits' >> beam.Map(split_and_lower) | 'noNum' >> beam.Map(no_num_format) | 'formatOut' >> beam.Map(format_output)) processed_users | 'uniqueUser' >> beam.Distinct( ) | 'writeUnique' >> WriteToText(known_args.output, file_name_suffix='.csv') schema = avro.schema.parse(open("user.avsc", "rb").read()) processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro( 'output_avro', schema, file_name_suffix='.avro') reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"), DatumReader()) for user in reader: print user reader.close() result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def test_user_counter_using_pardo(self): class SomeDoFn(beam.DoFn): """A custom dummy DoFn using yield.""" static_counter_elements = metrics.Metrics.counter( "SomeDoFn", 'metrics_static_counter_element') def __init__(self): self.user_counter_elements = metrics.Metrics.counter( self.__class__, 'metrics_user_counter_element') def process(self, element): self.static_counter_elements.inc(2) self.user_counter_elements.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) yield element pipeline = TestPipeline() nums = pipeline | 'Input' >> beam.Create([1, 2, 3, 4]) results = nums | 'ApplyPardo' >> beam.ParDo(SomeDoFn()) assert_that(results, equal_to([1, 2, 3, 4])) res = pipeline.run() res.wait_until_finish() # Verify static counter. metric_results = (res.metrics().query(MetricsFilter().with_metric( SomeDoFn.static_counter_elements))) outputs_static_counter = metric_results['counters'][0] self.assertEqual(outputs_static_counter.key.metric.name, 'metrics_static_counter_element') self.assertEqual(outputs_static_counter.committed, 8) # Verify user counter. metric_results = (res.metrics().query( MetricsFilter().with_name('metrics_user_counter_element'))) outputs_user_counter = metric_results['counters'][0] self.assertEqual(outputs_user_counter.key.metric.name, 'metrics_user_counter_element') self.assertEqual(outputs_user_counter.committed, 4) # Verify user distribution counter. metric_results = res.metrics().query() matcher = MetricResultMatcher( step='ApplyPardo', namespace=hc.contains_string('SomeDoFn'), name='element_dist', committed=DistributionMatcher( sum_value=hc.greater_than_or_equal_to(0), count_value=hc.greater_than_or_equal_to(0), min_value=hc.greater_than_or_equal_to(0), max_value=hc.greater_than_or_equal_to(0))) hc.assert_that(metric_results['distributions'], hc.contains_inanyorder(matcher))
def test_metric_filter_name_matching(self): filter = MetricsFilter().with_name('name1').with_namespace('ns1') name = MetricName('ns1', 'name1') key = MetricKey('step1', name) self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_name('name1') name = MetricName('ns1', 'name1') key = MetricKey('step1', name) self.assertTrue(MetricResults.matches(filter, key))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). known_args = PipelineOptions().view_as(WordcountOptions) pipeline_options = PipelineOptions() pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def get_distributions_metric(result, counter_name): metrics_filter = MetricsFilter().with_name(counter_name) query_result = result.metrics().query(metrics_filter) if query_result['distributions']: return query_result['distributions'][0].committed else: return None
def test_user_counter_using_pardo(self): class SomeDoFn(beam.DoFn): """A custom dummy DoFn using yield.""" def __init__(self): self.user_counter_elements = metrics.Metrics.counter( self.__class__, 'metrics_user_counter_element') def process(self, element): self.user_counter_elements.inc() yield element pipeline = TestPipeline() nums = pipeline | 'Input' >> beam.Create([1, 2, 3, 4]) results = nums | 'ApplyPardo' >> beam.ParDo(SomeDoFn()) assert_that(results, equal_to([1, 2, 3, 4])) res = pipeline.run() res.wait_until_finish() metric_results = (res.metrics().query( MetricsFilter().with_name('metrics_user_counter_element'))) outputs_counter = metric_results['counters'][0] self.assertEqual(outputs_counter.key.metric.name, 'metrics_user_counter_element') self.assertEqual(outputs_counter.committed, 4)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input file to process') parser.add_argument('--output', dest='output', help='Output file to write results') args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | 'read' >> ReadFromText(args.input) words = (lines | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'map' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) words | 'write' >> WriteToText(args.output) result = p.run() result.wait_until_finish() if not hasattr(result, 'has_job'): words = MetricsFilter().with_name('words') query_result = result.metrics().query(words) print(query_result['counters'][0].result)
def test_spanner_write_mutation_groups(self, mock_batch_snapshot_class, mock_batch_checkout): ks = spanner.KeySet(keys=[[1233], [1234]]) mutation_groups = [ MutationGroup([ WriteMutation.insert("roles", ("key", "rolename"), [('9001233', "mutations-inset-1233")]), WriteMutation.insert("roles", ("key", "rolename"), [('9001234', "mutations-inset-1234")]) ]), MutationGroup([ WriteMutation.update( "roles", ("key", "rolename"), [('9001234', "mutations-inset-9001233-updated")]) ]), MutationGroup([WriteMutation.delete("roles", ks)]) ] p = TestPipeline() _ = (p | beam.Create(mutation_groups) | WriteToSpanner(project_id=TEST_PROJECT_ID, instance_id=TEST_INSTANCE_ID, database_id=_generate_database_name(), max_batch_size_bytes=100)) res = p.run() res.wait_until_finish() metric_results = res.metrics().query( MetricsFilter().with_name('SpannerBatches')) batches_counter = metric_results['counters'][0] self.assertEqual(batches_counter.committed, 3) self.assertEqual(batches_counter.attempted, 3)
def test_bigtable_write(self): number = self.number pipeline_args = self.test_pipeline.options_list pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as pipeline: config_data = { 'project_id': self.project, 'instance_id': self.instance, 'table_id': self.table } _ = (pipeline | 'Generate Direct Rows' >> GenerateTestRows( number, **config_data)) assert pipeline.result.state == PipelineState.DONE read_rows = self.table.read_rows() assert len([_ for _ in read_rows]) == number if not hasattr(pipeline.result, 'has_job') or pipeline.result.has_job: read_filter = MetricsFilter().with_name('Written Row') query_result = pipeline.result.metrics().query(read_filter) if query_result['counters']: read_counter = query_result['counters'][0] logging.info('Number of Rows: %d', read_counter.committed) assert read_counter.committed == number
def compute_stats( input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None, publish_to_bq=None, metrics_dataset=None, metrics_table=None, project=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ namespace = metrics_table pipeline = beam.Pipeline(argv=pipeline_args) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace), ) query = taxi.make_sql( table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: {key: np.asarray([x[key]]) for key in x if x[key] is not None})) _ = ( raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace)) | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def main(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('project_id', help='Google Cloud project ID') parser.add_argument('subscription_name', help='Pub/Sub subscription name') known_args, pipeline_args = parser.parse_known_args(argv) dataflow_sub(known_args.project_id, pipeline_args.subscription_name) p = build_pipeline( project_id=known_args.project_id, input_subscription=known_args.subscription_name, output_subscription=known_args.output, pipeline_args=pipeline_args, ) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def get_pipeline_metric(results, metric_name, index=0, result_type='counters'): metric_filter = MetricsFilter().with_name(metric_name) query_result = results.metrics().query(metric_filter) try: return query_result[result_type][index].committed except IndexError: logging.info('No key in metrics for %s at index %s, returning 0', metric_name, index) return 0
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument( '--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def testTelemetry(self, decode_examples: bool): example_path = self._get_output_data_dir('examples') self._prepare_multihead_examples(example_path) model_path = self._get_output_data_dir('model') self._build_multihead_model(model_path) inference_spec_type = model_spec_pb2.InferenceSpecType( saved_model_spec=model_spec_pb2.SavedModelSpec( model_path=model_path, signature_name=['classify_sum'])) pipeline = self._make_beam_pipeline() _ = ( pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(example_path) | 'MaybeDecode' >> beam.Map( lambda x: x if decode_examples else tf.train.Example.FromString(x)) | 'RunInference' >> run_inference.RunInferenceImpl(inference_spec_type)) run_result = pipeline.run() run_result.wait_until_finish() num_inferences = run_result.metrics().query( MetricsFilter().with_name('num_inferences')) self.assertTrue(num_inferences['counters']) self.assertEqual(num_inferences['counters'][0].result, 2) inference_request_batch_size = run_result.metrics().query( MetricsFilter().with_name('inference_request_batch_size')) self.assertTrue(inference_request_batch_size['distributions']) self.assertEqual( inference_request_batch_size['distributions'][0].result.sum, 2) inference_request_batch_byte_size = run_result.metrics().query( MetricsFilter().with_name('inference_request_batch_byte_size')) self.assertTrue(inference_request_batch_byte_size['distributions']) self.assertEqual( inference_request_batch_byte_size['distributions'][0].result.sum, sum(element.ByteSize() for element in self._multihead_examples)) inference_batch_latency_micro_secs = run_result.metrics().query( MetricsFilter().with_name('inference_batch_latency_micro_secs')) self.assertTrue(inference_batch_latency_micro_secs['distributions']) self.assertGreaterEqual( inference_batch_latency_micro_secs['distributions'][0].result.sum, 0) load_model_latency_milli_secs = run_result.metrics().query( MetricsFilter().with_name('load_model_latency_milli_secs')) self.assertTrue(load_model_latency_milli_secs['distributions']) self.assertGreaterEqual( load_model_latency_milli_secs['distributions'][0].result.sum, 0)
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] p = Pipeline(DirectRunner()) pcoll = (p | beam.Create([1, 2, 3, 4, 5], reshuffle=False) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query(MetricsFilter().with_step('Do')) namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult(MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to( MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def run(input_file, output_file): """Main entry point; defines and runs the wordcount pipeline.""" options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(input_file) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(output_file) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] print 'average word length: %d', word_lengths_dist.committed.mean num_words_filer = MetricsFilter().with_name('num_words') query_result = result.metrics().query(num_words_filer) if query_result['counters']: total_words = query_result['counters'][0] print 'Number of total words: ' + str(total_words.committed)
def test_timing_metrics(self): pipeline = TestPipeline() examples = [1, 5, 3, 10] pcoll = pipeline | 'start' >> beam.Create(examples) fake_clock = FakeClock() _ = pcoll | base.RunInference(FakeModelHandler(clock=fake_clock), clock=fake_clock) res = pipeline.run() res.wait_until_finish() metric_results = (res.metrics().query( MetricsFilter().with_name('inference_batch_latency_micro_secs'))) batch_latency = metric_results['distributions'][0] self.assertEqual(batch_latency.result.count, 3) self.assertEqual(batch_latency.result.mean, 3000) metric_results = (res.metrics().query( MetricsFilter().with_name('load_model_latency_milli_secs'))) load_model_latency = metric_results['distributions'][0] self.assertEqual(load_model_latency.result.count, 1) self.assertEqual(load_model_latency.result.mean, 500)
def get_counter_values(pipeline_result, names, wait_until_finish=True): if wait_until_finish: pipeline_result.wait_until_finish() counter_values = dict() for name in names: counter = pipeline_result.metrics().query( MetricsFilter().with_name(name) )['counters'] assert len(counter) <= 1 if len(counter) == 1: counter_values[name] = counter[0].committed return counter_values
def testWriteSplitCounter(self): count = 10 def Pipeline(root): data = [tf.train.Example()] * count _ = (root | beam.Create(data) | base_example_gen_executor._WriteSplit( self._output_data_dir)) run_result = direct_runner.DirectRunner().run(Pipeline) run_result.wait_until_finish() num_instances = run_result.metrics().query( MetricsFilter().with_name('num_instances')) self.assertTrue(num_instances['counters']) self.assertEqual(len(num_instances['counters']), 1) self.assertEqual(num_instances['counters'][0].result, count)
def testWriteSplitCounter_WithTFRECORDS_GZIP(self): count = 10 def Pipeline(root): data = [tf.train.Example()] * count _ = (root | beam.Create(data) | write_split.WriteSplit( self._output_data_dir, example_gen_pb2.FORMAT_TFRECORDS_GZIP)) run_result = direct_runner.DirectRunner().run(Pipeline) run_result.wait_until_finish() num_instances = run_result.metrics().query( MetricsFilter().with_name('num_instances')) self.assertTrue( fileio.exists( os.path.join(self._output_data_dir, 'data_tfrecord-00000-of-00001.gz'))) self.assertTrue(num_instances['counters']) self.assertEqual(len(num_instances['counters']), 1) self.assertEqual(num_instances['counters'][0].result, count)
def process_tfma(schema_file, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """Runs a batch job to evaluate the eval_model against the given input. Args: schema_file: A file containing a text-serialized Schema that describes the eval data. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. publish_to_bq: project: metrics_dataset: metrics_table: Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if big_query_table is None: raise ValueError( '--big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] metrics_namespace = metrics_table schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, filters=MetricsFilter().with_namespace(metrics_namespace) ) pipeline = beam.Pipeline(argv=pipeline_args) query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace)) | 'CleanData' >> beam.Map(lambda x: ( taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) # Prepare arguments for Extract, Evaluate and Write steps extractors = tfma.default_extractors( eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=None, materialize=False) evaluators = tfma.default_evaluators( eval_shared_model=eval_shared_model, desired_batch_size=None, num_bootstrap_samples=1) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'Extract Results' >> tfma.InputsToExtracts() | 'Extract and evaluate' >> tfma.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'Map Evaluations to PCollection' >> MapEvalToPCollection() | 'Measure time: End' >> beam.ParDo( MeasureTime(metrics_namespace)) ) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Ensure that the experiment flag is set explicitly by the user. debug_options = pipeline_options.view_as(DebugOptions) use_fn_api = ( debug_options.experiments and 'beam_fn_api' in debug_options.experiments) assert use_fn_api, 'Enable beam_fn_api experiment, in order run this example.' # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group_and_sum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) # pylint: disable=unused-variable output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned # TODO(BEAM-2887): Enable after the issue is fixed. # output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.committed.mean)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs namespace = metrics_table metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace)) schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) pipeline = beam.Pipeline(argv=pipeline_args) with tft_beam.Context(temp_dir=working_dir): query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace)) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--format', dest='format', default='text', help='Supported output file formats: %s.' % FORMATS) known_args, pipeline_args = parser.parse_known_args(argv) if known_args.format not in FORMATS: raise ValueError('--format should be one of: %s' % FORMATS) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_text(word_count): (word, count) = word_count return '%s: %d' % (word, count) # Format the counts into a PCollection of dictionary strings. def format_dict(word_count): (word, count) = word_count row = dict(zip(HEADER, [word, count])) return row if known_args.format == 'text': output = counts | 'format text' >> beam.Map(format_text) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write text' >> WriteToText(known_args.output) elif known_args.format == 'avro': output = counts | 'format avro' >> beam.Map(format_dict) schema = avro.schema.parse(json.dumps(AVRO_SCHEMA)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write avro' >> WriteToAvro( file_path_prefix=known_args.output, schema=schema, codec=DEFAULT_CODEC) else: output = counts | 'format parquet' >> beam.Map(format_dict) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write parquet' >> WriteToParquet( file_path_prefix=known_args.output, schema=PARQUET_SCHEMA, codec=DEFAULT_CODEC) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
if y < 200.0: self.word_counter.set(y) yield element class Print_Row(beam.DoFn): def process(self, element): print(element) # Running locally in the DirectRunner. with beam.Pipeline() as pipeline: (pipeline | 'Read lines' >> beam.io.ReadFromText(file_in) | 'Par Do' >> beam.ParDo(Head()) | 'Par D1' >> beam.ParDo(Split()) | 'Par D2' >> beam.ParDo(Filter()) | 'Par D3' >> beam.Map(print)) pr = pipeline.run() pr.wait_until_finish() empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = pr.metrics().query(empty_lines_filter) print(query_result) word_lengths_filter = MetricsFilter().with_name('word_lengths') query_result = pr.metrics().query(word_lengths_filter) print(query_result) tot_len_filter = MetricsFilter().with_name('total_words') query_result = pr.metrics().query(tot_len_filter) print(query_result)