class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" # A custom aggregator can track values in your pipeline as it runs. Those # values will be displayed in the Dataflow Monitoring UI when this pipeline is # run using the Dataflow service. These aggregators below track the number of # matched and unmatched words. Learn more at # https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about # the Dataflow Monitoring UI. matched_words = beam.Aggregator('matched_words') umatched_words = beam.Aggregator('umatched_words') def __init__(self, pattern): super(FilterTextFn, self).__init__() self.pattern = pattern def process(self, context): word, _ = context.element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) context.aggregate_to(self.matched_words, 1) yield context.element else: # Log at the "DEBUG" level each element that is not matched. Different log # levels can be used to control the verbosity of logging providing an # effective mechanism to filter less important information. # Note currently only "INFO" and higher level logs are emitted to the # Cloud Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) context.aggregate_to(self.umatched_words, 1)
class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" # A custom aggregator can track values in your pipeline as it runs. Create # custom aggregators matched_word and unmatched_words. matched_words = beam.Aggregator('matched_words') umatched_words = beam.Aggregator('umatched_words') def __init__(self, pattern): self.pattern = pattern def process(self, context): word, _ = context.element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom aggregator matched_words context.aggregate_to(self.matched_words, 1) yield context.element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom aggregator umatched_words context.aggregate_to(self.umatched_words, 1)
import apache_beam as beam try: from apache_beam.utils.pipeline_options import PipelineOptions except ImportError: from apache_beam.utils.options import PipelineOptions from PIL import Image import tensorflow as tf from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io from google.cloud.ml.io import SaveFeatures slim = tf.contrib.slim error_count = beam.Aggregator('errorCount') missing_label_count = beam.Aggregator('missingLabelCount') csv_rows_count = beam.Aggregator('csvRowsCount') labels_count = beam.Aggregator('labelsCount') labels_without_ids = beam.Aggregator('labelsWithoutIds') existing_file = beam.Aggregator('existingFile') non_existing_file = beam.Aggregator('nonExistingFile') skipped_empty_line = beam.Aggregator('skippedEmptyLine') embedding_good = beam.Aggregator('embedding_good') embedding_bad = beam.Aggregator('embedding_bad') incompatible_image = beam.Aggregator('incompatible_image') invalid_uri = beam.Aggregator('invalid_file_name') unlabeled_image = beam.Aggregator('unlabeled_image') unknown_label = beam.Aggregator('unknown_label')
# """A word-counting workflow.""" from __future__ import absolute_import import argparse import logging import re import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io import WriteToText from apache_beam.utils.pipeline_options import PipelineOptions from apache_beam.utils.pipeline_options import SetupOptions empty_line_aggregator = beam.Aggregator('emptyLines') average_word_size_aggregator = beam.Aggregator('averageWordLength', beam.combiners.MeanCombineFn(), float) class WordExtractingDoFn(beam.DoFn): """Parse each line of input text into words.""" def process(self, context): """Returns an iterator over the words of this element. The element is a line of text. If the line is blank, note that, too. Args: context: the call-specific context: data and aggregator.