示例#1
0
class FilterTextFn(beam.DoFn):
  """A DoFn that filters for a specific key based on a regular expression."""

  # A custom aggregator can track values in your pipeline as it runs. Those
  # values will be displayed in the Dataflow Monitoring UI when this pipeline is
  # run using the Dataflow service. These aggregators below track the number of
  # matched and unmatched words. Learn more at
  # https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
  # the Dataflow Monitoring UI.
  matched_words = beam.Aggregator('matched_words')
  umatched_words = beam.Aggregator('umatched_words')

  def __init__(self, pattern):
    super(FilterTextFn, self).__init__()
    self.pattern = pattern

  def process(self, context):
    word, _ = context.element
    if re.match(self.pattern, word):
      # Log at INFO level each element we match. When executing this pipeline
      # using the Dataflow service, these log lines will appear in the Cloud
      # Logging UI.
      logging.info('Matched %s', word)
      context.aggregate_to(self.matched_words, 1)
      yield context.element
    else:
      # Log at the "DEBUG" level each element that is not matched. Different log
      # levels can be used to control the verbosity of logging providing an
      # effective mechanism to filter less important information.
      # Note currently only "INFO" and higher level logs are emitted to the
      # Cloud Logger. This log message will not be visible in the Cloud Logger.
      logging.debug('Did not match %s', word)
      context.aggregate_to(self.umatched_words, 1)
示例#2
0
文件: snippets.py 项目: wikier/beam
  class FilterTextFn(beam.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    # A custom aggregator can track values in your pipeline as it runs. Create
    # custom aggregators matched_word and unmatched_words.
    matched_words = beam.Aggregator('matched_words')
    umatched_words = beam.Aggregator('umatched_words')

    def __init__(self, pattern):
      self.pattern = pattern

    def process(self, context):
      word, _ = context.element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom aggregator matched_words
        context.aggregate_to(self.matched_words, 1)
        yield context.element
      else:
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom aggregator umatched_words
        context.aggregate_to(self.umatched_words, 1)
示例#3
0
import apache_beam as beam
try:
    from apache_beam.utils.pipeline_options import PipelineOptions
except ImportError:
    from apache_beam.utils.options import PipelineOptions
from PIL import Image
import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception
from tensorflow.python.framework import errors
from tensorflow.python.lib.io import file_io
from google.cloud.ml.io import SaveFeatures

slim = tf.contrib.slim

error_count = beam.Aggregator('errorCount')
missing_label_count = beam.Aggregator('missingLabelCount')
csv_rows_count = beam.Aggregator('csvRowsCount')
labels_count = beam.Aggregator('labelsCount')
labels_without_ids = beam.Aggregator('labelsWithoutIds')
existing_file = beam.Aggregator('existingFile')
non_existing_file = beam.Aggregator('nonExistingFile')
skipped_empty_line = beam.Aggregator('skippedEmptyLine')
embedding_good = beam.Aggregator('embedding_good')
embedding_bad = beam.Aggregator('embedding_bad')
incompatible_image = beam.Aggregator('incompatible_image')
invalid_uri = beam.Aggregator('invalid_file_name')
unlabeled_image = beam.Aggregator('unlabeled_image')
unknown_label = beam.Aggregator('unknown_label')

示例#4
0
#
"""A word-counting workflow."""

from __future__ import absolute_import

import argparse
import logging
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.utils.pipeline_options import PipelineOptions
from apache_beam.utils.pipeline_options import SetupOptions

empty_line_aggregator = beam.Aggregator('emptyLines')
average_word_size_aggregator = beam.Aggregator('averageWordLength',
                                               beam.combiners.MeanCombineFn(),
                                               float)


class WordExtractingDoFn(beam.DoFn):
    """Parse each line of input text into words."""
    def process(self, context):
        """Returns an iterator over the words of this element.

    The element is a line of text.  If the line is blank, note that, too.

    Args:
      context: the call-specific context: data and aggregator.