Пример #1
0
  def test_setting_timestamp(self):
    p = TestPipeline()
    unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
    items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

    def extract_timestamp_from_log_entry(entry):
      return entry[1]

    # [START setting_timestamp]
    class AddTimestampDoFn(beam.DoFn):

      def process(self, element):
        # Extract the numeric Unix seconds-since-epoch timestamp to be
        # associated with the current log entry.
        unix_timestamp = extract_timestamp_from_log_entry(element)
        # Wrap and emit the current entry and new timestamp in a
        # TimestampedValue.
        yield beam.TimestampedValue(element, unix_timestamp)

    timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn())
    # [END setting_timestamp]
    fixed_windowed_items = (
        timestamped_items | 'window' >> beam.WindowInto(
            beam.window.FixedWindows(60)))
    summed = (fixed_windowed_items
              | 'group' >> beam.GroupByKey()
              | 'combine' >> beam.CombineValues(sum))
    unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
    beam.assert_that(unkeyed, beam.equal_to([42, 187]))
    p.run()
    def test_create_groups(self):
        p = TestPipeline()

        group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C'])
        corpus_pcoll = p | 'CreateCorpus' >> beam.Create([{
            'f': 'corpus1'
        }, {
            'f': 'corpus2'
        }, {
            'f': 'corpus3'
        }])
        words_pcoll = p | 'CreateWords' >> beam.Create([{
            'f': 'word1'
        }, {
            'f': 'word2'
        }, {
            'f': 'word3'
        }])
        ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create(
            ['corpus1'])
        ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1'])

        groups = bigquery_side_input.create_groups(group_ids_pcoll,
                                                   corpus_pcoll, words_pcoll,
                                                   ignore_corpus_pcoll,
                                                   ignore_word_pcoll)

        beam.assert_that(
            groups,
            beam.equal_to([('A', 'corpus2', 'word2'),
                           ('B', 'corpus2', 'word2'),
                           ('C', 'corpus2', 'word2')]))
        p.run()
Пример #3
0
 def test_basics(self):
     p = TestPipeline()
     rows = (p | 'create' >> beam.Create([{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     beam.assert_that(
         results,
         beam.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run().wait_until_finish()
Пример #4
0
 def test_user_score(self):
   with TestPipeline() as p:
     result = (
         p | beam.Create(UserScoreTest.SAMPLE_DATA) | user_score.UserScore())
     beam.assert_that(result, beam.equal_to([
         ('user1_team1', 50), ('user2_team2', 2), ('user3_team3', 8),
         ('user4_team3', 5)]))
    def test_compute_top_sessions(self):
        p = TestPipeline()
        edits = p | beam.Create(self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        beam.assert_that(result, beam.equal_to(self.EXPECTED))
        p.run()
Пример #6
0
 def test_user_score(self):
   with TestPipeline() as p:
     result = (
         p | beam.Create(UserScoreTest.SAMPLE_DATA) | user_score.UserScore())
     beam.assert_that(result, beam.equal_to([
         ('user1_team1', 50), ('user2_team2', 2), ('user3_team3', 8),
         ('user4_team3', 5)]))
Пример #7
0
    def test_setting_timestamp(self):
        p = TestPipeline()
        unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
        items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

        def extract_timestamp_from_log_entry(entry):
            return entry[1]

        # [START setting_timestamp]
        class AddTimestampDoFn(beam.DoFn):
            def process(self, element):
                # Extract the numeric Unix seconds-since-epoch timestamp to be
                # associated with the current log entry.
                unix_timestamp = extract_timestamp_from_log_entry(element)
                # Wrap and emit the current entry and new timestamp in a
                # TimestampedValue.
                yield beam.TimestampedValue(element, unix_timestamp)

        timestamped_items = items | 'timestamp' >> beam.ParDo(
            AddTimestampDoFn())
        # [END setting_timestamp]
        fixed_windowed_items = (
            timestamped_items
            | 'window' >> beam.WindowInto(beam.window.FixedWindows(60)))
        summed = (fixed_windowed_items
                  | 'group' >> beam.GroupByKey()
                  | 'combine' >> beam.CombineValues(sum))
        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
        beam.assert_that(unkeyed, beam.equal_to([42, 187]))
        p.run()
  def test_compute_top_sessions(self):
    p = TestPipeline()
    edits = p | beam.Create(self.EDITS)
    result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

    beam.assert_that(result, beam.equal_to(self.EXPECTED))
    p.run()
Пример #9
0
 def test_basic(self):
   """Test that the correct result is returned for a simple dataset."""
   results = self._get_result_for_month(1)
   beam.assert_that(
       results,
       beam.equal_to([{'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3},
                      {'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3}]))
   results.pipeline.run()
Пример #10
0
 def test_process_gzip_auto(self):
   path = os.path.join(self._new_tempdir(), 'result.gz')
   self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
   with TestPipeline() as p:
     result = (p
               | ReadFromTFRecord(
                   path, compression_type=fileio.CompressionTypes.AUTO))
     beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #11
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | ReadFromTFRecord(
                       path, compression_type=CompressionTypes.GZIP))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #12
0
 def test_process_gzip_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | ReadFromTFRecord(
                       path, compression_type=fileio.CompressionTypes.AUTO))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #13
0
 def test_basic(self):
   """Test that the correct result is returned for a simple dataset."""
   results = self._get_result_for_month(1)
   beam.assert_that(
       results,
       beam.equal_to([{'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3},
                      {'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3}]))
   results.pipeline.run()
 def test_hourly_team_score(self):
   with TestPipeline() as p:
     result = (p
               | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA)
               | hourly_team_score.HourlyTeamScore(
                   start_min='2015-11-16-15-20',
                   stop_min='2015-11-16-17-20',
                   window_duration=60))
     beam.assert_that(result, beam.equal_to([
         ('team1', 18), ('team2', 2), ('team3', 13)]))
Пример #15
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.GZIP)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #16
0
 def test_hourly_team_score(self):
     with TestPipeline() as p:
         result = (p
                   | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA)
                   | hourly_team_score.HourlyTeamScore(
                       start_min='2015-11-16-15-20',
                       stop_min='2015-11-16-17-20',
                       window_duration=60))
         beam.assert_that(
             result,
             beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
Пример #17
0
 def test_process_gzip(self):
   path = os.path.join(self._new_tempdir(), 'result')
   self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
   with TestPipeline() as p:
     result = (p
               | beam.Read(
                   _TFRecordSource(
                       path,
                       coder=coders.BytesCoder(),
                       compression_type=fileio.CompressionTypes.GZIP)))
     beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #18
0
 def test_process_single(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file(path, FOO_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo']))
Пример #19
0
 def test_process_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.io.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.AUTO,
                           validate=True)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #20
0
 def test_process_single(self):
   path = os.path.join(self._new_tempdir(), 'result')
   self._write_file(path, FOO_RECORD_BASE64)
   with TestPipeline() as p:
     result = (p
               | beam.Read(
                   _TFRecordSource(
                       path,
                       coder=coders.BytesCoder(),
                       compression_type=CompressionTypes.AUTO,
                       validate=True)))
     beam.assert_that(result, beam.equal_to(['foo']))
Пример #21
0
  def test_end2end(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    # Generate a TFRecord file.
    with TestPipeline() as p:
      expected_data = [self.create_inputs() for _ in range(0, 10)]
      _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix)

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = p | ReadFromTFRecord(file_path_prefix + '-*')
      beam.assert_that(actual_data, beam.equal_to(expected_data))
Пример #22
0
  def test_end2end(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    # Generate a TFRecord file.
    with TestPipeline() as p:
      expected_data = [self.create_inputs() for _ in range(0, 10)]
      _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix)

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = p | ReadFromTFRecord(file_path_prefix + '-*')
      beam.assert_that(actual_data, beam.equal_to(expected_data))
Пример #23
0
def model_textio_compressed(renames, expected):
    """Using a Read Transform to read compressed text files."""
    p = TestPipeline()

    # [START model_textio_write_compressed]
    lines = p | 'ReadFromText' >> beam.io.ReadFromText(
        '/path/to/input-*.csv.gz',
        compression_type=beam.io.filesystem.CompressionTypes.GZIP)
    # [END model_textio_write_compressed]

    beam.assert_that(lines, beam.equal_to(expected))
    p.visit(SnippetUtils.RenameFiles(renames))
    p.run().wait_until_finish()
Пример #24
0
    def test_end2end_auto_compression_unsharded(self):
        file_path_prefix = os.path.join(self._new_tempdir(), 'result')

        # Generate a TFRecord file.
        with beam.Pipeline(DirectRunner()) as p:
            expected_data = [self.create_inputs() for _ in range(0, 10)]
            _ = p | beam.Create(expected_data) | WriteToTFRecord(
                file_path_prefix + '.gz', shard_name_template='')

        # Read the file back and compare.
        with beam.Pipeline(DirectRunner()) as p:
            actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz')
            beam.assert_that(actual_data, beam.equal_to(expected_data))
Пример #25
0
def model_textio_compressed(renames, expected):
  """Using a Read Transform to read compressed text files."""
  p = TestPipeline()

  # [START model_textio_write_compressed]
  lines = p | 'ReadFromText' >> beam.io.ReadFromText(
      '/path/to/input-*.csv.gz',
      compression_type=beam.io.filesystem.CompressionTypes.GZIP)
  # [END model_textio_write_compressed]

  beam.assert_that(lines, beam.equal_to(expected))
  p.visit(SnippetUtils.RenameFiles(renames))
  p.run().wait_until_finish()
Пример #26
0
    def test_pardo_side_input(self):
        p = TestPipeline()
        words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = (words
                        | beam.Map(len)
                        | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

        # Call with explicit side inputs.
        small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0,
                                                      3)

        # A single deferred side input.
        larger_than_average = (words | 'large' >> beam.FlatMap(
            filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len)))

        # Mix and match.
        small_but_nontrivial = words | beam.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc']))
        beam.assert_that(larger_than_average,
                         beam.equal_to(['ccc', 'dddd']),
                         label='larger_than_average')
        beam.assert_that(small_but_nontrivial,
                         beam.equal_to(['bb']),
                         label='small_but_not_trivial')
        p.run()
Пример #27
0
    def test_end2end_read_write_read(self):
        path = os.path.join(self._new_tempdir(), 'result')
        with TestPipeline() as p:
            # Initial read to validate the pipeline doesn't fail before the file is
            # created.
            _ = p | ReadFromTFRecord(path + '-*', validate=False)
            expected_data = [self.create_inputs() for _ in range(0, 10)]
            _ = p | beam.Create(expected_data) | WriteToTFRecord(
                path, file_name_suffix='.gz')

        # Read the file back and compare.
        with TestPipeline() as p:
            actual_data = p | ReadFromTFRecord(path + '-*', validate=True)
            beam.assert_that(actual_data, beam.equal_to(expected_data))
Пример #28
0
  def test_end2end_read_write_read(self):
    path = os.path.join(self._new_tempdir(), 'result')
    with TestPipeline() as p:
      # Initial read to validate the pipeline doesn't fail before the file is
      # created.
      _ = p | ReadFromTFRecord(path + '-*', validate=False)
      expected_data = [self.create_inputs() for _ in range(0, 10)]
      _ = p | beam.Create(expected_data) | WriteToTFRecord(
          path, file_name_suffix='.gz')

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = p | ReadFromTFRecord(path+'-*', validate=True)
      beam.assert_that(actual_data, beam.equal_to(expected_data))
Пример #29
0
    def test_composite(self):

        # [START model_composite_transform]
        class ComputeWordLengths(beam.PTransform):
            def expand(self, pcoll):
                # transform logic goes here
                return pcoll | beam.Map(lambda x: len(x))

        # [END model_composite_transform]

        p = TestPipeline()
        lengths = p | beam.Create(["a", "ab", "abc"]) | ComputeWordLengths()
        beam.assert_that(lengths, beam.equal_to([1, 2, 3]))
        p.run()
    def test_combine_per_key_with_custom_callable(self):
        """CombinePerKey using a custom function reducing iterables."""
        def multiply(values):
            result = 1
            for v in values:
                result *= v
            return result

        result = (TestPipeline()
                  | beam.Create(CombinersTest.SAMPLE_DATA)
                  | beam.CombinePerKey(multiply))

        beam.assert_that(result,
                         beam.equal_to([('a', 6), ('b', 200), ('c', 100)]))
        result.pipeline.run()
Пример #31
0
  def test_pardo_side_input(self):
    p = TestPipeline()
    words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd'])

    # [START model_pardo_side_input]
    # Callable takes additional arguments.
    def filter_using_length(word, lower_bound, upper_bound=float('inf')):
      if lower_bound <= len(word) <= upper_bound:
        yield word

    # Construct a deferred side input.
    avg_word_len = (words
                    | beam.Map(len)
                    | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

    # Call with explicit side inputs.
    small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3)

    # A single deferred side input.
    larger_than_average = (words | 'large' >> beam.FlatMap(
        filter_using_length,
        lower_bound=pvalue.AsSingleton(avg_word_len)))

    # Mix and match.
    small_but_nontrivial = words | beam.FlatMap(filter_using_length,
                                                lower_bound=2,
                                                upper_bound=pvalue.AsSingleton(
                                                    avg_word_len))
    # [END model_pardo_side_input]

    beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc']))
    beam.assert_that(larger_than_average, beam.equal_to(['ccc', 'dddd']),
                     label='larger_than_average')
    beam.assert_that(small_but_nontrivial, beam.equal_to(['bb']),
                     label='small_but_not_trivial')
    p.run()
    def test_combine_per_key_with_callable(self):
        """CombinePerKey using a standard callable reducing iterables.

    A common case for Dataflow combiners is to sum (or max or min) over the
    values of each key. Such standard functions can be used directly as combiner
    functions. In fact, any function "reducing" an iterable to a single value
    can be used.
    """
        result = (TestPipeline()
                  | beam.Create(CombinersTest.SAMPLE_DATA)
                  | beam.CombinePerKey(sum))

        beam.assert_that(result,
                         beam.equal_to([('a', 6), ('b', 30), ('c', 100)]))
        result.pipeline.run()
Пример #33
0
 def test_tfidf_transform(self):
   p = TestPipeline()
   uri_to_line = p | 'create sample' >> beam.Create(
       [('1.txt', 'abc def ghi'),
        ('2.txt', 'abc def'),
        ('3.txt', 'abc')])
   result = (
       uri_to_line
       | tfidf.TfIdf()
       | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
   beam.assert_that(result, beam.equal_to(EXPECTED_RESULTS))
   # Run the pipeline. Note that the assert_that above adds to the pipeline
   # a check that the result PCollection contains expected values. To actually
   # trigger the check the pipeline must be run.
   p.run()
Пример #34
0
def run(argv=None):
    """Runs the debugging wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection, count the occurrences of
    # each word and filter by a list of words.
    filtered_words = (
        p | 'read' >> ReadFromText(known_args.input)
        | CountWords()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # assert_that is a convenient PTransform that checks a PCollection has an
    # expected value. Asserts are best used in unit tests with small data sets but
    # is demonstrated here as a teaching tool.
    #
    # Note assert_that does not provide any output and that successful completion
    # of the Pipeline implies that the expectations were  met. Learn more at
    # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
    # test your pipeline.
    beam.assert_that(filtered_words,
                     beam.equal_to([('Flourish', 3), ('stomach', 1)]))

    # Format the counts into a PCollection of strings and write the output using a
    # "Write" transform that has side effects.
    # pylint: disable=unused-variable
    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'write' >> WriteToText(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run().wait_until_finish()
Пример #35
0
 def test_setting_global_window(self):
   p = TestPipeline()
   unkeyed_items = p | beam.Create([2, 11, 16, 27])
   items = (unkeyed_items
            | 'key' >> beam.Map(
                lambda x: beam.window.TimestampedValue(('k', x), x)))
   # [START setting_global_window]
   from apache_beam import window
   session_windowed_items = (
       items | 'window' >> beam.WindowInto(window.GlobalWindows()))
   # [END setting_global_window]
   summed = (session_windowed_items
             | 'group' >> beam.GroupByKey()
             | 'combine' >> beam.CombineValues(sum))
   unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
   beam.assert_that(unkeyed, beam.equal_to([56]))
   p.run()
Пример #36
0
def run(argv=None):
  """Runs the debugging wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection, count the occurrences of
  # each word and filter by a list of words.
  filtered_words = (
      p | 'read' >> ReadFromText(known_args.input)
      | CountWords()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # assert_that is a convenient PTransform that checks a PCollection has an
  # expected value. Asserts are best used in unit tests with small data sets but
  # is demonstrated here as a teaching tool.
  #
  # Note assert_that does not provide any output and that successful completion
  # of the Pipeline implies that the expectations were  met. Learn more at
  # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to
  # test your pipeline.
  beam.assert_that(
      filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))

  # Format the counts into a PCollection of strings and write the output using a
  # "Write" transform that has side effects.
  # pylint: disable=unused-variable
  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'write' >> WriteToText(known_args.output))

  # Actually run the pipeline (all operations above are deferred).
  p.run().wait_until_finish()
Пример #37
0
 def test_setting_fixed_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_fixed_windows]
     from apache_beam import window
     fixed_windowed_items = (
         items | 'window' >> beam.WindowInto(window.FixedWindows(60)))
     # [END setting_fixed_windows]
     summed = (fixed_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([110, 215, 120]))
     p.run()
Пример #38
0
 def test_setting_global_window(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_global_window]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.GlobalWindows()))
     # [END setting_global_window]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([56]))
     p.run()
Пример #39
0
 def test_setting_fixed_windows(self):
   p = TestPipeline()
   unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120])
   items = (unkeyed_items
            | 'key' >> beam.Map(
                lambda x: beam.window.TimestampedValue(('k', x), x)))
   # [START setting_fixed_windows]
   from apache_beam import window
   fixed_windowed_items = (
       items | 'window' >> beam.WindowInto(window.FixedWindows(60)))
   # [END setting_fixed_windows]
   summed = (fixed_windowed_items
             | 'group' >> beam.GroupByKey()
             | 'combine' >> beam.CombineValues(sum))
   unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
   beam.assert_that(unkeyed, beam.equal_to([110, 215, 120]))
   p.run()
Пример #40
0
  def test_end2end_example_proto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    example = tf.train.Example()
    example.features.feature['int'].int64_list.value.extend(range(3))
    example.features.feature['bytes'].bytes_list.value.extend(
        [b'foo', b'bar'])

    with TestPipeline() as p:
      _ = p | beam.Create([example]) | WriteToTFRecord(
          file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = (p | ReadFromTFRecord(
          file_path_prefix + '-*',
          coder=beam.coders.ProtoCoder(example.__class__)))
      beam.assert_that(actual_data, beam.equal_to([example]))
Пример #41
0
  def test_end2end_example_proto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    example = tf.train.Example()
    example.features.feature['int'].int64_list.value.extend(range(3))
    example.features.feature['bytes'].bytes_list.value.extend(
        [b'foo', b'bar'])

    with TestPipeline() as p:
      _ = p | beam.Create([example]) | WriteToTFRecord(
          file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = (p | ReadFromTFRecord(
          file_path_prefix + '-*',
          coder=beam.coders.ProtoCoder(example.__class__)))
      beam.assert_that(actual_data, beam.equal_to([example]))
Пример #42
0
 def test_setting_sliding_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 16, 23])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_sliding_windows]
     from apache_beam import window
     sliding_windowed_items = (
         items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
     # [END setting_sliding_windows]
     summed = (sliding_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed,
                      beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
     p.run()
Пример #43
0
 def test_setting_sliding_windows(self):
   p = TestPipeline()
   unkeyed_items = p | beam.Create([2, 16, 23])
   items = (unkeyed_items
            | 'key' >> beam.Map(
                lambda x: beam.window.TimestampedValue(('k', x), x)))
   # [START setting_sliding_windows]
   from apache_beam import window
   sliding_windowed_items = (
       items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
   # [END setting_sliding_windows]
   summed = (sliding_windowed_items
             | 'group' >> beam.GroupByKey()
             | 'combine' >> beam.CombineValues(sum))
   unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
   beam.assert_that(unkeyed,
                    beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
   p.run()
def run():
    pipeline = beam.Pipeline()

    filtered_words = (
        pipeline | 'read' >> ReadFromText('data/king_arthur.txt')
        | CountWords()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Camelot|Excalibur'))
    )

    beam.assert_that(
        filtered_words, beam.equal_to([('Camelot', 33), ('Excalibur', 17)])
    )

    output = filtered_words | 'format' >> beam.Map(
        lambda (word, count): '{}: {}'.format(word, count)
    )
    output | 'write' >> WriteToText('debugging-wordcount', '.txt')

    pipeline.run().wait_until_finish()
Пример #45
0
 def test_create_transform(self):
   with TestPipeline() as p:
     assert_that(p | Create(range(10)), equal_to(range(10)))
Пример #46
0
def examples_wordcount_debugging(renames):
    """DebuggingWordCount example snippets."""
    import re

    import apache_beam as beam

    # [START example_wordcount_debugging_logging]
    # [START example_wordcount_debugging_aggregators]
    import logging

    class FilterTextFn(beam.DoFn):
        """A DoFn that filters for a specific key based on a regular expression."""
        def __init__(self, pattern):
            self.pattern = pattern
            # A custom metric can track values in your pipeline as it runs. Create
            # custom metrics matched_word and unmatched_words.
            self.matched_words = Metrics.counter(self.__class__,
                                                 'matched_words')
            self.umatched_words = Metrics.counter(self.__class__,
                                                  'umatched_words')

        def process(self, element):
            word, _ = element
            if re.match(self.pattern, word):
                # Log at INFO level each element we match. When executing this pipeline
                # using the Dataflow service, these log lines will appear in the Cloud
                # Logging UI.
                logging.info('Matched %s', word)

                # Add 1 to the custom metric counter matched_words
                self.matched_words.inc()
                yield element
            else:
                # Log at the "DEBUG" level each element that is not matched. Different
                # log levels can be used to control the verbosity of logging providing
                # an effective mechanism to filter less important information. Note
                # currently only "INFO" and higher level logs are emitted to the Cloud
                # Logger. This log message will not be visible in the Cloud Logger.
                logging.debug('Did not match %s', word)

                # Add 1 to the custom metric counter umatched_words
                self.umatched_words.inc()

    # [END example_wordcount_debugging_logging]
    # [END example_wordcount_debugging_aggregators]

    p = TestPipeline()  # Use TestPipeline for testing.
    filtered_words = (
        p
        |
        beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
        |
        'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        | beam.combiners.Count.PerElement()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # [START example_wordcount_debugging_assert]
    beam.assert_that(filtered_words,
                     beam.equal_to([('Flourish', 3), ('stomach', 1)]))
    # [END example_wordcount_debugging_assert]

    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
Пример #47
0
def model_custom_source(count):
    """Demonstrates creating a new custom source and using it in a pipeline.

  Defines a new source ``CountingSource`` that produces integers starting from 0
  up to a given size.

  Uses the new source in an example pipeline.

  Additionally demonstrates how a source should be implemented using a
  ``PTransform``. This is the recommended way to develop sources that are to
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``CountingSource``.

  Args:
    count: the size of the counting source to be used in the pipeline
           demonstrated in this method.

  """

    import apache_beam as beam
    from apache_beam.io import iobase
    from apache_beam.io.range_trackers import OffsetRangeTracker
    from apache_beam.transforms.core import PTransform
    from apache_beam.utils.pipeline_options import PipelineOptions

    # Defining a new source.
    # [START model_custom_source_new_source]
    class CountingSource(iobase.BoundedSource):
        def __init__(self, count):
            self._count = count

        def estimate_size(self):
            return self._count

        def get_range_tracker(self, start_position, stop_position):
            if start_position is None:
                start_position = 0
            if stop_position is None:
                stop_position = self._count

            return OffsetRangeTracker(start_position, stop_position)

        def read(self, range_tracker):
            for i in range(self._count):
                if not range_tracker.try_claim(i):
                    return
                yield i

        def split(self,
                  desired_bundle_size,
                  start_position=None,
                  stop_position=None):
            if start_position is None:
                start_position = 0
            if stop_position is None:
                stop_position = self._count

            bundle_start = start_position
            while bundle_start < self._count:
                bundle_stop = max(self._count,
                                  bundle_start + desired_bundle_size)
                yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                          source=self,
                                          start_position=bundle_start,
                                          stop_position=bundle_stop)
                bundle_start = bundle_stop

    # [END model_custom_source_new_source]

    # Using the source in an example pipeline.
    # [START model_custom_source_use_new_source]
    p = beam.Pipeline(options=PipelineOptions())
    numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
    # [END model_custom_source_use_new_source]

    lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
    beam.assert_that(
        lines,
        beam.equal_to(['line ' + str(number) for number in range(0, count)]))

    p.run().wait_until_finish()

    # We recommend users to start Source classes with an underscore to discourage
    # using the Source class directly when a PTransform for the source is
    # available. We simulate that here by simply extending the previous Source
    # class.
    class _CountingSource(CountingSource):
        pass

    # [START model_custom_source_new_ptransform]
    class ReadFromCountingSource(PTransform):
        def __init__(self, count, **kwargs):
            super(ReadFromCountingSource, self).__init__(**kwargs)
            self._count = count

        def expand(self, pcoll):
            return pcoll | iobase.Read(_CountingSource(count))

    # [END model_custom_source_new_ptransform]

    # [START model_custom_source_use_ptransform]
    p = beam.Pipeline(options=PipelineOptions())
    numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count)
    # [END model_custom_source_use_ptransform]

    lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
    beam.assert_that(
        lines,
        beam.equal_to(['line ' + str(number) for number in range(0, count)]))

    # Don't test runner api due to pickling errors.
    p.run(test_runner_api=False).wait_until_finish()
Пример #48
0
 def test_basic_empty_missing(self):
   """Test that the correct empty result is returned for a missing month."""
   results = self._get_result_for_month(4)
   beam.assert_that(results, beam.equal_to([]))
   results.pipeline.run()
Пример #49
0
 def test_create_transform(self):
   with TestPipeline() as p:
     assert_that(p | Create(range(10)), equal_to(range(10)))
Пример #50
0
def examples_wordcount_debugging(renames):
  """DebuggingWordCount example snippets."""
  import re

  import apache_beam as beam

  # [START example_wordcount_debugging_logging]
  # [START example_wordcount_debugging_aggregators]
  import logging

  class FilterTextFn(beam.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    def __init__(self, pattern):
      self.pattern = pattern
      # A custom metric can track values in your pipeline as it runs. Create
      # custom metrics matched_word and unmatched_words.
      self.matched_words = Metrics.counter(self.__class__, 'matched_words')
      self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

    def process(self, element):
      word, _ = element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom metric counter matched_words
        self.matched_words.inc()
        yield element
      else:
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom metric counter umatched_words
        self.umatched_words.inc()
  # [END example_wordcount_debugging_logging]
  # [END example_wordcount_debugging_aggregators]

  p = TestPipeline()  # Use TestPipeline for testing.
  filtered_words = (
      p
      | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      | beam.combiners.Count.PerElement()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # [START example_wordcount_debugging_assert]
  beam.assert_that(
      filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))
  # [END example_wordcount_debugging_assert]

  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
Пример #51
0
def model_custom_source(count):
  """Demonstrates creating a new custom source and using it in a pipeline.

  Defines a new source ``CountingSource`` that produces integers starting from 0
  up to a given size.

  Uses the new source in an example pipeline.

  Additionally demonstrates how a source should be implemented using a
  ``PTransform``. This is the recommended way to develop sources that are to
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``CountingSource``.

  Args:
    count: the size of the counting source to be used in the pipeline
           demonstrated in this method.

  """

  # Using the source in an example pipeline.
  # [START model_custom_source_use_new_source]
  p = beam.Pipeline(options=PipelineOptions())
  numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
  # [END model_custom_source_use_new_source]

  lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
  beam.assert_that(
      lines, beam.equal_to(
          ['line ' + str(number) for number in range(0, count)]))

  p.run().wait_until_finish()

  # We recommend users to start Source classes with an underscore to discourage
  # using the Source class directly when a PTransform for the source is
  # available. We simulate that here by simply extending the previous Source
  # class.
  class _CountingSource(CountingSource):
    pass

  # [START model_custom_source_new_ptransform]
  class ReadFromCountingSource(PTransform):

    def __init__(self, count, **kwargs):
      super(ReadFromCountingSource, self).__init__(**kwargs)
      self._count = count

    def expand(self, pcoll):
      return pcoll | iobase.Read(_CountingSource(count))
  # [END model_custom_source_new_ptransform]

  # [START model_custom_source_use_ptransform]
  p = beam.Pipeline(options=PipelineOptions())
  numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count)
  # [END model_custom_source_use_ptransform]

  lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
  beam.assert_that(
      lines, beam.equal_to(
          ['line ' + str(number) for number in range(0, count)]))

  # Don't test runner api due to pickling errors.
  p.run(test_runner_api=False).wait_until_finish()
Пример #52
0

class TfIdfTest(unittest.TestCase):
    def create_file(self, path, contents):
        logging.info('Creating temp file: %s', path)
        with open(path, 'w') as f:
            f.write(contents)

    def test_tfidf_transform(self):
        p = TestPipeline()
        uri_to_line = p | 'create sample' >> beam.Create(
            [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')])
        result = (uri_to_line
                  | tfidf.TfIdf()
                  | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
        beam.assert_that(result, beam.equal_to(EXPECTED_RESULTS))
        # Run the pipeline. Note that the assert_that above adds to the pipeline
        # a check that the result PCollection contains expected values. To actually
        # trigger the check the pipeline must be run.
        p.run()

    def test_basics(self):
        # Setup the files with expected content.
        temp_folder = tempfile.mkdtemp()
        self.create_file(os.path.join(temp_folder, '1.txt'), 'abc def ghi')
        self.create_file(os.path.join(temp_folder, '2.txt'), 'abc def')
        self.create_file(os.path.join(temp_folder, '3.txt'), 'abc')
        tfidf.run([
            '--uris=%s/*' % temp_folder, '--output',
            os.path.join(temp_folder, 'result')
        ])
Пример #53
0
 def test_basic_empty(self):
   """Test that the correct empty result is returned for a simple dataset."""
   results = self._get_result_for_month(3)
   beam.assert_that(results, beam.equal_to([]))
   results.pipeline.run()
Пример #54
0
def run(argv=None, assert_results=None):

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input_email',
      required=True,
      help='Email database, with each line formatted as "name<TAB>email".')
  parser.add_argument(
      '--input_phone',
      required=True,
      help='Phonebook, with each line formatted as "name<TAB>phone number".')
  parser.add_argument(
      '--input_snailmail',
      required=True,
      help='Address database, with each line formatted as "name<TAB>address".')
  parser.add_argument('--output_tsv',
                      required=True,
                      help='Tab-delimited output file.')
  parser.add_argument('--output_stats',
                      required=True,
                      help='Output file for statistics about the input.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Helper: read a tab-separated key-value mapping from a text file, escape all
  # quotes/backslashes, and convert it a PCollection of (key, value) pairs.
  def read_kv_textfile(label, textfile):
    return (p
            | 'Read: %s' % label >> ReadFromText(textfile)
            | 'Backslash: %s' % label >> beam.Map(
                lambda x: re.sub(r'\\', r'\\\\', x))
            | 'EscapeQuotes: %s' % label >> beam.Map(
                lambda x: re.sub(r'"', r'\"', x))
            | 'Split: %s' % label >> beam.Map(
                lambda x: re.split(r'\t+', x, 1)))

  # Read input databases.
  email = read_kv_textfile('email', known_args.input_email)
  phone = read_kv_textfile('phone', known_args.input_phone)
  snailmail = read_kv_textfile('snailmail', known_args.input_snailmail)

  # Group together all entries under the same name.
  grouped = (email, phone, snailmail) | 'group_by_name' >> beam.CoGroupByKey()

  # Prepare tab-delimited output; something like this:
  # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only"
  tsv_lines = grouped | beam.Map(
      lambda (name, (email, phone, snailmail)): '\t'.join(
          ['"%s"' % name,
           '"%s"' % ','.join(email),
           '"%s"' % ','.join(phone),
           '"%s"' % next(iter(snailmail), '')]))

  # Compute some stats about our database of people.
  luddites = grouped | beam.Filter(  # People without email.
      lambda (name, (email, phone, snailmail)): not next(iter(email), None))
  writers = grouped | beam.Filter(   # People without phones.
      lambda (name, (email, phone, snailmail)): not next(iter(phone), None))
  nomads = grouped | beam.Filter(    # People without addresses.
      lambda (name, (email, phone, snailmail)): not next(iter(snailmail), None))

  num_luddites = luddites | 'Luddites' >> beam.combiners.Count.Globally()
  num_writers = writers | 'Writers' >> beam.combiners.Count.Globally()
  num_nomads = nomads | 'Nomads' >> beam.combiners.Count.Globally()

  # Write tab-delimited output.
  # pylint: disable=expression-not-assigned
  tsv_lines | 'WriteTsv' >> WriteToText(known_args.output_tsv)

  # TODO(silviuc): Move the assert_results logic to the unit test.
  if assert_results is not None:
    expected_luddites, expected_writers, expected_nomads = assert_results
    beam.assert_that(num_luddites, beam.equal_to([expected_luddites]),
                     label='assert:luddites')
    beam.assert_that(num_writers, beam.equal_to([expected_writers]),
                     label='assert:writers')
    beam.assert_that(num_nomads, beam.equal_to([expected_nomads]),
                     label='assert:nomads')
  # Execute pipeline.
  return p.run()
Пример #55
0
def model_custom_source(count):
  """Demonstrates creating a new custom source and using it in a pipeline.

  Defines a new source ``CountingSource`` that produces integers starting from 0
  up to a given size.

  Uses the new source in an example pipeline.

  Additionally demonstrates how a source should be implemented using a
  ``PTransform``. This is the recommended way to develop sources that are to
  distributed to a large number of end users.

  This method runs two pipelines.

  (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``CountingSource``.

  Args:
    count: the size of the counting source to be used in the pipeline
           demonstrated in this method.

  """

  import apache_beam as beam
  from apache_beam.io import iobase
  from apache_beam.io.range_trackers import OffsetRangeTracker
  from apache_beam.transforms.core import PTransform
  from apache_beam.utils.pipeline_options import PipelineOptions

  # Defining a new source.
  # [START model_custom_source_new_source]
  class CountingSource(iobase.BoundedSource):

    def __init__(self, count):
      self._count = count

    def estimate_size(self):
      return self._count

    def get_range_tracker(self, start_position, stop_position):
      if start_position is None:
        start_position = 0
      if stop_position is None:
        stop_position = self._count

      return OffsetRangeTracker(start_position, stop_position)

    def read(self, range_tracker):
      for i in range(self._count):
        if not range_tracker.try_claim(i):
          return
        yield i

    def split(self, desired_bundle_size, start_position=None,
              stop_position=None):
      if start_position is None:
        start_position = 0
      if stop_position is None:
        stop_position = self._count

      bundle_start = start_position
      while bundle_start < self._count:
        bundle_stop = max(self._count, bundle_start + desired_bundle_size)
        yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                  source=self,
                                  start_position=bundle_start,
                                  stop_position=bundle_stop)
        bundle_start = bundle_stop
  # [END model_custom_source_new_source]

  # Using the source in an example pipeline.
  # [START model_custom_source_use_new_source]
  p = beam.Pipeline(options=PipelineOptions())
  numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
  # [END model_custom_source_use_new_source]

  lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
  beam.assert_that(
      lines, beam.equal_to(
          ['line ' + str(number) for number in range(0, count)]))

  p.run().wait_until_finish()

  # We recommend users to start Source classes with an underscore to discourage
  # using the Source class directly when a PTransform for the source is
  # available. We simulate that here by simply extending the previous Source
  # class.
  class _CountingSource(CountingSource):
    pass

  # [START model_custom_source_new_ptransform]
  class ReadFromCountingSource(PTransform):

    def __init__(self, count, **kwargs):
      super(ReadFromCountingSource, self).__init__(**kwargs)
      self._count = count

    def expand(self, pcoll):
      return pcoll | iobase.Read(_CountingSource(count))
  # [END model_custom_source_new_ptransform]

  # [START model_custom_source_use_ptransform]
  p = beam.Pipeline(options=PipelineOptions())
  numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count)
  # [END model_custom_source_use_ptransform]

  lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
  beam.assert_that(
      lines, beam.equal_to(
          ['line ' + str(number) for number in range(0, count)]))

  p.run().wait_until_finish()