示例#1
0
 def test_create_do_with_side_in_memory_write(self):
   elements = ['abc', 'def', 'ghi']
   side_elements = ['x', 'y', 'z']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=3),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, side)]),
               tag_and_type=('inmemory', pvalue.SingletonPCollectionView,
                             (False, None))),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(
                   inmemory.InMemorySource(
                       elements=[pickler.dumps(e) for e in side_elements],
                       start_index=None,
                       end_index=None),
                   tag='inmemory')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(
           output_buffer=output_buffer,
           input=(1, 0),
           output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as singleton therefore we should see
   # only the first element appended.
   self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
    def test_concat_source_to_shuffle_sink(self):
        work = workitem.get_work_items(
            get_concat_source_to_shuffle_sink_message())
        self.assertIsNotNone(work)
        expected_sub_sources = []
        expected_sub_sources.append(
            io.TextFileSource(file_path='gs://sort_g/input_small_files/'
                              'ascii_sort_1MB_input.0000006',
                              start_offset=0,
                              end_offset=1000000,
                              strip_trailing_newlines=True,
                              coder=CODER))
        expected_sub_sources.append(
            io.TextFileSource(file_path='gs://sort_g/input_small_files/'
                              'ascii_sort_1MB_input.0000007',
                              start_offset=0,
                              end_offset=1000000,
                              strip_trailing_newlines=True,
                              coder=CODER))

        expected_concat_source = concat_reader.ConcatSource(
            expected_sub_sources)

        self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
            maptask.WorkerRead(expected_concat_source, output_coders=[CODER]),
            maptask.WorkerDoFn(serialized_fn='code',
                               output_tags=['out'],
                               input=(1, 0),
                               side_inputs=[],
                               output_coders=[CODER]),
            maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                       shuffle_writer_config='opaque',
                                       input=(1, 0),
                                       output_coders=(CODER, ))
        ]))
示例#3
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
示例#4
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     executor.MapTaskExecutor().execute(make_map_task(work_spec),
                                        test_shuffle_sink=shuffle_sink_mock)
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
示例#5
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
示例#6
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
示例#7
0
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
示例#8
0
 def test_read_do_write_with_start_bundle(self):
     input_path = self.create_temp_file('01234567890123456789\n0123456789')
     output_path = '%s.out' % input_path
     finish_path = '%s.finish' % input_path
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(fileio.TextFileSource(
                 file_path=input_path,
                 start_offset=0,
                 end_offset=15,
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder()),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                 DoFnUsingStartBundle(finish_path)),
                                output_tags=['out'],
                                output_coders=[self.OUTPUT_CODER],
                                input=(0, 0),
                                side_inputs=None),
             make_text_sink(output_path, input=(1, 0))
         ]))
     with open(output_path) as f:
         self.assertEqual('XYZ: 01234567890123456789\n', f.read())
     # Check that the finish_bundle method of the custom DoFn object left the
     # expected side-effect by writing a file with a specific content.
     with open(finish_path) as f:
         self.assertEqual('finish called.', f.read())
 def test_in_memory_source_to_flatten(self):
     work = workitem.get_work_items(
         get_in_memory_source_to_flatten_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(inmemory.InMemorySource(
             start_index=1,
             end_index=3,
             elements=[
                 base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS
             ],
             coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerFlatten(inputs=[(0, 0)], output_coders=[CODER])
     ]))
示例#10
0
    def test_create_do_with_collection_side_bigquery_write(self):
        elements = ['aa', 'bb']
        side_elements = ['x', 'y']
        output_buffer = []
        patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
        with mock.patch(target=patch_target) as mock_class:
            # Setup the reader so it will yield the values in 'side_elements'.
            reader_mock = mock_class.return_value
            reader_mock.__enter__.return_value = reader_mock
            # Use a lambda so that multiple readers can be created, each reading the
            # entirety of the side elements.
            reader_mock.__iter__.side_effect = lambda: (x
                                                        for x in side_elements)

            executor.MapTaskExecutor().execute(
                make_map_task([
                    maptask.WorkerRead(inmemory.InMemorySource(
                        elements=[pickler.dumps(e) for e in elements],
                        start_index=0,
                        end_index=3),
                                       output_coders=[self.OUTPUT_CODER]),
                    maptask.
                    WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                        ptransform.CallableWrapperDoFn(
                            lambda x, side: ['%s:%s' % (x, s) for s in side]),
                        tag_and_type=('bigquery',
                                      pvalue.IterablePCollectionView, ())),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=[
                                   maptask.WorkerSideInputSource(
                                       bigquery.BigQuerySource(
                                           project='project',
                                           dataset='dataset',
                                           table='table',
                                           coder=get_bigquery_source_coder()),
                                       tag='bigquery')
                               ],
                               output_coders=[self.OUTPUT_CODER]),
                    maptask.WorkerInMemoryWrite(
                        output_buffer=output_buffer,
                        input=(1, 0),
                        output_coders=(self.OUTPUT_CODER, ))
                ]))
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                         sorted(output_buffer))
示例#11
0
    def test_in_memory_source_progress_reporting(self):
        elements = [101, 201, 301, 401, 501, 601, 701]
        output_buffer = []
        source = ProgressRequestRecordingInMemorySource(
            elements=[pickler.dumps(e) for e in elements])
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                        input=(0, 0),
                                        output_coders=(self.OUTPUT_CODER, ))
        ])
        executor.MapTaskExecutor(work_item.map_task).execute()
        self.assertEqual(elements, output_buffer)

        expected_progress_record = range(len(elements))
        self.assertEqual(expected_progress_record,
                         source.last_reader.progress_record)
示例#12
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(inmemory.InMemorySource(
                 elements=[pickler.dumps(e) for e in elements],
                 start_index=0,
                 end_index=2),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(
                 serialized_fn=pickle_with_side_inputs(
                     ptransform.CallableWrapperDoFn(
                         lambda x, side: ['%s:%s' % (x, s) for s in side]),
                     tag_and_type=('sometag',
                                   pvalue.IterablePCollectionView, ())),
                 output_tags=['out'],
                 input=(0, 0),
                 # Note that the two side inputs have the same tag. This is quite
                 # common for intermediary PCollections used as side inputs that
                 # are saved as AVRO files. The files will contain the sharded
                 # PCollection.
                 side_inputs=[
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path1,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag'),
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path2,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag')
                 ],
                 output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerInMemoryWrite(
                 output_buffer=output_buffer,
                 input=(1, 0),
                 output_coders=(self.OUTPUT_CODER, ))
         ]))
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
 def test_in_memory_source_to_text_sink(self):
     work = workitem.get_work_items(
         get_in_memory_source_to_text_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(inmemory.InMemorySource(
             start_index=1,
             end_index=3,
             elements=[
                 base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS
             ],
             coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerWrite(fileio.NativeTextFileSink(
             file_path_prefix='gs://somefile',
             append_trailing_newlines=True,
             coder=CODER),
                             input=(0, 0),
                             output_coders=(CODER, ))
     ]))
示例#14
0
 def test_pgbk(self):
   elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(elements=[pickler.dumps(e) for e in elements
                                            ],
                                   start_index=0,
                                   end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerPartialGroupByKey(
           combine_fn=None,
           input=(0, 0),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
示例#15
0
  def test_in_memory_source_progress_reporting(self):
    elements = [101, 201, 301, 401, 501, 601, 701]
    output_buffer = []
    source = ProgressRequestRecordingInMemorySource(
        elements=[pickler.dumps(e) for e in elements])
    executor.MapTaskExecutor().execute(make_map_task([
        maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
        maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                    input=(0, 0),
                                    output_coders=(self.OUTPUT_CODER,))
    ]))
    self.assertEqual(elements, output_buffer)

    expected_progress_record = []
    len_elements = len(elements)
    for i in range(len_elements):
      expected_progress_record.append(float(i + 1) / len_elements)

    self.assertEqual(expected_progress_record,
                     source.last_reader.progress_record)
 def test_text_source_to_shuffle_sink(self):
     work = workitem.get_work_items(
         get_text_source_to_shuffle_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(io.TextFileSource(file_path='gs://somefile',
                                              start_offset=123,
                                              end_offset=123123,
                                              strip_trailing_newlines=True,
                                              coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerDoFn(serialized_fn='code',
                            output_tags=['out'],
                            input=(1, 0),
                            side_inputs=[],
                            output_coders=[CODER]),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='opaque',
                                    input=(1, 0),
                                    output_coders=(CODER, ))
     ]))
示例#17
0
 def test_combine(self):
   elements = [('a', [1, 2, 3]), ('b', [10])]
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=100),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CombineFn.from_callable(sum)),
                               phase='all',
                               input=(0, 0),
                               output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))
   ]))
   self.assertEqual([('a', 6), ('b', 10)], output_buffer)
示例#18
0
  def test_create_do_with_singleton_side_bigquery_write(self):
    elements = ['abc', 'def', 'ghi']
    side_elements = ['x', 'y', 'z']
    output_buffer = []
    patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
    with mock.patch(target=patch_target) as mock_class:
      # Setup the reader so it will yield the values in 'side_elements'.
      reader_mock = mock_class.return_value
      reader_mock.__enter__.return_value = reader_mock
      reader_mock.__iter__.return_value = (x for x in side_elements)

      pickled_elements = [pickler.dumps(e) for e in elements]
      executor.MapTaskExecutor().execute(make_map_task([
          maptask.WorkerRead(
              inmemory.InMemorySource(elements=pickled_elements,
                                      start_index=0,
                                      end_index=3),
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerDoFn(
              serialized_fn=pickle_with_side_inputs(
                  ptransform.CallableWrapperDoFn(
                      lambda x, side: ['%s:%s' % (x, side)]),
                  tag_and_type=('bigquery', pvalue.SingletonPCollectionView,
                                (False, None))),
              output_tags=['out'], input=(0, 0),
              side_inputs=[
                  maptask.WorkerSideInputSource(
                      bigquery.BigQuerySource(
                          project='project',
                          dataset='dataset',
                          table='table',
                          coder=get_bigquery_source_coder()),
                      tag='bigquery')],
              output_coders=[self.OUTPUT_CODER]),
          maptask.WorkerInMemoryWrite(
              output_buffer=output_buffer,
              input=(1, 0),
              output_coders=(self.OUTPUT_CODER,))]))
    # The side source was specified as singleton therefore we should see
    # only the first element appended.
    self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
示例#19
0
    def test_read_do_write_with_undeclared_output(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                DoFnUsingWithUndeclaredSideOutput()),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('01234567890123456789\n', f.read())