Exemplo n.º 1
0
    def test_create_do_avro_write(self):
        output_path = self.create_temp_file('n/a')
        elements = ['abc', 'def', 'ghi']
        work_item = workitem.BatchWorkItem(None)

        work_item.map_task = make_map_task([
            maptask.WorkerRead(
                inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=2,  # Start at the last element.
                    end_index=3),
                output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=None,
                               output_coders=[self.OUTPUT_CODER]),
            make_text_sink(output_path,
                           input=(1, 0),
                           coder=coders.Base64PickleCoder())
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: ghi', pickler.loads(f.read().strip()))
Exemplo n.º 2
0
 def test_create_do_with_side_in_memory_write(self):
     elements = ['abc', 'def', 'ghi']
     side_elements = ['x', 'y', 'z']
     output_buffer = []
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task([
         maptask.WorkerRead(inmemory.InMemorySource(
             elements=[pickler.dumps(e) for e in elements],
             start_index=0,
             end_index=3),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(
             serialized_fn=pickle_with_side_inputs(
                 ptransform.CallableWrapperDoFn(
                     lambda x, side: ['%s:%s' % (x, side)]),
                 tag_and_type=('inmemory', pvalue.SingletonPCollectionView,
                               (False, None))),
             output_tags=['out'],
             input=(0, 0),
             side_inputs=[
                 maptask.WorkerSideInputSource(inmemory.InMemorySource(
                     elements=[pickler.dumps(e) for e in side_elements],
                     start_index=None,
                     end_index=None),
                                               tag='inmemory')
             ],
             output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                     input=(1, 0),
                                     output_coders=(self.OUTPUT_CODER, ))
     ])
     executor.MapTaskExecutor(work_item.map_task).execute()
     # The side source was specified as singleton therefore we should see
     # only the first element appended.
     self.assertEqual(['abc:x', 'def:x', 'ghi:x'], output_buffer)
Exemplo n.º 3
0
 def test_shuffle_read_do_write(self):
     output_path = self.create_temp_file('n/a')
     work_spec = [
         maptask.WorkerGroupingShuffleRead(
             shuffle_reader_config='none',
             start_shuffle_position='aaa',
             end_shuffle_position='zzz',
             coder=self.SHUFFLE_CODER,
             output_coders=[self.SHUFFLE_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(
                 lambda (k, vs): [str((k, v)) for v in vs])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         make_text_sink(output_path, input=(1, 0))
     ]
     shuffle_source_mock = mock.MagicMock()
     shuffle_source_mock.reader().__enter__().__iter__.return_value = [
         (10, [1, 2]), (20, [3])
     ]
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task(work_spec)
     executor.MapTaskExecutor(
         work_item.map_task,
         test_shuffle_source=shuffle_source_mock).execute()
     with open(output_path) as f:
         self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
Exemplo n.º 4
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task(work_spec)
     executor.MapTaskExecutor(
         work_item.map_task, test_shuffle_sink=shuffle_sink_mock).execute()
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
Exemplo n.º 5
0
    def test_read_do_write_with_start_bundle(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path
        finish_path = '%s.finish' % input_path
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                DoFnUsingStartBundle(finish_path)),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: 01234567890123456789\n', f.read())
        # Check that the finish_bundle method of the custom DoFn object left the
        # expected side-effect by writing a file with a specific content.
        with open(finish_path) as f:
            self.assertEqual('finish called.', f.read())
Exemplo n.º 6
0
    def test_read_do_write(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path

        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('XYZ: 01234567890123456789\n', f.read())
Exemplo n.º 7
0
    def test_create_do_with_collection_side_bigquery_write(self):
        elements = ['aa', 'bb']
        side_elements = ['x', 'y']
        output_buffer = []
        patch_target = 'google.cloud.dataflow.io.bigquery.BigQueryReader'
        with mock.patch(target=patch_target) as mock_class:
            # Setup the reader so it will yield the values in 'side_elements'.
            reader_mock = mock_class.return_value
            reader_mock.__enter__.return_value = reader_mock
            # Use a lambda so that multiple readers can be created, each reading the
            # entirety of the side elements.
            reader_mock.__iter__.side_effect = lambda: (x
                                                        for x in side_elements)

            work_item = workitem.BatchWorkItem(None)
            work_item.map_task = make_map_task([
                maptask.WorkerRead(inmemory.InMemorySource(
                    elements=[pickler.dumps(e) for e in elements],
                    start_index=0,
                    end_index=3),
                                   output_coders=[self.OUTPUT_CODER]),
                maptask.WorkerDoFn(
                    serialized_fn=pickle_with_side_inputs(
                        ptransform.CallableWrapperDoFn(
                            lambda x, side: ['%s:%s' % (x, s) for s in side]),
                        tag_and_type=('bigquery',
                                      pvalue.IterablePCollectionView, ())),
                    output_tags=['out'],
                    input=(0, 0),
                    side_inputs=[
                        maptask.WorkerSideInputSource(bigquery.BigQuerySource(
                            project='project',
                            dataset='dataset',
                            table='table',
                            coder=get_bigquery_source_coder()),
                                                      tag='bigquery')
                    ],
                    output_coders=[self.OUTPUT_CODER]),
                maptask.WorkerInMemoryWrite(
                    output_buffer=output_buffer,
                    input=(1, 0),
                    output_coders=(self.OUTPUT_CODER, ))
            ])
            executor.MapTaskExecutor(work_item.map_task).execute()
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual(['aa:x', 'aa:y', 'bb:x', 'bb:y'],
                         sorted(output_buffer))
Exemplo n.º 8
0
    def test_in_memory_source_progress_reporting(self):
        elements = [101, 201, 301, 401, 501, 601, 701]
        output_buffer = []
        source = ProgressRequestRecordingInMemorySource(
            elements=[pickler.dumps(e) for e in elements])
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(source, output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                        input=(0, 0),
                                        output_coders=(self.OUTPUT_CODER, ))
        ])
        executor.MapTaskExecutor(work_item.map_task).execute()
        self.assertEqual(elements, output_buffer)

        expected_progress_record = range(len(elements))
        self.assertEqual(expected_progress_record,
                         source.last_reader.progress_record)
Exemplo n.º 9
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task([
         maptask.WorkerRead(inmemory.InMemorySource(
             elements=[pickler.dumps(e) for e in elements],
             start_index=0,
             end_index=2),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(
             serialized_fn=pickle_with_side_inputs(
                 ptransform.CallableWrapperDoFn(
                     lambda x, side: ['%s:%s' % (x, s) for s in side]),
                 tag_and_type=('sometag', pvalue.IterablePCollectionView,
                               ())),
             output_tags=['out'],
             input=(0, 0),
             # Note that the two side inputs have the same tag. This is quite
             # common for intermediary PCollections used as side inputs that
             # are saved as AVRO files. The files will contain the sharded
             # PCollection.
             side_inputs=[
                 maptask.WorkerSideInputSource(fileio.TextFileSource(
                     file_path=input_path1,
                     coder=coders.Base64PickleCoder()),
                                               tag='sometag'),
                 maptask.WorkerSideInputSource(fileio.TextFileSource(
                     file_path=input_path2,
                     coder=coders.Base64PickleCoder()),
                                               tag='sometag')
             ],
             output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                     input=(1, 0),
                                     output_coders=(self.OUTPUT_CODER, ))
     ])
     executor.MapTaskExecutor(work_item.map_task).execute()
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
    def test_progress_reporter_reports_progress(self, mock_report_response,
                                                mock_next_progress):  # pylint: disable=unused-argument
        work_item = workitem.BatchWorkItem(proto=mock.MagicMock(),
                                           map_task=mock.MagicMock())
        mock_work_executor = mock.MagicMock()
        mock_batch_worker = mock.MagicMock()
        mock_client = mock.MagicMock()

        mock_next_progress.return_value = 1

        progress_reporter = batchworker.ProgressReporter(
            work_item, mock_work_executor, mock_batch_worker, mock_client)
        progress_reporter.start_reporting_progress()
        time.sleep(10)
        progress_reporter.stop_reporting_progress()
        mock_client.report_status.assert_called_with(mock.ANY, mock.ANY,
                                                     mock.ANY, mock.ANY,
                                                     mock.ANY, mock.ANY,
                                                     mock.ANY)
Exemplo n.º 11
0
 def test_pgbk(self):
     elements = [('a', 1), ('b', 2), ('a', 3), ('a', 4)]
     output_buffer = []
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task([
         maptask.WorkerRead(inmemory.InMemorySource(
             elements=[pickler.dumps(e) for e in elements],
             start_index=0,
             end_index=100),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerPartialGroupByKey(combine_fn=None,
                                         input=(0, 0),
                                         output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                     input=(1, 0),
                                     output_coders=(self.OUTPUT_CODER, ))
     ])
     executor.MapTaskExecutor(work_item.map_task).execute()
     self.assertEqual([('a', [1, 3, 4]), ('b', [2])], sorted(output_buffer))
Exemplo n.º 12
0
 def test_combine(self):
     elements = [('a', [1, 2, 3]), ('b', [10])]
     output_buffer = []
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task([
         maptask.WorkerRead(inmemory.InMemorySource(
             elements=[pickler.dumps(e) for e in elements],
             start_index=0,
             end_index=100),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerCombineFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CombineFn.from_callable(sum)),
                                 phase='all',
                                 input=(0, 0),
                                 output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                     input=(1, 0),
                                     output_coders=(self.OUTPUT_CODER, ))
     ])
     executor.MapTaskExecutor(work_item.map_task).execute()
     self.assertEqual([('a', 6), ('b', 10)], output_buffer)
Exemplo n.º 13
0
 def test_ungrouped_shuffle_read_and_write(self):
     output_path = self.create_temp_file('n/a')
     work_spec = [
         maptask.WorkerUngroupedShuffleRead(
             shuffle_reader_config='none',
             start_shuffle_position='aaa',
             end_shuffle_position='zzz',
             coder=self.SHUFFLE_CODER,
             output_coders=[self.SHUFFLE_CODER]),
         make_text_sink(output_path, input=(0, 0))
     ]
     shuffle_source_mock = mock.MagicMock()
     shuffle_source_mock.reader().__enter__().__iter__.return_value = [
         1, 2, 3
     ]
     work_item = workitem.BatchWorkItem(None)
     work_item.map_task = make_map_task(work_spec)
     executor.MapTaskExecutor(
         work_item.map_task,
         test_shuffle_source=shuffle_source_mock).execute()
     with open(output_path) as f:
         self.assertEqual('1\n2\n3\n', f.read())
Exemplo n.º 14
0
    def test_create_do_with_side_text_file_write(self):
        input_path = self.create_temp_file('x\ny\n')
        elements = ['aa', 'bb']
        output_buffer = []
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(inmemory.InMemorySource(
                elements=[pickler.dumps(e) for e in elements],
                start_index=0,
                end_index=2),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                ptransform.CallableWrapperDoFn(
                    lambda x, side: ['%s:%s' % (x, s) for s in side]),
                tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
                               output_tags=['out'],
                               input=(0, 0),
                               side_inputs=[
                                   maptask.WorkerSideInputSource(
                                       fileio.TextFileSource(
                                           file_path=input_path,
                                           start_offset=None,
                                           end_offset=None,
                                           strip_trailing_newlines=True,
                                           coder=coders.StrUtf8Coder()),
                                       tag='textfile')
                               ],
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                        input=(1, 0),
                                        output_coders=(self.OUTPUT_CODER, ))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        # The side source was specified as collection therefore we should see
        # all elements of the side source.
        self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                         sorted(output_buffer))