Exemplo n.º 1
0
def make_text_sink(output_path, input, coder=coders.ToStringCoder()):
    return maptask.WorkerWrite(fileio.NativeTextFileSink(
        file_path_prefix=output_path,
        append_trailing_newlines=True,
        coder=coder),
                               input=input,
                               output_coders=(coder, ))
Exemplo n.º 2
0
 def test_shuffle_read_do_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerGroupingShuffleRead(shuffle_reader_config='none',
                                         start_shuffle_position='aaa',
                                         end_shuffle_position='zzz',
                                         coder=self.SHUFFLE_CODER,
                                         output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(
               lambda (k, vs): [str((k, v)) for v in vs])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [
       (10, [1, 2]), (20, [3])]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
Exemplo n.º 3
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
Exemplo n.º 4
0
 def test_read_do_write_with_start_bundle(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   finish_path = '%s.finish' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           DoFnUsingStartBundle(finish_path)),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
   # Check that the finish_bundle method of the custom DoFn object left the
   # expected side-effect by writing a file with a specific content.
   with open(finish_path) as f:
     self.assertEqual('finish called.', f.read())
Exemplo n.º 5
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
 def test_ungrouped_shuffle_source_to_text_sink(self):
     work = workitem.get_work_items(
         get_shuffle_source_to_text_sink_message(
             UNGROUPED_SHUFFLE_SOURCE_SPEC))
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerUngroupedShuffleRead(start_shuffle_position='opaque',
                                            end_shuffle_position='opaque',
                                            shuffle_reader_config='opaque',
                                            coder=CODER,
                                            output_coders=[CODER]),
         maptask.WorkerWrite(fileio.NativeTextFileSink(
             file_path_prefix='gs://somefile',
             append_trailing_newlines=True,
             coder=CODER),
                             input=(0, 0),
                             output_coders=(CODER, ))
     ]))
 def test_in_memory_source_to_text_sink(self):
     work = workitem.get_work_items(
         get_in_memory_source_to_text_sink_message())
     self.assertEqual((work.proto.id, work.map_task.operations), (1234, [
         maptask.WorkerRead(inmemory.InMemorySource(
             start_index=1,
             end_index=3,
             elements=[
                 base64.b64decode(v['value']) for v in IN_MEMORY_ELEMENTS
             ],
             coder=CODER),
                            output_coders=[CODER]),
         maptask.WorkerWrite(fileio.NativeTextFileSink(
             file_path_prefix='gs://somefile',
             append_trailing_newlines=True,
             coder=CODER),
                             input=(0, 0),
                             output_coders=(CODER, ))
     ]))
Exemplo n.º 8
0
 def test_ungrouped_shuffle_read_and_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerUngroupedShuffleRead(shuffle_reader_config='none',
                                          start_shuffle_position='aaa',
                                          end_shuffle_position='zzz',
                                          coder=self.SHUFFLE_CODER,
                                          output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(0, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [1, 2, 3]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('1\n2\n3\n', f.read())