示例#1
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
示例#2
0
 def test_shuffle_read_do_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerGroupingShuffleRead(shuffle_reader_config='none',
                                         start_shuffle_position='aaa',
                                         end_shuffle_position='zzz',
                                         coder=self.SHUFFLE_CODER,
                                         output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(
               lambda (k, vs): [str((k, v)) for v in vs])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [
       (10, [1, 2]), (20, [3])]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
示例#3
0
 def test_read_do_write_with_start_bundle(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   finish_path = '%s.finish' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           DoFnUsingStartBundle(finish_path)),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
   # Check that the finish_bundle method of the custom DoFn object left the
   # expected side-effect by writing a file with a specific content.
   with open(finish_path) as f:
     self.assertEqual('finish called.', f.read())
示例#4
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
示例#5
0
    def test_write_gzip_file(self):
        sink = fileio.TextFileSink(
            self.path, compression_type=fileio.CompressionTypes.DEFLATE)
        self._write_lines(sink, self.lines)

        with gzip.GzipFile(self.path, 'r') as f:
            self.assertEqual(f.read().splitlines(), self.lines)
示例#6
0
 def test_write_entire_file(self):
   lines = ['First', 'Second', 'Third']
   file_path = self.create_temp_file()
   sink = fileio.TextFileSink(file_path)
   with sink.writer() as writer:
     for line in lines:
       writer.Write(line)
   with open(file_path, 'r') as f:
     self.assertEqual(f.read().splitlines(), lines)
示例#7
0
 def test_ungrouped_shuffle_read_and_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerUngroupedShuffleRead(shuffle_reader_config='none',
                                          start_shuffle_position='aaa',
                                          end_shuffle_position='zzz',
                                          coder=self.SHUFFLE_CODER,
                                          output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(0, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [1, 2, 3]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('1\n2\n3\n', f.read())
示例#8
0
    def test_write_text_file(self):
        sink = fileio.TextFileSink(self.path)
        self._write_lines(sink, self.lines)

        with open(self.path, 'r') as f:
            self.assertEqual(f.read().splitlines(), self.lines)