def test_create_do_write(self): output_path = self.create_temp_file('n/a') elements = ['abc', 'def', 'ghi'] executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( inmemory.InMemorySource( elements=[pickler.dumps(e) for e in elements], # Start at the last element. start_index=2, # Go beyond the end to test that case is handled. end_index=15), output_coders=[coders.ToStringCoder()]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: ghi\n', f.read())
def test_shuffle_read_do_write(self): output_path = self.create_temp_file('n/a') work_spec = [ maptask.WorkerGroupingShuffleRead(shuffle_reader_config='none', start_shuffle_position='aaa', end_shuffle_position='zzz', coder=self.SHUFFLE_CODER, output_coders=[self.SHUFFLE_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn( lambda (k, vs): [str((k, v)) for v in vs])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ] shuffle_source_mock = mock.MagicMock() shuffle_source_mock.reader().__enter__().__iter__.return_value = [ (10, [1, 2]), (20, [3])] executor.MapTaskExecutor().execute( make_map_task(work_spec), test_shuffle_source=shuffle_source_mock) with open(output_path) as f: self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
def test_read_do_write_with_start_bundle(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path finish_path = '%s.finish' % input_path executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( fileio.TextFileSource(file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( DoFnUsingStartBundle(finish_path)), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read()) # Check that the finish_bundle method of the custom DoFn object left the # expected side-effect by writing a file with a specific content. with open(finish_path) as f: self.assertEqual('finish called.', f.read())
def test_read_do_write(self): input_path = self.create_temp_file('01234567890123456789\n0123456789') output_path = '%s.out' % input_path executor.MapTaskExecutor().execute(make_map_task([ maptask.WorkerRead( fileio.TextFileSource(file_path=input_path, start_offset=0, end_offset=15, strip_trailing_newlines=True, coder=coders.StrUtf8Coder()), output_coders=[self.OUTPUT_CODER]), maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs( ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])), output_tags=['out'], output_coders=[self.OUTPUT_CODER], input=(0, 0), side_inputs=None), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(1, 0), output_coders=(coders.ToStringCoder(),)) ])) with open(output_path) as f: self.assertEqual('XYZ: 01234567890123456789\n', f.read())
def test_write_gzip_file(self): sink = fileio.TextFileSink( self.path, compression_type=fileio.CompressionTypes.DEFLATE) self._write_lines(sink, self.lines) with gzip.GzipFile(self.path, 'r') as f: self.assertEqual(f.read().splitlines(), self.lines)
def test_write_entire_file(self): lines = ['First', 'Second', 'Third'] file_path = self.create_temp_file() sink = fileio.TextFileSink(file_path) with sink.writer() as writer: for line in lines: writer.Write(line) with open(file_path, 'r') as f: self.assertEqual(f.read().splitlines(), lines)
def test_ungrouped_shuffle_read_and_write(self): output_path = self.create_temp_file('n/a') work_spec = [ maptask.WorkerUngroupedShuffleRead(shuffle_reader_config='none', start_shuffle_position='aaa', end_shuffle_position='zzz', coder=self.SHUFFLE_CODER, output_coders=[self.SHUFFLE_CODER]), maptask.WorkerWrite( fileio.TextFileSink(file_path_prefix=output_path, append_trailing_newlines=True, coder=coders.ToStringCoder()), input=(0, 0), output_coders=(coders.ToStringCoder(),)) ] shuffle_source_mock = mock.MagicMock() shuffle_source_mock.reader().__enter__().__iter__.return_value = [1, 2, 3] executor.MapTaskExecutor().execute( make_map_task(work_spec), test_shuffle_source=shuffle_source_mock) with open(output_path) as f: self.assertEqual('1\n2\n3\n', f.read())
def test_write_text_file(self): sink = fileio.TextFileSink(self.path) self._write_lines(sink, self.lines) with open(self.path, 'r') as f: self.assertEqual(f.read().splitlines(), self.lines)