Пример #1
0
    def run_update_stop_position(self, start_offset, end_offset, stop_offset,
                                 records_to_read, file_path):
        source = fileio.TextFileSource(file_path, start_offset, end_offset)

        records_of_first_split = ''

        with source.reader() as reader:
            reader_iter = iter(reader)
            i = 0

            try:
                while i < records_to_read:
                    records_of_first_split += next(reader_iter)
                    i += 1
            except StopIteration:
                # Invalid case, given source does not contain this many records.
                return

            last_record_start_after_reading = reader.range_tracker.last_record_start

            if stop_offset <= last_record_start_after_reading:
                expected_split_response = None
            elif stop_offset == start_offset or stop_offset == end_offset:
                expected_split_response = None
            elif records_to_read == 0:
                expected_split_response = None  # unstarted
            else:
                expected_split_response = iobase.DynamicSplitResultWithPosition(
                    stop_position=iobase.ReaderPosition(
                        byte_offset=stop_offset))

            split_response = self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(progress=iobase.ReaderProgress(
                    iobase.ReaderPosition(byte_offset=stop_offset))),
                expected_split_response)

            # Reading remaining records from the updated reader.
            for line in reader:
                records_of_first_split += line

        if split_response is not None:
            # Total contents received by reading the two splits should be equal to the
            # result obtained by reading the original source.
            records_of_original = ''
            records_of_second_split = ''

            with source.reader() as original_reader:
                for line in original_reader:
                    records_of_original += line

            new_source = fileio.TextFileSource(
                file_path, split_response.stop_position.byte_offset,
                end_offset)
            with new_source.reader() as reader:
                for line in reader:
                    records_of_second_split += line

            self.assertEqual(records_of_original,
                             records_of_first_split + records_of_second_split)
Пример #2
0
 def test_read_do_shuffle_write(self):
     input_path = self.create_temp_file('a\nb\nc\nd\n')
     work_spec = [
         maptask.WorkerRead(fileio.TextFileSource(
             file_path=input_path,
             start_offset=0,
             end_offset=8,
             strip_trailing_newlines=True,
             coder=coders.StrUtf8Coder()),
                            output_coders=[self.OUTPUT_CODER]),
         maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
             ptransform.CallableWrapperDoFn(lambda x: [(x, 1)])),
                            output_tags=['out'],
                            output_coders=[self.OUTPUT_CODER],
                            input=(0, 0),
                            side_inputs=None),
         maptask.WorkerShuffleWrite(shuffle_kind='group_keys',
                                    shuffle_writer_config='none',
                                    input=(1, 0),
                                    output_coders=(self.SHUFFLE_CODER, ))
     ]
     shuffle_sink_mock = mock.MagicMock()
     executor.MapTaskExecutor().execute(make_map_task(work_spec),
                                        test_shuffle_sink=shuffle_sink_mock)
     # Make sure we have seen all the (k, v) writes.
     shuffle_sink_mock.writer().Write.assert_has_calls([
         mock.call('a', '', 1),
         mock.call('b', '', 1),
         mock.call('c', '', 1),
         mock.call('d', '', 1)
     ])
Пример #3
0
 def test_read_do_write_with_start_bundle(self):
     input_path = self.create_temp_file('01234567890123456789\n0123456789')
     output_path = '%s.out' % input_path
     finish_path = '%s.finish' % input_path
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(fileio.TextFileSource(
                 file_path=input_path,
                 start_offset=0,
                 end_offset=15,
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder()),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                 DoFnUsingStartBundle(finish_path)),
                                output_tags=['out'],
                                output_coders=[self.OUTPUT_CODER],
                                input=(0, 0),
                                side_inputs=None),
             make_text_sink(output_path, input=(1, 0))
         ]))
     with open(output_path) as f:
         self.assertEqual('XYZ: 01234567890123456789\n', f.read())
     # Check that the finish_bundle method of the custom DoFn object left the
     # expected side-effect by writing a file with a specific content.
     with open(finish_path) as f:
         self.assertEqual('finish called.', f.read())
Пример #4
0
 def test_create_do_with_side_text_file_write(self):
   input_path = self.create_temp_file('x\ny\n')
   elements = ['aa', 'bb']
   output_buffer = []
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               start_index=0,
               end_index=2),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(
           serialized_fn=pickle_with_side_inputs(
               ptransform.CallableWrapperDoFn(
                   lambda x, side: ['%s:%s' % (x, s) for s in side]),
               tag_and_type=('textfile', pvalue.IterablePCollectionView, ())),
           output_tags=['out'], input=(0, 0),
           side_inputs=[
               maptask.WorkerSideInputSource(fileio.TextFileSource(
                   file_path=input_path, start_offset=None, end_offset=None,
                   strip_trailing_newlines=True,
                   coder=coders.StrUtf8Coder()),
                                             tag='textfile')],
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerInMemoryWrite(output_buffer=output_buffer,
                                   input=(1, 0),
                                   output_coders=(self.OUTPUT_CODER,))]))
   # The side source was specified as collection therefore we should see
   # all elements of the side source.
   self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                    sorted(output_buffer))
Пример #5
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
Пример #6
0
 def test_read_entire_file(self):
     lines = ['First', 'Second', 'Third']
     source = fileio.TextFileSource(
         file_path=self.create_temp_file('\n'.join(lines)))
     read_lines = []
     with source.reader() as reader:
         for line in reader:
             read_lines.append(line)
     self.assertEqual(read_lines, lines)
Пример #7
0
 def test_create_do_with_side_avro_file_write(self):
     input_path1 = self.create_temp_file('%s\n' % pickler.dumps('x'))
     input_path2 = self.create_temp_file('%s\n' % pickler.dumps('y'))
     elements = ['aa', 'bb']
     output_buffer = []
     executor.MapTaskExecutor().execute(
         make_map_task([
             maptask.WorkerRead(inmemory.InMemorySource(
                 elements=[pickler.dumps(e) for e in elements],
                 start_index=0,
                 end_index=2),
                                output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerDoFn(
                 serialized_fn=pickle_with_side_inputs(
                     ptransform.CallableWrapperDoFn(
                         lambda x, side: ['%s:%s' % (x, s) for s in side]),
                     tag_and_type=('sometag',
                                   pvalue.IterablePCollectionView, ())),
                 output_tags=['out'],
                 input=(0, 0),
                 # Note that the two side inputs have the same tag. This is quite
                 # common for intermediary PCollections used as side inputs that
                 # are saved as AVRO files. The files will contain the sharded
                 # PCollection.
                 side_inputs=[
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path1,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag'),
                     maptask.WorkerSideInputSource(fileio.TextFileSource(
                         file_path=input_path2,
                         coder=coders.Base64PickleCoder()),
                                                   tag='sometag')
                 ],
                 output_coders=[self.OUTPUT_CODER]),
             maptask.WorkerInMemoryWrite(
                 output_buffer=output_buffer,
                 input=(1, 0),
                 output_coders=(self.OUTPUT_CODER, ))
         ]))
     # The side source was specified as collection therefore we should see
     # all three elements of the side source.
     self.assertEqual([u'aa:x', u'aa:y', u'bb:x', u'bb:y'],
                      sorted(output_buffer))
Пример #8
0
 def read_with_offsets(self, input_lines, output_lines,
                       start_offset=None, end_offset=None):
   source = fileio.TextFileSource(
       file_path=self.create_temp_file('\n'.join(input_lines)),
       start_offset=start_offset, end_offset=end_offset)
   read_lines = []
   with source.reader() as reader:
     for line in reader:
       read_lines.append(line)
   self.assertEqual(read_lines, output_lines)
Пример #9
0
  def test_progress_entire_file(self):
    lines = ['First', 'Second', 'Third']
    source = fileio.TextFileSource(
        file_path=self.create_temp_file('\n'.join(lines)))
    progress_record = []
    with source.reader() as reader:
      self.assertEqual(-1, reader.get_progress().position.byte_offset)
      for line in reader:
        self.assertIsNotNone(line)
        progress_record.append(reader.get_progress().position.byte_offset)
      self.assertEqual(13, reader.get_progress().position.byte_offset)

    self.assertEqual(len(progress_record), 3)
    self.assertEqual(progress_record, [0, 6, 13])
Пример #10
0
  def progress_with_offsets(self, input_lines,
                            start_offset=None, end_offset=None):
    source = fileio.TextFileSource(
        file_path=self.create_temp_file('\n'.join(input_lines)),
        start_offset=start_offset, end_offset=end_offset)
    progress_record = []
    with source.reader() as reader:
      self.assertEqual(reader.get_progress().position.byte_offset, -1)
      for line in reader:
        self.assertIsNotNone(line)
        progress_record.append(reader.get_progress().position.byte_offset)

    previous = 0
    for current in progress_record:
      self.assertGreater(current, previous)
      previous = current
Пример #11
0
    def test_update_stop_position_percent_complete_for_position(self):
        lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee']
        source = fileio.TextFileSource(
            file_path=self.create_temp_file('\n'.join(lines)))
        with source.reader() as reader:
            # Reading two lines
            reader_iter = iter(reader)
            next(reader_iter)
            next(reader_iter)
            next(reader_iter)

            # Splitting at end of the range should be unsuccessful
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=0))), None)
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=25))), None)

            # Splitting at positions on or before start offset of the last record
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=5))), None)
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=10))), None)

            # Splitting at a position after the start offset of the last record should
            # be successful
            self.try_splitting_reader_at(
                reader,
                iobase.DynamicSplitRequest(
                    iobase.ReaderProgress(position=iobase.ReaderPosition(
                        byte_offset=15))),
                iobase.DynamicSplitResultWithPosition(
                    iobase.ReaderPosition(byte_offset=15)))
Пример #12
0
    def test_read_do_write_with_undeclared_output(self):
        input_path = self.create_temp_file('01234567890123456789\n0123456789')
        output_path = '%s.out' % input_path
        work_item = workitem.BatchWorkItem(None)
        work_item.map_task = make_map_task([
            maptask.WorkerRead(fileio.TextFileSource(
                file_path=input_path,
                start_offset=0,
                end_offset=15,
                strip_trailing_newlines=True,
                coder=coders.StrUtf8Coder()),
                               output_coders=[self.OUTPUT_CODER]),
            maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
                DoFnUsingWithUndeclaredSideOutput()),
                               output_tags=['out'],
                               output_coders=[self.OUTPUT_CODER],
                               input=(0, 0),
                               side_inputs=None),
            make_text_sink(output_path, input=(1, 0))
        ])

        executor.MapTaskExecutor(work_item.map_task).execute()
        with open(output_path) as f:
            self.assertEqual('01234567890123456789\n', f.read())