def run_update_stop_position(self, start_offset, end_offset, stop_offset, records_to_read, file_path): source = fileio.TextFileSource(file_path, start_offset, end_offset) records_of_first_split = '' with source.reader() as reader: reader_iter = iter(reader) i = 0 try: while i < records_to_read: records_of_first_split += next(reader_iter) i += 1 except StopIteration: # Invalid case, given source does not contain this many records. return last_record_start_after_reading = reader.range_tracker.last_record_start if stop_offset <= last_record_start_after_reading: expected_split_response = None elif stop_offset == start_offset or stop_offset == end_offset: expected_split_response = None elif records_to_read == 0: expected_split_response = None # unstarted else: expected_split_response = iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition( byte_offset=stop_offset)) split_response = self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(progress=iobase.ReaderProgress( iobase.ReaderPosition(byte_offset=stop_offset))), expected_split_response) # Reading remaining records from the updated reader. for line in reader: records_of_first_split += line if split_response is not None: # Total contents received by reading the two splits should be equal to the # result obtained by reading the original source. records_of_original = '' records_of_second_split = '' with source.reader() as original_reader: for line in original_reader: records_of_original += line new_source = fileio.TextFileSource( file_path, split_response.stop_position.byte_offset, end_offset) with new_source.reader() as reader: for line in reader: records_of_second_split += line self.assertEqual(records_of_original, records_of_first_split + records_of_second_split)
def request_dynamic_split(self, dynamic_split_request): assert dynamic_split_request is not None progress = dynamic_split_request.progress split_position = progress.position if split_position is None: percent_complete = progress.percent_complete if percent_complete is not None: if percent_complete <= 0 or percent_complete >= 1: logging.warning( 'FileBasedReader cannot be split since the provided percentage ' 'of work to be completed is out of the valid range (0, ' '1). Requested: %r', dynamic_split_request) return split_position = iobase.ReaderPosition() split_position.byte_offset = ( self.range_tracker.position_at_fraction(percent_complete)) else: logging.warning( 'TextReader requires either a position or a percentage of work to ' 'be complete to perform a dynamic split request. Requested: %r', dynamic_split_request) return if self.range_tracker.try_split(split_position.byte_offset): return iobase.DynamicSplitResultWithPosition(split_position) else: return
def request_dynamic_split(self, dynamic_split_request): assert dynamic_split_request is not None split_request_progress = dynamic_split_request.progress if split_request_progress.position is None: logging.warning( 'GroupingShuffleReader only supports split at a Position.' ' Requested: %r', dynamic_split_request) return encoded_shuffle_position = split_request_progress.position.shuffle_position if encoded_shuffle_position is None: logging.warning( 'GroupingShuffleReader only supports split at a shuffle' ' position. Requested: %r', split_request_progress.position) return if self._range_tracker.try_split_at_position( _shuffle_decode(encoded_shuffle_position)): logging.info('Split GroupedShuffleReader at %s', encoded_shuffle_position) split_position = iobase.ReaderPosition( shuffle_position=encoded_shuffle_position) return iobase.DynamicSplitResultWithPosition(split_position) else: logging.info('Refusing to split GroupedShuffleReader %r at %s', self, encoded_shuffle_position)
def test_dynamic_split_result_with_position_to_cloud_stop_position(self): position = iobase.ReaderPosition(byte_offset=9999) dynamic_split_result = iobase.DynamicSplitResultWithPosition(position) approximate_position = ( apiclient. dynamic_split_result_with_position_to_cloud_stop_position( dynamic_split_result)) self.assertIsNotNone(approximate_position) self.assertIsInstance(approximate_position, dataflow.Position) self.assertEqual(9999, approximate_position.byteOffset)
def test_update_stop_position_percent_complete_for_position(self): lines = ['aaaa', 'bbbb', 'cccc', 'dddd', 'eeee'] source = fileio.TextFileSource( file_path=self.create_temp_file('\n'.join(lines))) with source.reader() as reader: # Reading two lines reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) # Splitting at end of the range should be unsuccessful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=0))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=25))), None) # Splitting at positions on or before start offset of the last record self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=5))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=10))), None) # Splitting at a position after the start offset of the last record should # be successful self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress(position=iobase.ReaderPosition( byte_offset=15))), iobase.DynamicSplitResultWithPosition( iobase.ReaderPosition(byte_offset=15)))
def request_dynamic_split(self, dynamic_split_request): assert dynamic_split_request is not None progress = dynamic_split_request.progress split_position = progress.position if split_position is None: logging.debug( 'InMemory reader only supports split requests that are ' 'based on positions. Received : %r', dynamic_split_request) return None index_position = split_position.record_index if index_position is None: logging.debug( 'InMemory reader only supports split requests that are ' 'based on index positions. Received : %r', dynamic_split_request) return None if self._range_tracker.try_split(index_position): return iobase.DynamicSplitResultWithPosition(split_position)
def test_dynamic_splitting_with_range(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder(), start_position=base64.urlsafe_b64encode('0'), end_position=base64.urlsafe_b64encode('3')) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: reader_iter = iter(reader) next(reader_iter) # Cannot split if split request is out of range self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('4')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('2'))))
def test_dynamic_splitting(self): source = GroupedShuffleSource( config_bytes='not used', coder=Base64Coder()) chunks = [TEST_CHUNK1, TEST_CHUNK2] with source.reader(test_reader=FakeShuffleReader(chunks)) as reader: # Cannot split an unstarted reader self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) reader_iter = iter(reader) next(reader_iter) next(reader_iter) # Cannot split since the provided split position is smaller than or equal # to the current position '1'. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('0')))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('1')))), None) # Successful split. self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest(iobase.ReaderProgress( position=iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3')))), iobase.DynamicSplitResultWithPosition(iobase.ReaderPosition( shuffle_position=base64.urlsafe_b64encode('3'))))
def test_in_memory_source_dynamic_split(self): source = inmemory.InMemorySource([10, 20, 30, 40, 50, 60], coder=FakeCoder()) # Unstarted reader with source.reader() as reader: self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Proposed split position out of range with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=-1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=10))), None) # Already read past proposed split position with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) next(reader_iter) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=1))), None) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), None) # Successful split with source.reader() as reader: reader_iter = iter(reader) next(reader_iter) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=4))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=4))) self.try_splitting_reader_at( reader, iobase.DynamicSplitRequest( iobase.ReaderProgress( position=iobase.ReaderPosition(record_index=2))), iobase.DynamicSplitResultWithPosition( stop_position=iobase.ReaderPosition(record_index=2)))