def test_split_points(self): file_name = self._write_data(count=12000) source = AvroSource(file_name) splits = [ split for split in source.split(desired_bundle_size=float('inf')) ] assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) split_points_report = [] for _ in splits[0].source.read(range_tracker): split_points_report.append(range_tracker.split_points()) # There are a total of three blocks. Each block has more than 10 records. # When reading records of the first block, range_tracker.split_points() # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) self.assertEquals(split_points_report[:10], [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10) # When reading records of last block, range_tracker.split_points() should # return (2, 1) self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
def test_read_reantrant_with_splitting(self): file_name = self._write_data() source = AvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 source_test_utils.assertReentrantReadsSucceed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_source_display_data(self): file_name = 'some_avro_source' source = AvroSource(file_name, validate=False) dd = DisplayData.create_from(source) # No extra avro parameters for AvroSource. expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_dynamic_work_rebalancing_exhaustive(self): # Adjusting block size so that we can perform a exhaustive dynamic # work rebalancing test that completes within an acceptable amount of time. old_sync_interval = avro.datafile.SYNC_INTERVAL try: avro.datafile.SYNC_INTERVAL = 2 file_name = self._write_data(count=5) source = AvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=float('inf'))] assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source) finally: avro.datafile.SYNC_INTERVAL = old_sync_interval
def test_corrupted_file(self): file_name = self._write_data() with open(file_name, 'rb') as f: data = f.read() # Corrupt the last character of the file which is also the last character of # the last sync_marker. last_char_index = len(data) - 1 corrupted_data = data[:last_char_index] corrupted_data += 'A' if data[last_char_index] == 'B' else 'B' with tempfile.NamedTemporaryFile( delete=False, prefix=tempfile.template) as f: f.write(corrupted_data) corrupted_file_name = f.name source = AvroSource(corrupted_file_name) with self.assertRaises(ValueError) as exn: source_test_utils.read_from_source(source, None, None) self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))
def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting, expected_result): source = AvroSource(pattern) read_records = [] if perform_splitting: assert desired_bundle_size splits = [ split for split in source.split( desired_bundle_size=desired_bundle_size) ] if len(splits) < 2: raise ValueError( 'Test is trivial. Please adjust it so that at least ' 'two splits get generated') sources_info = [(split.source, split.start_position, split.stop_position) for split in splits] source_test_utils.assertSourcesEqualReferenceSource( (source, None, None), sources_info) else: read_records = source_test_utils.readFromSource(source, None, None) self.assertItemsEqual(expected_result, read_records)
def test_read_reentrant_without_splitting(self): file_name = self._write_data() source = AvroSource(file_name) source_test_utils.assertReentrantReadsSucceed((source, None, None))