def test_estimate_size(self): source = ConcatSource([ RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) self.assertEqual(source.estimate_size(), 1000)
def test_estimate_size(self): sources = [ TestConcatSource.DummySource(range(start, start + 10)) for start in [0, 10, 20] ] concat = ConcatSource(sources) self.assertEquals(30, concat.estimate_size())
def test_read(self): sources = [TestConcatSource.DummySource(range(start, start + 10)) for start in [0, 10, 20]] concat = ConcatSource(sources) range_tracker = concat.get_range_tracker(None, None) read_data = [value for value in concat.read(range_tracker)] self.assertCountEqual(list(range(30)), read_data)
def test_read(self): sources = [TestConcatSource.DummySource(range(start, start + 10)) for start in [0, 10, 20]] concat = ConcatSource(sources) range_tracker = concat.get_range_tracker(None, None) read_data = [value for value in concat.read(range_tracker)] self.assertItemsEqual(range(30), read_data)
def test_position_at_fration(self): ranges = [(0, 4), (4, 16), (16, 24), (24, 32)] source = ConcatSource([iobase.SourceBundle((range[1] - range[0]) / 32., RangeSource(*range), None, None) for range in ranges]) range_tracker = source.get_range_tracker() self.assertEquals(range_tracker.position_at_fraction(0), (0, 0)) self.assertEquals(range_tracker.position_at_fraction(.01), (0, 1)) self.assertEquals(range_tracker.position_at_fraction(.1), (0, 4)) self.assertEquals(range_tracker.position_at_fraction(.125), (1, 4)) self.assertEquals(range_tracker.position_at_fraction(.2), (1, 7)) self.assertEquals(range_tracker.position_at_fraction(.7), (2, 23)) self.assertEquals(range_tracker.position_at_fraction(.75), (3, 24)) self.assertEquals(range_tracker.position_at_fraction(.8), (3, 26)) self.assertEquals(range_tracker.position_at_fraction(1), (4, None)) range_tracker = source.get_range_tracker((1, None), (3, None)) self.assertEquals(range_tracker.position_at_fraction(0), (1, 4)) self.assertEquals(range_tracker.position_at_fraction(.01), (1, 5)) self.assertEquals(range_tracker.position_at_fraction(.5), (1, 14)) self.assertEquals(range_tracker.position_at_fraction(.599), (1, 16)) self.assertEquals(range_tracker.position_at_fraction(.601), (2, 17)) self.assertEquals(range_tracker.position_at_fraction(1), (3, None))
def test_fraction_consumed_at_end(self): source = ConcatSource([ RangeSource(0, 2), RangeSource(2, 4), ]) range_tracker = source.get_range_tracker((2, None), None) self.assertEqual(range_tracker.fraction_consumed(), 1.0)
def test_position_at_fration(self): ranges = [(0, 4), (4, 16), (16, 24), (24, 32)] source = ConcatSource([ iobase.SourceBundle((range[1] - range[0]) / 32., RangeSource(*range), None, None) for range in ranges ]) range_tracker = source.get_range_tracker() self.assertEqual(range_tracker.position_at_fraction(0), (0, 0)) self.assertEqual(range_tracker.position_at_fraction(.01), (0, 1)) self.assertEqual(range_tracker.position_at_fraction(.1), (0, 4)) self.assertEqual(range_tracker.position_at_fraction(.125), (1, 4)) self.assertEqual(range_tracker.position_at_fraction(.2), (1, 7)) self.assertEqual(range_tracker.position_at_fraction(.7), (2, 23)) self.assertEqual(range_tracker.position_at_fraction(.75), (3, 24)) self.assertEqual(range_tracker.position_at_fraction(.8), (3, 26)) self.assertEqual(range_tracker.position_at_fraction(1), (4, None)) range_tracker = source.get_range_tracker((1, None), (3, None)) self.assertEqual(range_tracker.position_at_fraction(0), (1, 4)) self.assertEqual(range_tracker.position_at_fraction(.01), (1, 5)) self.assertEqual(range_tracker.position_at_fraction(.5), (1, 14)) self.assertEqual(range_tracker.position_at_fraction(.599), (1, 16)) self.assertEqual(range_tracker.position_at_fraction(.601), (2, 17)) self.assertEqual(range_tracker.position_at_fraction(1), (3, None))
def test_single_source(self): read_all = source_test_utils.readFromSource range10 = RangeSource(0, 10) self.assertEquals(read_all(ConcatSource([range10])), range(10)) self.assertEquals(read_all(ConcatSource([range10]), (0, 5)), range(5, 10)) self.assertEquals(read_all(ConcatSource([range10]), None, (0, 5)), range(5))
def test_source_with_empty_ranges(self): read_all = source_test_utils.readFromSource empty = RangeSource(0, 0) self.assertEquals(read_all(empty), []) range10 = RangeSource(0, 10) self.assertEquals(read_all(ConcatSource([empty, empty, range10])), range(10)) self.assertEquals(read_all(ConcatSource([empty, range10, empty])), range(10)) self.assertEquals(read_all(ConcatSource([range10, empty, range10, empty])), range(10) + range(10))
def _get_concat_source(self): if self._concat_source is None: patterns = self._patterns.get() single_file_sources = [] for match_result in FileSystems.match(patterns): file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in match_result.metadata_list: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = ( self.splittable and _determine_splittability_from_compression_type( file_name, self._compression_type)) single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = ConcatSource(single_file_sources) return self._concat_source
def test_split(self): sources = [TestConcatSource.DummySource(list(range(start, start + 10))) for start in [0, 10, 20]] concat = ConcatSource(sources) splits = [split for split in concat.split()] self.assertEquals(6, len(splits)) # Reading all splits read_data = [] for split in splits: range_tracker_for_split = split.source.get_range_tracker( split.start_position, split.stop_position) read_data.extend([value for value in split.source.read( range_tracker_for_split)]) self.assertCountEqual(list(range(30)), read_data)
def test_split(self): sources = [TestConcatSource.DummySource(range(start, start + 10)) for start in [0, 10, 20]] concat = ConcatSource(sources) splits = [split for split in concat.split()] self.assertEquals(6, len(splits)) # Reading all splits read_data = [] for split in splits: range_tracker_for_split = split.source.get_range_tracker( split.start_position, split.stop_position) read_data.extend([value for value in split.source.read( range_tracker_for_split)]) self.assertItemsEqual(range(30), read_data)
def test_conact_source_exhaustive(self): source = ConcatSource([ RangeSource(0, 10), RangeSource(100, 110), RangeSource(1000, 1010), ]) source_test_utils.assert_split_at_fraction_exhaustive(source)
def test_run_concat_direct(self): source = ConcatSource([ RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) with TestPipeline() as pipeline: pcoll = pipeline | beam.io.Read(source) assert_that(pcoll, equal_to(list(range(1000))))
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_source_with_empty_ranges_exhastive(self): empty = RangeSource(0, 0) source = ConcatSource([empty, RangeSource(0, 10), empty, empty, RangeSource(10, 13), RangeSource(13, 17), empty, ]) source_test_utils.assertSplitAtFractionExhaustive(source)
def test_concat_source_split(self): unused_element = None initial_concat_source = ConcatSource([self.initial_range_source]) sdf_concat_restriction_provider = ( iobase._SDFBoundedSourceWrapper. _SDFBoundedSourceRestrictionProvider(initial_concat_source, desired_chunk_size=2)) restriction = ( self.sdf_restriction_provider.initial_restriction(unused_element)) expect_splits = [(0, 2), (2, 4)] split_bundles = list( sdf_concat_restriction_provider.split(unused_element, restriction)) self.assertTrue( all([isinstance(bundle, SourceBundle) for bundle in split_bundles])) splits = ([(bundle.start_position, bundle.stop_position) for bundle in split_bundles]) self.assertEqual(expect_splits, list(splits))
def test_empty_source(self): read_all = source_test_utils.read_from_source empty = RangeSource(0, 0) self.assertEqual(read_all(ConcatSource([])), []) self.assertEqual(read_all(ConcatSource([empty])), []) self.assertEqual(read_all(ConcatSource([empty, empty])), []) range10 = RangeSource(0, 10) self.assertEqual(read_all(ConcatSource([range10]), (0, None), (0, 0)), []) self.assertEqual(read_all(ConcatSource([range10]), (0, 10), (1, None)), []) self.assertEqual( read_all(ConcatSource([range10, range10]), (0, 10), (1, 0)), [])
def test_conact_source(self): source = ConcatSource([ RangeSource(0, 4), RangeSource(4, 8), RangeSource(8, 12), RangeSource(12, 16), ]) self.assertEqual(list(source.read(source.get_range_tracker())), list(range(16))) self.assertEqual( list(source.read(source.get_range_tracker((1, None), (2, 10)))), list(range(4, 10))) range_tracker = source.get_range_tracker(None, None) self.assertEqual(range_tracker.position_at_fraction(0), (0, 0)) self.assertEqual(range_tracker.position_at_fraction(.5), (2, 8)) self.assertEqual(range_tracker.position_at_fraction(.625), (2, 10)) # Simulate a read. self.assertEqual(range_tracker.try_claim((0, None)), True) self.assertEqual(range_tracker.sub_range_tracker(0).try_claim(2), True) self.assertEqual(range_tracker.fraction_consumed(), 0.125) self.assertEqual(range_tracker.try_claim((1, None)), True) self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(6), True) self.assertEqual(range_tracker.fraction_consumed(), 0.375) self.assertEqual(range_tracker.try_split((0, 1)), None) self.assertEqual(range_tracker.try_split((1, 5)), None) self.assertEqual(range_tracker.try_split((3, 14)), ((3, None), 0.75)) self.assertEqual(range_tracker.try_claim((3, None)), False) self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(7), True) self.assertEqual(range_tracker.try_claim((2, None)), True) self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(9), True) self.assertEqual(range_tracker.try_split((2, 8)), None) self.assertEqual(range_tracker.try_split((2, 11)), ((2, 11), 11. / 12)) self.assertEqual( range_tracker.sub_range_tracker(2).try_claim(10), True) self.assertEqual( range_tracker.sub_range_tracker(2).try_claim(11), False)
def test_conact_source(self): source = ConcatSource([RangeSource(0, 4), RangeSource(4, 8), RangeSource(8, 12), RangeSource(12, 16), ]) self.assertEqual(list(source.read(source.get_range_tracker())), range(16)) self.assertEqual(list(source.read(source.get_range_tracker((1, None), (2, 10)))), range(4, 10)) range_tracker = source.get_range_tracker(None, None) self.assertEqual(range_tracker.position_at_fraction(0), (0, 0)) self.assertEqual(range_tracker.position_at_fraction(.5), (2, 8)) self.assertEqual(range_tracker.position_at_fraction(.625), (2, 10)) # Simulate a read. self.assertEqual(range_tracker.try_claim((0, None)), True) self.assertEqual(range_tracker.sub_range_tracker(0).try_claim(2), True) self.assertEqual(range_tracker.fraction_consumed(), 0.125) self.assertEqual(range_tracker.try_claim((1, None)), True) self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(6), True) self.assertEqual(range_tracker.fraction_consumed(), 0.375) self.assertEqual(range_tracker.try_split((0, 1)), None) self.assertEqual(range_tracker.try_split((1, 5)), None) self.assertEqual(range_tracker.try_split((3, 14)), ((3, None), 0.75)) self.assertEqual(range_tracker.try_claim((3, None)), False) self.assertEqual(range_tracker.sub_range_tracker(1).try_claim(7), True) self.assertEqual(range_tracker.try_claim((2, None)), True) self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(9), True) self.assertEqual(range_tracker.try_split((2, 8)), None) self.assertEqual(range_tracker.try_split((2, 11)), ((2, 11), 11. / 12)) self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(10), True) self.assertEqual(range_tracker.sub_range_tracker(2).try_claim(11), False)
def test_estimate_size(self): sources = [TestConcatSource.DummySource(range(start, start + 10)) for start in [0, 10, 20]] concat = ConcatSource(sources) self.assertEquals(30, concat.estimate_size())
def test_estimate_size(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) self.assertEqual(source.estimate_size(), 1000)