def test_split_respects_min_num_splits(self):
   range = OffsetRange(10, 100)
   splits = list(range.split(desired_num_offsets_per_split=5,
                             min_num_offsets_per_split=25))
   self.assertEqual(3, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 100), splits)
Exemplo n.º 2
0
    def process(self, element, *args, **kwargs):
        match_results = FileSystems.match([element])
        for metadata in match_results[0].metadata_list:
            splittable = (self._splittable
                          and _determine_splittability_from_compression_type(
                              metadata.path, self._compression_type))

            if splittable:
                for split in OffsetRange(0, metadata.size_in_bytes).split(
                        self._desired_bundle_size, self._min_bundle_size):
                    yield (metadata, split)
            else:
                yield (metadata,
                       OffsetRange(
                           0,
                           range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
Exemplo n.º 3
0
 def test_split_respects_desired_num_splits(self):
   range = OffsetRange(10, 100)
   splits = list(range.split(desired_num_offsets_per_split=25))
   self.assertEqual(4, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 85), splits)
   self.assertIn(OffsetRange(85, 100), splits)
Exemplo n.º 4
0
 def test_split_no_small_split_at_end(self):
   range = OffsetRange(10, 90)
   splits = list(range.split(desired_num_offsets_per_split=25))
   self.assertEqual(3, len(splits))
   self.assertIn(OffsetRange(10, 35), splits)
   self.assertIn(OffsetRange(35, 60), splits)
   self.assertIn(OffsetRange(60, 90), splits)
Exemplo n.º 5
0
    def split(self, desired_bundle_size, start_offset=None, stop_offset=None):
        if start_offset is None:
            start_offset = self._start_offset
        if stop_offset is None:
            stop_offset = self._stop_offset

        if self._splittable:
            splits = OffsetRange(start_offset,
                                 stop_offset).split(desired_bundle_size,
                                                    self._min_bundle_size)
            for split in splits:
                yield iobase.SourceBundle(
                    split.stop - split.start,
                    _SingleFileSource(
                        # Copying this so that each sub-source gets a fresh instance.
                        pickler.loads(pickler.dumps(self._file_based_source)),
                        self._file_name,
                        split.start,
                        split.stop,
                        min_bundle_size=self._min_bundle_size,
                        splittable=self._splittable),
                    split.start,
                    split.stop)
        else:
            # Returning a single sub-source with end offset set to OFFSET_INFINITY (so
            # that all data of the source gets read) since this source is
            # unsplittable. Choosing size of the file as end offset will be wrong for
            # certain unsplittable source, e.g., compressed sources.
            yield iobase.SourceBundle(
                stop_offset - start_offset,
                _SingleFileSource(
                    self._file_based_source,
                    self._file_name,
                    start_offset,
                    range_trackers.OffsetRangeTracker.OFFSET_INFINITY,
                    min_bundle_size=self._min_bundle_size,
                    splittable=self._splittable), start_offset,
                range_trackers.OffsetRangeTracker.OFFSET_INFINITY)
Exemplo n.º 6
0
  def test_create(self):
    OffsetRange(0, 10)
    OffsetRange(10, 100)

    with self.assertRaises(ValueError):
      OffsetRange(10, 9)