Пример #1
0
    def test_split_points(self):
        file_name = self._write_data(count=12000)
        source = AvroSource(file_name)

        splits = [
            split for split in source.split(desired_bundle_size=float('inf'))
        ]
        assert len(splits) == 1

        range_tracker = splits[0].source.get_range_tracker(
            splits[0].start_position, splits[0].stop_position)

        split_points_report = []

        for _ in splits[0].source.read(range_tracker):
            split_points_report.append(range_tracker.split_points())

        # There are a total of three blocks. Each block has more than 10 records.

        # When reading records of the first block, range_tracker.split_points()
        # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
        self.assertEquals(split_points_report[:10],
                          [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10)

        # When reading records of last block, range_tracker.split_points() should
        # return (2, 1)
        self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
Пример #2
0
  def test_split_points(self):
    file_name = self._write_data(count=12000)
    source = AvroSource(file_name)

    splits = [
        split
        for split in source.split(desired_bundle_size=float('inf'))
    ]
    assert len(splits) == 1

    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)

    split_points_report = []

    for _ in splits[0].source.read(range_tracker):
      split_points_report.append(range_tracker.split_points())

    # There are a total of three blocks. Each block has more than 10 records.

    # When reading records of the first block, range_tracker.split_points()
    # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
    self.assertEquals(
        split_points_report[:10],
        [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10)

    # When reading records of last block, range_tracker.split_points() should
    # return (2, 1)
    self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)
Пример #3
0
 def test_read_reantrant_with_splitting(self):
     file_name = self._write_data()
     source = AvroSource(file_name)
     splits = [split for split in source.split(desired_bundle_size=100000)]
     assert len(splits) == 1
     source_test_utils.assertReentrantReadsSucceed(
         (splits[0].source, splits[0].start_position,
          splits[0].stop_position))
Пример #4
0
 def test_read_reantrant_with_splitting(self):
   file_name = self._write_data()
   source = AvroSource(file_name)
   splits = [
       split for split in source.split(desired_bundle_size=100000)]
   assert len(splits) == 1
   source_test_utils.assert_reentrant_reads_succeed(
       (splits[0].source, splits[0].start_position, splits[0].stop_position))
Пример #5
0
 def test_dynamic_work_rebalancing_exhaustive(self):
   # Adjusting block size so that we can perform a exhaustive dynamic
   # work rebalancing test that completes within an acceptable amount of time.
   old_sync_interval = avro.datafile.SYNC_INTERVAL
   try:
     avro.datafile.SYNC_INTERVAL = 2
     file_name = self._write_data(count=5)
     source = AvroSource(file_name)
     splits = [split
               for split in source.split(desired_bundle_size=float('inf'))]
     assert len(splits) == 1
     source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source)
   finally:
     avro.datafile.SYNC_INTERVAL = old_sync_interval
Пример #6
0
 def test_dynamic_work_rebalancing_exhaustive(self):
   # Adjusting block size so that we can perform a exhaustive dynamic
   # work rebalancing test that completes within an acceptable amount of time.
   old_sync_interval = avro.datafile.SYNC_INTERVAL
   try:
     avro.datafile.SYNC_INTERVAL = 2
     file_name = self._write_data(count=5)
     source = AvroSource(file_name)
     splits = [split
               for split in source.split(desired_bundle_size=float('inf'))]
     assert len(splits) == 1
     source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source)
   finally:
     avro.datafile.SYNC_INTERVAL = old_sync_interval
Пример #7
0
    def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                       expected_result):
        source = AvroSource(pattern)

        read_records = []
        if perform_splitting:
            assert desired_bundle_size
            splits = [
                split for split in source.split(
                    desired_bundle_size=desired_bundle_size)
            ]
            if len(splits) < 2:
                raise ValueError(
                    'Test is trivial. Please adjust it so that at least '
                    'two splits get generated')

            sources_info = [(split.source, split.start_position,
                             split.stop_position) for split in splits]
            source_test_utils.assertSourcesEqualReferenceSource(
                (source, None, None), sources_info)
        else:
            read_records = source_test_utils.readFromSource(source, None, None)
            self.assertItemsEqual(expected_result, read_records)
Пример #8
0
  def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                     expected_result):
    source = AvroSource(pattern)

    read_records = []
    if perform_splitting:
      assert desired_bundle_size
      splits = [
          split
          for split in source.split(desired_bundle_size=desired_bundle_size)
      ]
      if len(splits) < 2:
        raise ValueError('Test is trivial. Please adjust it so that at least '
                         'two splits get generated')

      sources_info = [
          (split.source, split.start_position, split.stop_position)
          for split in splits
      ]
      source_test_utils.assert_sources_equal_reference_source(
          (source, None, None), sources_info)
    else:
      read_records = source_test_utils.read_from_source(source, None, None)
      self.assertItemsEqual(expected_result, read_records)