def test_randomness(self): shuffler = RandomShuffler(16) expected = list(range(100)) actual = self.shuffle_numbers(shuffler, expected) self.assertEqual(100, len(actual)) self.assertNotEqual(expected, actual) self.assertEqual(expected, sorted(actual))
def test_randomness(): shuffler = RandomShuffler(16) expected = list(range(100)) actual = shuffle_numbers(shuffler, expected) assert len(actual) == 100 assert expected != actual assert expected == sorted(actual)
def __iter__(self): shuffler = RandomShuffler( self.shuffler_capacity if self.shuffle else 1, self.seed) group_count = 0 for filepath in self.files: fs, path = FileSystem.from_uri(filepath) with fs.open_input_file(path) as fobj: parquet = pg.ParquetFile(fobj) for group_idx in range(parquet.num_row_groups): # A simple form of row-group level bucketing without memory overhead. # Pros: # - It requires zero communication to initialize the distributed policy # - It uses little memory and no startup overhead, i.e. collecting row groups. # Cons: # The drawback would be if the world size is much larger than # the average number of row groups. As a result, many of the # file open operations would be wasted. group_count += 1 if group_count % self.world_size != self.rank: continue row_group = parquet.read_row_group(group_idx, columns=self.columns) for batch in row_group.to_batches(): # type: RecordBatch # TODO: read batches not using pandas for _, row in batch.to_pandas().iterrows(): shuffler.append(row) # Maintain the shuffler buffer around its capacity. while shuffler.full(): yield self._convert(shuffler.pop().to_dict(), self.spark_row_metadata) while shuffler: yield self._convert(shuffler.pop().to_dict(), self.spark_row_metadata)
def test_randomness_with_large_capacity(self): """Test the case that the capacity is larger than total number of elements.""" shuffler = RandomShuffler(128) expected = list(range(100)) actual = self.shuffle_numbers(shuffler, expected) self.assertEqual(100, len(actual)) self.assertNotEqual(expected, actual) self.assertEqual(expected, sorted(actual))
def test_fifo_with_single_item(): shuffler = RandomShuffler(capacity=1) shuffler.append(1) assert shuffler assert shuffler.full() assert len(shuffler) == 1 assert shuffler.pop() == 1 assert not shuffler.full()
def test_randomness_with_large_capacity(): """Test the case that the capacity is larger than total number of elements. """ shuffler = RandomShuffler(128) expected = list(range(100)) actual = shuffle_numbers(shuffler, expected) assert len(actual) == 100 assert expected != actual assert expected == sorted(actual)
def test_fifo(self): shuffler = RandomShuffler(capacity=1) returned = self.shuffle_numbers(shuffler, range(100)) self.assertEqual(list(range(100)), returned)
def test_fifo(): shuffler = RandomShuffler(capacity=1) returned = shuffle_numbers(shuffler, range(100)) assert len(returned) == 100