def assert_arrow_memory_increases(): import gc gc.collect() previous_allocated_memory = pa.total_allocated_bytes() yield assert pa.total_allocated_bytes() - previous_allocated_memory > 0, "Arrow memory didn't increase."
def test_export_import_type(): c_schema = ffi.new("struct ArrowSchema*") ptr_schema = int(ffi.cast("uintptr_t", c_schema)) gc.collect() # Make sure no Arrow data dangles in a ref cycle old_allocated = pa.total_allocated_bytes() typ = pa.list_(pa.int32()) typ._export_to_c(ptr_schema) assert pa.total_allocated_bytes() > old_allocated # Delete and recreate C++ object from exported pointer del typ assert pa.total_allocated_bytes() > old_allocated typ_new = pa.DataType._import_from_c(ptr_schema) assert typ_new == pa.list_(pa.int32()) assert pa.total_allocated_bytes() == old_allocated # Now released with assert_schema_released: pa.DataType._import_from_c(ptr_schema) # Invalid format string pa.int32()._export_to_c(ptr_schema) bad_format = ffi.new("char[]", b"zzz") c_schema.format = bad_format with pytest.raises(ValueError, match="Invalid or unsupported format string"): pa.DataType._import_from_c(ptr_schema) # Now released with assert_schema_released: pa.DataType._import_from_c(ptr_schema)
def assert_arrow_memory_doesnt_increase(): import gc gc.collect() previous_allocated_memory = pa.total_allocated_bytes() yield assert pa.total_allocated_bytes() - previous_allocated_memory <= 0, "Arrow memory wasn't expected to increase."
def check_export_import_schema(schema_factory): c_schema = ffi.new("struct ArrowSchema*") ptr_schema = int(ffi.cast("uintptr_t", c_schema)) gc.collect() # Make sure no Arrow data dangles in a ref cycle old_allocated = pa.total_allocated_bytes() schema_factory()._export_to_c(ptr_schema) assert pa.total_allocated_bytes() > old_allocated # Delete and recreate C++ object from exported pointer schema_new = pa.Schema._import_from_c(ptr_schema) assert schema_new == schema_factory() assert pa.total_allocated_bytes() == old_allocated del schema_new assert pa.total_allocated_bytes() == old_allocated # Now released with assert_schema_released: pa.Schema._import_from_c(ptr_schema) # Not a struct type pa.int32()._export_to_c(ptr_schema) with pytest.raises(ValueError, match="ArrowSchema describes non-struct type"): pa.Schema._import_from_c(ptr_schema) # Now released with assert_schema_released: pa.Schema._import_from_c(ptr_schema)
def no_pyarrow_leak(): # No leak of C++ memory old_allocation = pa.total_allocated_bytes() try: yield finally: assert pa.total_allocated_bytes() == old_allocation
def test_cast_kernel_on_extension_arrays(): # test array casting storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(IntegerType(), storage) # test that no allocation happens during identity cast allocated_before_cast = pa.total_allocated_bytes() casted = arr.cast(pa.int64()) assert pa.total_allocated_bytes() == allocated_before_cast cases = [ (pa.int64(), pa.Int64Array), (pa.int32(), pa.Int32Array), (pa.int16(), pa.Int16Array), (pa.uint64(), pa.UInt64Array), (pa.uint32(), pa.UInt32Array), (pa.uint16(), pa.UInt16Array) ] for typ, klass in cases: casted = arr.cast(typ) assert casted.type == typ assert isinstance(casted, klass) # test chunked array casting arr = pa.chunked_array([arr, arr]) casted = arr.cast(pa.int16()) assert casted.type == pa.int16() assert isinstance(casted, pa.ChunkedArray)
def test_pandas_self_destruct(self): import pyarrow as pa rows = 2 ** 10 cols = 4 expected_bytes = rows * cols * 8 df = self.spark.range(0, rows).select(*[rand() for _ in range(cols)]) # Test the self_destruct behavior by testing _collect_as_arrow directly allocation_before = pa.total_allocated_bytes() batches = df._collect_as_arrow(split_batches=True) table = pa.Table.from_batches(batches) del batches pdf_split = table.to_pandas(self_destruct=True, split_blocks=True, use_threads=False) allocation_after = pa.total_allocated_bytes() difference = allocation_after - allocation_before # Should be around 1x the data size (table should not hold on to any memory) self.assertGreaterEqual(difference, 0.9 * expected_bytes) self.assertLessEqual(difference, 1.1 * expected_bytes) with self.sql_conf({"spark.sql.execution.arrow.pyspark.selfDestruct.enabled": False}): no_self_destruct_pdf = df.toPandas() # Note while memory usage is 2x data size here (both table and pdf hold on to # memory), in this case Arrow still only tracks 1x worth of memory (since the # batches are not allocated by Arrow in this case), so we can't make any # assertions here with self.sql_conf({"spark.sql.execution.arrow.pyspark.selfDestruct.enabled": True}): self_destruct_pdf = df.toPandas() assert_frame_equal(pdf_split, no_self_destruct_pdf) assert_frame_equal(pdf_split, self_destruct_pdf)
def tearDown(self): self.bridge.close() gc.collect() diff_python = pa.total_allocated_bytes() - self.old_allocated_python self.assertEqual( pa.total_allocated_bytes(), self.old_allocated_python, f"PyArrow memory was not adequately released: {diff_python} bytes lost" )
def assert_pyarrow_memory_released(self): self.run_gc() old_allocated = pa.total_allocated_bytes() yield self.run_gc() diff = pa.total_allocated_bytes() - old_allocated self.assertEqual( pa.total_allocated_bytes(), old_allocated, f"PyArrow memory was not adequately released: {diff} bytes lost")
def test_read_table(in_memory, dataset, arrow_file): filename = arrow_file previous_allocated_memory = pa.total_allocated_bytes() table = ArrowReader.read_table(filename, in_memory=in_memory) increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert table.shape == dataset.data.shape assert set(table.column_names) == set(dataset.data.column_names) assert dict(table.to_pydict()) == dict(dataset.data.to_pydict()) # to_pydict returns OrderedDict assert increased_allocated_memory == in_memory
def test_garbage_collection(self): import gc # Force the cyclic garbage collector to run gc.collect() bytes_before = pyarrow.total_allocated_bytes() pyarrow.from_pylist([1, None, 3, None]) gc.collect() assert pyarrow.total_allocated_bytes() == bytes_before
def test_garbage_collection(): import gc # Force the cyclic garbage collector to run gc.collect() bytes_before = pa.total_allocated_bytes() pa.array([1, None, 3, None]) gc.collect() assert pa.total_allocated_bytes() == bytes_before
def test_read_files(in_memory, dataset, arrow_file): filename = arrow_file reader = ArrowReader("", None) previous_allocated_memory = pa.total_allocated_bytes() dataset_kwargs = reader.read_files([{"filename": filename}], in_memory=in_memory) increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert dataset_kwargs.keys() == set(["arrow_table", "data_files", "info", "split"]) table = dataset_kwargs["arrow_table"] assert table.shape == dataset.data.shape assert set(table.column_names) == set(dataset.data.column_names) assert dict(table.to_pydict()) == dict(dataset.data.to_pydict()) # to_pydict returns OrderedDict assert increased_allocated_memory == in_memory
def test_string_python(self): """ Python -> Rust -> Python """ old_allocated = pyarrow.total_allocated_bytes() a = pyarrow.array(["a", None, "ccc"]) b = arrow_pyarrow_integration_testing.substring(a, 1) self.assertEqual(b, pyarrow.array(["", None, "cc"])) del a del b # No leak of C++ memory self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def test_primitive_python(self): """ Python -> Rust -> Python """ old_allocated = pyarrow.total_allocated_bytes() a = pyarrow.array([1, 2, 3]) b = arrow_pyarrow_integration_testing.double(a) self.assertEqual(b, pyarrow.array([2, 4, 6])) del a del b # No leak of C++ memory self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def test_different_memory_pool(): gc.collect() bytes_before_default = pa.total_allocated_bytes() bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated() # it works array = pa.array([1, None, 3, None], # noqa memory_pool=pa.jemalloc_memory_pool()) gc.collect() assert pa.total_allocated_bytes() == bytes_before_default assert (pa.jemalloc_memory_pool().bytes_allocated() > bytes_before_jemalloc)
def test_import_primitive(self): """ Python -> Rust """ old_allocated = pyarrow.total_allocated_bytes() a = pyarrow.array([2, None, 6]) is_correct = arrow_pyarrow_integration_testing.import_primitive(a) self.assertTrue(is_correct) # No leak of C++ memory del a self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def test_export_primitive(self): """ Python -> Rust """ old_allocated = pyarrow.total_allocated_bytes() expected = pyarrow.array([2, None, 6]) result = arrow_pyarrow_integration_testing.export_primitive() self.assertEqual(expected, result) # No leak of C++ memory del expected self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def test_different_memory_pool(): gc.collect() bytes_before_default = pa.total_allocated_bytes() bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated() # it works array = pa.array( [1, None, 3, None], # noqa memory_pool=pa.jemalloc_memory_pool()) gc.collect() assert pa.total_allocated_bytes() == bytes_before_default assert (pa.jemalloc_memory_pool().bytes_allocated() > bytes_before_jemalloc)
def test_time32_python(self): """ Python -> Rust -> Python """ old_allocated = pyarrow.total_allocated_bytes() a = pyarrow.array([None, 1, 2], pyarrow.time32('s')) b = arrow_pyarrow_integration_testing.concatenate(a) expected = pyarrow.array([None, 1, 2] + [None, 1, 2], pyarrow.time32('s')) self.assertEqual(b, expected) del a del b del expected # No leak of C++ memory self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def test_primitive_rust(self): """ Rust -> Python -> Rust """ old_allocated = pyarrow.total_allocated_bytes() def double(array): array = array.to_pylist() return pyarrow.array([x * 2 if x is not None else None for x in array]) is_correct = arrow_pyarrow_integration_testing.double_py(double) self.assertTrue(is_correct) # No leak of C++ memory self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def assert_pyarrow_memory_released(self): self.run_gc() old_allocated = pa.total_allocated_bytes() old_go_allocated = cgotest.totalAllocated() yield self.run_gc() diff = pa.total_allocated_bytes() - old_allocated godiff = cgotest.totalAllocated() - old_go_allocated self.assertEqual( pa.total_allocated_bytes(), old_allocated, f"PyArrow memory was not adequately released: {diff} bytes lost") self.assertEqual( cgotest.totalAllocated(), old_go_allocated, f"Go memory was not properly released: {godiff} bytes lost")
def test_generator_based_builder_as_dataset(in_memory, tmp_path): cache_dir = tmp_path / "data" cache_dir.mkdir() cache_dir = str(cache_dir) dummy_builder = DummyGeneratorBasedBuilder(cache_dir=cache_dir, name="dummy") dummy_builder.download_and_prepare(try_from_hf_gcs=False, download_mode=FORCE_REDOWNLOAD) previous_allocated_memory = pa.total_allocated_bytes() dataset = dummy_builder.as_dataset("train", in_memory=in_memory) increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert dataset.data.to_pydict() == {"text": ["foo"] * 100} assert increased_allocated_memory == in_memory
def test_export_import_batch_reader(reader_factory): c_stream = ffi.new("struct ArrowArrayStream*") ptr_stream = int(ffi.cast("uintptr_t", c_stream)) gc.collect() # Make sure no Arrow data dangles in a ref cycle old_allocated = pa.total_allocated_bytes() _export_import_batch_reader(ptr_stream, reader_factory) assert pa.total_allocated_bytes() == old_allocated # Now released with assert_stream_released: pa.RecordBatchReader._import_from_c(ptr_stream)
def test_list_array(self): """ Python -> Rust -> Python """ old_allocated = pyarrow.total_allocated_bytes() a = pyarrow.array([[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64())) b = arrow_pyarrow_integration_testing.round_trip(a) b.validate(full=True) assert a.to_pylist() == b.to_pylist() assert a.type == b.type del a del b # No leak of C++ memory self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def rebook(self): """Rebook histograms for timers or profiles. typically called after random sampling of data chunk """ self.__logger.info("Rebook") self.gate.hbook.rebook() # Resets all histograms! self.__logger.info("artemis: allocated before reset %i", pa.total_allocated_bytes()) self.datahandler.reset() self.__logger.info("artemis: allocated after reset %i", pa.total_allocated_bytes()) # Reset all meta data needed for processing all job info self.reset_job_summary()
def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog): previous_allocated_memory = pa.total_allocated_bytes() dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=keep_in_memory) increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert len(dataset) == 2 assert increased_allocated_memory == keep_in_memory for offline_simulation_mode in list(OfflineSimulationMode): with offline(offline_simulation_mode): caplog.clear() # Load dataset from cache dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir) assert len(dataset) == 2 assert "Using the latest cached version of the module" in caplog.text with pytest.raises(FileNotFoundError) as exc_info: datasets.load_dataset("_dummy") assert "at " + os.path.join("_dummy", "_dummy.py") in str(exc_info.value)
def test_default_memory_pool(): gc.collect() bytes_before_default = pa.total_allocated_bytes() bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated() old_memory_pool = pa.default_memory_pool() pa.set_memory_pool(pa.jemalloc_memory_pool()) array = pa.array([1, None, 3, None]) # noqa pa.set_memory_pool(old_memory_pool) gc.collect() assert pa.total_allocated_bytes() == bytes_before_default assert (pa.jemalloc_memory_pool().bytes_allocated() > bytes_before_jemalloc)
def test_batch_lifetime(self): gc.collect() old_allocated = pa.total_allocated_bytes() # Memory occupation should not grow with CSV file size def check_one_batch(reader, expected): batch = reader.read_next_batch() assert batch.to_pydict() == expected rows = b"10,11\n12,13\n14,15\n16,17\n" read_options = ReadOptions() read_options.column_names = ['a', 'b'] read_options.block_size = 6 reader = self.open_bytes(rows, read_options=read_options) check_one_batch(reader, {'a': [10], 'b': [11]}) allocated_after_first_batch = pa.total_allocated_bytes() check_one_batch(reader, {'a': [12], 'b': [13]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [14], 'b': [15]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [16], 'b': [17]}) assert pa.total_allocated_bytes() == allocated_after_first_batch with pytest.raises(StopIteration): reader.read_next_batch() assert pa.total_allocated_bytes() == old_allocated reader = None assert pa.total_allocated_bytes() == old_allocated
def test_export_import_field(): c_schema = ffi.new("struct ArrowSchema*") ptr_schema = int(ffi.cast("uintptr_t", c_schema)) gc.collect() # Make sure no Arrow data dangles in a ref cycle old_allocated = pa.total_allocated_bytes() field = pa.field("test", pa.list_(pa.int32()), nullable=True) field._export_to_c(ptr_schema) assert pa.total_allocated_bytes() > old_allocated # Delete and recreate C++ object from exported pointer del field assert pa.total_allocated_bytes() > old_allocated field_new = pa.Field._import_from_c(ptr_schema) assert field_new == pa.field("test", pa.list_(pa.int32()), nullable=True) assert pa.total_allocated_bytes() == old_allocated # Now released with assert_schema_released: pa.Field._import_from_c(ptr_schema)
def read(self): start_time = arrow.utcnow() if self.logger: self.logger.debug('Reading %s', self.filename) table = pq.read_table(self.filename) read_time = (arrow.utcnow()-start_time).total_seconds() n_bytes = pa.total_allocated_bytes() if self.logger: self.logger.debug('Reading %s sequences (%sB) took %.1f s (%sB/s).', engr_notation(len(table), powers_of_2=False), engr_notation(n_bytes, digits=2), read_time, engr_notation(n_bytes/read_time, digits=2)) return table
def test_total_bytes_allocated(): assert pyarrow.total_allocated_bytes() == 0
def test_default_allocated_bytes(): pool = pa.default_memory_pool() with allocate_bytes(pool, 1024): check_allocated_bytes(pool) assert pool.bytes_allocated() == pa.total_allocated_bytes()