Exemplo n.º 1
0
def assert_arrow_memory_increases():
    import gc

    gc.collect()
    previous_allocated_memory = pa.total_allocated_bytes()
    yield
    assert pa.total_allocated_bytes() - previous_allocated_memory > 0, "Arrow memory didn't increase."
Exemplo n.º 2
0
def test_export_import_type():
    c_schema = ffi.new("struct ArrowSchema*")
    ptr_schema = int(ffi.cast("uintptr_t", c_schema))

    gc.collect()  # Make sure no Arrow data dangles in a ref cycle
    old_allocated = pa.total_allocated_bytes()

    typ = pa.list_(pa.int32())
    typ._export_to_c(ptr_schema)
    assert pa.total_allocated_bytes() > old_allocated
    # Delete and recreate C++ object from exported pointer
    del typ
    assert pa.total_allocated_bytes() > old_allocated
    typ_new = pa.DataType._import_from_c(ptr_schema)
    assert typ_new == pa.list_(pa.int32())
    assert pa.total_allocated_bytes() == old_allocated
    # Now released
    with assert_schema_released:
        pa.DataType._import_from_c(ptr_schema)

    # Invalid format string
    pa.int32()._export_to_c(ptr_schema)
    bad_format = ffi.new("char[]", b"zzz")
    c_schema.format = bad_format
    with pytest.raises(ValueError,
                       match="Invalid or unsupported format string"):
        pa.DataType._import_from_c(ptr_schema)
    # Now released
    with assert_schema_released:
        pa.DataType._import_from_c(ptr_schema)
Exemplo n.º 3
0
def assert_arrow_memory_doesnt_increase():
    import gc

    gc.collect()
    previous_allocated_memory = pa.total_allocated_bytes()
    yield
    assert pa.total_allocated_bytes() - previous_allocated_memory <= 0, "Arrow memory wasn't expected to increase."
Exemplo n.º 4
0
def check_export_import_schema(schema_factory):
    c_schema = ffi.new("struct ArrowSchema*")
    ptr_schema = int(ffi.cast("uintptr_t", c_schema))

    gc.collect()  # Make sure no Arrow data dangles in a ref cycle
    old_allocated = pa.total_allocated_bytes()

    schema_factory()._export_to_c(ptr_schema)
    assert pa.total_allocated_bytes() > old_allocated
    # Delete and recreate C++ object from exported pointer
    schema_new = pa.Schema._import_from_c(ptr_schema)
    assert schema_new == schema_factory()
    assert pa.total_allocated_bytes() == old_allocated
    del schema_new
    assert pa.total_allocated_bytes() == old_allocated
    # Now released
    with assert_schema_released:
        pa.Schema._import_from_c(ptr_schema)

    # Not a struct type
    pa.int32()._export_to_c(ptr_schema)
    with pytest.raises(ValueError,
                       match="ArrowSchema describes non-struct type"):
        pa.Schema._import_from_c(ptr_schema)
    # Now released
    with assert_schema_released:
        pa.Schema._import_from_c(ptr_schema)
Exemplo n.º 5
0
def no_pyarrow_leak():
    # No leak of C++ memory
    old_allocation = pa.total_allocated_bytes()
    try:
        yield
    finally:
        assert pa.total_allocated_bytes() == old_allocation
def test_cast_kernel_on_extension_arrays():
    # test array casting
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(IntegerType(), storage)

    # test that no allocation happens during identity cast
    allocated_before_cast = pa.total_allocated_bytes()
    casted = arr.cast(pa.int64())
    assert pa.total_allocated_bytes() == allocated_before_cast

    cases = [
        (pa.int64(), pa.Int64Array),
        (pa.int32(), pa.Int32Array),
        (pa.int16(), pa.Int16Array),
        (pa.uint64(), pa.UInt64Array),
        (pa.uint32(), pa.UInt32Array),
        (pa.uint16(), pa.UInt16Array)
    ]
    for typ, klass in cases:
        casted = arr.cast(typ)
        assert casted.type == typ
        assert isinstance(casted, klass)

    # test chunked array casting
    arr = pa.chunked_array([arr, arr])
    casted = arr.cast(pa.int16())
    assert casted.type == pa.int16()
    assert isinstance(casted, pa.ChunkedArray)
Exemplo n.º 7
0
    def test_pandas_self_destruct(self):
        import pyarrow as pa
        rows = 2 ** 10
        cols = 4
        expected_bytes = rows * cols * 8
        df = self.spark.range(0, rows).select(*[rand() for _ in range(cols)])
        # Test the self_destruct behavior by testing _collect_as_arrow directly
        allocation_before = pa.total_allocated_bytes()
        batches = df._collect_as_arrow(split_batches=True)
        table = pa.Table.from_batches(batches)
        del batches
        pdf_split = table.to_pandas(self_destruct=True, split_blocks=True, use_threads=False)
        allocation_after = pa.total_allocated_bytes()
        difference = allocation_after - allocation_before
        # Should be around 1x the data size (table should not hold on to any memory)
        self.assertGreaterEqual(difference, 0.9 * expected_bytes)
        self.assertLessEqual(difference, 1.1 * expected_bytes)

        with self.sql_conf({"spark.sql.execution.arrow.pyspark.selfDestruct.enabled": False}):
            no_self_destruct_pdf = df.toPandas()
            # Note while memory usage is 2x data size here (both table and pdf hold on to
            # memory), in this case Arrow still only tracks 1x worth of memory (since the
            # batches are not allocated by Arrow in this case), so we can't make any
            # assertions here

        with self.sql_conf({"spark.sql.execution.arrow.pyspark.selfDestruct.enabled": True}):
            self_destruct_pdf = df.toPandas()

        assert_frame_equal(pdf_split, no_self_destruct_pdf)
        assert_frame_equal(pdf_split, self_destruct_pdf)
Exemplo n.º 8
0
 def tearDown(self):
     self.bridge.close()
     gc.collect()
     diff_python = pa.total_allocated_bytes() - self.old_allocated_python
     self.assertEqual(
         pa.total_allocated_bytes(), self.old_allocated_python,
         f"PyArrow memory was not adequately released: {diff_python} bytes lost"
     )
Exemplo n.º 9
0
 def assert_pyarrow_memory_released(self):
     self.run_gc()
     old_allocated = pa.total_allocated_bytes()
     yield
     self.run_gc()
     diff = pa.total_allocated_bytes() - old_allocated
     self.assertEqual(
         pa.total_allocated_bytes(), old_allocated,
         f"PyArrow memory was not adequately released: {diff} bytes lost")
Exemplo n.º 10
0
def test_read_table(in_memory, dataset, arrow_file):
    filename = arrow_file
    previous_allocated_memory = pa.total_allocated_bytes()
    table = ArrowReader.read_table(filename, in_memory=in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict
    assert increased_allocated_memory == in_memory
Exemplo n.º 11
0
    def test_garbage_collection(self):
        import gc

        # Force the cyclic garbage collector to run
        gc.collect()

        bytes_before = pyarrow.total_allocated_bytes()
        pyarrow.from_pylist([1, None, 3, None])
        gc.collect()
        assert pyarrow.total_allocated_bytes() == bytes_before
Exemplo n.º 12
0
def test_garbage_collection():
    import gc

    # Force the cyclic garbage collector to run
    gc.collect()

    bytes_before = pa.total_allocated_bytes()
    pa.array([1, None, 3, None])
    gc.collect()
    assert pa.total_allocated_bytes() == bytes_before
Exemplo n.º 13
0
    def test_garbage_collection(self):
        import gc

        # Force the cyclic garbage collector to run
        gc.collect()

        bytes_before = pyarrow.total_allocated_bytes()
        pyarrow.from_pylist([1, None, 3, None])
        gc.collect()
        assert pyarrow.total_allocated_bytes() == bytes_before
Exemplo n.º 14
0
def test_garbage_collection():
    import gc

    # Force the cyclic garbage collector to run
    gc.collect()

    bytes_before = pa.total_allocated_bytes()
    pa.array([1, None, 3, None])
    gc.collect()
    assert pa.total_allocated_bytes() == bytes_before
Exemplo n.º 15
0
def test_read_files(in_memory, dataset, arrow_file):
    filename = arrow_file
    reader = ArrowReader("", None)
    previous_allocated_memory = pa.total_allocated_bytes()
    dataset_kwargs = reader.read_files([{"filename": filename}], in_memory=in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0
    assert dataset_kwargs.keys() == set(["arrow_table", "data_files", "info", "split"])
    table = dataset_kwargs["arrow_table"]
    assert table.shape == dataset.data.shape
    assert set(table.column_names) == set(dataset.data.column_names)
    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict
    assert increased_allocated_memory == in_memory
Exemplo n.º 16
0
 def test_string_python(self):
     """
     Python -> Rust -> Python
     """
     old_allocated = pyarrow.total_allocated_bytes()
     a = pyarrow.array(["a", None, "ccc"])
     b = arrow_pyarrow_integration_testing.substring(a, 1)
     self.assertEqual(b, pyarrow.array(["", None, "cc"]))
     del a
     del b
     # No leak of C++ memory
     self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 17
0
 def test_primitive_python(self):
     """
     Python -> Rust -> Python
     """
     old_allocated = pyarrow.total_allocated_bytes()
     a = pyarrow.array([1, 2, 3])
     b = arrow_pyarrow_integration_testing.double(a)
     self.assertEqual(b, pyarrow.array([2, 4, 6]))
     del a
     del b
     # No leak of C++ memory
     self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 18
0
def test_different_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    # it works
    array = pa.array([1, None, 3, None],  # noqa
                     memory_pool=pa.jemalloc_memory_pool())
    gc.collect()
    assert pa.total_allocated_bytes() == bytes_before_default
    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
Exemplo n.º 19
0
    def test_import_primitive(self):
        """
        Python -> Rust
        """
        old_allocated = pyarrow.total_allocated_bytes()

        a = pyarrow.array([2, None, 6])

        is_correct = arrow_pyarrow_integration_testing.import_primitive(a)
        self.assertTrue(is_correct)
        # No leak of C++ memory
        del a
        self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 20
0
    def test_export_primitive(self):
        """
        Python -> Rust
        """
        old_allocated = pyarrow.total_allocated_bytes()

        expected = pyarrow.array([2, None, 6])

        result = arrow_pyarrow_integration_testing.export_primitive()
        self.assertEqual(expected, result)
        # No leak of C++ memory
        del expected
        self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 21
0
def test_different_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    # it works
    array = pa.array(
        [1, None, 3, None],  # noqa
        memory_pool=pa.jemalloc_memory_pool())
    gc.collect()
    assert pa.total_allocated_bytes() == bytes_before_default
    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
Exemplo n.º 22
0
 def test_time32_python(self):
     """
     Python -> Rust -> Python
     """
     old_allocated = pyarrow.total_allocated_bytes()
     a = pyarrow.array([None, 1, 2], pyarrow.time32('s'))
     b = arrow_pyarrow_integration_testing.concatenate(a)
     expected = pyarrow.array([None, 1, 2] + [None, 1, 2], pyarrow.time32('s'))
     self.assertEqual(b, expected)
     del a
     del b
     del expected
     # No leak of C++ memory
     self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 23
0
    def test_primitive_rust(self):
        """
        Rust -> Python -> Rust
        """
        old_allocated = pyarrow.total_allocated_bytes()

        def double(array):
            array = array.to_pylist()
            return pyarrow.array([x * 2 if x is not None else None for x in array])

        is_correct = arrow_pyarrow_integration_testing.double_py(double)
        self.assertTrue(is_correct)
        # No leak of C++ memory
        self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 24
0
 def assert_pyarrow_memory_released(self):
     self.run_gc()
     old_allocated = pa.total_allocated_bytes()
     old_go_allocated = cgotest.totalAllocated()
     yield
     self.run_gc()
     diff = pa.total_allocated_bytes() - old_allocated
     godiff = cgotest.totalAllocated() - old_go_allocated
     self.assertEqual(
         pa.total_allocated_bytes(), old_allocated,
         f"PyArrow memory was not adequately released: {diff} bytes lost")
     self.assertEqual(
         cgotest.totalAllocated(), old_go_allocated,
         f"Go memory was not properly released: {godiff} bytes lost")
Exemplo n.º 25
0
def test_generator_based_builder_as_dataset(in_memory, tmp_path):
    cache_dir = tmp_path / "data"
    cache_dir.mkdir()
    cache_dir = str(cache_dir)
    dummy_builder = DummyGeneratorBasedBuilder(cache_dir=cache_dir,
                                               name="dummy")
    dummy_builder.download_and_prepare(try_from_hf_gcs=False,
                                       download_mode=FORCE_REDOWNLOAD)
    previous_allocated_memory = pa.total_allocated_bytes()
    dataset = dummy_builder.as_dataset("train", in_memory=in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() -
                                  previous_allocated_memory) > 0
    assert dataset.data.to_pydict() == {"text": ["foo"] * 100}
    assert increased_allocated_memory == in_memory
Exemplo n.º 26
0
def test_export_import_batch_reader(reader_factory):
    c_stream = ffi.new("struct ArrowArrayStream*")
    ptr_stream = int(ffi.cast("uintptr_t", c_stream))

    gc.collect()  # Make sure no Arrow data dangles in a ref cycle
    old_allocated = pa.total_allocated_bytes()

    _export_import_batch_reader(ptr_stream, reader_factory)

    assert pa.total_allocated_bytes() == old_allocated

    # Now released
    with assert_stream_released:
        pa.RecordBatchReader._import_from_c(ptr_stream)
Exemplo n.º 27
0
    def test_list_array(self):
        """
        Python -> Rust -> Python
        """
        old_allocated = pyarrow.total_allocated_bytes()
        a = pyarrow.array([[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64()))
        b = arrow_pyarrow_integration_testing.round_trip(a)

        b.validate(full=True)
        assert a.to_pylist() == b.to_pylist()
        assert a.type == b.type
        del a
        del b
        # No leak of C++ memory
        self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
Exemplo n.º 28
0
    def rebook(self):
        """Rebook histograms for timers or profiles.
        typically called after random sampling of data chunk
        """
        self.__logger.info("Rebook")

        self.gate.hbook.rebook()  # Resets all histograms!

        self.__logger.info("artemis: allocated before reset %i",
                           pa.total_allocated_bytes())
        self.datahandler.reset()
        self.__logger.info("artemis: allocated after reset %i",
                           pa.total_allocated_bytes())

        # Reset all meta data needed for processing all job info
        self.reset_job_summary()
Exemplo n.º 29
0
def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog):
    previous_allocated_memory = pa.total_allocated_bytes()
    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=keep_in_memory)
    increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0
    assert len(dataset) == 2
    assert increased_allocated_memory == keep_in_memory
    for offline_simulation_mode in list(OfflineSimulationMode):
        with offline(offline_simulation_mode):
            caplog.clear()
            # Load dataset from cache
            dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir)
            assert len(dataset) == 2
            assert "Using the latest cached version of the module" in caplog.text
    with pytest.raises(FileNotFoundError) as exc_info:
        datasets.load_dataset("_dummy")
    assert "at " + os.path.join("_dummy", "_dummy.py") in str(exc_info.value)
Exemplo n.º 30
0
def test_default_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    old_memory_pool = pa.default_memory_pool()
    pa.set_memory_pool(pa.jemalloc_memory_pool())

    array = pa.array([1, None, 3, None])  # noqa

    pa.set_memory_pool(old_memory_pool)
    gc.collect()

    assert pa.total_allocated_bytes() == bytes_before_default

    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
Exemplo n.º 31
0
def test_default_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    old_memory_pool = pa.default_memory_pool()
    pa.set_memory_pool(pa.jemalloc_memory_pool())

    array = pa.array([1, None, 3, None])  # noqa

    pa.set_memory_pool(old_memory_pool)
    gc.collect()

    assert pa.total_allocated_bytes() == bytes_before_default

    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
Exemplo n.º 32
0
    def test_batch_lifetime(self):
        gc.collect()
        old_allocated = pa.total_allocated_bytes()

        # Memory occupation should not grow with CSV file size
        def check_one_batch(reader, expected):
            batch = reader.read_next_batch()
            assert batch.to_pydict() == expected

        rows = b"10,11\n12,13\n14,15\n16,17\n"
        read_options = ReadOptions()
        read_options.column_names = ['a', 'b']
        read_options.block_size = 6
        reader = self.open_bytes(rows, read_options=read_options)
        check_one_batch(reader, {'a': [10], 'b': [11]})
        allocated_after_first_batch = pa.total_allocated_bytes()
        check_one_batch(reader, {'a': [12], 'b': [13]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [14], 'b': [15]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [16], 'b': [17]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        with pytest.raises(StopIteration):
            reader.read_next_batch()
        assert pa.total_allocated_bytes() == old_allocated
        reader = None
        assert pa.total_allocated_bytes() == old_allocated
Exemplo n.º 33
0
def test_export_import_field():
    c_schema = ffi.new("struct ArrowSchema*")
    ptr_schema = int(ffi.cast("uintptr_t", c_schema))

    gc.collect()  # Make sure no Arrow data dangles in a ref cycle
    old_allocated = pa.total_allocated_bytes()

    field = pa.field("test", pa.list_(pa.int32()), nullable=True)
    field._export_to_c(ptr_schema)
    assert pa.total_allocated_bytes() > old_allocated
    # Delete and recreate C++ object from exported pointer
    del field
    assert pa.total_allocated_bytes() > old_allocated

    field_new = pa.Field._import_from_c(ptr_schema)
    assert field_new == pa.field("test", pa.list_(pa.int32()), nullable=True)
    assert pa.total_allocated_bytes() == old_allocated

    # Now released
    with assert_schema_released:
        pa.Field._import_from_c(ptr_schema)
Exemplo n.º 34
0
 def read(self):
     start_time = arrow.utcnow()
     if self.logger:
         self.logger.debug('Reading %s', self.filename)
     table = pq.read_table(self.filename)
     read_time = (arrow.utcnow()-start_time).total_seconds()
     n_bytes = pa.total_allocated_bytes()
     if self.logger:
         self.logger.debug('Reading %s sequences (%sB) took %.1f s (%sB/s).',
                           engr_notation(len(table), powers_of_2=False),
                           engr_notation(n_bytes, digits=2),
                           read_time,
                           engr_notation(n_bytes/read_time, digits=2))
     return table
Exemplo n.º 35
0
def test_total_bytes_allocated():
    assert pyarrow.total_allocated_bytes() == 0
Exemplo n.º 36
0
def test_default_allocated_bytes():
    pool = pa.default_memory_pool()
    with allocate_bytes(pool, 1024):
        check_allocated_bytes(pool)
        assert pool.bytes_allocated() == pa.total_allocated_bytes()