def test_fingerprint_when_transform_version_changes(): data = {"a": [0, 1, 2]} class DummyDatasetChild(datasets.Dataset): @fingerprint_transform(inplace=False) def func(self, new_fingerprint): return DummyDatasetChild(self.data, fingerprint=new_fingerprint) fingeprint_no_version = DummyDatasetChild( InMemoryTable.from_pydict(data)).func() class DummyDatasetChild(datasets.Dataset): @fingerprint_transform(inplace=False, version="1.0.0") def func(self, new_fingerprint): return DummyDatasetChild(self.data, fingerprint=new_fingerprint) fingeprint_1 = DummyDatasetChild(InMemoryTable.from_pydict(data)).func() class DummyDatasetChild(datasets.Dataset): @fingerprint_transform(inplace=False, version="2.0.0") def func(self, new_fingerprint): return DummyDatasetChild(self.data, fingerprint=new_fingerprint) fingeprint_2 = DummyDatasetChild(InMemoryTable.from_pydict(data)).func() assert len({fingeprint_no_version, fingeprint_1, fingeprint_2}) == 3
def test_in_memory_table_pickle_big_table(): big_table_4GB = InMemoryTable.from_pydict( {"col": [0] * ((4 * 8 << 30) // 64)}) length = len(big_table_4GB) big_table_4GB = pickle.dumps(big_table_4GB) big_table_4GB = pickle.loads(big_table_4GB) assert len(big_table_4GB) == length
def test_fingerprint_in_multiprocessing(): data = {"a": [0, 1, 2]} dataset = DatasetChild(InMemoryTable.from_pydict(data)) expected_fingerprint = dataset.func1()._fingerprint assert expected_fingerprint == dataset.func1()._fingerprint assert expected_fingerprint != dataset.func2()._fingerprint with Pool(2) as p: assert expected_fingerprint == p.apply_async( dataset.func1).get()._fingerprint assert expected_fingerprint != p.apply_async( dataset.func2).get()._fingerprint
def test_in_memory_table_from_pydict(in_memory_pa_table): pydict = in_memory_pa_table.to_pydict() with assert_arrow_memory_increases(): table = InMemoryTable.from_pydict(pydict) assert isinstance(table, InMemoryTable) assert table.table == pa.Table.from_pydict(pydict)