def test_memory_usage(tmpdir, compress): # Verify memory stays within expected bounds. filename = tmpdir.join('test.pkl').strpath small_array = np.ones((10, 10)) big_array = np.ones(shape=100 * int(1e6), dtype=np.uint8) small_matrix = np.matrix(small_array) big_matrix = np.matrix(big_array) for obj in (small_array, big_array, small_matrix, big_matrix): size = obj.nbytes / 1e6 obj_filename = filename + str(np.random.randint(0, 1000)) mem_used = memory_used(numpy_pickle.dump, obj, obj_filename, compress=compress) # The memory used to dump the object shouldn't exceed the buffer # size used to write array chunks (16MB). write_buf_size = _IO_BUFFER_SIZE + 16 * 1024**2 / 1e6 assert mem_used <= write_buf_size mem_used = memory_used(numpy_pickle.load, obj_filename) # memory used should be less than array size + buffer size used to # read the array chunk by chunk. read_buf_size = 32 + _IO_BUFFER_SIZE # MiB assert mem_used < size + read_buf_size
def test_compressed_pickle_dump_and_load(tmpdir): expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), np.arange(256, dtype=np.uint8).tobytes(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !" ] fname = tmpdir.join('temp.pkl.gz').strpath dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) assert len(dumped_filenames) == 1 result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected
def test_compressed_pickle_dump_and_load(): expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !"] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name try: dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) assert len(dumped_filenames) == 1 result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected finally: os.remove(fname)
def test_joblib_pickle_across_python_versions(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] # Testing all the compressed and non compressed # pickles in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) pickle_extensions = ('.pkl', '.gz', '.gzip', '.bz2', '.xz', '.lzma', 'lz4') pickle_filenames = [os.path.join(test_data_dir, fn) for fn in os.listdir(test_data_dir) if any(fn.endswith(ext) for ext in pickle_extensions)] for fname in pickle_filenames: _check_pickle(fname, expected_list)
def test_file_handle_persistence(): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] fobjs = [bz2.BZ2File, gzip.GzipFile] if PY3_OR_LATER: import lzma fobjs += [lzma.LZMAFile] filename = env['filename'] + str(random.randint(0, 1000)) for obj in objs: for fobj in fobjs: with fobj(filename, 'wb') as f: numpy_pickle.dump(obj, f) # using the same decompressor prevents from internally # decompress again. with fobj(filename, 'rb') as f: obj_reloaded = numpy_pickle.load(f) # when needed, the correct decompressor should be used when # passing a raw file handle. with open(filename, 'rb') as f: obj_reloaded_2 = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) np.testing.assert_array_equal(obj_reloaded_2, obj) else: assert obj_reloaded == obj assert obj_reloaded_2 == obj os.remove(filename)
def test_joblib_pickle_across_python_versions(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] # Testing all the compressed and non compressed # pickles in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) pickle_extensions = ('.pkl', '.gz', '.gzip', '.bz2', '.xz', '.lzma') pickle_filenames = [os.path.join(test_data_dir, fn) for fn in os.listdir(test_data_dir) if any(fn.endswith(ext) for ext in pickle_extensions)] for fname in pickle_filenames: _check_pickle(fname, expected_list)
def test_file_handle_persistence(tmpdir): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] fobjs = [bz2.BZ2File, gzip.GzipFile] if lzma is not None: fobjs += [lzma.LZMAFile] filename = tmpdir.join('test.pkl').strpath for obj in objs: for fobj in fobjs: with fobj(filename, 'wb') as f: numpy_pickle.dump(obj, f) # using the same decompressor prevents from internally # decompress again. with fobj(filename, 'rb') as f: obj_reloaded = numpy_pickle.load(f) # when needed, the correct decompressor should be used when # passing a raw file handle. with open(filename, 'rb') as f: obj_reloaded_2 = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) np.testing.assert_array_equal(obj_reloaded_2, obj) else: assert obj_reloaded == obj assert obj_reloaded_2 == obj
def test_joblib_decompression_format_support(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !" ] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) extensions = ('.gz', '.gzip', '.bz2', '.xz', '.lzma') compress_filenames_list = [ glob.glob(os.path.join(test_data_dir, ext)) for ext in extensions ] compress_filenames = sum(compress_filenames_list, []) for fname in compress_filenames: _check_compression_format(fname, expected_list)
def test_file_handle_persistence(): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] fobjs = [open] if not PY26: fobjs += [bz2.BZ2File, gzip.GzipFile] if PY3_OR_LATER: import lzma fobjs += [lzma.LZMAFile] filename = env['filename'] + str(random.randint(0, 1000)) for obj in objs: for fobj in fobjs: with fobj(filename, 'wb') as f: numpy_pickle.dump(obj, f) # using the same decompressor prevents from internally # decompress again. with fobj(filename, 'rb') as f: obj_reloaded = numpy_pickle.load(f) # when needed, the correct decompressor should be used when # passing a raw file handle. with open(filename, 'rb') as f: obj_reloaded_2 = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) np.testing.assert_array_equal(obj_reloaded_2, obj) else: nose.tools.assert_equal(obj_reloaded, obj) nose.tools.assert_equal(obj_reloaded_2, obj) os.remove(filename)
def test_joblib_decompression_format_support(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) extensions = ('.gz', '.gzip', '.bz2', '.xz', '.lzma') compress_filenames_list = [glob.glob(os.path.join(test_data_dir, ext)) for ext in extensions] compress_filenames = sum(compress_filenames_list, []) for fname in compress_filenames: _check_compression_format(fname, expected_list)
def test_numpy_persistence(): filename = env['filename'] rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) for compress in (False, True, 0, 3): # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])): # Change the file name to avoid side effects between tests this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) # Check that only one file was created nose.tools.assert_equal(filenames[0], this_filename) # Check that this file does exist nose.tools.assert_true( os.path.exists(os.path.join(env['dir'], filenames[0]))) # Unpickle the object obj_ = numpy_pickle.load(this_filename) # Check that the items are indeed arrays for item in obj_: nose.tools.assert_true(isinstance(item, np.ndarray)) # And finally, check that all the values are equal. np.testing.assert_array_equal(np.array(obj), np.array(obj_)) # Now test with array subclasses for obj in (np.matrix(np.zeros(10)), np.memmap(filename + str(random.randint(0, 1000)) + 'mmap', mode='w+', shape=4, dtype=np.float)): this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) obj_ = numpy_pickle.load(this_filename) if (type(obj) is not np.memmap and hasattr(obj, '__array_prepare__')): # We don't reconstruct memmaps nose.tools.assert_true(isinstance(obj_, type(obj))) np.testing.assert_array_equal(obj_, obj) # Test with an object containing multiple numpy arrays obj = ComplexTestObject() filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) obj_loaded = numpy_pickle.load(this_filename) nose.tools.assert_true(isinstance(obj_loaded, type(obj))) np.testing.assert_array_equal(obj_loaded.array_float, obj.array_float) np.testing.assert_array_equal(obj_loaded.array_int, obj.array_int) np.testing.assert_array_equal(obj_loaded.array_obj, obj.array_obj)
def test_in_memory_persistence(): objs = [np.random.random((10, 10)), "some data", np.matrix([0, 1, 2])] for obj in objs: f = io.BytesIO() numpy_pickle.dump(obj, f) obj_reloaded = numpy_pickle.load(f) if isinstance(obj, np.ndarray): np.testing.assert_array_equal(obj_reloaded, obj) else: assert obj_reloaded == obj
def test_numpy_persistence(tmpdir, compress): filename = tmpdir.join('test.pkl').strpath rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a, ), (a.T, ), (a, a), [a, a, a])): filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 # Check that only one file was created assert filenames[0] == filename # Check that this file does exist assert os.path.exists(filenames[0]) # Unpickle the object obj_ = numpy_pickle.load(filename) # Check that the items are indeed arrays for item in obj_: assert isinstance(item, np.ndarray) # And finally, check that all the values are equal. np.testing.assert_array_equal(np.array(obj), np.array(obj_)) # Now test with array subclasses for obj in (np.matrix(np.zeros(10)), np.memmap(filename + 'mmap', mode='w+', shape=4, dtype=np.float)): filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 obj_ = numpy_pickle.load(filename) if (type(obj) is not np.memmap and hasattr(obj, '__array_prepare__')): # We don't reconstruct memmaps assert isinstance(obj_, type(obj)) np.testing.assert_array_equal(obj_, obj) # Test with an object containing multiple numpy arrays obj = ComplexTestObject() filenames = numpy_pickle.dump(obj, filename, compress=compress) # All is cached in one file assert len(filenames) == 1 obj_loaded = numpy_pickle.load(filename) assert isinstance(obj_loaded, type(obj)) np.testing.assert_array_equal(obj_loaded.array_float, obj.array_float) np.testing.assert_array_equal(obj_loaded.array_int, obj.array_int) np.testing.assert_array_equal(obj_loaded.array_obj, obj.array_obj)
def test_compressed_pickle_dump_and_load(): expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !" ] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name try: dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) nose.tools.assert_equal(len(dumped_filenames), 1) result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) finally: os.remove(fname)
def test_memory_usage(): # Verify memory stays within expected bounds. filename = env['filename'] small_array = np.ones((10, 10)) big_array = np.ones(shape=100 * int(1e6), dtype=np.uint8) small_matrix = np.matrix(small_array) big_matrix = np.matrix(big_array) for compress in (True, False): for obj in (small_array, big_array, small_matrix, big_matrix): size = obj.nbytes / 1e6 obj_filename = filename + str(np.random.randint(0, 1000)) mem_used = memory_used(numpy_pickle.dump, obj, obj_filename, compress=compress) # The memory used to dump the object shouldn't exceed the buffer # size used to write array chunks (16MB). write_buf_size = _IO_BUFFER_SIZE + 16 * 1024 ** 2 / 1e6 assert mem_used <= write_buf_size mem_used = memory_used(numpy_pickle.load, obj_filename) # memory used should be less than array size + buffer size used to # read the array chunk by chunk. read_buf_size = 32 + _IO_BUFFER_SIZE # MiB assert mem_used < size + read_buf_size
def test_numpy_persistence(): filename = env['filename'] rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) for compress, cache_size in ((0, 0), (1, 0), (1, 10)): # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a,), (a.T,), (a, a), [a, a, a])): # Change the file name to avoid side effects between tests this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress, cache_size=cache_size) # Check that one file was created per array if not compress: nose.tools.assert_equal(len(filenames), len(obj) + 1) # Check that these files do exist for file in filenames: nose.tools.assert_true( os.path.exists(os.path.join(env['dir'], file))) # Unpickle the object obj_ = numpy_pickle.load(this_filename) # Check that the items are indeed arrays for item in obj_: nose.tools.assert_true(isinstance(item, np.ndarray)) # And finally, check that all the values are equal. nose.tools.assert_true(np.all(np.array(obj) == np.array(obj_))) # Now test with array subclasses for obj in ( np.matrix(np.zeros(10)), np.core.multiarray._reconstruct(np.memmap, (), np.float) ): this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress, cache_size=cache_size) obj_ = numpy_pickle.load(this_filename) if (type(obj) is not np.memmap and hasattr(obj, '__array_prepare__')): # We don't reconstruct memmaps nose.tools.assert_true(isinstance(obj_, type(obj))) # Finally smoke test the warning in case of compress + mmap_mode this_filename = filename + str(random.randint(0, 1000)) numpy_pickle.dump(a, this_filename, compress=1) numpy_pickle.load(this_filename, mmap_mode='r')
def test_numpy_persistence(): filename = env['filename'] rnd = np.random.RandomState(0) a = rnd.random_sample((10, 2)) for compress in (False, True, 0, 3): # We use 'a.T' to have a non C-contiguous array. for index, obj in enumerate(((a, ), (a.T, ), (a, a), [a, a, a])): # Change the file name to avoid side effects between tests this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) # Check that only one file was created nose.tools.assert_equal(filenames[0], this_filename) # Check that this file does exist nose.tools.assert_true( os.path.exists(os.path.join(env['dir'], filenames[0]))) # Unpickle the object obj_ = numpy_pickle.load(this_filename) # Check that the items are indeed arrays for item in obj_: nose.tools.assert_true(isinstance(item, np.ndarray)) # And finally, check that all the values are equal. np.testing.assert_array_equal(np.array(obj), np.array(obj_)) # Now test with array subclasses for obj in (np.matrix(np.zeros(10)), np.memmap(filename + str(random.randint(0, 1000)) + 'mmap', mode='w+', shape=4, dtype=np.float)): this_filename = filename + str(random.randint(0, 1000)) filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) obj_ = numpy_pickle.load(this_filename) if (type(obj) is not np.memmap and hasattr(obj, '__array_prepare__')): # We don't reconstruct memmaps nose.tools.assert_true(isinstance(obj_, type(obj))) np.testing.assert_array_equal(obj_, obj) # Test with an object containing multiple numpy arrays obj = ComplexTestObject() filenames = numpy_pickle.dump(obj, this_filename, compress=compress) # All is cached in one file nose.tools.assert_equal(len(filenames), 1) obj_loaded = numpy_pickle.load(this_filename) nose.tools.assert_true(isinstance(obj_loaded, type(obj))) np.testing.assert_array_equal(obj_loaded.array_float, obj.array_float) np.testing.assert_array_equal(obj_loaded.array_int, obj.array_int) np.testing.assert_array_equal(obj_loaded.array_obj, obj.array_obj)