def test_compressed_pickle_dump_and_load(): expected_list = [ np.arange(5, dtype=np.int64), np.arange(5, dtype=np.float64), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !" ] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name try: numpy_pickle.dump(expected_list, fname, compress=1) result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) finally: os.remove(fname)
def test_hash_object_dtype(): """ Make sure that ndarrays with dtype `object' hash correctly.""" a = np.array([np.arange(i) for i in range(6)], dtype=object) b = np.array([np.arange(i) for i in range(6)], dtype=object) assert hash(a) == hash(b)
def test_compressed_pickle_dump_and_load(): expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !"] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name try: dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) assert len(dumped_filenames) == 1 result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected finally: os.remove(fname)
def test_joblib_pickle_across_python_versions(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] # Testing all the compressed and non compressed # pickles in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) pickle_extensions = ('.pkl', '.gz', '.gzip', '.bz2', '.xz', '.lzma') pickle_filenames = [os.path.join(test_data_dir, fn) for fn in os.listdir(test_data_dir) if any(fn.endswith(ext) for ext in pickle_extensions)] for fname in pickle_filenames: _check_pickle(fname, expected_list)
def test_hash_object_dtype(): """ Make sure that ndarrays with dtype `object' hash correctly.""" a = np.array([np.arange(i) for i in range(6)], dtype=object) b = np.array([np.arange(i) for i in range(6)], dtype=object) nose.tools.assert_equal(hash(a), hash(b))
def test_joblib_pickle_across_python_versions(): # XXX: temporarily disable this test on non little-endian machines if sys.byteorder != 'little': raise nose.SkipTest('Skipping this test on non little-endian machines') # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !" ] # Testing all the *.gz and *.pkl (compressed and non compressed # pickles) in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) data_filenames = glob.glob(os.path.join(test_data_dir, '*.gz')) data_filenames += glob.glob(os.path.join(test_data_dir, '*.pkl')) for fname in data_filenames: _check_pickle(fname, expected_list)
def test_joblib_pickle_across_python_versions(): # XXX: temporarily disable this test on non little-endian machines if sys.byteorder != 'little': raise nose.SkipTest('Skipping this test on non little-endian machines') # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !"] # Testing all the *.gz and *.pkl (compressed and non compressed # pickles) in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) data_filenames = glob.glob(os.path.join(test_data_dir, '*.gz')) data_filenames += glob.glob(os.path.join(test_data_dir, '*.pkl')) for fname in data_filenames: _check_pickle(fname, expected_list)
def test_joblib_decompression_format_support(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !" ] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) extensions = ('.gz', '.gzip', '.bz2', '.xz', '.lzma') compress_filenames_list = [ glob.glob(os.path.join(test_data_dir, ext)) for ext in extensions ] compress_filenames = sum(compress_filenames_list, []) for fname in compress_filenames: _check_compression_format(fname, expected_list)
def test_compressed_pickle_dump_and_load(tmpdir): expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), np.arange(256, dtype=np.uint8).tobytes(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !" ] fname = tmpdir.join('temp.pkl.gz').strpath dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) assert len(dumped_filenames) == 1 result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): assert result.dtype == expected.dtype np.testing.assert_equal(result, expected) else: assert result == expected
def test_compressed_pickle_dump_and_load(): # XXX: temporarily disable this test on non little-endian machines if sys.byteorder != 'little': raise nose.SkipTest('Skipping this test on non little-endian machines') expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !"] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name # Need to test both code branches (whether array size is greater # or smaller than cache_size) for cache_size in [0, 1e9]: try: dumped_filenames = numpy_pickle.dump( expected_list, fname, compress=1, cache_size=cache_size) result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) finally: for fn in dumped_filenames: os.remove(fn)
def create_objects_to_hash(): rng = np.random.RandomState(42) # Being explicit about dtypes in order to avoid # architecture-related differences. Also using 'f4' rather than # 'f8' for float arrays because 'f8' arrays generated by # rng.random.randn don't seem to be bit-identical on 32bit and # 64bit machines. to_hash_list = [ rng.randint(-1000, high=1000, size=50).astype('<i8'), tuple(rng.randn(3).astype('<f4') for _ in range(5)), [rng.randn(3).astype('<f4') for _ in range(5)], { -3333: rng.randn(3, 5).astype('<f4'), 0: [ rng.randint(10, size=20).astype('<i8'), rng.randn(10).astype('<f4') ] }, # Non regression cases for # https://github.com/joblib/joblib/issues/308 np.arange(100, dtype='<i8').reshape((10, 10)), # Fortran contiguous array np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))), # Non contiguous array np.arange(100, dtype='<i8').reshape((10, 10))[:, :2], ] return to_hash_list
def test_joblib_pickle_across_python_versions(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] # Testing all the compressed and non compressed # pickles in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) pickle_extensions = ('.pkl', '.gz', '.gzip', '.bz2', '.xz', '.lzma', 'lz4') pickle_filenames = [os.path.join(test_data_dir, fn) for fn in os.listdir(test_data_dir) if any(fn.endswith(ext) for ext in pickle_extensions)] for fname in pickle_filenames: _check_pickle(fname, expected_list)
def test_joblib_decompression_format_support(): # We need to be specific about dtypes in particular endianness # because the pickles can be generated on one architecture and # the tests run on another one. See # https://github.com/joblib/joblib/issues/279. expected_list = [np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', {'a': 1, 'b': 2}], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), u"C'est l'\xe9t\xe9 !"] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) extensions = ('.gz', '.gzip', '.bz2', '.xz', '.lzma') compress_filenames_list = [glob.glob(os.path.join(test_data_dir, ext)) for ext in extensions] compress_filenames = sum(compress_filenames_list, []) for fname in compress_filenames: _check_compression_format(fname, expected_list)
def test_hashes_stay_the_same_with_numpy_objects(): # We want to make sure that hashes don't change with joblib # version. For end users, that would mean that they have to # regenerate their cache from scratch, which potentially means # lengthy recomputations. rng = np.random.RandomState(42) # Being explicit about dtypes in order to avoid # architecture-related differences. Also using 'f4' rather than # 'f8' for float arrays because 'f8' arrays generated by # rng.random.randn don't seem to be bit-identical on 32bit and # 64bit machines. to_hash_list = [ rng.randint(-1000, high=1000, size=50).astype('<i8'), tuple(rng.randn(3).astype('<f4') for _ in range(5)), [rng.randn(3).astype('<f4') for _ in range(5)], { -3333: rng.randn(3, 5).astype('<f4'), 0: [ rng.randint(10, size=20).astype('<i8'), rng.randn(10).astype('<f4') ] }, # Non regression cases for https://github.com/joblib/joblib/issues/308. # Generated with joblib 0.9.4. np.arange(100, dtype='<i8').reshape((10, 10)), # Fortran contiguous array np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))), # Non contiguous array np.arange(100, dtype='<i8').reshape((10, 10))[:, :2], ] # These expected results have been generated with joblib 0.9.0 expected_dict = { 'py2': [ '80f2387e7752abbda2658aafed49e086', '0d700f7f25ea670fd305e4cd93b0e8cd', '83a2bdf843e79e4b3e26521db73088b9', '63e0efd43c0a9ad92a07e8ce04338dd3', '03fef702946b602c852b8b4e60929914', '07074691e90d7098a85956367045c81e', 'd264cf79f353aa7bbfa8349e3df72d8f' ], 'py3': [ '10a6afc379ca2708acfbaef0ab676eab', '988a7114f337f381393025911ebc823b', 'c6809f4b97e35f2fa0ee8d653cbd025c', 'b3ad17348e32728a7eb9cda1e7ede438', '927b3e6b0b6a037e8e035bda134e0b05', '108f6ee98e7db19ea2006ffd208f4bf1', 'bd48ccaaff28e16e6badee81041b7180' ] } py_version_str = 'py3' if PY3_OR_LATER else 'py2' expected_list = expected_dict[py_version_str] for to_hash, expected in zip(to_hash_list, expected_list): assert hash(to_hash) == expected
def test_compressed_pickle_python_2_3_compatibility(): expected_list = [np.arange(5, dtype=np.int64), np.arange(5, dtype=np.float64), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !"] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) # These files have been generated with the # joblib/test/data/create_numpy_pickle.py script for the relevant # python and joblib versions basenames = ['joblib_0.8.4_compressed_pickle_py27.gz', 'joblib_0.9.0_compressed_pickle_py27.gz', 'joblib_0.8.4_compressed_pickle_py33.gz', 'joblib_0.9.0_compressed_pickle_py33.gz', 'joblib_0.8.4_compressed_pickle_py34.gz', 'joblib_0.9.0_compressed_pickle_py34.gz'] data_filenames = [os.path.join(test_data_dir, bname) for bname in basenames] for fname in data_filenames: version_match = re.match(r'.+py(\d)(\d).gz', fname) python_version_used_for_writing = tuple( [int(each) for each in version_match.groups()]) python_version_used_for_reading = sys.version_info[:2] python_version_to_default_pickle_protocol = { (2, 6): 2, (2, 7): 2, (3, 0): 3, (3, 1): 3, (3, 2): 3, (3, 3): 3, (3, 4): 4} pickle_reading_protocol = python_version_to_default_pickle_protocol[ python_version_used_for_reading] pickle_writing_protocol = python_version_to_default_pickle_protocol[ python_version_used_for_writing] if ('0.8.4' not in fname or pickle_reading_protocol >= pickle_writing_protocol): result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) else: # For joblib <= 0.8.4 compressed pickles written with # python `version = v` can not be read by python with # `version < v' because of differences in the default # pickle protocol (2 for python 2, 3 for python 3.3 and 4 # for python 3.4) try: numpy_pickle.load(fname) raise AssertionError('Numpy pickle loading should ' 'have raised a ValueError exception') except ValueError as e: nose.tools.assert_true( 'unsupported pickle protocol' in str(e.args))
def test_numpy_datetime_array(): # memoryview is not supported for some dtypes e.g. datetime64 # see https://github.com/joblib/joblib/issues/188 for more details dtypes = ['datetime64[s]', 'timedelta64[D]'] a_hash = hash(np.arange(10)) arrays = (np.arange(0, 10, dtype=dtype) for dtype in dtypes) for array in arrays: assert hash(array) != a_hash
def test_numpy_datetime_array(): # memoryview is not supported for some dtypes e.g. datetime64 # see https://github.com/joblib/joblib/issues/188 for more details dtypes = ['datetime64[s]', 'timedelta64[D]'] a_hash = hash(np.arange(10)) arrays = (np.arange(0, 10, dtype=dtype) for dtype in dtypes) for array in arrays: nose.tools.assert_not_equal(hash(array), a_hash)
def test_hashes_stay_the_same_with_numpy_objects(): # We want to make sure that hashes don't change with joblib # version. For end users, that would mean that they have to # regenerate their cache from scratch, which potentially means # lengthy recomputations. rng = np.random.RandomState(42) # Being explicit about dtypes in order to avoid # architecture-related differences. Also using 'f4' rather than # 'f8' for float arrays because 'f8' arrays generated by # rng.random.randn don't seem to be bit-identical on 32bit and # 64bit machines. to_hash_list = [ rng.randint(-1000, high=1000, size=50).astype('<i8'), tuple(rng.randn(3).astype('<f4') for _ in range(5)), [rng.randn(3).astype('<f4') for _ in range(5)], { -3333: rng.randn(3, 5).astype('<f4'), 0: [ rng.randint(10, size=20).astype('<i8'), rng.randn(10).astype('<f4') ] }, # Non regression cases for https://github.com/joblib/joblib/issues/308. # Generated with joblib 0.9.4. np.arange(100, dtype='<i8').reshape((10, 10)), # Fortran contiguous array np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))), # Non contiguous array np.arange(100, dtype='<i8').reshape((10, 10))[:, :2], ] # These expected results have been generated with joblib 0.9.0 expected_dict = {'py2': ['80f2387e7752abbda2658aafed49e086', '0d700f7f25ea670fd305e4cd93b0e8cd', '83a2bdf843e79e4b3e26521db73088b9', '63e0efd43c0a9ad92a07e8ce04338dd3', '03fef702946b602c852b8b4e60929914', '07074691e90d7098a85956367045c81e', 'd264cf79f353aa7bbfa8349e3df72d8f'], 'py3': ['10a6afc379ca2708acfbaef0ab676eab', '988a7114f337f381393025911ebc823b', 'c6809f4b97e35f2fa0ee8d653cbd025c', 'b3ad17348e32728a7eb9cda1e7ede438', '927b3e6b0b6a037e8e035bda134e0b05', '108f6ee98e7db19ea2006ffd208f4bf1', 'bd48ccaaff28e16e6badee81041b7180']} py_version_str = 'py3' if PY3_OR_LATER else 'py2' expected_list = expected_dict[py_version_str] for to_hash, expected in zip(to_hash_list, expected_list): yield assert_equal, hash(to_hash), expected
def test_hash_numpy_noncontiguous(): a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] b = np.ascontiguousarray(a) nose.tools.assert_not_equal(hash(a), hash(b)) c = np.asfortranarray(a) nose.tools.assert_not_equal(hash(a), hash(c))
def test_hash_numpy_noncontiguous(): a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] b = np.ascontiguousarray(a) assert hash(a) != hash(b) c = np.asfortranarray(a) assert hash(a) != hash(c)
def test_memmapping_temp_folder_thread_safety(): # Concurrent calls to Parallel with the loky backend will use the same # executor, and thus the same reducers. Make sure that those reducers use # different temporary folders depending on which Parallel objects called # them, which is necessary to limit potential race conditions during the # garbage collection of temporary memmaps. array = np.arange(int(1e2)) temp_dirs_thread_1 = set() temp_dirs_thread_2 = set() def concurrent_get_filename(array, temp_dirs): with Parallel(backend='loky', n_jobs=2, max_nbytes=10) as p: for i in range(10): [filename ] = p(delayed(getattr)(array, 'filename') for _ in range(1)) temp_dirs.add(os.path.dirname(filename)) t1 = threading.Thread(target=concurrent_get_filename, args=(array, temp_dirs_thread_1)) t2 = threading.Thread(target=concurrent_get_filename, args=(array, temp_dirs_thread_2)) t1.start() t2.start() t1.join() t2.join() assert len(temp_dirs_thread_1) == 1 assert len(temp_dirs_thread_2) == 1 assert temp_dirs_thread_1 != temp_dirs_thread_2
def test_managed_backend_reuse_temp_folder(backend): # Test that calls to a managed parallel object reuse the same memmaps. array = np.arange(int(1e2)) with Parallel(n_jobs=2, backend=backend, max_nbytes=10) as p: [filename_1] = p(delayed(getattr)(array, 'filename') for _ in range(1)) [filename_2] = p(delayed(getattr)(array, 'filename') for _ in range(1)) assert os.path.dirname(filename_2) == os.path.dirname(filename_1)
def test_direct_mmap(tmpdir): testfile = str(tmpdir.join('arr.dat')) a = np.arange(10, dtype='uint8') a.tofile(testfile) def _read_array(): with open(testfile) as fd: mm = mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ, offset=0) return np.ndarray((10, ), dtype=np.uint8, buffer=mm, offset=0) def func(x): return x**2 arr = _read_array() # this is expected to work and gives the reference ref = Parallel(n_jobs=2)(delayed(func)(x) for x in [a]) # now test that it work with the mmap array results = Parallel(n_jobs=2)(delayed(func)(x) for x in [arr]) np.testing.assert_array_equal(results, ref) # also test with a mmap array read in the subprocess def worker(): return _read_array() results = Parallel(n_jobs=2)(delayed(worker)() for _ in range(1)) np.testing.assert_array_equal(results[0], arr)
def test_compressed_pickle_dump_and_load(): expected_list = [np.arange(5, dtype=np.int64), np.arange(5, dtype=np.float64), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !"] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: numpy_pickle.dump(expected_list, f.name, compress=1) result_list = numpy_pickle.load(f.name) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected)
def test_hashes_stay_the_same_with_numpy_objects(): # We want to make sure that hashes don't change with joblib # version. For end users, that would mean that they have to # regenerate their cache from scratch, which potentially means # lengthy recomputations. rng = np.random.RandomState(42) # Being explicit about dtypes in order to avoid # architecture-related differences. Also using 'f4' rather than # 'f8' for float arrays because 'f8' arrays generated by # rng.random.randn don't seem to be bit-identical on 32bit and # 64bit machines. to_hash_list = [ rng.randint(-1000, high=1000, size=50).astype('<i8'), tuple(rng.randn(3).astype('<f4') for _ in range(5)), [rng.randn(3).astype('<f4') for _ in range(5)], { -3333: rng.randn(3, 5).astype('<f4'), 0: [ rng.randint(10, size=20).astype('<i8'), rng.randn(10).astype('<f4') ] }, # Non regression cases for https://github.com/joblib/joblib/issues/308. # Generated with joblib 0.9.4. np.arange(100, dtype='<i8').reshape((10, 10)), # Fortran contiguous array np.asfortranarray(np.arange(100, dtype='<i8').reshape((10, 10))), # Non contiguous array np.arange(100, dtype='<i8').reshape((10, 10))[:, :2], ] # These expected results have been generated with joblib 0.9.0 expected_hashes = [ '10a6afc379ca2708acfbaef0ab676eab', '988a7114f337f381393025911ebc823b', 'c6809f4b97e35f2fa0ee8d653cbd025c', 'b3ad17348e32728a7eb9cda1e7ede438', '927b3e6b0b6a037e8e035bda134e0b05', '108f6ee98e7db19ea2006ffd208f4bf1', 'bd48ccaaff28e16e6badee81041b7180' ] for to_hash, expected in zip(to_hash_list, expected_hashes): assert hash(to_hash) == expected
def test_joblib_pickle_across_python_versions(): expected_list = [np.arange(5, dtype=np.int64), np.arange(5, dtype=np.float64), np.array([1, 'abc', {'a': 1, 'b': 2}]), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !"] # Testing all the *.gz and *.pkl (compressed and non compressed # pickles) in joblib/test/data. These pickles were generated by # the joblib/test/data/create_numpy_pickle.py script for the # relevant python, joblib and numpy versions. test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) data_filenames = glob.glob(os.path.join(test_data_dir, '*.gz')) data_filenames += glob.glob(os.path.join(test_data_dir, '*.pkl')) for fname in data_filenames: _check_pickle(fname, expected_list)
def test_parallel_isolated_temp_folders(backend): # Test that consecutive Parallel call use isolated subfolders, even # for the loky backend that reuses its executor instance across calls. array = np.arange(int(1e2)) [filename_1] = Parallel(n_jobs=2, backend=backend, max_nbytes=10)(delayed(getattr)(array, 'filename') for _ in range(1)) [filename_2] = Parallel(n_jobs=2, backend=backend, max_nbytes=10)(delayed(getattr)(array, 'filename') for _ in range(1)) assert os.path.dirname(filename_2) != os.path.dirname(filename_1)
def test_numpy_array_byte_order_mismatch_detection(): # List of numpy arrays with big endian byteorder. be_arrays = [ np.array([(1, 2.0), (3, 4.0)], dtype=[('', '>i8'), ('', '>f8')]), np.arange(3, dtype=np.dtype('>i8')), np.arange(3, dtype=np.dtype('>f8')) ] # Verify the byteorder mismatch is correctly detected. for array in be_arrays: if sys.byteorder == 'big': assert not _is_numpy_array_byte_order_mismatch(array) else: assert _is_numpy_array_byte_order_mismatch(array) converted = _ensure_native_byte_order(array) if converted.dtype.fields: for f in converted.dtype.fields.values(): f[0].byteorder == '=' else: assert converted.dtype.byteorder == "=" # List of numpy arrays with little endian byteorder. le_arrays = [ np.array([(1, 2.0), (3, 4.0)], dtype=[('', '<i8'), ('', '<f8')]), np.arange(3, dtype=np.dtype('<i8')), np.arange(3, dtype=np.dtype('<f8')) ] # Verify the byteorder mismatch is correctly detected. for array in le_arrays: if sys.byteorder == 'little': assert not _is_numpy_array_byte_order_mismatch(array) else: assert _is_numpy_array_byte_order_mismatch(array) converted = _ensure_native_byte_order(array) if converted.dtype.fields: for f in converted.dtype.fields.values(): f[0].byteorder == '=' else: assert converted.dtype.byteorder == "="
def test_compressed_pickle_dump_and_load(): # XXX: temporarily disable this test on non little-endian machines if sys.byteorder != 'little': raise nose.SkipTest('Skipping this test on non little-endian machines') expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('<f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !" ] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name # Need to test both code branches (whether array size is greater # or smaller than cache_size) for cache_size in [0, 1e9]: try: dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1, cache_size=cache_size) result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) finally: for fn in dumped_filenames: os.remove(fn)
def test_compressed_pickle_dump_and_load(): expected_list = [ np.arange(5, dtype=np.dtype('<i8')), np.arange(5, dtype=np.dtype('>i8')), np.arange(5, dtype=np.dtype('<f8')), np.arange(5, dtype=np.dtype('>f8')), np.array([1, 'abc', { 'a': 1, 'b': 2 }], dtype='O'), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), # np.matrix is a subclass of np.ndarray, here we want # to verify this type of object is correctly unpickled # among versions. np.matrix([0, 1, 2], dtype=np.dtype('<i8')), np.matrix([0, 1, 2], dtype=np.dtype('>i8')), u"C'est l'\xe9t\xe9 !" ] with tempfile.NamedTemporaryFile(suffix='.gz', dir=env['dir']) as f: fname = f.name try: dumped_filenames = numpy_pickle.dump(expected_list, fname, compress=1) nose.tools.assert_equal(len(dumped_filenames), 1) result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) finally: os.remove(fname)
def test_high_dimension_memmap_array_reducing(tmpdir): assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a high dimensional memmap a = np.memmap(filename, dtype=np.float64, shape=(100, 15, 15, 3), mode='w+') a[:] = np.arange(100 * 15 * 15 * 3).reshape(a.shape) # Create some slices/indices at various dimensions b = a[0:10] c = a[:, 5:10] d = a[:, :, :, 0] e = a[1:3:4] # Array reducer with auto dumping disabled reducer = ArrayMemmapForwardReducer(None, tmpdir.strpath, 'c', True) def reconstruct_array_or_memmap(x): cons, args = reducer(x) return cons(*args) a_reconstructed = reconstruct_array_or_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) b_reconstructed = reconstruct_array_or_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) c_reconstructed = reconstruct_array_or_memmap(c) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_array_or_memmap(d) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) e_reconstructed = reconstruct_array_or_memmap(e) assert has_shareable_memory(e_reconstructed) assert_array_equal(e_reconstructed, e)
def test_pickle_in_socket(): # test that joblib can pickle in sockets test_array = np.arange(10) _ADDR = ("localhost", 12345) listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) listener.bind(_ADDR) listener.listen(1) client = socket.create_connection(_ADDR) server, client_addr = listener.accept() with server.makefile("wb") as sf: numpy_pickle.dump(test_array, sf) with client.makefile("rb") as cf: array_reloaded = numpy_pickle.load(cf) np.testing.assert_array_equal(array_reloaded, test_array)
def test_high_dimension_memmap_array_reducing(): assert_array_equal = np.testing.assert_array_equal filename = os.path.join(TEMP_FOLDER, 'test.mmap') # Create a high dimensional memmap a = np.memmap(filename, dtype=np.float64, shape=(100, 15, 15, 3), mode='w+') a[:] = np.arange(100 * 15 * 15 * 3).reshape(a.shape) # Create some slices/indices at various dimensions b = a[0:10] c = a[:, 5:10] d = a[:, :, :, 0] e = a[1:3:4] def reconstruct_memmap(x): cons, args = reduce_memmap(x) res = cons(*args) return res a_reconstructed = reconstruct_memmap(a) assert_true(has_shareable_memory(a_reconstructed)) assert_true(isinstance(a_reconstructed, np.memmap)) assert_array_equal(a_reconstructed, a) b_reconstructed = reconstruct_memmap(b) assert_true(has_shareable_memory(b_reconstructed)) assert_array_equal(b_reconstructed, b) c_reconstructed = reconstruct_memmap(c) assert_true(has_shareable_memory(c_reconstructed)) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_memmap(d) assert_true(has_shareable_memory(d_reconstructed)) assert_array_equal(d_reconstructed, d) e_reconstructed = reconstruct_memmap(e) assert_true(has_shareable_memory(e_reconstructed)) assert_array_equal(e_reconstructed, e)
def test_high_dimension_memmap_array_reducing(tmpdir): assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a high dimensional memmap a = np.memmap(filename, dtype=np.float64, shape=(100, 15, 15, 3), mode='w+') a[:] = np.arange(100 * 15 * 15 * 3).reshape(a.shape) # Create some slices/indices at various dimensions b = a[0:10] c = a[:, 5:10] d = a[:, :, :, 0] e = a[1:3:4] def reconstruct_memmap(x): cons, args = reduce_memmap(x) res = cons(*args) return res a_reconstructed = reconstruct_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) b_reconstructed = reconstruct_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) c_reconstructed = reconstruct_memmap(c) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_memmap(d) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) e_reconstructed = reconstruct_memmap(e) assert has_shareable_memory(e_reconstructed) assert_array_equal(e_reconstructed, e)
def test_pickle_in_socket(): # test that joblib can pickle in sockets if not PY3_OR_LATER: raise SkipTest("Cannot peek or seek in socket in python 2.") test_array = np.arange(10) _ADDR = ("localhost", 12345) listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) listener.bind(_ADDR) listener.listen(1) client = socket.create_connection(_ADDR) server, client_addr = listener.accept() with server.makefile("wb") as sf: numpy_pickle.dump(test_array, sf) with client.makefile("rb") as cf: array_reloaded = numpy_pickle.load(cf) np.testing.assert_array_equal(array_reloaded, test_array)
def test_pickle_in_socket(): # test that joblib can pickle in sockets if not PY3_OR_LATER: raise SkipTest("Cannot peek or seek in socket in python 2.") test_array = np.arange(10) _ADDR = ("10.0.0.7", 12345) listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) listener.bind(_ADDR) listener.listen(1) client = socket.create_connection(_ADDR) server, client_addr = listener.accept() with server.makefile("wb") as sf: numpy_pickle.dump(test_array, sf) with client.makefile("rb") as cf: array_reloaded = numpy_pickle.load(cf) np.testing.assert_array_equal(array_reloaded, test_array)
def test_workaround_against_bad_memmap_with_copied_buffers(tmpdir): """Check that memmaps with a bad buffer are returned as regular arrays Unary operations and ufuncs on memmap instances return a new memmap instance with an in-memory buffer (probably a numpy bug). """ assert_array_equal = np.testing.assert_array_equal p = MemmapingPool(3, max_nbytes=10, temp_folder=tmpdir.strpath) try: # Send a complex, large-ish view on a array that will be converted to # a memmap in the worker process a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] # Call a non-inplace multiply operation on the worker and memmap and # send it back to the parent. b = p.apply_async(_worker_multiply, args=(a, 3)).get() assert not has_shareable_memory(b) assert_array_equal(b, 3 * a) finally: p.terminate() del p
def test_workaround_against_bad_memmap_with_copied_buffers(factory, tmpdir): """Check that memmaps with a bad buffer are returned as regular arrays Unary operations and ufuncs on memmap instances return a new memmap instance with an in-memory buffer (probably a numpy bug). """ assert_array_equal = np.testing.assert_array_equal p = factory(3, max_nbytes=10, temp_folder=tmpdir.strpath) try: # Send a complex, large-ish view on a array that will be converted to # a memmap in the worker process a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] # Call a non-inplace multiply operation on the worker and memmap and # send it back to the parent. b = p.apply_async(_worker_multiply, args=(a, 3)).get() assert not has_shareable_memory(b) assert_array_equal(b, 3 * a) finally: p.terminate() del p
def __init__(self): self.array_float = np.arange(100, dtype='float64') self.array_int = np.ones(100, dtype='int32') self.array_obj = np.array(['a', 10, 20.0], dtype='object')
def test_numpy_datetime_array(dtype): # memoryview is not supported for some dtypes e.g. datetime64 # see https://github.com/joblib/joblib/issues/188 for more details a_hash = hash(np.arange(10)) array = np.arange(0, 10, dtype=dtype) assert hash(array) != a_hash
def test_memmap_based_array_reducing(tmpdir): """Check that it is possible to reduce a memmap backed array""" assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a file larger than what will be used by a buffer = np.memmap(filename, dtype=np.float64, shape=500, mode='w+') # Fill the original buffer with negative markers to detect over of # underflow in case of test failures buffer[:] = - 1.0 * np.arange(buffer.shape[0], dtype=buffer.dtype) buffer.flush() # Memmap a 2D fortran array on a offseted subsection of the previous # buffer a = np.memmap(filename, dtype=np.float64, shape=(3, 5, 4), mode='r+', order='F', offset=4) a[:] = np.arange(60).reshape(a.shape) # Build various views that share the buffer with the original memmap # b is an memmap sliced view on an memmap instance b = a[1:-1, 2:-1, 2:4] # c and d are array views c = np.asarray(b) d = c.T # Array reducer with auto dumping disabled reducer = ArrayMemmapReducer(None, tmpdir.strpath, 'c') def reconstruct_array(x): cons, args = reducer(x) return cons(*args) def reconstruct_memmap(x): cons, args = reduce_memmap(x) return cons(*args) # Reconstruct original memmap a_reconstructed = reconstruct_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) # Reconstruct strided memmap view b_reconstructed = reconstruct_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) # Reconstruct arrays views on memmap base c_reconstructed = reconstruct_array(c) assert not isinstance(c_reconstructed, np.memmap) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_array(d) assert not isinstance(d_reconstructed, np.memmap) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) # Test graceful degradation on fake memmap instances with in-memory # buffers a3 = a * 3 assert not has_shareable_memory(a3) a3_reconstructed = reconstruct_memmap(a3) assert not has_shareable_memory(a3_reconstructed) assert not isinstance(a3_reconstructed, np.memmap) assert_array_equal(a3_reconstructed, a * 3) # Test graceful degradation on arrays derived from fake memmap instances b3 = np.asarray(a3) assert not has_shareable_memory(b3) b3_reconstructed = reconstruct_array(b3) assert isinstance(b3_reconstructed, np.ndarray) assert not has_shareable_memory(b3_reconstructed) assert_array_equal(b3_reconstructed, b3)
def test_compressed_pickle_python_2_3_compatibility(): expected_list = [ np.arange(5, dtype=np.int64), np.arange(5, dtype=np.float64), # .tostring actually returns bytes and is a # compatibility alias for .tobytes which was # added in 1.9.0 np.arange(256, dtype=np.uint8).tostring(), u"C'est l'\xe9t\xe9 !" ] test_data_dir = os.path.dirname(os.path.abspath(data.__file__)) # These files have been generated with the # joblib/test/data/create_numpy_pickle.py script for the relevant # python and joblib versions basenames = [ 'joblib_0.8.4_compressed_pickle_py27.gz', 'joblib_0.9.0_compressed_pickle_py27.gz', 'joblib_0.8.4_compressed_pickle_py33.gz', 'joblib_0.9.0_compressed_pickle_py33.gz', 'joblib_0.8.4_compressed_pickle_py34.gz', 'joblib_0.9.0_compressed_pickle_py34.gz' ] data_filenames = [ os.path.join(test_data_dir, bname) for bname in basenames ] for fname in data_filenames: version_match = re.match(r'.+py(\d)(\d).gz', fname) py_version_used_for_writing = tuple( [int(each) for each in version_match.groups()]) py_version_used_for_reading = sys.version_info[:2] # Use Pickle protocol 4 for Python 3.4 and later py_version_to_default_pickle_protocol = { (2, 6): 2, (2, 7): 2, (3, 0): 3, (3, 1): 3, (3, 2): 3, (3, 3): 3 } pickle_reading_protocol = py_version_to_default_pickle_protocol.get( py_version_used_for_reading, 4) pickle_writing_protocol = py_version_to_default_pickle_protocol.get( py_version_used_for_writing, 4) if ('0.8.4' not in fname or pickle_reading_protocol >= pickle_writing_protocol): result_list = numpy_pickle.load(fname) for result, expected in zip(result_list, expected_list): if isinstance(expected, np.ndarray): nose.tools.assert_equal(result.dtype, expected.dtype) np.testing.assert_equal(result, expected) else: nose.tools.assert_equal(result, expected) else: # For joblib <= 0.8.4 compressed pickles written with # python `version = v` can not be read by python with # `version < v' because of differences in the default # pickle protocol (2 for python 2, 3 for python 3.3 and 4 # for python 3.4) try: numpy_pickle.load(fname) raise AssertionError('Numpy pickle loading should ' 'have raised a ValueError exception') except ValueError as e: nose.tools.assert_true( 'unsupported pickle protocol' in str(e.args))
def test_memmap_based_array_reducing(tmpdir): """Check that it is possible to reduce a memmap backed array""" assert_array_equal = np.testing.assert_array_equal filename = tmpdir.join('test.mmap').strpath # Create a file larger than what will be used by a buffer = np.memmap(filename, dtype=np.float64, shape=500, mode='w+') # Fill the original buffer with negative markers to detect over of # underflow in case of test failures buffer[:] = -1.0 * np.arange(buffer.shape[0], dtype=buffer.dtype) buffer.flush() # Memmap a 2D fortran array on a offseted subsection of the previous # buffer a = np.memmap(filename, dtype=np.float64, shape=(3, 5, 4), mode='r+', order='F', offset=4) a[:] = np.arange(60).reshape(a.shape) # Build various views that share the buffer with the original memmap # b is an memmap sliced view on an memmap instance b = a[1:-1, 2:-1, 2:4] # c and d are array views c = np.asarray(b) d = c.T # Array reducer with auto dumping disabled reducer = ArrayMemmapReducer(None, tmpdir.strpath, 'c') def reconstruct_array(x): cons, args = reducer(x) return cons(*args) def reconstruct_memmap(x): cons, args = reduce_memmap(x) return cons(*args) # Reconstruct original memmap a_reconstructed = reconstruct_memmap(a) assert has_shareable_memory(a_reconstructed) assert isinstance(a_reconstructed, np.memmap) assert_array_equal(a_reconstructed, a) # Reconstruct strided memmap view b_reconstructed = reconstruct_memmap(b) assert has_shareable_memory(b_reconstructed) assert_array_equal(b_reconstructed, b) # Reconstruct arrays views on memmap base c_reconstructed = reconstruct_array(c) assert not isinstance(c_reconstructed, np.memmap) assert has_shareable_memory(c_reconstructed) assert_array_equal(c_reconstructed, c) d_reconstructed = reconstruct_array(d) assert not isinstance(d_reconstructed, np.memmap) assert has_shareable_memory(d_reconstructed) assert_array_equal(d_reconstructed, d) # Test graceful degradation on fake memmap instances with in-memory # buffers a3 = a * 3 assert not has_shareable_memory(a3) a3_reconstructed = reconstruct_memmap(a3) assert not has_shareable_memory(a3_reconstructed) assert not isinstance(a3_reconstructed, np.memmap) assert_array_equal(a3_reconstructed, a * 3) # Test graceful degradation on arrays derived from fake memmap instances b3 = np.asarray(a3) assert not has_shareable_memory(b3) b3_reconstructed = reconstruct_array(b3) assert isinstance(b3_reconstructed, np.ndarray) assert not has_shareable_memory(b3_reconstructed) assert_array_equal(b3_reconstructed, b3)