def test_n_threads_agnosticism( PairwiseDistancesReduction, seed, n_samples, chunk_size, n_features=100, dtype=np.float64, ): # Results should not depend on the number of threads rng = np.random.RandomState(seed) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread parameter = ( 10 if PairwiseDistancesReduction is PairwiseDistancesArgKmin # Scaling the radius slightly with the numbers of dimensions else 10**np.log(n_features)) ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"): dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True) ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
def test_strategies_consistency( PairwiseDistancesReduction, metric, n_samples, seed, n_features=10, dtype=np.float64, ): rng = np.random.RandomState(seed) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread # Haversine distance only accepts 2D data if metric == "haversine": X = np.ascontiguousarray(X[:, :2]) Y = np.ascontiguousarray(Y[:, :2]) parameter = ( 10 if PairwiseDistancesReduction is PairwiseDistancesArgKmin # Scaling the radius slightly with the numbers of dimensions else 10 ** np.log(n_features) ) dist_par_X, indices_par_X = PairwiseDistancesReduction.compute( X, Y, parameter, metric=metric, # Taking the first metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], # To be sure to use parallelization chunk_size=n_samples // 4, strategy="parallel_on_X", return_distance=True, ) dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute( X, Y, parameter, metric=metric, # Taking the first metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0], # To be sure to use parallelization chunk_size=n_samples // 4, strategy="parallel_on_Y", return_distance=True, ) ASSERT_RESULT[PairwiseDistancesReduction]( dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, )
def test_memmap_backed_data( metric, PairwiseDistancesReduction, n_samples=512, n_features=100, dtype=np.float64, ): # Results must not depend on the datasets writability rng = np.random.RandomState(0) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread # Create read only datasets X_mm, Y_mm = create_memmap_backed_data([X, Y]) if PairwiseDistancesReduction is PairwiseDistancesArgKmin: parameter = 10 check_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions radius = 10**np.log(n_features) parameter = radius check_parameters = {"radius": radius} ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, metric=metric, return_distance=True, ) dist_mm, indices_mm = PairwiseDistancesReduction.compute( X_mm, Y_mm, parameter, metric=metric, return_distance=True, ) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters)
def test_chunk_size_agnosticism( global_random_seed, PairwiseDistancesReduction, n_samples, chunk_size, n_features=100, dtype=np.float64, ): # Results must not depend on the chunk size rng = np.random.RandomState(global_random_seed) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread if PairwiseDistancesReduction is PairwiseDistancesArgKmin: parameter = 10 check_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions radius = 10**np.log(n_features) parameter = radius check_parameters = {"radius": radius} ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, metric="manhattan", return_distance=True, ) dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, chunk_size=chunk_size, metric="manhattan", return_distance=True, ) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist, ref_indices, indices, **check_parameters)
def test_n_threads_agnosticism( global_random_seed, PairwiseDistancesReduction, n_samples, chunk_size, n_features=100, dtype=np.float64, ): # Results must not depend on the number of threads rng = np.random.RandomState(global_random_seed) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread if PairwiseDistancesReduction is PairwiseDistancesArgKmin: parameter = 10 check_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions radius = 10**np.log(n_features) parameter = radius check_parameters = {"radius": radius} ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"): dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True) ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist, ref_indices, indices, **check_parameters)
def test_chunk_size_agnosticism( global_random_seed, PairwiseDistancesReduction, n_samples, chunk_size, n_features=100, dtype=np.float64, ): # Results should not depend on the chunk size rng = np.random.RandomState(global_random_seed) spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread Y = rng.rand(n_samples, n_features).astype(dtype) * spread parameter = ( 10 if PairwiseDistancesReduction is PairwiseDistancesArgKmin # Scaling the radius slightly with the numbers of dimensions else 10 ** np.log(n_features) ) ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, chunk_size=chunk_size, return_distance=True, ) ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)