def pytest_runtest_setup(item): """Set the number of openmp threads based on the number of workers xdist is using to prevent oversubscription. Parameters ---------- item : pytest item item to be processed """ xdist_worker_count = environ.get("PYTEST_XDIST_WORKER_COUNT") if xdist_worker_count is None: # returns if pytest-xdist is not installed return else: xdist_worker_count = int(xdist_worker_count) openmp_threads = _openmp_effective_n_threads() threads_per_worker = max(openmp_threads // xdist_worker_count, 1) threadpool_limits(threads_per_worker, user_api="openmp")
def _init(self): log.info('Initializing envs for env runner %d...', self.worker_idx) if self.cfg.force_envs_single_thread: from threadpoolctl import threadpool_limits threadpool_limits(limits=1, user_api=None) if self.cfg.set_workers_cpu_affinity: set_process_cpu_affinity(self.worker_idx, self.cfg.num_workers) psutil.Process().nice(min(self.cfg.default_niceness + 10, 20)) self.env_runners = [] for split_idx in range(self.num_splits): env_runner = VectorEnvRunner( self.cfg, self.vector_size // self.num_splits, self.worker_idx, split_idx, self.num_agents, self.shared_buffers, self.reward_shaping, ) env_runner.init() self.env_runners.append(env_runner)
def test_openmp_limit_num_threads(num_threads): # checks that OpenMP effectively uses the number of threads requested by # the context manager from ._openmp_test_helper import check_openmp_num_threads old_num_threads = check_openmp_num_threads(100) with threadpool_limits(limits=num_threads): assert check_openmp_num_threads(100) in (num_threads, old_num_threads) assert check_openmp_num_threads(100) == old_num_threads
def test_openmp_nesting(nthreads_outer): # checks that OpenMP effectively uses the number of threads requested by # the context manager from ._openmp_test_helper import check_nested_openmp_loops from ._openmp_test_helper import get_inner_compiler from ._openmp_test_helper import get_outer_compiler inner_cc = get_inner_compiler() outer_cc = get_outer_compiler() outer_num_threads, inner_num_threads = check_nested_openmp_loops(10) original_infos = threadpool_info() openmp_infos = [info for info in original_infos if info["user_api"] == "openmp"] if "gcc" in (inner_cc, outer_cc): assert "libgomp" in [info["prefix"] for info in openmp_infos] if "clang" in (inner_cc, outer_cc): assert "libomp" in [info["prefix"] for info in openmp_infos] if inner_cc == outer_cc: # The openmp runtime should be shared by default, meaning that # the inner loop should automatically be run serially by the OpenMP # runtime. assert inner_num_threads == 1 else: # There should be at least 2 OpenMP runtime detected. assert len(openmp_infos) >= 2 with threadpool_limits(limits=1) as threadpoolctx: max_threads = threadpoolctx.get_original_num_threads()['openmp'] nthreads = effective_num_threads(nthreads_outer, max_threads) outer_num_threads, inner_num_threads = \ check_nested_openmp_loops(10, nthreads) # The state of the original state of all threadpools should have been # restored. assert threadpool_info() == original_infos # The number of threads available in the outer loop should not have been # decreased: assert outer_num_threads == nthreads # The number of threads available in the inner loop should have been # set to 1 so avoid oversubscription and preserve performance: if inner_cc != outer_cc: if inner_num_threads != 1: # XXX: this does not always work when nesting independent openmp # implementations. See: https://github.com/jeremiedbb/Nested_OpenMP pytest.xfail("Inner OpenMP num threads was %d instead of 1" % inner_num_threads) assert inner_num_threads == 1
def optimize_rbf(self, cvs, max_samples=3000): X = cvs[0][0] y = cvs[0][1].reshape(-1, 1) X_val = cvs[0][2] y_val = cvs[0][3].reshape(-1, 1) X_test = cvs[0][4] y_test = cvs[0][5].reshape(-1, 1) if X.shape[0] > max_samples: ind = np.random.randint(low=0, high=X.shape[0], size=max_samples) X = X[ind] y = y[ind] if self.GA == True: with threadpool_limits(limits=1): self.optimize_with_deep(X, y, X_val, y_val, X_test, y_test, self.rated) else: with threadpool_limits(limits=1): self.optimize_common_width(X, y, X_val, y_val, X_test, y_test, self.rated) return self.to_dict()
def _correlation_alignment(s: daskarr, t: daskarr, nthreads: int) -> daskarr: from scipy.linalg import fractional_matrix_power as fmp from threadpoolctl import threadpool_limits s_cov = show_progress(_cov_diaged(s), f"CORAL: Computing source covariance", nthreads) t_cov = show_progress(_cov_diaged(t), f"CORAL: Computing target covariance", nthreads) logger.info("Calculating fractional power of covariance matrices. This might take a while... ") with threadpool_limits(limits=nthreads): a_coral = np.dot(fmp(s_cov, -0.5), fmp(t_cov, 0.5)) logger.info("Fractional power calculation complete") return daskarr.dot(s, a_coral)
def compute_all_runtimes(): # limit multi-threading; NO parallelization threadpoolctl.threadpool_limits(limits=1) # output fn = OUTDIR / ('cached.csv' if USE_CACHED else 'uncached.csv') fn.parent.mkdir(exist_ok=True, parents=True) cols = ['parcellation', 'scale', 'spatnull', 'runtime'] data = pd.read_csv(fn).to_dict('records') if fn.exists() else [] for spatnull in simnulls.SPATNULLS: for parc, scale in PARCS: if parc == "vertex" and spatnull not in simnulls.VERTEXWISE: continue for repeat in range(N_REPEAT): if output_exists(data, parc, scale, spatnull, repeat): continue data.append(get_runtime(parc, scale, spatnull)) pd.DataFrame(data)[cols].to_csv(fn, index=False) return fn
def test_shipped_openblas(): all_openblases = [ctypes.CDLL(path) for path in libopenblas_paths] original_num_threads = [blas.openblas_get_num_threads() for blas in all_openblases] with threadpool_limits(1): for openblas in all_openblases: assert openblas.openblas_get_num_threads() == 1 assert original_num_threads == [openblas.openblas_get_num_threads() for openblas in all_openblases]
def _wrapper(*args): with threadpool_limits(limits=1, user_api='blas'): # writes results of statistical_inefficiency to destination memmap (array_fn) seqs, truncate_acf, mact, I, J, array_fn, start, stop = args[0] array = np.memmap(array_fn, mode='r+', dtype=np.float64) partial = np.empty(len(I)) for n, (i, j) in enumerate(zip(I, J)): s = _indicator_multitraj(seqs, i, j) partial[n] = statistical_inefficiency(s, truncate_acf=truncate_acf, mact=mact) array[start:stop] = partial
def test_shipped_openblas(): # checks that OpenBLAS effectively uses the number of threads requested by # the context manager original_info = ThreadpoolController().info() openblas_controller = ThreadpoolController().select( internal_api="openblas") with threadpool_limits(1): for lib_controller in openblas_controller.lib_controllers: assert lib_controller.num_threads == 1 assert ThreadpoolController().info() == original_info
def test_get_original_num_threads(limit): with threadpool_limits(limits=2, user_api='blas') as ctl: # set different blas num threads to start with (when multiple openblas) if ctl._original_limits: ctl._original_limits[0]['set_num_threads'](1) original_infos = threadpool_info() with threadpool_limits(limits=limit, user_api='blas') as threadpoolctx: original_num_threads = threadpoolctx.get_original_num_threads() assert 'openmp' not in original_num_threads if 'blas' in [module['user_api'] for module in original_infos]: assert original_num_threads['blas'] >= 1 else: assert original_num_threads['blas'] is None if len(libopenblas_paths) >= 2: with pytest.warns(None, match='Multiple value possible'): expected = min([module['num_threads'] for module in original_infos]) assert original_num_threads['blas'] == expected
def test_shipped_openblas(): # checks that OpenBLAS effectively uses the number of threads requested by # the context manager original_info = _threadpool_info() openblas_modules = original_info.get_modules("internal_api", "openblas") with threadpool_limits(1): for module in openblas_modules: assert module.get_num_threads() == 1 assert original_info == _threadpool_info()
def create_pool(self, ncores=None, threadpool_limit=None): """Creates a reusable pool Parameters ---------- ncores : int, optional Number of cores. Defaults to pathos' default, which is the number of cores. threadpool_limit : int, optional Number of threads that numpy uses independently of pathos. Only used if `threadpoolctl` is installed. Defaults to one. """ import pathos if hasattr(self, "pool"): ncores = ncores or self.pool.ncpus self.pool.close() else: # pools should carry their count with them ncores = ncores or pathos.multiprocessing.cpu_count() if threadpool_limit: self.threadpool_limit = threadpool_limit elif not hasattr(self, "threadpool_limit"): self.threadpool_limit = 1 try: from threadpoolctl import threadpool_limits threadpool_limits(limits=self.threadpool_limit) except ImportError: print( "[create_pool:]".ljust(15, " ") + " Could not import package `threadpoolctl` to limit numpy multithreading. This might reduce multiprocessing performance." ) self.pool = pathos.pools.ProcessPool(ncores) self.pool.clear() return self.pool
def apply_affine_transform(x, seg=None, transform_matrix=None, crop_shape=None, fill_mode='nearest', cval=0., order=1): """Applies an affine transformation specified by the parameters given. # Arguments x: 4D numpy array, single image, multimodalities (Modality*H*W) fill_mode: Points outside the boundaries of the input are filled according to the given mode (one of `{'constant', 'nearest', 'reflect', 'wrap'}`). cval: Value used for points outside the boundaries of the input if `mode='constant'`. order: int, order of interpolation # Returns The transformed version of the input. """ if scipy is None: raise ImportError('Image transformations require SciPy. ' 'Install SciPy.') if transform_matrix is not None: channels, h, w, d = x.shape transform_matrix = transform_matrix_offset_center(transform_matrix, h, w, d) coords = create_coordinate_mesh(x.shape[1:], crop_shape) # Multiplication between coords and trasnform matrix with threadpool_limits(limits=1, user_api='blas'): trf_coords = coords.reshape(coords.shape[0], -1) trf_coords = np.matmul(transform_matrix, trf_coords) trf_coords = trf_coords.reshape(*coords.shape) trf_coords = trf_coords[:-1, :, :, :] # Interpolation res = [ scipy.ndimage.map_coordinates(x[channel, ...], trf_coords, order=order, mode=fill_mode, cval=cval) for channel in range(channels)] x = np.stack(res, axis=0) x[x < 1e-3] = 0 if seg is not None: labels = seg.shape[0] res = [scipy.ndimage.map_coordinates(seg[label, ...], trf_coords, order=order, mode=fill_mode, cval=cval) for label in range(labels)] seg = np.stack(res, axis=0) seg[seg > 0.5] = 1 seg[seg < 0.5] = 0 return x, seg
def net_train_and_predict(X_train, y_train, X_pred, alpha, n_jobs, random_state, verbose=False): start_time = time.perf_counter() scaler_x = MaxStdScaler() X_train = scaler_x.fit_transform(X_train) scaler_y = MaxStdScaler(factor=15.0) y_train = scaler_y.fit_transform(y_train) regressor = MLPRegressor( hidden_layer_sizes=(100, 75, 50, 25), activation="relu", solver="sgd", learning_rate="adaptive", alpha=alpha, random_state=random_state, ) with threadpool_limits(limits=n_jobs): regressor.fit(X_train, y_train) logger.info(regressor.loss_) with threadpool_limits(limits=n_jobs): y_pred = scaler_y.inverse_transform(regressor.predict( scaler_x.transform(X_pred)), copy=False) end_time = time.perf_counter() if verbose: logger.info( "Deep regressor traning and predicting finished. Time spent = {:.2f}s." .format(end_time - start_time)) return y_pred
def test_multiple_shipped_openblas(): libopenblas = [ctypes.CDLL(path) for path in libopenblas_paths] old_limits = [blas.openblas_get_num_threads() for blas in libopenblas] with threadpool_limits(1): assert all( [blas.openblas_get_num_threads() == 1 for blas in libopenblas]) assert all([ blas.openblas_get_num_threads() == l for blas, l in zip(libopenblas, old_limits) ])
def test_get_original_num_threads(limit): # Tests the method get_original_num_threads of the context manager with threadpool_limits(limits=2, user_api="blas") as ctl: # set different blas num threads to start with (when multiple openblas) if ctl._original_info: ctl._original_info.modules[0].set_num_threads(1) original_info = _threadpool_info() with threadpool_limits(limits=limit, user_api="blas") as threadpoolctx: original_num_threads = threadpoolctx.get_original_num_threads() assert "openmp" not in original_num_threads blas_info = original_info.get_modules("user_api", "blas") if blas_info: expected = min(module.num_threads for module in blas_info) assert original_num_threads["blas"] == expected else: assert original_num_threads["blas"] is None if len(libopenblas_paths) >= 2: with pytest.warns(None, match="Multiple value possible"): threadpoolctx.get_original_num_threads()
def _generate_impl_worker(args): with threadpool_limits(limits=1, user_api='blas'): i, Xi, L0, U0, c, eps, k = args T0 = 0 # start time T1 = 40 # end time nT = 401 # number of time points T = np.linspace(T0, T1, nT) # time points sol = solve_ivp(_rhs, [T0, T1], Xi, t_eval=T, args=(L0, U0, c, eps, k)) sol.y[0, :] = np.mod(sol.y[0, :], 20) # the domain is periodic in x-direction return i, sol.y
def _init(self): """ Initialize env runners, that actually do all the work. Also we're doing some utility stuff here, e.g. setting process affinity (this is a performance optimization). """ log.info('Initializing envs for env runner %d...', self.worker_idx) if self.cfg.force_envs_single_thread: from threadpoolctl import threadpool_limits threadpool_limits(limits=1, user_api=None) if self.cfg.set_workers_cpu_affinity: set_process_cpu_affinity(self.worker_idx, self.cfg.num_workers) psutil.Process().nice(min(self.cfg.default_niceness + 10, 20)) self.env_runners = [] for split_idx in range(self.num_splits): env_runner = VectorEnvRunner( self.cfg, self.vector_size // self.num_splits, self.worker_idx, split_idx, self.num_agents, self.shared_buffers, self.reward_shaping, ) env_runner.init() self.env_runners.append(env_runner)
def test_get_original_num_threads(limit): # Tests the method get_original_num_threads of the context manager with threadpool_limits(limits=2, user_api="blas") as ctx: # set different blas num threads to start with (when multiple openblas) if len(ctx._controller.select(user_api="blas")) > 1: ctx._controller.lib_controllers[0].set_num_threads(1) original_info = ThreadpoolController().info() with threadpool_limits(limits=limit, user_api="blas") as threadpoolctx: original_num_threads = threadpoolctx.get_original_num_threads() assert "openmp" not in original_num_threads blas_info = select(original_info, user_api="blas") if blas_info: expected = min(lib_info["num_threads"] for lib_info in blas_info) assert original_num_threads["blas"] == expected else: assert original_num_threads["blas"] is None if len(libopenblas_paths) >= 2: with pytest.warns(None, match="Multiple value possible"): threadpoolctx.get_original_num_threads()
def __init__(self, num_threads=1): # Get the current number of threads here, so we can set it back when we're done. from ..utilities import get_omp_threads self.orig_num_threads = get_omp_threads() self.temp_num_threads = num_threads # If threadpoolctl is installed, use that too, since it will set blas libraries to # be single threaded too. This makes it so you don't need to set the environment # variables OPENBLAS_NUM_THREAD=1 or MKL_NUM_THREADS=1, etc. try: import threadpoolctl except ImportError: self.tpl = None else: # pragma: no cover (Not installed on Travis currently.) self.tpl = threadpoolctl.threadpool_limits(num_threads)
def calculate_diffusion_map( W: csr_matrix, n_components: int, solver: str, max_t: int, n_jobs: int, random_state: int, ) -> Tuple[np.array, np.array, np.array]: assert issparse(W) nc, labels = connected_components(W, directed=True, connection="strong") logger.info("Calculating connected components is done.") assert nc == 1 W_norm, diag, diag_half = calculate_normalized_affinity(W.astype(np.float64)) # use double precision to guarantee reproducibility logger.info("Calculating normalized affinity matrix is done.") n_jobs = eff_n_jobs(n_jobs) with threadpool_limits(limits = n_jobs): if solver == "eigsh": np.random.seed(random_state) v0 = np.random.uniform(-1.0, 1.0, W_norm.shape[0]) Lambda, U = eigsh(W_norm, k=n_components, v0=v0) Lambda = Lambda[::-1] U = U[:, ::-1] else: assert solver == "randomized" U, S, VT = randomized_svd( W_norm, n_components=n_components, random_state=random_state ) signs = np.sign((U * VT.transpose()).sum(axis=0)) # get eigenvalue signs Lambda = signs * S # get eigenvalues # remove the first eigen value and vector Lambda = Lambda[1:] U = U[:, 1:] Phi = U / diag_half[:, np.newaxis] if max_t == -1: Lambda_new = Lambda / (1.0 - Lambda) else: # Find the knee point x = np.array(range(1, max_t + 1), dtype = float) y = np.array([calc_von_neumann_entropy(Lambda, t) for t in x]) t = x[find_knee_point(x, y)] logger.info("Detected knee point at t = {:.0f}.".format(t)) # U_df = U * Lambda #symmetric diffusion component Lambda_new = Lambda * ((1.0 - Lambda ** t) / (1.0 - Lambda)) Phi_pt = Phi * Lambda_new # asym pseudo component return Phi_pt, Lambda, Phi # , U_df, W_norm
def _fit_kmeans(self, do_ann_fit): from sklearn.cluster import MiniBatchKMeans if do_ann_fit is False: return None kmeans = MiniBatchKMeans(n_clusters=self.nClusters, random_state=self.randState, batch_size=self.batchSize) with threadpool_limits(limits=self.nthreads): for i in self.iter_blocks(msg='Fitting kmeans'): kmeans.partial_fit(self.reducer(i)) temp = [] for i in self.iter_blocks(msg='Estimating seed partitions'): temp.extend(kmeans.predict(self.reducer(i))) self.clusterLabels = np.array(temp) return kmeans
def solve(self, t=None): if self.t0 > T0_UPPER_BOUND: self.t0 = 0 dt = self.getParam('dt') if t is None: t = self.t0 + np.linspace(0, SOLVE_EVERY_TI*dt, SOLVE_EVERY_TI + 1) # change this else: t += self.t0 # print(t, dt, self.initCond) with threadpool_limits(limits=1): if self.dim == 1 or self.dim == 0: self.sol = self.solver.solve(self.wrhs, (t[0], t[-1]), self.initCond, t) elif self.dim == 2: self.sol = self.solver.solve(self.wrhs, (t[0], t[-1]), self.initCond.reshape(self.N * self.N), t) self.t0 = t[-1]
def test_threadpool_limits_by_prefix(prefix, limit): # Check that the maximum number of threads can be set by prefix original_info = _threadpool_info() modules_matching_prefix = original_info.get_modules("prefix", prefix) if not modules_matching_prefix: pytest.skip("Requires {} runtime".format(prefix)) with threadpool_limits(limits={prefix: limit}): for module in modules_matching_prefix: if is_old_openblas(module): continue # threadpool_limits only sets an upper bound on the number of # threads. assert 0 < module.get_num_threads() <= limit assert _threadpool_info() == original_info
def test_threadpool_limits_by_prefix(prefix, limit): # Check that the maximum number of threads can be set by prefix controller = ThreadpoolController() original_info = controller.info() controller_matching_prefix = controller.select(prefix=prefix) if not controller_matching_prefix: pytest.skip(f"Requires {prefix} runtime") with threadpool_limits(limits={prefix: limit}): for lib_controller in controller_matching_prefix.lib_controllers: if is_old_openblas(lib_controller): continue # threadpool_limits only sets an upper bound on the number of # threads. assert 0 < lib_controller.num_threads <= limit assert ThreadpoolController().info() == original_info
def main(): with contextlib.ExitStack() as stack: # Limit numpy/blas etc threads to 1, as we obtain # our parallelism with dask threads stack.enter_context(threadpool_limits(limits=1)) args = create_parser().parse_args() # Configure dask pool if args.nworkers <= 1: log.warn("Entering single threaded mode per user request!") dask.config.set(scheduler='single-threaded') else: stack.enter_context( dask.config.set(pool=ThreadPool(args.nworkers))) _main(args)
def _start(self): if not self.was_initialized: self._finish() self.abort_event.clear() logging.debug("starting workers") self._queue_ctr = 0 self._end_ctr = 0 if hasattr(self.generator, 'was_initialized'): self.generator.was_initialized = False with threadpool_limits(limits=1, user_api="blas"): for i in range(self.num_processes): self._queues.append(Queue(self.num_cached_per_queue)) self._processes.append( Process(target=producer, args=(self._queues[i], self.generator, self.transform, i, self.seeds[i], self.abort_event))) self._processes[-1].daemon = True self._processes[-1].start() if torch is not None and torch.cuda.is_available(): gpu = torch.cuda.current_device() else: gpu = None # more caching = more performance. But don't cache too much or your RAM will hate you self.pin_memory_queue = thrQueue( max(3, self.num_cached_per_queue * self.num_processes // 2)) self.pin_memory_thread = threading.Thread( target=results_loop, args=(self._queues, self.pin_memory_queue, self.abort_event, self.pin_memory, gpu, self.wait_time, self._processes)) self.pin_memory_thread.daemon = True self.pin_memory_thread.start() self.was_initialized = True else: logging.debug( "MultiThreadedGenerator Warning: start() has been called but it has already been " "initialized previously")
def main(): ### Select non-interactive backend # matplotlib.use('Agg') would not work here, due to importing order # the console_scripts entry point design means that 'riptide' is always imported first, # importing everything else in riptide's __init__.py, which ends up setting the backend # before the first line of this script is reached # Another alternative is to call the pipeline command with the MPLBACKEND=Agg prefix # NOTE: We don't do this at the top of the script, in case someone wants # to import the Pipeline class without switching backends import matplotlib.pyplot as plt plt.switch_backend('Agg') # NOTE (IMPORTANT): Force all numpy libraries to use a single thread/CPU # Each DM trial is assigned to a different process, and for optimal # performance, each process should be limited to 1 CPU with threadpoolctl.threadpool_limits(limits=1): run_program()
def test_threadpool_limits_manual_restore(): # Check that threadpool_limits can be used as an object which holds the # original state of the threadpools and that can be restored thanks to the # dedicated restore_original_limits method original_info = ThreadpoolController().info() limits = threadpool_limits(limits=1) try: for lib_controller in ThreadpoolController().lib_controllers: if is_old_openblas(lib_controller): continue assert lib_controller.num_threads == 1 finally: # Restore the original limits so that this test does not have any # side-effect. limits.restore_original_limits() assert ThreadpoolController().info() == original_info