def _compute_combination(gen): # handle if tqdm is installed or not. if is_tqdm_installed(): from tqdm import tqdm _generator = tqdm(gen, position=0) else: _generator = gen F = [_ratio_and_distance(a, b, True) for a, b in _generator] res = pd.DataFrame(gen, columns=["x", "y"]) res["L"] = F return res
def _map_comp(f, *args): N = len(args) narg = len(args[0]) if is_tqdm_installed(): from tqdm import tqdm _gen = (tqdm(args, position=0, total=narg) if N == 0 else tqdm( it.zip_longest(*args), position=0, total=narg)) else: _gen = args if N == 0 else it.zip_longest(*args) # check to make sure every argument is an iterable, and make it one if not if N == 0: return [f(arg) for arg in _gen] else: return [f(*arg) for arg in _gen]
def _corr_combination(data, comb, niter, covar, cart_z, method, output, verbose): # calculate the number of combinations to pass to tqdm to set the progressbar length # as comb is an iterable # handle if tqdm is installed whether to use progressbar. if is_tqdm_installed(): from tqdm import tqdm # wrap the generator around tqdm if covar is not None and cart_z: _generator = tqdm(it.product(comb, covar), position=0, total=niter) else: _generator = tqdm(comb, position=0, total=niter) else: # there is no tqdm if covar is not None and cart_z: _generator = it.product(comb, covar) else: _generator = comb # with no covariates, simple correlation. if covar is None: # select appropriate function rho. rho = _bicorr_inner_score if output == "score" else _bicorr_inner_full # iterate and calculate rho result_k = [rho(data[x], data[y], method) for x, y in _generator] else: # if we cartesian over covariates, produce the product of combinations to each covariate if cart_z: niter *= len(covar) result_k = [ _partial_bicorr_inner(data, x, y, covar, method=method, output=output) for (x, y), z in _generator ] else: # otherwise do all pairwise correlations with a fixed covariate matrix result_k = [ _partial_bicorr_inner(data, x, y, covar, method=method, output=output) for x, y, in _generator ] # we should have a list of dict - assemble the records return ( pd.DataFrame.from_records(result_k) )
def _parallel_list_comprehension(f, *args): from joblib import cpu_count, Parallel, delayed N = len(args) if N == 0: return f() else: if is_tqdm_installed(): # use tqdm to wrap around our iterable _Threaded = TqdmParallel(use_tqdm=True, total=len(args[0])) else: _Threaded = Parallel if N == 1: n = len(args[0]) ncpu = n if n < cpu_count() else (cpu_count() - 1) um = _Threaded(ncpu)(delayed(f)(arg) for arg in args[0]) else: ncpu = N if N < cpu_count() else (cpu_count() - 1) um = _Threaded(ncpu)(delayed(f)(*arg) for arg in it.zip_longest(*args)) return um
def umappcc(fn: str, f: Callable, *args): """Performs Map comprehension with Parallelism and Caching by Chunks. That is to say that the first time this runs, function f(*args) is called, storing a cache file. The second time and onwards, the resulting cached file is read and no execution takes place. Further to this, 'by-chunks' means that each step is stored separately as a file and concatenated together at the end. This means that if a program stops half way through execution, when re-run, it restarts from the last cached element, which is incredibly useful during debugging and prototype development. This assumes each iteration is independent from each other in the list comprehension. Parameters ---------- fn : str The path and filename. f : function The function to call *args : list-like Arguments to pass as f(*args) Returns ------- res : Any The results from f(*args) or from file Examples -------- See `turb.utils.umap` for examples. """ from joblib import Parallel, cpu_count, delayed if os.path.isfile(fn): return _load_file(fn) else: # pre-compute iterable its = list(it.zip_longest(*args)) n = len(its) ncpu = n if n < cpu_count() else (cpu_count() - 1) # check the directory actually exists before continuing check_file_path(fn, False, True, 0) # use tqdm for display. if is_tqdm_installed(): # use our custom tqdm parallel class _Threaded = TqdmParallel(use_tqdm=True, total=n) else: _Threaded = Parallel # create a cache directory in the directory below where to plant the file relfile, abscachedir = _create_cache_directory(fn) # do list comprehension using parallelism um = _Threaded(ncpu)( delayed(_mini_cache)(add_suf(relfile, str(i)), f, *arg) for i, arg in enumerate(its)) # save final version _write_file(um, fn) # delete temp versions _delete_temps(abscachedir) # return return um
def umapcc(fn: str, f: Callable, *args): """Performs Map comprehension with Caching by Chunks. That is to say that the first time this runs, function f(*args) is called, storing a cache file. The second time and onwards, the resulting cached file is read and no execution takes place. Further to this, 'by-chunks' means that each step is stored separately as a file and concatenated together at the end. The intermediate caches are removed at the end of the process automatically. If the program crashes part-way through this, re-running will resume from the last stored chunk. Parameters ---------- fn : str The path and filename. f : function The function to call *args : list-like Arguments to pass as f(*args) Returns ------- res : Any The results from f(*args) or from file Examples -------- See `turb.utils.umap` for examples. """ if os.path.isfile(fn): return _load_file(fn) else: # pre-compute iterable its = list(it.zip_longest(*args)) n = len(its) # check the directory actually exists before continuing check_file_path(fn, False, True, 0) # use tqdm for display. if is_tqdm_installed(): from tqdm import tqdm _generator = enumerate(tqdm(its, position=0, total=n)) else: _generator = enumerate(its) # create a cache directory in the directory below where to plant the file relfile, abscachedir = _create_cache_directory(fn) # run and do chunked caching, using item cache # _cache_part = partial(cache, f=f, debug=False, expand_filepath=False) um = [ _mini_cache(add_suf(relfile, str(i)), f, *arg) for i, arg in _generator ] # save final version _write_file(um, fn) # delete the temp files _delete_temps(abscachedir) # return return um