def brute_force_pairs(df_0, df_1, threshold_point=2, n_jobs=-2,tqdm=True): work = df_1.groupby('site') if tqdm: from tqdm import tqdm_notebook as tqdn work = tqdn(work,'site') arr = [] for site, df_s in work: def work_on(df_t): rotation, translation, score = evaluate_match(df_t, df_s, threshold_point=threshold_point) determinant = None if rotation is None else np.linalg.det(rotation) result = pd.Series({'rotation': rotation, 'translation': translation, 'score': score, 'determinant': determinant}) return result (df_0 .pipe(ops.utils.gb_apply_parallel, 'tile', work_on,n_jobs=n_jobs) .assign(site=site) .pipe(arr.append) ) return (pd.concat(arr).reset_index() .sort_values('score', ascending=False) )
def find_group_cliques(df_input, prefix_length=12, edit_distance=2, gene_id=GENE_ID, n_cores=-2): prefixes = df_input['sgRNA'].str[:prefix_length + 1].pipe(list) hash_buckets = build_khash(tqdn(prefixes, 'hash'), edit_distance) # for parallel distance calculation arr = [[x] for x in hash_buckets] print('hashed') # f = partial(sparse_dist, threshold=edit_distance # ,distance_func=distance_prefix # ) # print('sparse_dist function initialized') # import multiprocessing # with multiprocessing.Pool(n_cores) as p: # r = list(tqdn(p.imap(f, arr), 'distance',total=len(arr))) from joblib import Parallel, delayed results = Parallel(n_cores)(delayed(sparse_dist)( bucket, threshold=edit_distance, distance_func=distance_prefix) for bucket in tqdn(arr, 'distance')) print('distanced') Distance = dict() for x in results: Distance.update(x) sparse_distance = sparse_view(prefixes, Distance) selected = maxy_clique_groups(sparse_distance, df_input[gene_id].pipe(list), df_input['sgRNAs_per_gene'].pipe(list)) # xs = [prefixes[i] for i in selected] return df_input.iloc[selected]
def groupby_apply2(df_1, df_2, cols, f): """Apply a function `f` that takes two dataframes and returns a dataframe. Groups inputs by `cols`, evaluates for each group, and concatenates the result. """ from tqdm import tqdm_notebook as tqdn d_1 = {k: v for k, v in df_1.groupby(cols)} d_2 = {k: v for k, v in df_2.groupby(cols)} arr = [] for k in tqdn(d_1): arr.append(f(d_1[k], d_2[k])) return pd.concat(arr)
def applyIJ_parallel(f, arr, n_jobs=-2, backend='threading',tqdm=False, *args, **kwargs): """Apply a function that expects 2D input to the trailing two dimensions of an array, parallelizing computation across 2D frames. The function must output an array whose shape depends only on the input shape. """ from joblib import Parallel,delayed h, w = arr.shape[-2:] reshaped = arr.reshape((-1, h, w)) if tqdm: from tqdm import tqdm_notebook as tqdn work = tqdn(reshaped,'frame') else: work = reshaped arr_ = Parallel(n_jobs=n_jobs,backend=backend)(delayed(f)(frame, *args, **kwargs) for frame in work) output_shape = arr.shape[:-2] + arr_[0].shape return np.array(arr_).reshape(output_shape)
def csv_frame(files_or_search, tqdm=False, **kwargs): """Convenience function, pass either a list of files or a glob wildcard search term. """ from natsort import natsorted def read_csv(f): try: return pd.read_csv(f, **kwargs) except pd.errors.EmptyDataError: return None if isinstance(files_or_search, str): files = natsorted(glob(files_or_search)) else: files = files_or_search if tqdm: from tqdm import tqdm_notebook as tqdn return pd.concat([read_csv(f) for f in tqdn(files)], sort=True) else: return pd.concat([read_csv(f) for f in files], sort=True)
def parallel_levenshtein_group(group, dist_func=None, n_cores=-2): remainders = [group[i+1:] for i,_ in enumerate(group)] if not dist_func: dist_func = Levenshtein.distance def measure_distances(string,remainder): arr = [] for test_string in remainder: d = dist_func(string,test_string) if d<2: print(string,test_string) arr.append(d) return arr from joblib import Parallel, delayed results = Parallel(n_cores)(delayed(measure_distances)(*subset) for subset in tqdn(zip(group,remainders),total=len(group))) distances = [] for result in results: distances.extend(result) return distances