def select_informative_positions(data, incid_thresh): minor_allele_incid = (data > 0).mean("library_id").min("allele") informative_positions = idxwhere( minor_allele_incid.to_series() > incid_thresh ) return informative_positions
f"Found {npos_available} informative positions with minor " f"allele incidence of >{args.incid_thresh}" ) npos = min(args.npos, npos_available) info(f"Randomly sampling {npos} positions.") position_ss = np.random.choice( informative_positions, size=npos, replace=False, ) info("Filtering libraries.") suff_cvrg_samples = idxwhere( ( ( data.sel(position=informative_positions).sum(["allele"]) > 0 ).mean("position") > args.cvrg_thresh ).to_series() ) nlibs = len(suff_cvrg_samples) info( f"Found {nlibs} libraries with >{args.cvrg_thresh:0.1%} " f"of informative positions covered." ) info("Constructing input data.") data_fit = data.sel(library_id=suff_cvrg_samples, position=position_ss) m_ss = data_fit.sum("allele") n, g_ss = m_ss.shape y_obs_ss = data_fit.sel(allele="alt")
cvrg = trsnf_data.sum("allele") trsnf_data = (trsnf_data + 1) / (cvrg + 2) trsnf_data = trsnf_data.sel(allele="alt") * 2 - 1 info(f"Clustering {nlibs} metagenotypes at a {dist_thresh} " f"maximum cosine distance threshold across {npos} positions.") clust = AgglomerativeClustering( n_clusters=None, affinity="cosine", linkage="complete", distance_threshold=dist_thresh, ).fit(trsnf_data) clust = pd.Series(clust.labels_, index=trsnf_data.library_id) clust_size = clust.value_counts() large_clusts = idxwhere(clust_size >= clust_size_thresh) nclusts = len(large_clusts) total_clust_libs = clust_size.loc[large_clusts].sum() info(f"Found {nclusts} ('large') clusters with at least " f"{clust_size_thresh} members, encompassing a total of " f"{total_clust_libs} libraries.") info(f"Constructing cluster-by-position matrix indicating " f"which large clusters have counts at each position in " f">{clust_pos_frac_thresh:.1%} of samples.") cvrg_subset = cvrg.to_pandas().loc[clust.index] clust_pos_frac = {} for c in large_clusts: clust_pos_frac[c] = (cvrg_subset.loc[clust == c] > 0).mean() clust_pos_frac = pd.DataFrame(clust_pos_frac, index=inf_positions) clust_pos_incid = (clust_pos_frac >= clust_pos_frac_thresh).astype(int)
def xr_idxwhere(x): return idxwhere(x.to_series())
def filter_samples(pileup, min_median_coverage): median_coverage = (pileup.groupby(level='sample_id', axis='columns') .sum() .median()) return pileup.loc[:, idxwhere(median_coverage >= min_median_coverage)]