Exemplo n.º 1
0
def select_informative_positions(data, incid_thresh):
    minor_allele_incid = (data > 0).mean("library_id").min("allele")
    informative_positions = idxwhere(
        minor_allele_incid.to_series() > incid_thresh
    )
    return informative_positions
Exemplo n.º 2
0
        f"Found {npos_available} informative positions with minor "
        f"allele incidence of >{args.incid_thresh}"
    )
    npos = min(args.npos, npos_available)
    info(f"Randomly sampling {npos} positions.")
    position_ss = np.random.choice(
        informative_positions,
        size=npos,
        replace=False,
    )

    info("Filtering libraries.")
    suff_cvrg_samples = idxwhere(
        (
            (
                data.sel(position=informative_positions).sum(["allele"]) > 0
            ).mean("position")
            > args.cvrg_thresh
        ).to_series()
    )
    nlibs = len(suff_cvrg_samples)
    info(
        f"Found {nlibs} libraries with >{args.cvrg_thresh:0.1%} "
        f"of informative positions covered."
    )

    info("Constructing input data.")
    data_fit = data.sel(library_id=suff_cvrg_samples, position=position_ss)
    m_ss = data_fit.sum("allele")
    n, g_ss = m_ss.shape
    y_obs_ss = data_fit.sel(allele="alt")
Exemplo n.º 3
0
    cvrg = trsnf_data.sum("allele")
    trsnf_data = (trsnf_data + 1) / (cvrg + 2)
    trsnf_data = trsnf_data.sel(allele="alt") * 2 - 1

    info(f"Clustering {nlibs} metagenotypes at a {dist_thresh} "
         f"maximum cosine distance threshold across {npos} positions.")
    clust = AgglomerativeClustering(
        n_clusters=None,
        affinity="cosine",
        linkage="complete",
        distance_threshold=dist_thresh,
    ).fit(trsnf_data)
    clust = pd.Series(clust.labels_, index=trsnf_data.library_id)

    clust_size = clust.value_counts()
    large_clusts = idxwhere(clust_size >= clust_size_thresh)
    nclusts = len(large_clusts)
    total_clust_libs = clust_size.loc[large_clusts].sum()
    info(f"Found {nclusts} ('large') clusters with at least "
         f"{clust_size_thresh} members, encompassing a total of "
         f"{total_clust_libs} libraries.")

    info(f"Constructing cluster-by-position matrix indicating "
         f"which large clusters have counts at each position in "
         f">{clust_pos_frac_thresh:.1%} of samples.")
    cvrg_subset = cvrg.to_pandas().loc[clust.index]
    clust_pos_frac = {}
    for c in large_clusts:
        clust_pos_frac[c] = (cvrg_subset.loc[clust == c] > 0).mean()
    clust_pos_frac = pd.DataFrame(clust_pos_frac, index=inf_positions)
    clust_pos_incid = (clust_pos_frac >= clust_pos_frac_thresh).astype(int)
Exemplo n.º 4
0
def xr_idxwhere(x):
    return idxwhere(x.to_series())
Exemplo n.º 5
0
def filter_samples(pileup, min_median_coverage):
    median_coverage = (pileup.groupby(level='sample_id', axis='columns')
                             .sum()
                             .median())
    return pileup.loc[:, idxwhere(median_coverage >= min_median_coverage)]