def load_crosses(self, seq_id, cross_id, field): # Not available via intake 22 Oct 2020 crosses_path = self.release_dir / "site_filters" / "crosses_stats" crosses_store = self.gcs.get_mapper(crosses_path.as_posix()) crosses_group = zarr.Group(crosses_store) return da.from_zarr(crosses_group[seq_id][field][cross_id])
def _benchmark_load_zarr_datasets(self, zarr_paths): callsets = [] self.benchmark_profiler.start_benchmark( operation_name="Load Zarr Dataset") for zarr_path in zarr_paths: store = zarr.DirectoryStore(zarr_path) callset = zarr.Group(store=store, read_only=True) callsets.append(callset) self.benchmark_profiler.end_benchmark() return callsets
def open_group(path): if path.endswith("h5"): return h5py.File(path, "r") elif path.endswith(("zarr2", "zarr")): return zarr.open_group(path, "r") elif path.endswith("zip"): zz = zarr.ZipStore(path) return zarr.Group(zz) else: raise ValueError( "Bad filepath provided: {0}. Only hdf5/zarr supported.".format( path))
def execute(cls, ctx, op): import zarr fs = get_fs(op.path, None) fs_map = FSMap(op.path, fs) group = zarr.Group(store=fs_map, path=op.group) array = group[op.dataset] to_store = ctx[op.inputs[0].key] axis_offsets = op.axis_offsets shape = to_store.shape array[tuple( slice(offset, offset + size) for offset, size in zip(axis_offsets, shape))] = to_store ctx[op.outputs[0].key] = np.empty((0, ) * to_store.ndim, dtype=to_store.dtype)
def load_variants_array(self, seq_id, field="POS", mask=None): """ release_pa """ path = self.release_dir / "snp_genotypes" / "all" / "sites" # need to open as mapping if this on cloud storez = self.gcs.get_mapper(path.as_posix()) calldata = zarr.Group(storez) arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"]) if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() return arr
def load_calldata_by_sampleset(self, seq_id, sampleset, field="GT", mask=None): if isinstance(sampleset, str): path = self.release_dir / "snp_genotypes" / "all" / sampleset print(path) # need to open as mapping if this on cloud storez = self.gcs.get_mapper(path.as_posix()) calldata = zarr.Group(storez) arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"]) elif isinstance(sampleset, list): arr = da.concatenate([ self.load_calldata_by_sampleset( seq_id, s, field=field, mask=None) for s in sampleset ], axis=1) else: raise ValueError( "sampleset must be a string, or a list of strings") if mask is not None: assert isinstance(mask, da.core.Array), "mask must be a dask_array" arr = da.compress(mask, arr, axis=0).compute_chunk_sizes() if field == "GT": arr = allel.GenotypeDaskArray(arr) return arr
def main(): parser = argparse.ArgumentParser( description= 'This script takes a single zarr zipstore, and estimates the contamination rate, providing a log' 'likelihood ratio vs the null model') parser.add_argument( '--input', required=True, help= 'Path to zarr file containing genotypes and allele depths, zipped Zarr file with data for a single sample.' 'This should follow the standard format of {sample}/{seqid}/calldata/GT and {sample}/{seqid}/calldata/AD.' ) parser.add_argument( '--sites', required=True, help= 'Path to zarr describing which sites in `input` were genotyped. This is used to match the `input` to the' 'allele frequencies below. variants/POS is required.') parser.add_argument( '--allele-frequencies', required=True, help= 'path to zarr file describing allele frequencies. This has two purposes: 1) to select SNPs to downsample' 'to, based on the `minimum_af` argument. 2) To provide a prior expectation on the frequency of genotypes.' 'The first level of the zarr file should be groups for seqids, with each containing `POS` (position) and' 'AF (allele frequencies). The shape of the AF array must be Nx4, where N is the size of the 1D POS array.' 'The order of alleles *must* correspond to the coding in the input data. There is no requirement to have a' 'similar shape to the input genotypes, although a minimum level of intersection is required!' ) parser.add_argument('--seqid', required=True, nargs='+', help='name of chromosome(s) or contig(s) to process. ') parser.add_argument('--output', required=True, help='path to output file stem') parser.add_argument('--downsample', required=False, default=20000, help='number of sites to consider.', type=int) parser.add_argument( '--minimum-af', required=False, default=0.05, help= 'minimum minor allele frequency in reference population to consider. Sites with higher MAF are more ' 'powerful at detecting contamination', type=float) parser.add_argument('--sequence-error-rate', required=False, default=1e-3, help='probability of observing a non REF/ALT base', type=float) parser.add_argument( '--minimum-coverage', required=False, default=10, help= 'minimum read depth to use. Low depths have low power to detect contamination', type=int) parser.add_argument('--plot', dest='plot', action='store_true') parser.add_argument('--no-plot', dest='plot', action='store_false') parser.add_argument('--log', dest='log', action='store_true') parser.add_argument('--no-log', dest='log', action='store_false') parser.set_defaults(plot=True, log=False) try: args = { "input": snakemake.input.input, "sites": snakemake.input.sites, "allele_frequencies": snakemake.input.allele_frequencies, "seqid": snakemake.params.seqid, "output": snakemake.params.stem, "minimum_af": snakemake.params.minimum_af, "minimum_coverage": snakemake.params.minimum_coverage, "sequence_error_rate": snakemake.params.seq_err_rate, "downsample": snakemake.params.downsample, "plot": snakemake.params.plot, "log": snakemake.params.log } log("Args read via snakemake") except NameError: args = vars(parser.parse_args()) log("Args read via command line") seqids = args['seqid'] sequence_error_rate = args['sequence_error_rate'] downsample_n = args["downsample"] minimum_minor_af = args["minimum_af"] output_csv = args['output'] + ".contamination.csv" output_png = args['output'] + ".allele_balance.png" output_log = args["output"] + ".{alpha}.log" sample_store = zarr.ZipStore(args["input"], mode="r") sample_callset = zarr.Group(sample_store) sites = zarr.ZipStore(args["sites"], mode="r") variant_sites = zarr.Group(sites) sample = next(iter(sample_callset)) concatenated_sample_callset, _ = concatenate_arrays( sample_callset[sample], seqids, paths=["calldata/GT", "calldata/AD"]) gt = allel.GenotypeArray(concatenated_sample_callset["calldata/GT"]) ad = concatenated_sample_callset["calldata/AD"] concatenated_sites, concatenated_site_shapes = concatenate_arrays( variant_sites, seqids, ["variants/POS"]) pos = concatenated_sites["variants/POS"] assert pos.shape[0] == gt.shape[0] == ad.shape[ 0], "Shape inconsistency. {0}, {1}, {2}".format( pos.shape, gt.shape, ad.shape) # load allele frequencies required to compute weights allele_frequencies_z = zarr.open_group(args['allele_frequencies'], "r") concatenated_af_arrays, concatenated_af_shapes = concatenate_arrays( allele_frequencies_z, seqids, ["POS", "AF"]) af_pos = concatenated_af_arrays["POS"] # This is a 2D array of the frequency of the ALT allele in some other dataset. af_val = concatenated_af_arrays["AF"] assert af_val.shape[ 1] == 4, "Allele frequencies must contain all 4 alleles, even if unobserved." # for the sample_gt: Keep if # a) in af, b) is_called and c) is_biallelic # step 1 find the intersection this works on multi indexes loc_gt, loc_af = locate_intersection(pos, concatenated_site_shapes, af_pos, concatenated_af_shapes) flt_af_val = np.compress(loc_af, af_val, axis=0) flt_gt = np.compress(loc_gt, gt, axis=0) flt_ad = np.compress(loc_gt, ad, axis=0) # now we need to filter both by is biallelic and is called. is_bial_ref_pop = np.count_nonzero(flt_af_val, axis=1) == 2 is_called = flt_gt.is_called()[:, 0] # compress the intersection by the AND of these keep_loc = is_called & is_bial_ref_pop alt_frequency_pass = np.compress(keep_loc, flt_af_val, axis=0) allele_depth_pass = np.compress(keep_loc, flt_ad, axis=0) # recode the allele depth to 0/1. # find the "alt" column. log("Ordering alleles by frequency for REF/ALT/ERR") min_cov_reached = allele_depth_pass[:, 0].sum( axis=1) >= args['minimum_coverage'] ix_cols_sort = np.argsort(alt_frequency_pass, axis=1)[:, ::-1] # indices of all rows ix_rows = np.arange(alt_frequency_pass.shape[0]) # apply the sorting operation allele_depth_pass_reordered = np.squeeze(allele_depth_pass)[ ix_rows[:, np.newaxis], ix_cols_sort] # Define allele counts: sum final 2 columns, representing ref/alt/error allele_depths = allele_depth_pass_reordered[:, :3] allele_depths[:, 2] = allele_depths[:, 2] + allele_depth_pass_reordered[:, 3] assert allele_depths.shape[1] == 3 # issue with some samples having a third allele (ie not in phase 2) discovered at high frequency # Filter sites where more than 10% of reads look like errors. probably_biallelic = allele_depth_pass_reordered[:, 2] < ( .1 * allele_depth_pass_reordered.sum(axis=1)) # step 2 create the 0/1/2 from the allele frequencies. major_af = alt_frequency_pass.max(axis=1) # select the values with the highest MAF. log("Selecting variants on which to perform analysis") while True: eligible = probably_biallelic & min_cov_reached & ( (1 - major_af) > minimum_minor_af) if eligible.sum() > downsample_n: break minimum_minor_af -= 0.01 if minimum_minor_af < 0: log("Insufficient variants meet criteria to compute contamination. n={0}, min={1}" .format(eligible.sum(), downsample_n)) break res = pd.DataFrame(index=[sample], columns=["LLR", "LL", "pc_contam"]) if eligible.sum() > downsample_n: log("Downsample from {0} to {1}".format(eligible.sum(), downsample_n)) ix_ds = np.sort( np.random.choice(np.where(eligible)[0], size=downsample_n)) major_af = np.take(major_af, ix_ds, axis=0) allele_depths = np.take(allele_depths, ix_ds, axis=0) genotype_weights = np.log(determine_weights(major_af)) log("estimating contamination...") xv = minimize_scalar(compute_likelihood, args=(sequence_error_rate, allele_depths, genotype_weights, args["log"], output_log), bounds=(0, 0.5), method="Bounded", options={"xatol": 1e-6}) # compute the likelihood at alpha = 0, to report likelihood ratio. null = compute_likelihood(0.0, sequence_error_rate, allele_depths, genotype_weights, args["log"], output_log) # return the llr / ll / estimate res.loc[sample] = -min(xv.fun - null, 0), -xv.fun, xv.x * 100 if args['plot']: plot_allele_balance(flt_gt, flt_ad, output_png, res.iloc[0]) res.to_csv(output_csv)
import gzip import pandas as pd import os import gcsfs import zarr from itertools import compress gcs_bucket_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='anon', access='read_only') geno_bi_path = os.path.join( "ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic" ) gcsacmap = gcs_bucket_fs.get_mapper(root=geno_bi_path) callset_biallel = zarr.Group(gcsacmap, read_only=True) metadata = pd.read_csv("samples.meta.txt", sep="\t") pop_selection = metadata.population.isin({ 'GHcol', 'GHgam', 'BFgam', 'BFcol', 'GM', 'GW', 'GNgam', 'GNcol', 'CIcol' }).values callset_fn = callset_biallel def get_consecutive_true(a): if a.sum() == 0: return 0 else: return np.diff( np.where(np.concatenate( ([a[0]], a[:-1] != a[1:], [True])))[0])[::2].max()
def load_mask(self, seq_id, mask_id, filters_model="dt_20200416"): mask_path = self.release_dir / "site_filters" / filters_model / mask_id mask_store = self.gcs.get_mapper(mask_path.as_posix()) mask_group = zarr.Group(mask_store) return da.from_zarr(mask_group[seq_id]["variants/filter_pass"])
def open_zarr_dataset(zarr_path): store = zarr.DirectoryStore(zarr_path) callset = zarr.Group(store=store, read_only=True) return callset
def load_filter_n(self, seq_id): path = self.release_dir / "accessibility" / "accessibility.zarr" storez = self.gcs.get_mapper(path.as_posix()) zarrdata = zarr.Group(storez) return da.from_zarr(zarrdata[f"{seq_id}/filter_n"])
def main(): parser = argparse.ArgumentParser( description= 'Given a zipped Zarr store summarize information about the coverage/calls' ) parser.add_argument( '--input', required=True, help='path to Zarr store containing calldata for a single sample') parser.add_argument('--seqid', required=False, help='name of chromosomes or contigs to process', nargs="+", default=["2R", "2L", "3R", "3L", "X"], type=str) parser.add_argument( '--output', required=True, help='path to output basename for stats table and coverage histogram') try: args = { "input": snakemake.input.input, "seqid": snakemake.params.seqid, "output": snakemake.params.stem } log("Args read via snakemake") except NameError: args = vars(parser.parse_args()) log("Args read via command line") zfn = args['input'] store = zarr.ZipStore(zfn, mode="r") callset = zarr.Group(store) csv_out = args['output'] + ".callstats.csv" npy_out = args['output'] + ".covhist.npz" # Holders for data df_cols = [ "nSitesCalled", "nHomRef", "nHet", "nHetRef", "nHomAlt", "nNonRefAlleles" ] ad_df = pd.DataFrame(index=args['seqid'], columns=df_cols) cov_hist = dict() sample = next(iter(callset)) for chrom in args['seqid']: gt = allel.GenotypeArray(callset[sample][chrom]["calldata/GT"]) is_called = np.squeeze(gt.is_called()) # if no coverage at all if is_called.sum() == 0: ad_df.loc[chrom] = 0 cov_hist[chrom] = np.array([is_called.shape[0]]) continue all_allele_depths = callset[sample][chrom]["calldata/AD"][:, 0] # To calculate DP at each position/sample we need to filter missing genotpyes. allele_depths_with_cov = np.compress(is_called, all_allele_depths, axis=0) # Histogram of coverage sum_by_alt = allele_depths_with_cov.sum(axis=1) bc = np.bincount(sum_by_alt, minlength=251) bc[0] = (~is_called).sum() cov_hist[chrom] = bc nHomRef = gt.count_hom_ref() nHet = gt.count_het() nHetRef = gt.count_het(allele=0) nHomAlt = gt.count_hom_alt() nNonRefAlleles = (2 * nHomAlt) + nHetRef + (2 * (nHet - nHetRef)) total_calls = is_called.sum() ad_df.loc[chrom] = [ total_calls, nHomRef, nHet, nHetRef, nHomAlt, nNonRefAlleles ] ad_df.to_csv(csv_out) np.savez(npy_out, **cov_hist)