Exemplo n.º 1
0
 def load_crosses(self, seq_id, cross_id, field):
     
     # Not available via intake 22 Oct 2020
     
     crosses_path = self.release_dir / "site_filters" / "crosses_stats"
     crosses_store = self.gcs.get_mapper(crosses_path.as_posix())
     crosses_group = zarr.Group(crosses_store)
     return da.from_zarr(crosses_group[seq_id][field][cross_id])
Exemplo n.º 2
0
 def _benchmark_load_zarr_datasets(self, zarr_paths):
     callsets = []
     self.benchmark_profiler.start_benchmark(
         operation_name="Load Zarr Dataset")
     for zarr_path in zarr_paths:
         store = zarr.DirectoryStore(zarr_path)
         callset = zarr.Group(store=store, read_only=True)
         callsets.append(callset)
     self.benchmark_profiler.end_benchmark()
     return callsets
def open_group(path):
    if path.endswith("h5"):
        return h5py.File(path, "r")
    elif path.endswith(("zarr2", "zarr")):
        return zarr.open_group(path, "r")
    elif path.endswith("zip"):
        zz = zarr.ZipStore(path)
        return zarr.Group(zz)
    else:
        raise ValueError(
            "Bad filepath provided: {0}. Only hdf5/zarr supported.".format(
                path))
Exemplo n.º 4
0
    def execute(cls, ctx, op):
        import zarr

        fs = get_fs(op.path, None)
        fs_map = FSMap(op.path, fs)

        group = zarr.Group(store=fs_map, path=op.group)
        array = group[op.dataset]

        to_store = ctx[op.inputs[0].key]
        axis_offsets = op.axis_offsets
        shape = to_store.shape

        array[tuple(
            slice(offset, offset + size)
            for offset, size in zip(axis_offsets, shape))] = to_store

        ctx[op.outputs[0].key] = np.empty((0, ) * to_store.ndim,
                                          dtype=to_store.dtype)
    def load_variants_array(self, seq_id, field="POS", mask=None):
        """
        release_pa

        """

        path = self.release_dir / "snp_genotypes" / "all" / "sites"

        # need to open as mapping if this on cloud
        storez = self.gcs.get_mapper(path.as_posix())
        calldata = zarr.Group(storez)

        arr = da.from_zarr(calldata[f"{seq_id}/variants/{field}"])

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"
            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        return arr
    def load_calldata_by_sampleset(self,
                                   seq_id,
                                   sampleset,
                                   field="GT",
                                   mask=None):

        if isinstance(sampleset, str):

            path = self.release_dir / "snp_genotypes" / "all" / sampleset
            print(path)

            # need to open as mapping if this on cloud
            storez = self.gcs.get_mapper(path.as_posix())
            calldata = zarr.Group(storez)

            arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"])

        elif isinstance(sampleset, list):
            arr = da.concatenate([
                self.load_calldata_by_sampleset(
                    seq_id, s, field=field, mask=None) for s in sampleset
            ],
                                 axis=1)
        else:
            raise ValueError(
                "sampleset must be a string, or a list of strings")

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"

            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        if field == "GT":
            arr = allel.GenotypeDaskArray(arr)

        return arr
def main():

    parser = argparse.ArgumentParser(
        description=
        'This script takes a single zarr zipstore, and estimates the contamination rate, providing a log'
        'likelihood ratio vs the null model')

    parser.add_argument(
        '--input',
        required=True,
        help=
        'Path to zarr file containing genotypes and allele depths, zipped Zarr file with data for a single sample.'
        'This should follow the standard format of {sample}/{seqid}/calldata/GT and {sample}/{seqid}/calldata/AD.'
    )

    parser.add_argument(
        '--sites',
        required=True,
        help=
        'Path to zarr describing which sites in `input` were genotyped. This is used to match the `input` to the'
        'allele frequencies below. variants/POS is required.')

    parser.add_argument(
        '--allele-frequencies',
        required=True,
        help=
        'path to zarr file describing allele frequencies. This has two purposes: 1) to select SNPs to downsample'
        'to, based on the `minimum_af` argument. 2) To provide a prior expectation on the frequency of genotypes.'
        'The first level of the zarr file should be groups for seqids, with each containing `POS` (position) and'
        'AF (allele frequencies). The shape of the AF array must be Nx4, where N is the size of the 1D POS array.'
        'The order of alleles *must* correspond to the coding in the input data. There is no requirement to have a'
        'similar shape to the input genotypes, although a minimum level of intersection is required!'
    )

    parser.add_argument('--seqid',
                        required=True,
                        nargs='+',
                        help='name of chromosome(s) or contig(s) to process. ')

    parser.add_argument('--output',
                        required=True,
                        help='path to output file stem')

    parser.add_argument('--downsample',
                        required=False,
                        default=20000,
                        help='number of sites to consider.',
                        type=int)

    parser.add_argument(
        '--minimum-af',
        required=False,
        default=0.05,
        help=
        'minimum minor allele frequency in reference population to consider. Sites with higher MAF are more '
        'powerful at detecting contamination',
        type=float)

    parser.add_argument('--sequence-error-rate',
                        required=False,
                        default=1e-3,
                        help='probability of observing a non REF/ALT base',
                        type=float)

    parser.add_argument(
        '--minimum-coverage',
        required=False,
        default=10,
        help=
        'minimum read depth to use. Low depths have low power to detect contamination',
        type=int)

    parser.add_argument('--plot', dest='plot', action='store_true')
    parser.add_argument('--no-plot', dest='plot', action='store_false')

    parser.add_argument('--log', dest='log', action='store_true')
    parser.add_argument('--no-log', dest='log', action='store_false')

    parser.set_defaults(plot=True, log=False)

    try:
        args = {
            "input": snakemake.input.input,
            "sites": snakemake.input.sites,
            "allele_frequencies": snakemake.input.allele_frequencies,
            "seqid": snakemake.params.seqid,
            "output": snakemake.params.stem,
            "minimum_af": snakemake.params.minimum_af,
            "minimum_coverage": snakemake.params.minimum_coverage,
            "sequence_error_rate": snakemake.params.seq_err_rate,
            "downsample": snakemake.params.downsample,
            "plot": snakemake.params.plot,
            "log": snakemake.params.log
        }
        log("Args read via snakemake")
    except NameError:
        args = vars(parser.parse_args())
        log("Args read via command line")

    seqids = args['seqid']
    sequence_error_rate = args['sequence_error_rate']
    downsample_n = args["downsample"]
    minimum_minor_af = args["minimum_af"]

    output_csv = args['output'] + ".contamination.csv"
    output_png = args['output'] + ".allele_balance.png"
    output_log = args["output"] + ".{alpha}.log"

    sample_store = zarr.ZipStore(args["input"], mode="r")
    sample_callset = zarr.Group(sample_store)

    sites = zarr.ZipStore(args["sites"], mode="r")
    variant_sites = zarr.Group(sites)
    sample = next(iter(sample_callset))

    concatenated_sample_callset, _ = concatenate_arrays(
        sample_callset[sample], seqids, paths=["calldata/GT", "calldata/AD"])

    gt = allel.GenotypeArray(concatenated_sample_callset["calldata/GT"])
    ad = concatenated_sample_callset["calldata/AD"]

    concatenated_sites, concatenated_site_shapes = concatenate_arrays(
        variant_sites, seqids, ["variants/POS"])
    pos = concatenated_sites["variants/POS"]
    assert pos.shape[0] == gt.shape[0] == ad.shape[
        0], "Shape inconsistency. {0}, {1}, {2}".format(
            pos.shape, gt.shape, ad.shape)

    # load allele frequencies required to compute weights
    allele_frequencies_z = zarr.open_group(args['allele_frequencies'], "r")
    concatenated_af_arrays, concatenated_af_shapes = concatenate_arrays(
        allele_frequencies_z, seqids, ["POS", "AF"])
    af_pos = concatenated_af_arrays["POS"]
    # This is a 2D array of the frequency of the ALT allele in some other dataset.
    af_val = concatenated_af_arrays["AF"]
    assert af_val.shape[
        1] == 4, "Allele frequencies must contain all 4 alleles, even if unobserved."

    # for the sample_gt: Keep if
    # a) in af, b) is_called and c) is_biallelic
    # step 1 find the intersection this works on multi indexes
    loc_gt, loc_af = locate_intersection(pos, concatenated_site_shapes, af_pos,
                                         concatenated_af_shapes)

    flt_af_val = np.compress(loc_af, af_val, axis=0)
    flt_gt = np.compress(loc_gt, gt, axis=0)
    flt_ad = np.compress(loc_gt, ad, axis=0)

    # now we need to filter both by is biallelic and is called.
    is_bial_ref_pop = np.count_nonzero(flt_af_val, axis=1) == 2
    is_called = flt_gt.is_called()[:, 0]

    # compress the intersection by the AND of these
    keep_loc = is_called & is_bial_ref_pop
    alt_frequency_pass = np.compress(keep_loc, flt_af_val, axis=0)
    allele_depth_pass = np.compress(keep_loc, flt_ad, axis=0)

    # recode the allele depth to 0/1.
    # find the "alt" column.
    log("Ordering alleles by frequency for REF/ALT/ERR")
    min_cov_reached = allele_depth_pass[:, 0].sum(
        axis=1) >= args['minimum_coverage']

    ix_cols_sort = np.argsort(alt_frequency_pass, axis=1)[:, ::-1]

    # indices of all rows
    ix_rows = np.arange(alt_frequency_pass.shape[0])

    # apply the sorting operation
    allele_depth_pass_reordered = np.squeeze(allele_depth_pass)[
        ix_rows[:, np.newaxis], ix_cols_sort]

    # Define allele counts: sum final 2 columns, representing ref/alt/error
    allele_depths = allele_depth_pass_reordered[:, :3]
    allele_depths[:,
                  2] = allele_depths[:, 2] + allele_depth_pass_reordered[:, 3]
    assert allele_depths.shape[1] == 3

    # issue with some samples having a third allele (ie not in phase 2) discovered at high frequency
    # Filter sites where more than 10% of reads look like errors.
    probably_biallelic = allele_depth_pass_reordered[:, 2] < (
        .1 * allele_depth_pass_reordered.sum(axis=1))

    # step 2 create the 0/1/2 from the allele frequencies.
    major_af = alt_frequency_pass.max(axis=1)

    # select the values with the highest MAF.
    log("Selecting variants on which to perform analysis")
    while True:
        eligible = probably_biallelic & min_cov_reached & (
            (1 - major_af) > minimum_minor_af)
        if eligible.sum() > downsample_n:
            break

        minimum_minor_af -= 0.01
        if minimum_minor_af < 0:
            log("Insufficient variants meet criteria to compute contamination. n={0}, min={1}"
                .format(eligible.sum(), downsample_n))
            break

    res = pd.DataFrame(index=[sample], columns=["LLR", "LL", "pc_contam"])

    if eligible.sum() > downsample_n:
        log("Downsample from {0} to {1}".format(eligible.sum(), downsample_n))

        ix_ds = np.sort(
            np.random.choice(np.where(eligible)[0], size=downsample_n))

        major_af = np.take(major_af, ix_ds, axis=0)
        allele_depths = np.take(allele_depths, ix_ds, axis=0)

        genotype_weights = np.log(determine_weights(major_af))

        log("estimating contamination...")
        xv = minimize_scalar(compute_likelihood,
                             args=(sequence_error_rate, allele_depths,
                                   genotype_weights, args["log"], output_log),
                             bounds=(0, 0.5),
                             method="Bounded",
                             options={"xatol": 1e-6})

        # compute the likelihood at alpha = 0, to report likelihood ratio.
        null = compute_likelihood(0.0, sequence_error_rate, allele_depths,
                                  genotype_weights, args["log"], output_log)

        # return the llr / ll / estimate
        res.loc[sample] = -min(xv.fun - null, 0), -xv.fun, xv.x * 100

        if args['plot']:
            plot_allele_balance(flt_gt, flt_ad, output_png, res.iloc[0])

    res.to_csv(output_csv)
Exemplo n.º 8
0
import gzip
import pandas as pd
import os
import gcsfs
import zarr
from itertools import compress

gcs_bucket_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub',
                                    token='anon',
                                    access='read_only')

geno_bi_path = os.path.join(
    "ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic"
)
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_bi_path)
callset_biallel = zarr.Group(gcsacmap, read_only=True)
metadata = pd.read_csv("samples.meta.txt", sep="\t")
pop_selection = metadata.population.isin({
    'GHcol', 'GHgam', 'BFgam', 'BFcol', 'GM', 'GW', 'GNgam', 'GNcol', 'CIcol'
}).values
callset_fn = callset_biallel


def get_consecutive_true(a):
    if a.sum() == 0:
        return 0
    else:
        return np.diff(
            np.where(np.concatenate(
                ([a[0]], a[:-1] != a[1:], [True])))[0])[::2].max()
    def load_mask(self, seq_id, mask_id, filters_model="dt_20200416"):

        mask_path = self.release_dir / "site_filters" / filters_model / mask_id
        mask_store = self.gcs.get_mapper(mask_path.as_posix())
        mask_group = zarr.Group(mask_store)
        return da.from_zarr(mask_group[seq_id]["variants/filter_pass"])
def open_zarr_dataset(zarr_path):
    store = zarr.DirectoryStore(zarr_path)
    callset = zarr.Group(store=store, read_only=True)
    return callset
Exemplo n.º 11
0
    def load_filter_n(self, seq_id):

        path = self.release_dir / "accessibility" / "accessibility.zarr"
        storez = self.gcs.get_mapper(path.as_posix())
        zarrdata = zarr.Group(storez)
        return da.from_zarr(zarrdata[f"{seq_id}/filter_n"])
Exemplo n.º 12
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Given a zipped Zarr store summarize information about the coverage/calls'
    )

    parser.add_argument(
        '--input',
        required=True,
        help='path to Zarr store containing calldata for a single sample')

    parser.add_argument('--seqid',
                        required=False,
                        help='name of chromosomes or contigs to process',
                        nargs="+",
                        default=["2R", "2L", "3R", "3L", "X"],
                        type=str)

    parser.add_argument(
        '--output',
        required=True,
        help='path to output basename for stats table and coverage histogram')

    try:
        args = {
            "input": snakemake.input.input,
            "seqid": snakemake.params.seqid,
            "output": snakemake.params.stem
        }
        log("Args read via snakemake")
    except NameError:
        args = vars(parser.parse_args())
        log("Args read via command line")

    zfn = args['input']
    store = zarr.ZipStore(zfn, mode="r")
    callset = zarr.Group(store)

    csv_out = args['output'] + ".callstats.csv"
    npy_out = args['output'] + ".covhist.npz"

    # Holders for data
    df_cols = [
        "nSitesCalled", "nHomRef", "nHet", "nHetRef", "nHomAlt",
        "nNonRefAlleles"
    ]
    ad_df = pd.DataFrame(index=args['seqid'], columns=df_cols)

    cov_hist = dict()

    sample = next(iter(callset))

    for chrom in args['seqid']:

        gt = allel.GenotypeArray(callset[sample][chrom]["calldata/GT"])

        is_called = np.squeeze(gt.is_called())

        # if no coverage at all
        if is_called.sum() == 0:
            ad_df.loc[chrom] = 0
            cov_hist[chrom] = np.array([is_called.shape[0]])
            continue

        all_allele_depths = callset[sample][chrom]["calldata/AD"][:, 0]

        # To calculate DP at each position/sample we need to filter missing genotpyes.
        allele_depths_with_cov = np.compress(is_called,
                                             all_allele_depths,
                                             axis=0)

        # Histogram of coverage
        sum_by_alt = allele_depths_with_cov.sum(axis=1)
        bc = np.bincount(sum_by_alt, minlength=251)
        bc[0] = (~is_called).sum()
        cov_hist[chrom] = bc

        nHomRef = gt.count_hom_ref()
        nHet = gt.count_het()
        nHetRef = gt.count_het(allele=0)
        nHomAlt = gt.count_hom_alt()

        nNonRefAlleles = (2 * nHomAlt) + nHetRef + (2 * (nHet - nHetRef))
        total_calls = is_called.sum()

        ad_df.loc[chrom] = [
            total_calls, nHomRef, nHet, nHetRef, nHomAlt, nNonRefAlleles
        ]

    ad_df.to_csv(csv_out)
    np.savez(npy_out, **cov_hist)