Exemplo n.º 1
0
Arquivo: core.py Projeto: niemasd/pixy
def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2,
                              sites_list_chunk):

    # a string representation of the target region of the current window
    window_region = chromosome + ":" + str(window_pos_1) + "-" + str(
        window_pos_2)

    # read in data from the source VCF for the current window
    callset = allel.read_vcf(args.vcf,
                             region=window_region,
                             fields=[
                                 'CHROM', 'POS', 'calldata/GT',
                                 'variants/is_snp', 'variants/numalt'
                             ])

    # keep track of whether the callset was empty (no sites for this range in the VCF)
    # used by compute_summary_stats to add info about completely missing sites
    if callset is None:
        callset_is_none = True
        gt_array = None
        pos_array = None

    else:
        # if the callset is NOT empty (None), continue with pipeline
        callset_is_none = False

        # convert to a genotype array object
        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['calldata/GT']))

        # build an array of positions for the region
        pos_array = allel.SortedIndex(callset['variants/POS'])

        # create a mask for biallelic snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['variants/is_snp'][:] == 1,
                           callset['variants/numalt'][:] == 1),
            callset['variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        # if a list of target sites was specified, mask out all non-target sites
        if sites_list_chunk is not None:
            gt_array = mask_non_target_sites(gt_array, pos_array,
                                             sites_list_chunk)

        # extra 'none' check to catch cases where every site was removed by the mask
        if len(gt_array) == 0:
            callset_is_none = True
            gt_array = None
            pos_array = None

    return callset_is_none, gt_array, pos_array
Exemplo n.º 2
0
def get_genotype_array_concat(callsets,
                              genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    if len(callsets) == 1:
        # Only one callset provided. No need for concatenation
        callset = callsets[0]
        return get_genotype_array(callset=callset,
                                  genotype_array_type=genotype_array_type)

    gt_list = []

    # Get genotype data for each callset
    for callset in callsets:
        gt = get_callset_genotype_data(callset)
        if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Encapsulate underlying zarr array with a chunked dask array
            gt = da.from_array(gt, chunks=gt.chunks)
        gt_list.append(gt)

    if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        combined_gt = da.concatenate(gt_list, axis=0)
        combined_gt = allel.GenotypeDaskArray(combined_gt)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        combined_gt = allel.GenotypeChunkedArray(
            np.concatenate(gt_list, axis=0))
    elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0))
    else:
        raise ValueError(
            'Error: Invalid option specified for genotype_array_type.')

    return combined_gt
Exemplo n.º 3
0
def get_singletons(zarr_folder, chrom, samples, start=-9, stop=-9):
    callset = zarr.open_group(zarr_folder, mode='r')

    pos = callset[chrom]['variants']['POS']
    # pdb.set_trace()
    ref = callset[chrom]['variants']['REF']
    alt = callset[chrom]['variants']['ALT']
    ids = callset[chrom]['variants']['ID']

    gt = allel.GenotypeDaskArray(
        callset[str(chrom)]['calldata']['GT'])  # Retrieve genotype data
    gt = gt.take(samples,
                 axis=1).compute()  # subset data to samples of interest

    ac = gt.count_alleles()

    if start == -9: start = min(pos)
    if stop == -9: stop = max(pos)

    flt = ac.is_singleton(1)
    pos2 = pos.get_mask_selection(flt)
    gf = gt.compress(flt, axis=0)
    sing_dict = {p: i for p, i in zip(pos2, np.where(gf.is_het())[1])}
    ind_dict = {}
    for key, value in sing_dict.items():
        if value in ind_dict:
            ind_dict[value].append(key)
        else:
            ind_dict[value] = [key]

    return ind_dict, gt, ids, ref, alt, pos, start, stop
Exemplo n.º 4
0
def __main__():

    parser = arg.ArgumentParser()
    parser.add_argument('--chr', dest='chrom')
    args = parser.parse_args()

    # read in extra data
    bed = pd.read_csv(
        '/psych/ripke/vasa/reference_data/ldetect-data/EUR/fourier_ls-chr{}.bed'
        .format(args.chrom),
        sep='\s+')
    eur_samples = pd.read_csv(
        '/psych/ripke/1000Genomes_reference/1KG_Oct14/1000GP_Phase3_sr_0517d/integrated_call_samples_v3.20130502.ALL.panel.fam.EUR',
        sep='\t',
        names=['fid', 'iid', 'mid', 'pid', 'sex', 'pheno'],
        header=None)

    # read in genotype data
    zarr_path = '/psych/ripke/vasa/reference_data/1000G/loc.ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.zarr'.format(
        args.chrom)
    callset = zarr.open_group(zarr_path, mode='r')
    pos = ska.SortedIndex(callset['variants/POS'])

    callset_samples = list(callset['samples'][:])
    eur_samples['callset_index'] = [
        callset_samples.index(s) for s in eur_samples['iid']
    ]

    gt = callset['calldata/GT']
    gt_da = ska.GenotypeDaskArray(gt)

    print('Subsetting to europeans')
    eur_da = gt_da.take(eur_samples['callset_index'].values, axis=1)
    eur_ac = eur_da.count_alleles()

    print('Filtering european singletons and invariants')
    flt = (eur_ac.max_allele() == 1) & (eur_ac[:, :2].min(axis=1) > 1)
    flt_mask = flt.compute()
    flt_da = eur_da.compress(flt_mask, axis=0).compute()

    # update variant index
    pos = pos[flt_mask]

    #import ipdb
    #ipdb.set_trace()

    print('Counting region window sizes: ')
    bed['num_variants'] = np.nan
    for i, region in bed.iterrows():
        print('\t{} of {}'.format(i, bed.shape[0]))
        loc_region = pos.locate_range(region['start'], region['stop'])
        bed.loc[i, ['num_variants']] = flt_da[loc_region, :, :].n_variants

    bed.to_csv('data/1000G_eur_chr{}_region_variant_counts.tsv'.format(
        args.chrom),
               sep='\t')
Exemplo n.º 5
0
def get_genotype_array(callset,
                       genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    gtz = get_callset_genotype_data(callset)

    if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        return allel.GenotypeArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        return allel.GenotypeDaskArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        return allel.GenotypeChunkedArray(gtz)
    else:
        return None
Exemplo n.º 6
0
def get_biallelic(zarr_folder, chrom, samples):
    callset = zarr.open_group(zarr_folder, mode='r')

    gt = allel.GenotypeDaskArray(
        callset[str(chrom)]['calldata']['GT'])  # Retrieve genotype data
    gt = gt.take(samples,
                 axis=1).compute()  # subset data to samples of interest

    ac = gt.count_alleles()
    sites = ac.is_biallelic_01()[:]

    return sites
Exemplo n.º 7
0
def load_arrays_noncoding_and_centromeres(local_path,
                                          _set,
                                          chrom,
                                          coding_reg_df,
                                          sitefilter='gamb_colu',
                                          filter_centro=True):
    """
    This function reads and filters a genotyping array to the noncoding, noncentromeric regions, and applys a filter depending on 
    whether the samples are arabiensis (arab) or gambiae/coluzzii (gamb_colu)
    """
    Ag_array = zarr.open_array(
        f"{local_path}/snp_genotypes/all/{_set}/{chrom}/calldata/GT/",
        mode='r')
    filters = zarr.open(
        f"{local_path}/site_filters/dt_20200416/{sitefilter}/{chrom}/variants/filter_pass",
        mode="r")
    positions = zarr.open_array(
        f"{local_path}/snp_genotypes/all/sites/{chrom}/variants/POS/",
        mode='r')
    positions = positions[:][filters[:]]
    geno = allel.GenotypeDaskArray(Ag_array)
    geno = geno[filters[:]]

    if filter_centro is True:
        if chrom == '2L':
            centromere = (positions > 3000000)
        elif chrom == '2R':
            centromere = (positions < 57000000)
        elif chrom == '3L':
            centromere = (positions > 2000000)
        elif chrom == '3R':
            centromere = (positions < 50000000)
        elif chrom == 'X':
            centromere = (positions < 21000000)

        positions = allel.SortedIndex(positions[centromere])
    else:
        positions = allel.SortedIndex(positions)

    #get boolean array for positions that are coding - allel.locate_ranges so fast!
    coding = positions.locate_ranges(coding_reg_df.start,
                                     coding_reg_df.end,
                                     strict=False)
    #compress to get noncoding SNPs and remove centromeric regions of low recombination
    #get non-centromeric regions. currently chosen by eye based on ag1000g phase1 paper fig1.

    if filter_centro is True: geno = geno.compress(centromere, axis=0)
    geno = geno.compress(
        ~coding,
        axis=0)  #we want noncoding regions so '~' to get inverse of boolean
    positions = positions[~coding]

    return (geno, positions)
Exemplo n.º 8
0
def gtstats(calls, pop2color, n_variants):
    """
    """
    gtd = allel.GenotypeDaskArray(calls['calldata/GT'])
    pc_missing = gtd.count_missing(axis=0)[:].compute()  # per sample
    miss = gtd.count_missing(axis=1)[:].compute()
    pc_het = gtd.count_het(axis=0)[:].compute()  # per sample
    dep = calls['calldata/DP']
    dp = np.mean(dep[:, :], axis=0)
    ap.plotstats(pc_het / n_variants, 'Heterozygous', pop2color)
    ap.plotstats(pc_missing / n_variants, 'Missing', pop2color)
    ap.plotstats(dp, 'Depth', pop2color)
    return (miss)
Exemplo n.º 9
0
def get_ACdata(zarr_folder, chrom, samples, start=-9, stop=-9):
    callset = zarr.open_group(zarr_folder, mode='r')

    pos = callset[chrom]['variants']['POS']

    gt = allel.GenotypeDaskArray(
        callset[str(chrom)]['calldata']['GT'])  # Retrieve genotype data
    gt = gt.take(samples,
                 axis=1).compute()  # subset data to samples of interest

    ac = gt.count_alleles()

    if start == -9: start = min(pos)
    if stop == -9: stop = max(pos)

    return ac, pos, start, stop
def get_genotype_data(callset):
    genotype_ref_name = ''

    # Ensure 'calldata' is within the callset
    if 'calldata' in callset:
        # Try to find either GT or genotype in calldata
        if 'GT' in callset['calldata']:
            genotype_ref_name = 'GT'
        elif 'genotype' in callset['calldata']:
            genotype_ref_name = 'genotype'
        else:
            return None
    else:
        return None

    gtz = callset['calldata'][genotype_ref_name]

    return allel.GenotypeDaskArray(gtz)
Exemplo n.º 11
0
    def test_ld(self):
        ''' unit test for ldshrink '''

        input_hdf = "/home/nwknoblauch/Dropbox/Repos/LD_dask/test_data/reference_genotype.h5"
        callset = h5.File(input_hdf, mode='r')
        ref_geno = allel.GenotypeDaskArray(callset['calldata/GT'])
        vt = allel.VariantChunkedTable(callset['variants'])
        map_data = vt['MAP']
        geno_ac = ref_geno.to_n_alt().T.compute()
        m = 85
        Ne = 11490.672741
        cutoff = 0.001
        test_R_file = "test_data/reference_ld.txt"
        sub_X = geno_ac[:, :4]
        sub_map = map_data[:4]
        est_r = lddask.ld.ldshrink(sub_X, sub_map, m, Ne, cutoff)
        true_r = np.loadtxt(test_R_file, delimiter="\t")
        sub_est_r = true_r[:4, :4]
        assert (np.allclose(true_r[:4, :4], est_r))
Exemplo n.º 12
0
    def load_calldata_by_sampleset(self,
                                   seq_id,
                                   sampleset,
                                   field="GT",
                                   mask=None):

        if isinstance(sampleset, str):

            path = self.release_dir / "snp_genotypes" / "all" / sampleset
            print(path)

            # need to open as mapping if this on cloud
            storez = self.gcs.get_mapper(path.as_posix())
            calldata = zarr.Group(storez)

            arr = da.from_zarr(calldata[f"{seq_id}/calldata/{field}"])

        elif isinstance(sampleset, list):
            arr = da.concatenate([
                self.load_calldata_by_sampleset(
                    seq_id, s, field=field, mask=None) for s in sampleset
            ],
                                 axis=1)
        else:
            raise ValueError(
                "sampleset must be a string, or a list of strings")

        if mask is not None:

            assert isinstance(mask, da.core.Array), "mask must be a dask_array"

            arr = da.compress(mask, arr, axis=0).compute_chunk_sizes()

        if field == "GT":
            arr = allel.GenotypeDaskArray(arr)

        return arr
Exemplo n.º 13
0
    def collect_metrics(self, dataset, metadataObj, indels_flag):
        pos = dataset[self.seq_id]['variants']['POS'][:]  # numpy.ndarray
        gts = allel.GenotypeDaskArray(
            dataset[self.seq_id]['calldata']
            ['GT'])  # allel.model.dask.GenotypeDaskArray
        acs = gts.count_alleles(
            max_allele=3).compute()  # allel.model.ndarray.AlleleCountsArray
        is_snp = np.array(
            dataset[self.seq_id]['variants']['is_snp']
        )  # zarr.core.array ; len(REF) == 1 && len(ALT) == 1 (excludes SNP+other)
        is_var = acs.is_variant()  # numpy.ndarray; AlternateCall >= 1
        # Missingness of SNP/nonSNP vars/invars (works)
        self.missingness_counter_by_label = get_missingness_counter_by_label(
            gts, is_var, is_snp)
        # snps numalts
        numalts = dataset[self.seq_id]['variants']['numalt'][:]
        self.snp_numalt_counter = get_snp_numalt_counter(numalts, is_snp)
        # variant_type_counter_by_idx
        if indels_flag and np.any(~is_snp & is_var):
            reflen = np.frompyfunc(len, 1, 1)(
                dataset[self.seq_id]['variants']['REF'][:])
            self.variant_type_counter_by_idx = get_variant_counter_by_idx(
                reflen, pos, is_snp)
        # sample_counts_by_gt
        self.sample_counts_by_gt = get_sample_counts_by_gt(
            gts, metadataObj, is_snp)
        # sample_snp_dp_counter_by_sample_id
        # https://matplotlib.org/3.1.0/gallery/statistics/multiple_histograms_side_by_side.html#sphx-glr-gallery-statistics-multiple-histograms-side-by-side-py
        sample_snp_dps = dataset[self.seq_id]['calldata']['DP'][:][is_snp]
        self.sample_snp_dp_counter_by_sample_id = get_sample_snp_dp_counter_by_sample_id(
            sample_snp_dps, metadataObj)
        # SNP density over windows
        counts, windows = allel.windowed_count(
            pos[:][is_snp], size=WINDOWSIZE, start=1,
            stop=self.seq_length)  # seq_length
        self.snp_densities = np.round(counts / WINDOWSIZE, 4)
        # biallelic SNPs
        is_biallelic = acs.is_biallelic()[:]
        is_biallelic_snp = (is_biallelic & is_snp)
        self.biallelic_snp_singletons_count = np.count_nonzero(
            (acs.max_allele() == 1) & acs.is_singleton(1))
        # TS/TV
        biallelic_snps_REF = np.array(
            dataset[self.seq_id]['variants']['REF'][:][is_biallelic_snp],
            dtype="|S2")
        biallelic_snps_ALT = np.array(
            dataset[self.seq_id]['variants']['ALT'][:, 0][is_biallelic_snp],
            dtype="|S2")

        biallelic_snps_DP = dataset[
            self.seq_id]['variants']['DP'][:][is_biallelic_snp]
        biallelic_snps_QUAL = dataset[
            self.seq_id]['variants']['QUAL'][:][is_biallelic_snp]
        biallelic_snps_acs = acs[is_biallelic_snp]
        self.biallelic_snps_mutations = np.char.add(biallelic_snps_REF,
                                                    biallelic_snps_ALT)
        self.biallelic_snps_QUAL_DPs = biallelic_snps_QUAL / biallelic_snps_DP
        self.biallelic_snps_count = len(biallelic_snps_acs)
        biallelic_snps_gts = gts[is_biallelic_snp].compute()
        biallelic_snps_allelecounts_subpops = biallelic_snps_gts.count_alleles_subpops(
            metadataObj.sample_ids_by_pop_id, max_allele=1
        )  # max_allele=1, otherwise error in allel.stats.sf._check_ac_n()
        self.biallelic_snps_seg_count_by_pop_id = {
            pop_id: biallelic_snps_allelecounts.count_segregating()
            for pop_id, biallelic_snps_allelecounts in
            biallelic_snps_allelecounts_subpops.items()
        }
        is_segregating_biallelic_snp = biallelic_snps_allelecounts_subpops[
            'all'].is_segregating()[:]
        self.segregating_biallelic_snp_acs_by_pop_id = {
            pop_id: biallelic_snps_allelecounts_subpops[pop_id]
            [is_segregating_biallelic_snp]
            for pop_id in metadataObj.pop_ids_order
        }
Exemplo n.º 14
0
         ac,
         ac2,
         size=winsize,
         start=start,
         stop=stop,
         step=int(winsize / 2))
     new_dat = format_results(stat=Fst_Pat,
                              stat_name="Fst_Pat",
                              chrom=chrom,
                              windows=windows,
                              nvar=counts,
                              pop=pop)
     df_list.append(new_dat)
 if 'r2' in args.s:
     #pdb.set_trace()
     ct = allel.GenotypeDaskArray(
         callset[str(chrom)]['calldata']['GT'])
     ct = ct.take(loc_samples, axis=1).compute()
     ct = ct.compress(biallelic, axis=0)
     ct = ct.to_n_alt(fill=-1)
     r2, windows, counts = allel.windowed_r_squared(
         pos,
         ct,
         size=winsize,
         start=start,
         stop=stop,
         step=int(winsize / 2),
         fill=-9)
     new_dat = format_results(stat=r2,
                              stat_name="r2",
                              chrom=chrom,
                              windows=windows,
Exemplo n.º 15
0
mapping = polarize_map(callset, ancestral_sequence)

# calculating mutation rate in windows - should probably be set up as a function, but I will start out as without
gt_outgroup = gt.compress(callables*biallelic*ancestral, axis = 0).take(outgroup, axis = 1)
ps_outgroup = allel.SortedIndex(callset['variants/POS']).compress(callables*biallelic*ancestral)
gt_outgroup_allele_count = gt_outgroup.count_alleles()
polymorphic_loci = (gt_outgroup_allele_count[:, 0] != 0)*(gt_outgroup_allele_count[:, 1] != 0)
mutrate(ps_outgroup, polymorphic_loci, chrom_number)


# boolean numpy array encoding if a position in the genome (after filtering and polarizying) for the outgroup individuals 
# is variant (more than 0 alleles of type "1", in this case, derived) or not
variant_loci_outgroup = (allel.GenotypeDaskArray(callset['/calldata/GT'])
                                                                         .map_alleles(mapping)
                                                                         .compress(callables*biallelic*ancestral, axis = 0)
                                                                         .take(outgroup, axis = 1)
                                                                            .count_alleles()
                                                                         .is_variant()
                                                                         .compute())

#Genotype Array polarized and with SNPs filtered for the ingroup individuals
gt_ingroup  = (allel.GenotypeDaskArray(callset['/calldata/GT'])
                                                               .map_alleles(mapping)
                                                               .compress(callables*biallelic*ancestral, axis = 0)
                                                               .take(ingroup, axis = 1)
                                                               .compute())


#Write the observation file per ind
obs(ps, gt_ingroup, variant_loci_outgroup, chrom_number, ingroup_names, dir_name)
Exemplo n.º 16
0
samples_HGDP = list(callset_HGDP["{}/samples".format(chrom)][:])
samples_1KGP = list(callset_1KGP["{}/samples".format(chrom)][:])

#outgroup and ingroup individuals index
outgroup_index_HGDP = get_outgroup_index_HGDP(samples_HGDP)
outgroup_index_1KGP = get_outgroup_index_1KGP(samples_1KGP)
ingroup_index = get_ingroup_index(samples_HGDP, ingroup_names)

#mapping array to polarize SNPS
mapping = polarize_map(callset_HGDP, chrom)

#boolean numpy array encoding if a position in the genome (after filtering and polarizying) for the outgroup individuals
#is variant (more than 0 alleles of type "1", in this case, derived) or not
variant_loci_outgroup_HGDP = (allel.GenotypeDaskArray(callset_HGDP[
    '{}/calldata/GT'.format(chrom)]).map_alleles(mapping).compress(
        callables * biallelic * ancestral,
        axis=0).take(outgroup_index_HGDP,
                     axis=1).count_alleles().is_variant().compute())

#boolean numpy array encoding for each position in the VCF of 1KGP if it appears also in the HGDP data
intersect_loci_1KGP, intersect_loci_HGDP = allel.SortedIndex(
    callset_1KGP["{}/variants/POS".format(chrom)]).locate_intersection(ps)

#boolean numpy array encoding for each position in the VCF of 1KGP if it appears also in the HGDP data
variant_loci_outgroup_1KGP = (allel.GenotypeDaskArray(
    callset_1KGP['{}/calldata/GT'.format(chrom)]).compress(
        intersect_loci_1KGP,
        axis=0).take(outgroup_index_1KGP,
                     axis=1).count_alleles().is_variant().compute())

variant_loci_outgroup_HGDP[intersect_loci_HGDP] += variant_loci_outgroup_1KGP
Exemplo n.º 17
0
species = samples_df.groupby('sp_sex').indices
species_ix = {k: list(v) for k, v in species.items()}
unknown_subpops = ['GM_F', 'GW_F', 'KE_F']
main_species = ['An. coluzzii_F', 'An. gambiae_F']

def compute_divergence(allele_freqs):
	sum_alt = allele_freqs.sum(axis=0)
	return (sum_alt[1:].sum())

window_size = 100000
vref_dxy_by_window = dict()
xpop_dxy_by_window = dict()

for chrom in chroms:
	print('\nChromosome ' + chrom)
	gt = allel.GenotypeDaskArray(phase2_ar1.callset_pass[chrom]['calldata/genotype'])
	ac = gt.count_alleles_subpops(subpops_ix)
	ac_species = gt.count_alleles_subpops(species_ix)

	pos = allel.SortedIndex(phase2_ar1.callset_pass[chrom]['variants/POS'])

	accessibility = phase2_ar1.accessibility[chrom]['is_accessible']
	eqa = allel.equally_accessible_windows(accessibility, window_size)

	# Use the middle of the window as the index
	window_middle = np.sum(eqa, axis=1)/2
	vref_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int), columns=list(subpops.keys()) + list(species.keys()))
	xpop_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int))

	# Calculate distance from the reference in each sub-population
	for pop in subpops.keys():
Exemplo n.º 18
0
def LoadRegion(
    callset,
    meta,  # minimally a dataframe with sample names as index
    region,
    min_FMTDP=0,
    filter_snp=False,
    filter_biallelic=False,
    max_missing_proportion=None,
    group_col=None,  # column name in meta used to identify groups for group_max_missing_proportion
    group_max_missing_proportion=None):
    # NOTE: returned meta should be in the same order and same length as the returned genotype array
    # determine the index in the full callset for all the samples in meta
    callset_all_sample_ids = list(list(callset.values())[0]['samples'])
    meta['callset_idx'] = [callset_all_sample_ids.index(x) for x in meta.index]
    meta['idx'] = np.arange(meta.shape[0])  # index in the genotype array

    ch, start, stop = str2range(region)
    print("Region:", region, '->', (ch, start, stop))

    pos = allel.SortedIndex(callset[ch]['variants/POS'])
    if pos.shape[0] == 0:  # return empty if nothing for chrom
        return [], [], meta
    # create the slice
    try:
        sl = pos.locate_range(start, stop)
        pos = pos[sl]
    except KeyError:
        pos = []
    if len(pos) == 0:  # no loci in slice
        return [], [], meta

    # load combined set of both groups
    sample_idxs = meta['callset_idx'].values
    g = allel.GenotypeDaskArray(callset[ch]['calldata/GT'])[sl].take(
        sample_idxs, axis=1)
    g = g.compute()  # need to convert GenotypeDaskArray to GenotypeArray

    ## Filtering
    num_loci_in = g.shape[0]
    flt = np.ones(num_loci_in, dtype=bool)
    ac = None
    print('total number of loci =', flt.shape[0])

    # filter genotypes on FMT:DP
    if min_FMTDP > 0:
        genoflt_FMTDP = callset[ch]['calldata/DP'][sl].take(sample_idxs,
                                                            axis=1) < min_FMTDP
        g[genoflt_FMTDP] = [-1, -1]
        tmp_num_calls = g.shape[0] * g.shape[1]
        tmp = np.count_nonzero(genoflt_FMTDP)
        print('{} genotype calls of {} ({:02.2f}%) fail FMT:DP filter'.format(
            tmp, tmp_num_calls, 100 * tmp / float(tmp_num_calls)))

    if filter_snp:
        flt_snp = np.all(np.logical_or(
            callset[ch]['variants/TYPE'][sl] == 'snp',
            callset[ch]['variants/TYPE'][sl] == ''),
                         axis=1)
        flt = flt & flt_snp
        print('=', np.count_nonzero(flt), 'passing previous filters & SNP')

    if filter_biallelic:
        if ac is None:
            ac = g.count_alleles()
        flt_biallelic = ac.allelism() == 2
        flt = flt & flt_biallelic
        print('=', np.count_nonzero(flt),
              'passing previous filters & biallelic')

    # filter max_missing (genotype calls)
    if max_missing_proportion is not None:
        max_missing = int(np.floor(g.shape[1] * max_missing_proportion))
        flt_max_missing = g.is_missing().sum(axis=1) <= max_missing
        tmp = np.count_nonzero(flt_max_missing)
        print("max missing proportion {} of {} is {}".format(
            max_missing_proportion, g.shape[1], max_missing))
        print("max missing passing loci = {} ({:2.2f}%)".format(
            tmp, 100 * tmp / flt_max_missing.shape[0]))
        flt = flt & flt_max_missing
        print('=', np.count_nonzero(flt),
              'passing previous filters & max_missing')

    if group_max_missing_proportion is not None and group_col is not None:
        gmmflt = np.ones(g.shape[0], dtype=bool)
        for grp in meta[group_col].unique():
            grp_meta = meta[meta[group_col] == grp]
            max_missing = int(
                np.floor(grp_meta.shape[0] * group_max_missing_proportion))
            print("### Group max missing filter:", grp)
            print(grp_meta['idx'])
            print("N =", grp_meta.shape[0])
            print("max missing =", max_missing)
            print("loci in =", flt.shape[0])
            f = g[:, grp_meta['idx']].is_missing().sum(axis=1) <= max_missing
            tmp = np.count_nonzero(f)
            print("passing loci = {} ({:2.2f}%)".format(
                tmp, 100 * tmp / f.shape[0]))
            gmmflt = gmmflt & f
        tmp = np.count_nonzero(mmflt)
        print("passing all max missing filters {:d} of {:d} ({:.2f}%)".format(
            tmp, gmmflt.shape[0], 100 * tmp / gmmflt.shape[0]))
        flt = flt & gmmflt
        print('=', np.count_nonzero(flt),
              'passing previous filters & max_missing')

    # apply combined filter
    tmp = np.count_nonzero(flt)
    print("Passing all all filters {:d} of {:d} ({:.2f}%)".format(
        tmp, flt.shape[0], 100 * tmp / flt.shape[0]))
    return g.compress(flt, axis=0), pos.compress(flt, axis=0), meta
Exemplo n.º 19
0
    x = allele_depth.compute()
    n_disc = x[np.arange(0, x.shape[0], dtype=int), index]
    ad = x.sum(axis=1)
    return ad - n_disc, ad


chrom = snakemake.wildcards.chrom

phase2_callset_pass = zarr.open_group(snakemake.input.phase2_callset, mode="r")
pass_pos = allel.SortedIndex(phase2_callset_pass[chrom]["variants/POS"])

x_callset = h5py.File(snakemake.input.cross_callset, mode="r")
xdf = pd.read_table(snakemake.input.metadata, index_col=0)

x_pos = allel.SortedIndex(x_callset[chrom]['variants/POS'])
x_gt = allel.GenotypeDaskArray(x_callset[chrom]['calldata/genotype'])

x_ad = x_callset[chrom]['calldata/AD']
x_ad = da.from_array(x_ad, chunks=x_ad.chunks)

call_class = ("HOMREF", "HET", "HOMALT")
columns = pd.MultiIndex.from_product(
    (("ALL", "PASS"), call_class, ("SUCCESS", "N")))

# take sample names from the hdf5 file
sample_list = x_callset[chrom]["samples"][:].astype("U8").tolist()

# Drop samples that were not included in sequencing.
xdf = xdf.set_index("ox_code").reindex(sample_list).reset_index()
xdf = xdf.loc[xdf.cross.notna()]
xids = xdf.cross.unique()
Exemplo n.º 20
0
#Meta data for the sample present in the zarr data structure - Kasper has removed some of the samples.
samples_list = list(callset['chr1/samples'][:])
meta_data_samples = meta_data.loc[meta_data.PGDP_ID.isin(samples_list)].copy()
samples_callset_index = [
    samples_list.index(s) for s in meta_data_samples.PGDP_ID
]
meta_data_samples['callset_index'] = samples_callset_index


def het_counting(gt):
    return gt.count_het()


gt_zarr = callset["{}/calldata/GT".format(chrom)]
pos = callset["{}/variants/POS".format(chrom)]
gt = allel.GenotypeDaskArray(gt_zarr)
df_list = []
for i, row in meta_data_samples.iterrows():
    df = pd.DataFrame()
    individual = (gt.take([row.callset_index], axis=1))
    nnz, windows, counts = allel.windowed_statistic(pos,
                                                    individual,
                                                    statistic=het_counting,
                                                    size=window_size)
    df["het"] = nnz
    if i % 10 == 0:
        print(i)
    window_numbering = []
    df.insert(0, column="chr", value=chrom)
    window_numbering.extend(range(len(nnz)))
    df.insert(1, column="window", value=window_numbering)
Exemplo n.º 21
0
def main(args=None):

    if args is None:
        args = sys.argv[1:]

    # the ascii help image
    help_image = "█▀▀█ ░▀░ █░█ █░░█\n" "█░░█ ▀█▀ ▄▀▄ █▄▄█\n" "█▀▀▀ ▀▀▀ ▀░▀ ▄▄▄█\n"

    help_text = 'pixy: sensible estimates of pi and dxy from a VCF'
    version_text = 'version 0.95.0'

    # initialize arguments
    parser = argparse.ArgumentParser(
        description=help_image + help_text + '\n' + version_text,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--version', action='version', version=version_text)
    parser.add_argument(
        '--stats',
        nargs='+',
        choices=['pi', 'dxy', 'fst'],
        help=
        'Which statistics to calculate from the VCF (pi, dxy, and/or fst, separated by spaces)',
        required=True)
    parser.add_argument('--vcf',
                        type=str,
                        nargs='?',
                        help='Path to the input VCF',
                        required=True)
    parser.add_argument('--zarr_path',
                        type=str,
                        nargs='?',
                        help='Folder in which to build the Zarr array(s)',
                        required=True)
    parser.add_argument(
        '--reuse_zarr',
        choices=['yes', 'no'],
        default='no',
        help='Use existing Zarr array(s) (saves time if re-running)')
    parser.add_argument('--populations',
                        type=str,
                        nargs='?',
                        help='Path to the populations file',
                        required=True)
    parser.add_argument(
        '--window_size',
        type=int,
        nargs='?',
        help='Window size in base pairs over which to calculate pi/dxy')
    parser.add_argument(
        '--chromosomes',
        type=str,
        nargs='?',
        default='all',
        help=
        'A single-quoted, comma separated list of chromosome(s) (e.g. \'X,1,2\')',
        required=False)
    parser.add_argument(
        '--interval_start',
        type=str,
        nargs='?',
        help=
        'The start of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--interval_end',
        type=str,
        nargs='?',
        help=
        'The end of the interval over which to calculate pi/dxy. Only valid when calculating over a single chromosome.'
    )
    parser.add_argument(
        '--variant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,GQ>=20\') to apply to SNPs',
        required=False)
    parser.add_argument(
        '--invariant_filter_expression',
        type=str,
        nargs='?',
        help=
        'A single-quoted, comma separated list of genotype filters (e.g. \'DP>=10,RGQ>=20\') to apply to invariant sites',
        required=False)
    parser.add_argument(
        '--outfile_prefix',
        type=str,
        nargs='?',
        default='./pixy_output',
        help='Path and prefix for the output file, e.g. path/to/outfile')
    parser.add_argument(
        '--bypass_filtration',
        choices=['yes', 'no'],
        default='no',
        help=
        'Bypass all variant filtration (for data lacking FORMAT fields, use with caution)'
    )
    parser.add_argument(
        '--bypass_invariant_check',
        choices=['yes', 'no'],
        default='no',
        help=
        'Allow computation of stats without invariant sites, will result in wildly incorrect estimates most of the time. Use with extreme caution.'
    )
    parser.add_argument(
        '--fst_maf_filter',
        default=0.05,
        type=float,
        nargs='?',
        help=
        'Minor allele frequency filter for FST calculations, with value 0.0-1.0 (default 0.05).'
    )

    # ag1000g test data
    # args = parser.parse_args('--stats fst --vcf data/vcf/multi_chr.vcf.gz --zarr_path data/vcf/multi --window_size 10000 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --outfile_prefix output/pixy_out'.split())

    # filter test data
    # args = parser.parse_args('--stats pi --vcf data/vcf/filter_test.vcf.gz --zarr_path data/vcf/filter_test --window_size 3 --populations data/vcf/ag1000/Ag1000_sampleIDs_popfile_3.txt --variant_filter_expression DP>=10,GQ>20 --invariant_filter_expression DP>=10,RGQ>20 --fst_maf_filter 0.05 --outfile_prefix output/pixy_out'.split())

    # catch arguments from the command line
    args = parser.parse_args()

    # CHECK FOR TABIX
    # (disabled until we implement site level and BED support)
    #tabix_path = shutil.which("tabix")

    #if tabix_path is None:
    #    warnings.warn('[pixy] WARNING: tabix is not installed (or cannot be located) -- this may reduce performance. install tabix with "conda install -c bioconda tabix"')
    #if not os.path.exists(args.vcf + ".tbi") and tabix_path is not None:
    #    raise Exception('[pixy] ERROR: your vcf is not indexed with tabix, index the bgzipped vcf with "tabix your.vcf.gz"')

    # VALIDATE ARGUMENTS

    print("[pixy] pixy " + version_text)
    print(
        "[pixy] Validating VCF and input parameters (this may take some time)..."
    )

    # expand all file paths
    args.vcf = os.path.expanduser(args.vcf)
    args.zarr_path = os.path.expanduser(args.zarr_path)
    args.populations = os.path.expanduser(args.populations)
    args.outfile_prefix = os.path.expanduser(args.outfile_prefix)

    # CHECK FOR EXISTANCE OF VCF AND POPFILES

    if os.path.exists(args.vcf) is not True:
        raise Exception('[pixy] ERROR: The specified VCF ' + str(args.vcf) +
                        ' does not exist')

    if os.path.exists(args.populations) is not True:
        raise Exception('[pixy] ERROR: The specified populations file ' +
                        str(args.populations) + ' does not exist')

    # VALIDATE FILTER EXPRESSIONS

    # get vcf header info
    vcf_headers = allel.read_vcf_headers(args.vcf)

    # skip invariant check if only asking for FST
    if len(args.stats) == 1 and (args.stats[0] == 'fst'):
        args.bypass_invariant_check = "yes"

    # if we are bypassing the invariant check, spoof in a invariant filter
    if args.bypass_invariant_check == "yes":
        args.invariant_filter_expression = "DP>=0"

    if args.bypass_filtration == 'no' and (
            args.variant_filter_expression is None
            or args.invariant_filter_expression is None):
        raise Exception(
            '[pixy] ERROR: One or more filter expression is missing. Provide two filter expressions, or set --bypass_filtration to \'yes\''
        )

    if args.bypass_filtration == 'no':
        # get the list of format fields and requested filter fields
        format_fields = vcf_headers.formats.keys()
        filter_fields = list()

        for x in args.variant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        for x in args.invariant_filter_expression.split(","):
            filter_fields.append(re.sub("[^A-Za-z]+", "", x))

        missing = list(set(filter_fields) - set(format_fields))

        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following genotype filters were requested but not occur in the VCF: ',
                missing)
    else:
        print(
            "[pixy] WARNING: --bypass_filtration is set to \'yes\', genotype filtration will be not be performed."
        )

    # VALIDATE THE VCF

    # check if the vcf is zipped
    if re.search(".gz", args.vcf):
        cat_prog = "gunzip -c "
    else:
        cat_prog = "cat "

    # check if the vcf contains any invariant sites
    # a very basic check: just looks for at least one invariant site in the alt field

    if args.bypass_invariant_check == 'no':
        alt_list = subprocess.check_output(
            cat_prog + args.vcf +
            " | grep -v '#' | head -n 10000 | awk '{print $5}' | sort | uniq",
            shell=True).decode("utf-8").split()
        if "." not in alt_list:
            raise Exception(
                '[pixy] ERROR: the provided VCF appears to contain no invariant sites (ALT = \".\"). This check can be bypassed via --bypass_invariant_check \'yes\'.'
            )
    else:
        if not (len(args.stats) == 1 and (args.stats[0] == 'fst')):
            print(
                "[pixy] EXTREME WARNING: --bypass_invariant_check is set to \'yes\', which assumes that your VCF contains invariant sites. Lack of invariant sites will result in incorrect estimates."
            )

    # check if requested chromosomes exist in vcf
    # defaults to all the chromosomes contained in the VCF (first data column)

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list

    if args.chromosomes == 'all':
        chrom_list = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        chrom_all = chrom_list
    else:
        chrom_list = list(args.chromosomes.split(","))
        chrom_all = subprocess.check_output(
            cat_prog + args.vcf + " | grep -v '#' | awk '{print $1}' | uniq",
            shell=True).decode("utf-8").split()
        missing = list(set(chrom_list) - set(chrom_all))
        if len(missing) > 0:
            raise Exception(
                '[pixy] ERROR: the following chromosomes were requested but not occur in the VCF: ',
                missing)

    # INTERVALS
    # check if intervals are correctly specified

    if args.interval_start is not None and args.interval_end is None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is None and args.interval_end is not None:
        raise Exception(
            '[pixy] ERROR: Both --interval_start and --interval_end must be specified'
        )

    if args.interval_start is not None and args.interval_end is not None and len(
            chrom_list) > 1:
        raise Exception(
            '[pixy] ERROR: --interval_start and --interval_end are not valid when calculating over multiple chromosomes. Remove both arguments or specify a single chromosome.'
        )

    # SAMPLES
    # check if requested samples exist in vcf

    # - parse + validate the population file
    # - format is IND POP (tab separated)
    # - throws an error if individuals are missing from VCF

    # read in the list of samples/populations
    poppanel = pandas.read_csv(args.populations,
                               sep='\t',
                               usecols=[0, 1],
                               names=['ID', 'Population'])
    poppanel.head()

    # get a list of samples from the callset
    samples_list = vcf_headers.samples

    # make sure every indiv in the pop file is in the VCF callset
    IDs = list(poppanel['ID'])
    missing = list(set(IDs) - set(samples_list))

    # find the samples in the callset index by matching up the order of samples between the population file and the callset
    # also check if there are invalid samples in the popfile
    try:
        samples_callset_index = [samples_list.index(s) for s in poppanel['ID']]
    except ValueError as e:
        raise Exception(
            '[pixy] ERROR: the following samples are listed in the population file but not in the VCF: ',
            missing) from e
    else:
        poppanel['callset_index'] = samples_callset_index

        # use the popindices dictionary to keep track of the indices for each population
        popindices = {}
        popnames = poppanel.Population.unique()
        for name in popnames:
            popindices[name] = poppanel[poppanel.Population ==
                                        name].callset_index.values

    print("[pixy] Preparing for calculation of summary statistics: " +
          ','.join(map(str, args.stats)))
    print("[pixy] Data set contains " + str(len(popnames)) +
          " population(s), " + str(len(chrom_list)) + " chromosome(s), and " +
          str(len(IDs)) + " sample(s)")

    # initialize and remove any previous output files
    if os.path.exists(re.sub(r"[^\/]+$", "", args.outfile_prefix)) is not True:
        os.mkdir(re.sub(r"[^\/]+$", "", args.outfile_prefix))

    # initialize the output files for writing
    if 'pi' in args.stats:

        pi_file = str(args.outfile_prefix) + "_pi.txt"

        if os.path.exists(pi_file):
            os.remove(pi_file)

        outfile = open(pi_file, 'a')
        outfile.write("pop" + "\t" + "chromosome" + "\t" + "window_pos_1" +
                      "\t" + "window_pos_2" + "\t" + "avg_pi" + "\t" +
                      "no_sites" + "\t" + "count_diffs" + "\t" +
                      "count_comparisons" + "\t" + "count_missing" + "\n")
        outfile.close()

    if 'dxy' in args.stats:

        dxy_file = str(args.outfile_prefix) + "_dxy.txt"

        if os.path.exists(dxy_file):
            os.remove(dxy_file)

        outfile = open(dxy_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_dxy" + "\t" + "no_sites" + "\t" + "count_diffs" +
                      "\t" + "count_comparisons" + "\t" + "count_missing" +
                      "\n")
        outfile.close()

    if 'fst' in args.stats:

        fst_file = str(args.outfile_prefix) + "_fst.txt"

        if os.path.exists(fst_file):
            os.remove(fst_file)

        outfile = open(fst_file, 'a')
        outfile.write("pop1" + "\t" + "pop2" + "\t" + "chromosome" + "\t" +
                      "window_pos_1" + "\t" + "window_pos_2" + "\t" +
                      "avg_wc_fst" + "\t" + "no_snps" + "\n")
        outfile.close()

    # initialize the folder structure for the zarr array
    if os.path.exists(args.zarr_path) is not True:
        pathlib.Path(args.zarr_path).mkdir(parents=True, exist_ok=True)

    # main loop for computing summary stats

    # time the calculations
    start_time = time.time()
    print("[pixy] Started calculations at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))

    for chromosome in chrom_list:

        # Zarr array conversion

        # the chromosome specific zarr path
        zarr_path = args.zarr_path + "/" + chromosome

        # determine the fields that will be included
        # TBD: just reading all fields currently
        # vcf_fields = ['variants/CHROM', 'variants/POS'] + ['calldata/' + s for s in np.unique(filter_fields)]

        # build region string (if using an interval)
        if args.interval_start is not None:
            targ_region = chromosome + ":" + str(
                args.interval_start) + "-" + str(args.interval_end)
        else:
            targ_region = chromosome

        # allow for resuse of previously calculated zarr arrays
        if args.reuse_zarr == 'yes' and os.path.exists(zarr_path):
            print(
                "[pixy] If a zarr array exists, it will be reused for chromosome "
                + chromosome + "...")
        elif args.reuse_zarr == 'no' or os.path.exists(zarr_path) is not True:
            print("[pixy] Building zarr array for chromosome " + chromosome +
                  "...")
            warnings.filterwarnings("ignore")
            allel.vcf_to_zarr(args.vcf,
                              zarr_path,
                              region=targ_region,
                              fields='*',
                              overwrite=True)
            warnings.resetwarnings()

        print("[pixy] Calculating statistics for chromosome " + targ_region +
              "...")

        # open the zarr
        callset = zarr.open_group(zarr_path, mode='r')

        # parse the filtration expression and build the boolean filter array

        # define an operator dictionary for parsing the operator strings
        ops = {
            "<": operator.lt,
            "<=": operator.le,
            ">": operator.gt,
            ">=": operator.ge,
            "==": operator.eq
        }

        # determine the complete list of available calldata fields usable for filtration
        calldata_fields = sorted(callset['/calldata/'].array_keys())

        # check if bypassing filtration, otherwise filter
        if args.bypass_filtration == 'no':

            # VARIANT SITE FILTERS
            var_filters = []

            # iterate over each requested variant filter
            for x in args.variant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF FORMAT field"
                    ) from e
                else:
                    if type(var_filters) is list:
                        var_filters = ops[compare](callset['/calldata/' +
                                                           stat][:], value)
                    elif type(var_filters) is not list:
                        var_filters = np.logical_and(
                            var_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for variants only
            # is snp is a site level (1d) array
            # np.tile below creates a column of "is_snp" once for each sample
            # (i.e. makes it the same dimensions as the genotype table)
            is_snp = np.array([callset['/variants/is_snp'][:].flatten()
                               ]).transpose()
            snp_mask = np.tile(is_snp, (1, var_filters.shape[1]))

            # force only variant sites (snps, remember we ignore indels) to be included in the filter
            var_filters = np.logical_and(var_filters, snp_mask)

            # INVARIANT SITE FILTERS
            invar_filters = []

            for x in args.invariant_filter_expression.split(","):
                stat = re.sub("[^A-Za-z]+", "", x)
                value = int(re.sub("[^0-9]+", "", x))
                compare = re.sub("[A-Za-z0-9]+", "", x)

                # check if the requested filter/format exists in the VCF
                try:
                    stat_index = calldata_fields.index(stat)
                except ValueError as e:
                    raise Exception(
                        "[pixy] ERROR: The requested filter \'" + stat +
                        "\' is not annotated in the input VCF") from e
                else:
                    if type(invar_filters) is list:
                        invar_filters = ops[compare](callset['/calldata/' +
                                                             stat][:], value)
                    elif type(var_filters) is not list:
                        invar_filters = np.logical_and(
                            invar_filters,
                            ops[compare](callset['/calldata/' + stat][:],
                                         value))

            # create a mask for invariant sites by inverting the snp filter
            # join that to the invariant sites filter

            invar_filters = np.logical_and(invar_filters, np.invert(snp_mask))

            # join the variant and invariant filter masks (logical OR)
            filters = np.logical_or(invar_filters, var_filters)

        # applying the filter to the data
        # all the filters are in a boolean array ('filters' above)

        # first, recode the gt matrix as a Dask array (saves memory) -> packed
        # create a packed genotype array
        # this is a array with dims snps x samples
        # genotypes are represented by single byte codes
        # critically, as the same dims as the filters array below

        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['/calldata/GT'])).to_packed()

        # apply filters
        # only if not bypassing filtration
        if args.bypass_filtration == 'no':
            # set all genotypes that fail filters (the inversion of the array)
            # to 'missing', 239 = -1 (i.e. missing) for packed arrays
            gt_array[np.invert(filters)] = 239

        # convert the packed array back to a GenotypeArray
        gt_array = allel.GenotypeArray.from_packed(gt_array)

        # build the position array
        pos_array = allel.SortedIndex(callset['/variants/POS'])

        # a mask for snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['/variants/is_snp'][:] == 1,
                           callset['/variants/numalt'][:] == 1),
            callset['/variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it

        #Basic functions for comparing the genotypes at each site in a region: counts differences out of sites with data

        #For the given region: return average pi, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def tallyRegion(gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for site in gt_region:
                vec = site.flatten()
                #now we have an individual site as a numpy.ndarray, pass it to the comparison function
                site_diffs, site_comps, missing = compareGTs(vec)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #For the given region: return average dxy, # of differences, # of comparisons, and # missing.
        # this function loops over every site in a region passed to it
        def dxyTallyRegion(pop1_gt_region, pop2_gt_region):
            total_diffs = 0
            total_comps = 0
            total_missing = 0
            for x in range(0, len(pop1_gt_region)):
                site1 = pop1_gt_region[x]
                site2 = pop2_gt_region[x]
                vec1 = site1.flatten()
                vec2 = site2.flatten()
                #now we have an individual site as 2 numpy.ndarrays, pass them to the comparison function
                site_diffs, site_comps, missing = dxyCompareGTs(vec1, vec2)
                total_diffs += site_diffs
                total_comps += site_comps
                total_missing += missing
            if total_comps > 0:
                avg_pi = total_diffs / total_comps
            else:
                avg_pi = 0
            return (avg_pi, total_diffs, total_comps, total_missing)

        #Return the number of differences, the number of comparisons, and missing data count.
        def compareGTs(vec):  #for pi
            c = Counter(vec)
            diffs = c[1] * c[0]
            gts = c[1] + c[0]
            missing = (
                len(vec)
            ) - gts  #anything that's not 1 or 0 is ignored and counted as missing
            comps = int(special.comb(gts, 2))
            return (diffs, comps, missing)

        def dxyCompareGTs(vec1, vec2):  #for dxy
            c1 = Counter(vec1)
            c2 = Counter(vec2)
            gt1zeros = c1[0]
            gt1ones = c1[1]
            gts1 = c1[1] + c1[0]
            gt2zeros = c2[0]
            gt2ones = c2[1]
            gts2 = c2[1] + c2[0]
            missing = (len(vec1) + len(vec2)) - (
                gts1 + gts2
            )  #anything that's not 1 or 0 is ignored and counted as missing
            diffs = (gt1zeros * gt2ones) + (gt1ones * gt2zeros)
            comps = gts1 * gts2
            return (diffs, comps, missing)

        # Interval specification check
        # check if computing over specific intervals (otherwise, compute over whole chromosome)

        # window size
        window_size = args.window_size

        # set intervals based on args
        if (args.interval_end is None):
            interval_end = max(pos_array)
        else:
            interval_end = int(args.interval_end)

        if (args.interval_start is None):
            interval_start = min(pos_array)
        else:
            interval_start = int(args.interval_start)

        try:
            if (interval_start > interval_end):
                raise ValueError()
        except ValueError as e:
            raise Exception("[pixy] ERROR: The specified interval start (" +
                            str(interval_start) +
                            ") exceeds the interval end (" +
                            str(interval_end) + ")") from e

        # catch misspecified intervals
        # TBD: harmonize this with the new interval method for the zarr array
        if (interval_end > max(pos_array)):
            print(
                "[pixy] WARNING: The specified interval end (" +
                str(interval_end) +
                ") exceeds the last position of the chromosome and has been substituted with "
                + str(max(pos_array)))
            interval_end = max(pos_array)

        if (interval_start < min(pos_array)):
            print(
                "[pixy] WARNING: The specified interval start (" +
                str(interval_start) +
                ") begins before the first position of the chromosome and has been substituted with "
                + str(min(pos_array)))
            interval_start = min(pos_array)

        if ((interval_end - interval_start + 1) < window_size):
            print(
                "[pixy] WARNING: The requested interval or total number of sites in the VCF ("
                + str(interval_start) + "-" + str(interval_end) +
                ") is smaller than the requested window size (" +
                str(window_size) + ")")

        # PI:
        # AVERAGE NUCLEOTIDE VARIATION WITHIN POPULATIONS

        # Compute pi over a chosen interval and window size

        if (args.populations is not None) and ('pi' in args.stats):

            # open the pi output file for writing
            outfile = open(pi_file, 'a')

            for pop in popnames:

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # loop over populations and windows, compute stats and write to file
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    # if the window has no sites, assign all NAs,
                    # otherwise calculate pi
                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_pi, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:

                        # pull out the genotypes for the window
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # subset the window for the individuals in each population
                        gt_pop = gt_region1.take(popindices[pop], axis=1)
                        avg_pi, total_diffs, total_comps, total_missing = tallyRegion(
                            gt_pop)

                    outfile.write(
                        str(pop) + "\t" + str(chromosome) + "\t" +
                        str(window_pos_1) + "\t" + str(window_pos_2) + "\t" +
                        str(avg_pi) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")
                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

                # close output file and print complete message
            outfile.close()

            print("[pixy] Pi calculations for chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_pi.txt")

        # DXY:
        # AVERAGE NUCLEOTIDE VARIATION BETWEEN POPULATIONS

        if (args.populations is not None) and ('dxy' in args.stats):

            # create a list of all pairwise comparisons between populations in the popfile
            dxy_pop_list = list(combinations(popnames, 2))

            # open the dxy output file for writing
            outfile = open(dxy_file, 'a')

            # interate over all population pairs and compute dxy
            for pop_pair in dxy_pop_list:
                pop1 = pop_pair[0]
                pop2 = pop_pair[1]

                # window size:
                window_size = args.window_size

                # initialize window_pos_2
                window_pos_2 = (interval_start + window_size) - 1

                # perform the dxy calculation for all windows in the range
                for window_pos_1 in range(interval_start, interval_end,
                                          window_size):

                    if len(pos_array[(pos_array > window_pos_1)
                                     & (pos_array < window_pos_2)]) == 0:
                        avg_dxy, total_diffs, total_comps, total_missing, no_sites = "NA", "NA", "NA", "NA", 0
                    else:
                        loc_region = pos_array.locate_range(
                            window_pos_1, window_pos_2)
                        gt_region1 = gt_array[loc_region]
                        no_sites = len(gt_region1)

                        # use the popGTs dictionary to keep track of this region's GTs for each population
                        popGTs = {}
                        for name in pop_pair:
                            gt_pop = gt_region1.take(popindices[name], axis=1)
                            popGTs[name] = gt_pop

                        pop1_gt_region1 = popGTs[pop1]
                        pop2_gt_region1 = popGTs[pop2]
                        avg_dxy, total_diffs, total_comps, total_missing = dxyTallyRegion(
                            pop1_gt_region1, pop2_gt_region1)

                    outfile.write(
                        str(pop1) + "\t" + str(pop2) + "\t" + str(chromosome) +
                        "\t" + str(window_pos_1) + "\t" + str(window_pos_2) +
                        "\t" + str(avg_dxy) + "\t" + str(no_sites) + "\t" +
                        str(total_diffs) + "\t" + str(total_comps) + "\t" +
                        str(total_missing) + "\n")

                    window_pos_2 += window_size

                    if window_pos_2 > interval_end:
                        window_pos_2 = interval_end

            outfile.close()
            print("[pixy] Dxy calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_dxy.txt")

        # FST:
        # WEIR AND COCKERHAMS FST
        # This is just a plain wrapper for the scikit-allel fst function

        if (args.populations is not None) and ('fst' in args.stats):

            # open the fst output file for writing
            outfile = open(fst_file, 'a')

            # determine all the possible population pairings
            pop_names = list(popindices.keys())
            fst_pop_list = list(combinations(pop_names, 2))

            #calculate maf
            allele_counts = gt_array.count_alleles()
            allele_freqs = allele_counts.to_frequencies()
            maf_array = allele_freqs[:, 1] > args.fst_maf_filter

            # apply the maf filter to the genotype array]
            gt_array_fst = gt_array[maf_array]
            gt_array_fst = allel.GenotypeArray(gt_array_fst)

            # apply the maf filter to the position array
            pos_array_fst = pos_array[maf_array]

            # for each pair, compute fst
            for pop_pair in fst_pop_list:

                # the indices for the individuals in each population
                fst_pop_indicies = [
                    popindices[pop_pair[0]].tolist(),
                    popindices[pop_pair[1]].tolist()
                ]

                # compute FST
                # windowed_weir_cockerham_fst seems to generate (spurious?) warnings about div/0, so suppressing warnings
                # (this assumes that the scikit-allel function is working as intended)
                np.seterr(divide='ignore', invalid='ignore')

                a, b, c = allel.windowed_weir_cockerham_fst(
                    pos_array_fst,
                    gt_array_fst,
                    subpops=fst_pop_indicies,
                    size=args.window_size,
                    start=interval_start,
                    stop=interval_end)

                for fst, wind, snps in zip(a, b, c):
                    outfile.write(
                        str(pop_pair[0]) + "\t" + str(pop_pair[1]) + "\t" +
                        str(chromosome) + "\t" + str(wind[0]) + "\t" +
                        str(wind[1]) + "\t" + str(fst) + "\t" + str(snps) +
                        "\n")
            outfile.close()
            print("[pixy] Fst calculations chromosome " + chromosome +
                  " complete and written to " + args.outfile_prefix +
                  "_fst.txt")

    print("\n[pixy] All calculations complete at " +
          time.strftime("%H:%M:%S", time.localtime(start_time)))
    end_time = (time.time() - start_time)
    print("[pixy] Time elapsed: " +
          time.strftime("%H:%M:%S", time.gmtime(end_time)))