示例#1
0
def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts,
                                                      chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = divergence(ds)
    div = ds["stat_divergence"].values
    # test off-diagonal entries, by replacing diagonal with NaNs
    div[:, np.arange(2), np.arange(2)] = np.nan

    # Calculate divergence using scikit-allel moving_statistic
    # (Don't use windowed_divergence, since it treats the last window differently)
    ds1 = count_variant_alleles(ts_to_dataset(
        ts, samples=ts.samples()[:1]))  # type: ignore[no-untyped-call]
    ds2 = count_variant_alleles(ts_to_dataset(
        ts, samples=ts.samples()[1:]))  # type: ignore[no-untyped-call]
    ac1 = ds1["variant_allele_count"].values
    ac2 = ds2["variant_allele_count"].values
    mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
    ska_div = allel.moving_statistic(mpd, np.sum, size=25)  # noqa: F841
    # TODO: investigate why numbers are different
    np.testing.assert_allclose(
        div[:-1], ska_div)  # scikit-allel has final window missing
示例#2
0
def test_observed_heterozygosity__scikit_allel_comparison(
        n_variant, n_sample, missing_pct, window_size, seed):
    ds = simulate_genotype_call_dataset(
        n_variant=n_variant,
        n_sample=n_sample,
        n_ploidy=2,
        missing_pct=missing_pct,
        seed=seed,
    )
    ds["sample_cohort"] = (
        ["samples"],
        np.zeros(n_sample, int),
    )
    ds = window(ds, size=window_size)
    ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values
    if n_sample % window_size:
        # scikit-allel will drop the ragged end
        ho_sg = ho_sg[0:-1]
    # calculate with scikit-allel
    ho_sa = allel.moving_statistic(
        allel.heterozygosity_observed(ds["call_genotype"]),
        np.sum,
        size=window_size,
    )
    # add cohort dimension to scikit-allel result
    np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
示例#3
0
def test_diversity__windowed(sample_size):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds, ts, cohort_key_names=["cohorts"])  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = diversity(ds)
    div = ds["stat_diversity"].sel(cohorts="co_0").compute()

    # Calculate diversity using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    ts_div = ts.diversity(windows=windows, span_normalise=False)
    np.testing.assert_allclose(div, ts_div)

    # Calculate diversity using scikit-allel moving_statistic
    # (Don't use windowed_diversity, since it treats the last window differently)
    ds = count_variant_alleles(
        ts_to_dataset(ts))  # type: ignore[no-untyped-call]
    ac = ds["variant_allele_count"].values
    mpd = allel.mean_pairwise_difference(ac, fill=0)
    ska_div = allel.moving_statistic(mpd, np.sum, size=25)
    np.testing.assert_allclose(
        div[:-1], ska_div)  # scikit-allel has final window missing
示例#4
0
def test_moving_statistic_1d(length, chunks, size, step, dtype):
    values = da.from_array(np.arange(length, dtype=dtype), chunks=chunks)

    stat = moving_statistic(values,
                            np.sum,
                            size=size,
                            step=step,
                            dtype=values.dtype)
    stat = stat.compute()
    if length % size != 0 or size != step:
        # scikit-allel misses final window in this case
        stat = stat[:-1]
    assert stat.dtype == dtype

    values_sa = np.arange(length)
    stat_sa = allel.moving_statistic(values_sa, np.sum, size=size, step=step)

    np.testing.assert_equal(stat, stat_sa)
示例#5
0
def test_moving_statistic_2d(length, chunks, size, step, dtype):
    arr = np.arange(length * 3, dtype=dtype).reshape(length, 3)

    def sum_cols(x):
        return np.sum(x, axis=0)

    values = da.from_array(arr, chunks=chunks)
    stat = moving_statistic(values,
                            sum_cols,
                            size=size,
                            step=step,
                            dtype=values.dtype)
    stat = stat.compute()
    if length % size != 0 or size != step:
        # scikit-allel misses final window in this case
        stat = stat[:-1]
    assert stat.dtype == dtype

    values_sa = arr
    stat_sa = allel.moving_statistic(values_sa, sum_cols, size=size, step=step)

    np.testing.assert_equal(stat, stat_sa)
示例#6
0
        qualflt=qualflt,
        missingfltprop=missingprop)

    #### Fst in windows ####
    for sus, res in comparisons:
        name = sus + "_" + res
        cohortText = f"{sus} v {res}"
        print(f"Calculating Fst values in sliding windows for {name}\n")

        for wname, size, step in zip(windownames, windowsizes, windowsteps):
            FstArray = allel.moving_hudson_fst(acsubpops[sus],
                                               acsubpops[res],
                                               size=size,
                                               step=step)
            midpoint = allel.moving_statistic(pos,
                                              np.median,
                                              size=size,
                                              step=step)

            cohortNoSpaceText = name + "." + wname
            rnaseqpop.plotWindowed(
                statName="Fst",
                cohortText=cohortText,
                cohortNoSpaceText=cohortNoSpaceText,
                values=FstArray,
                midpoints=midpoint,
                colour='dodgerblue',
                prefix="results/variantAnalysis/selection/fst",
                chrom=chrom,
                ylim=0.5,
                save=True)
示例#7
0
def selective_sweep(chroms,
                    pop,
                    samples,
                    haplo=True,
                    plot=False,
                    inaccessible=False):
    """ Function to calculate H12 statistic across chromosome for given population. Currently not standardised or normalised. """

    for chrom in chroms:

        if inaccessible is False:
            ############ Read zarrs #############
            Ag_store = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
                mode='r')
            positions = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS",
                mode='r')[:]
        else:
            Ag_store = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
                mode='r')
            positions = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS",
                mode='r')[:]

        print("--------------------------------------------------")
        print(f"Zarrs loaded: {pop}, Chromosome {chrom}")

        ############ Load intro gen.array and compute statistics ###########
        ag_geno = allel.GenotypeChunkedArray(Ag_store)
        pop_bool = samples.population == pop

        print("Constructing HaplotypeArray")
        pop_geno = ag_geno.compress(pop_bool, axis=1)
        pop_haplo = pop_geno.to_haplotypes()

        print("Computing statistics")
        h1, h12, h123, h2_h1 = allel.moving_garud_h(pop_haplo, size=1000)
        median_pos = allel.moving_statistic(positions, np.median, size=1000)

        print(f"mean {chrom} h12", np.mean(h12))

        if plot is True:

            print("Producing figure")
            sns.set_palette("muted")
            xtick = np.arange(0, median_pos.max(), 1000000)
            plt.figure(figsize=(30, 10))
            sns.lineplot(
                median_pos,
                h12).set_title(f'{pop} {chrom} H12 in 1000 snp windows')
            plt.xticks(xtick)
            plt.savefig(f"../data/{pop}/{chrom}/{pop}_{chrom}_H12_scatter.png",
                        dpi=800)
            plt.close

    if haplo is True:
        return (pop_haplo, h12, np.around(median_pos), positions)
    else:
        return (h12, np.around(median_pos), positions)
示例#8
0
    allcoef = defaultdict(list)

    for pop in metadata['treatment'].unique():

        # Sequence diversity
        seqdivdict[pop] = allel.sequence_diversity(pos, acsubpops[pop])

        # Wattersons theta
        thetadict[pop] = allel.watterson_theta(pos, acsubpops[pop])

        # Inbreeding coefficient
        if ploidy > 1:
            gn = geno.take(subpops[pop], axis=1)
            coef = allel.moving_statistic(
                gn,
                statistic=allel.inbreeding_coefficient,
                size=1000,
                step=100)
            coef = np.nanmean(coef, axis=1)
            coefdict[pop] = np.mean(coef)
            allcoef[pop].append(np.array(coef))

        print(f"{pop} | {chrom} | Nucleotide Diversity (Pi) =",
              seqdivdict[pop])
        print(f"{pop} | {chrom} | Wattersons Theta =", thetadict[pop])
        if ploidy > 1:
            print(f"{pop} | {chrom} | Inbreeding Coef =", np.mean(coef), "\n")

    seqdivdictchrom[chrom] = dict(seqdivdict)
    thetadictchrom[chrom] = dict(thetadict)
    if ploidy > 1: coefdictchrom[chrom] = dict(coefdict)
def loop_D_statistic3(name,
                      popA_list,
                      popB_list,
                      popC_list,
                      popD_list,
                      popA_ac,
                      popB_ac,
                      popC_ac,
                      popD_ac,
                      pos,
                      block_len_snp,
                      step_len_snp,
                      cycle="C",
                      blen=100,
                      color=[
                          "blue", "darkorange", "turquoise", "crimson",
                          "magenta", "limegreen", "forestgreen", "slategray",
                          "orchid", "darkblue"
                      ]):

    windows_pos = allel.moving_statistic(pos,
                                         statistic=lambda v: v[0],
                                         size=block_len_snp,
                                         step=step_len_snp)

    # calculate pvalues and focus in this region: duplicated region proper
    is_locus = np.logical_and(pos > loc_start, pos < loc_end)  # gene region
    is_inv = np.logical_and(pos > inv_start, pos < inv_end)  # inversion region

    # loop
    pdf = PdfPages("%s/%s.Dstat_%s.pdf" % (outdir, outcode, name))

    colors = cm.rainbow(np.linspace(0, 1, len(popC_list)))

    for dn, popD in enumerate(popD_list):

        for bn, popB in enumerate(popB_list):

            for an, popA in enumerate(popA_list):

                print("(((%s,%s),X),%s) chr" % (popA, popB, popD))

                fig = plt.figure(figsize=(10, 2))

                # whole chromosome: frame
                ax1 = plt.subplot(1, 2, 1)
                sns.despine(ax=ax1, offset=10)
                ax1.set_title("Chr %s (((%s,%s),X),%s)" %
                              (chrom, popA, popB, popD))
                ax1.set_xlim(0, 50)
                ax1.set_ylim(-1, 1)
                ax1.set_xlabel("Mb")
                ax1.set_ylabel("D")
                plt.axhline(0, color='k', linestyle="--", label="")
                plt.axvline(loc_start / 1e6,
                            color='red',
                            linestyle=":",
                            label="Rdl")
                plt.axvline(loc_end / 1e6,
                            color='red',
                            linestyle=":",
                            label="")
                plt.axvline(inv_start / 1e6,
                            color='orange',
                            linestyle=":",
                            label="inversion")
                plt.axvline(inv_end / 1e6,
                            color='orange',
                            linestyle=":",
                            label="")

                ax2 = plt.subplot(1, 4, 3)
                sns.despine(ax=ax2, offset=10)
                ax2.set_xlim(loc_start / 1e6 - 1, loc_end / 1e6 + 1)
                ax2.set_ylim(-1, 1)
                ax2.set_xlabel("Mb")
                ax2.set_ylabel("D")
                plt.axhline(0, color='k', linestyle="--", label="")
                plt.axvline(loc_start / 1e6,
                            color='red',
                            linestyle=":",
                            label="Rdl")
                plt.axvline(loc_end / 1e6,
                            color='red',
                            linestyle=":",
                            label="")
                plt.axvline(inv_start / 1e6,
                            color='orange',
                            linestyle=":",
                            label="inversion")
                plt.axvline(inv_end / 1e6,
                            color='orange',
                            linestyle=":",
                            label="")

                for cn, popC in enumerate(popC_list):

                    if popA != popB:

                        # block-wise patterson D (normalised)
                        admix_pd_n_win = allel.moving_patterson_d(
                            aca=popA_ac[popA][:, 0:2],
                            acb=popB_ac[popB][:, 0:2],
                            acc=popC_ac[popC][:, 0:2],
                            acd=popD_ac[popD][:, 0:2],
                            size=block_len_snp,
                            step=step_len_snp)

                        # whole chromosome: plot
                        plt.subplot(1, 2, 1)
                        plt.step(windows_pos / 1e6,
                                 admix_pd_n_win,
                                 color=colors[cn])

                        # estimated D in locus with pval
                        admix_pd_av_indup = allel.average_patterson_d(
                            aca=popA_ac[popA][:, 0:2][is_locus],
                            acb=popB_ac[popB][:, 0:2][is_locus],
                            acc=popC_ac[popC][:, 0:2][is_locus],
                            acd=popD_ac[popD][:, 0:2][is_locus],
                            blen=blen)
                        # convert Z-score (num of SD from 0) to pval (two-sided)
                        admix_pd_av_indup_pval = scipy.stats.norm.sf(
                            abs(admix_pd_av_indup[2])) * 2

                        # zoomed region: plot
                        plt.subplot(1, 4, 3)
                        plt.step(
                            windows_pos / 1e6,
                            admix_pd_n_win,
                            color=colors[cn],
                            where="post",
                            label="%s\nD = %.3f +/- %.3f | Z = %.3f | p = %.3E"
                            %
                            (popC, admix_pd_av_indup[0], admix_pd_av_indup[1],
                             admix_pd_av_indup[2], admix_pd_av_indup_pval))

                plt.axhline(0, color='k', linestyle="--", label="")
                ax2.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))

                # save pdf
                pdf.savefig(fig, bbox_inches='tight')

    pdf.close()
fig = plt.figure(figsize=(8,12))
ax9 = plt.subplot(3, 1, 1)


j=0
for i,clui in enumerate(np.append(clu_list_ids_fil,np.append("no_wt","no_alt"))):

    # which cluster
    clu_key = "cluster_"+str(clui)

    # which variants include in the cluster-wise analysis of selection?
    clu_sambool = np.isin(range(0,oc_haploty_hap_seg.n_haplotypes),test_elements=popdich_clu[clu_key])
    clu_sambool = np.logical_and(clu_sambool,rmv_miss_bool)

    # hap div along chromosome
    clu_pos_wib    = allel.moving_statistic(oc_hapvars_seg["POS"].subset(sel0=clu_varbool), statistic=lambda v: v[0], size=50, step=10)
    clu_hdi_wib    = allel.moving_haplotype_diversity(oc_haploty_hap_seg.subset(sel0=clu_varbool,sel1=clu_sambool), size=50, step=10)

    # hap div in focus region
    j_index = np.array(popdich_clu[clu_key]).tolist()
    j_run = len(j_index)
    j_hdi = np.zeros(shape=j_run)
    for k in range(j_run):
        j_sel1   = j_index[0:k] + j_index[k+1:j_run]
        j_hdi[k] = allel.haplotype_diversity(oc_haploty_hap_seg.subset(sel0=clu_varbool_focus, sel1=j_sel1))
    j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_hdi)
    clu_label = "%s\nh = %.6f +/- %.6f SE, %.6f-%.6f CI95, n=%i" % (clu_key, j_av, j_se, j_cl, j_cu, j_nu)
    print(clu_label)

    # plot
    plt.subplot(3, 1, 1)
示例#11
0
def getPCADist(vcf, fpop1, fpop2, window_size):

    # Getting the samples
    fh1 = open(fpop1, 'r').readlines()
    spop1 = [(ele.split()[0], 'pop1') for ele in fh1]
    fh2 = open(fpop2, 'r').readlines()
    spop2 = [(ele.split()[0], 'pop2') for ele in fh2]
    pops = spop1 + spop2
    Pops = {a: b for a, b in pops}
    Samples = list(Pops.keys())

    print("Reading vcf")
    callset = allel.read_vcf(
        vcf, ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'],
        samples=Samples)
    samples = callset['samples']
    chromosomes = callset['variants/CHROM']
    positions = callset['variants/POS']
    gts = callset['calldata/GT']
    variants = callset['variants/POS']
    idx = allel.ChromPosIndex(chromosomes, positions)
    chroms = []
    for cr in chromosomes:
        if cr not in chroms:
            chroms.append(cr)

    # Getting sample indices
    populations = []
    for ele in samples:
        if ele in Pops.keys():
            populations.append(Pops[ele])
        else:
            populations.append('other')
    ds = pd.DataFrame({'sample': samples, 'pop': populations})
    samples_callset_index = [list(samples).index(s) for s in ds['sample']]
    ds['callset_index'] = samples_callset_index
    dpops = defaultdict(list)
    for a, b in ds[['pop', 'callset_index']].values.tolist():
        dpops[a].append(b)

    print("Calculating pop distance from the centroid")
    Dist = []
    for chrom in chroms:
        #print(chrom)

        chr_slice = idx.locate_key(chrom)
        chr_vars = variants[chr_slice]
        # Getting genotypes
        chr_gts = gts[chr_slice]
        chr_gts

        # Filtering out rows (positions) with missing genotypes
        missing = allel.GenotypeArray(chr_gts).is_missing()
        bool_missing = missing.any(axis=1)
        chr_nomissing = chr_gts[~bool_missing]
        chr_nomissing
        chr_vars_nomissing = chr_vars[~bool_missing]

        # Retaining rows (positions) with segregating genotypes
        segs = allel.GenotypeArray(chr_nomissing).count_alleles() > 0
        bool_segs = segs.all(axis=1)
        chr_segregating = chr_nomissing[bool_segs]
        chr_vars_segregating = chr_vars_nomissing[bool_segs]
        #chr_segregating.shape, chr_vars_segregating.shape

        # Converting genotypes to one code number
        #chr_nalt = allel.GenotypeArray(chr_gts).to_n_alt(fill=-1)
        chr_nalt = allel.GenotypeArray(chr_segregating).to_n_alt()
        chr_nalt.shape

        ### This is optional - locating unlinked variants
        #unlink = allel.locate_unlinked(chr_nalt, size=100, step=50, threshold = 0.1)
        #chr_unlink = chr_nalt[unlink]
        #chr_vars_unlink = chr_vars_segregating[unlink]

        # Calculating distance
        win_stat = allel.moving_statistic(chr_nalt,
                                          runPCA,
                                          size=int(window_size),
                                          pop_1=dpops['pop1'],
                                          pop_2=dpops['pop2'])
        flat_stat = np.concatenate(win_stat)
        starts = chr_vars_segregating[0:len(chr_vars_segregating
                                            ):int(window_size)]
        stops = chr_vars_segregating[int(window_size) -
                                     1:len(chr_vars_segregating
                                           ):int(window_size)]
        wf = pd.DataFrame({
            'chrom': chrom,
            'dist': flat_stat,
            'SNP_start': starts[:len(flat_stat)],
            'SNP_stop': stops[:len(flat_stat)],
            'SNPs': int(window_size)
        })
        Dist.append(wf)

    dW = pd.concat(Dist)
    dW['mid'] = dW['SNP_start'] + (dW['SNP_stop'] - dW['SNP_start']) / 2
    dW['window'] = list(range(len(dW['dist'])))
    dW.to_csv('calculatePCADist.out',
              sep='\t',
              index=False,
              header=True,
              columns=[
                  'chrom', 'SNP_start', 'SNP_stop', 'mid', 'window', 'SNPs',
                  'dist'
              ])