Exemplo n.º 1
0
def countN1N2N3(gt, pops, mle):
    """
    """
    clist = []
    klist = []
    # make gt arrays for each subpop, then haplotype arrays
    gtA = gt.take(pops[0], axis=1)
    htA = gtA.to_haplotypes()
    gtB = gt.take(pops[1], axis=1)
    htB = gtB.to_haplotypes()
    if len(pops[1]) == 1:
        hap2 = list(range(len(pops[1]*2)))
        for hap1 in list(range(len(pops[0]))):
            ma = htA[:, [hap1]].count_alleles(max_allele=1)
            mb = htB[:, hap2].count_alleles(max_allele=1)
            jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1])
            try:
                n3 = jsfs[0, 2] + jsfs[1, 0]
                n2 = jsfs[0, 1] + jsfs[1, 1]
                n1 = jsfs[0, 0] + jsfs[1, 2]
            except IndexError:
                z = np.zeros((2, 3), dtype=int)
                z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
                n3 = z[0, 2] + z[1, 0]
                n2 = z[0, 1] + z[1, 1]
                n1 = z[0, 0] + z[1, 2]
            c_hat, k_hat = estimCandK(n1, n2, n3, mle)
            clist.append(c_hat)
            klist.append(k_hat)
    else:
        for hap1 in list(range(len(pops[0]))):
            for hap2 in list(combinations(range(len(pops[1])*2), 2)):
                ma = htA[:, [hap1]].count_alleles(max_allele=1)
                mb = htB[:, hap2].count_alleles(max_allele=1)
                z = np.zeros((2, 3), dtype=int)
                jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1])
                try:
                    n3 = jsfs[0, 2] + jsfs[1, 0]
                    n2 = jsfs[0, 1] + jsfs[1, 1]
                    n1 = jsfs[0, 0] + jsfs[1, 2]
                except IndexError:
                    z = np.zeros((2, 3), dtype=int)
                    z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
                    n3 = z[0, 2] + z[1, 0]
                    n2 = z[0, 1] + z[1, 1]
                    n1 = z[0, 0] + z[1, 2]
                c_hat, k_hat = estimCandK(n1, n2, n3, mle)
                clist.append(c_hat)
                klist.append(k_hat)
    return(np.mean(clist), np.mean(klist))
Exemplo n.º 2
0
def countN1N2N3(gt, pops, mle):
    """
    """
    clist = []
    klist = []
    # make gt arrays for each subpop, then haplotype arrays
    gtA = gt.take(pops[0], axis=1)
    htA = gtA.to_haplotypes()
    gtB = gt.take(pops[1], axis=1)
    htB = gtB.to_haplotypes()
    if len(pops[1]) == 1:
        hap2 = list(range(len(pops[1] * 2)))
        for hap1 in list(range(len(pops[0]))):
            ma = htA[:, [hap1]].count_alleles(max_allele=1)
            mb = htB[:, hap2].count_alleles(max_allele=1)
            jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1])
            try:
                n3 = jsfs[0, 2] + jsfs[1, 0]
                n2 = jsfs[0, 1] + jsfs[1, 1]
                n1 = jsfs[0, 0] + jsfs[1, 2]
            except IndexError:
                z = np.zeros((2, 3), dtype=int)
                z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
                n3 = z[0, 2] + z[1, 0]
                n2 = z[0, 1] + z[1, 1]
                n1 = z[0, 0] + z[1, 2]
            c_hat, k_hat = estimCandK(n1, n2, n3, mle)
            clist.append(c_hat)
            klist.append(k_hat)
    else:
        for hap1 in list(range(len(pops[0]))):
            for hap2 in list(combinations(range(len(pops[1]) * 2), 2)):
                ma = htA[:, [hap1]].count_alleles(max_allele=1)
                mb = htB[:, hap2].count_alleles(max_allele=1)
                z = np.zeros((2, 3), dtype=int)
                jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1])
                try:
                    n3 = jsfs[0, 2] + jsfs[1, 0]
                    n2 = jsfs[0, 1] + jsfs[1, 1]
                    n1 = jsfs[0, 0] + jsfs[1, 2]
                except IndexError:
                    z = np.zeros((2, 3), dtype=int)
                    z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
                    n3 = z[0, 2] + z[1, 0]
                    n2 = z[0, 1] + z[1, 1]
                    n1 = z[0, 0] + z[1, 2]
                c_hat, k_hat = estimCandK(n1, n2, n3, mle)
                clist.append(c_hat)
                klist.append(k_hat)
    return (np.mean(clist), np.mean(klist))
Exemplo n.º 3
0
def joint_site_frequency_spectrum(genotypes1: np.ndarray, genotypes2: np.ndarray, population1: str='population1', population2: str='population2') -> np.ndarray:
    allele_counts1 = genotypes1.reshape(genotypes1.shape[0], -1).sum(1)
    allele_counts2 = genotypes2.reshape(genotypes2.shape[0], -1).sum(1)
    joint_sfs = allel.joint_sfs(allele_counts1, allele_counts2, np.product(genotypes1.shape[1:]), np.product(genotypes2.shape[1:]))
    ax = plot_joint_sfs(joint_sfs, population1, population2)
    plt.savefig(os.path.join(FIGURES_DIR, '{}.{}.joint_sfs.png'.format(population1.replace(' ', '_'), population2.replace(' ', '_'))))
    plt.clf()
    return joint_sfs / joint_sfs.sum()
Exemplo n.º 4
0
def jsfsStats(gt, pops, chrm, fold=False, plot=False):
    """Joint site frequency spectrum with scikit-allel
    """
    print("jsfs")
    n = 100000  # number of SNPs to choose randomly
    try:
        vidx = np.random.choice(gt.shape[0], n, replace=False)
    except ValueError:
        vidx = np.random.choice(gt.shape[0], gt.shape[0], replace=False)
    vidx.sort()
    gtr = gt.take(vidx, axis=0)
    jsfslist = []
    for i, j in combinations(pops, 2):
        gtpop1 = gtr.take(i, axis=1)
        gtpop2 = gtr.take(j, axis=1)
        ac1 = gtpop1.count_alleles()
        ac2 = gtpop2.count_alleles()
        if fold:
            # pad for allel as well
            popsizeA, popsizeB = len(i) / 2, len(j) / 2
            fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
            jsfs = allel.joint_sfs_folded(ac1, ac2)
            fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
        else:
            # pad for allel as well
            popsizeA, popsizeB = len(i) * 2, len(j) * 2
            fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
            jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1])
            fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
        if plot:
            fig, ax = plt.subplots(figsize=(6, 6))
            allel.stats.plot_joint_sfs(fs, ax=ax)
        jsfsarray = np.zeros(23)
        jsfsarray[0] = np.sum(fs[0, 1:3])
        jsfsarray[1] = np.sum(fs[1:3, 0])
        jsfsarray[2] = np.sum(fs[0, 3:-3])
        jsfsarray[3] = np.sum(fs[3:-3, 0])
        jsfsarray[4] = np.sum(fs[0, -3:-1])
        jsfsarray[5] = np.sum(fs[-3:-1, 0])
        jsfsarray[6] = np.sum(fs[1:3, 1:3])
        jsfsarray[7] = np.sum(fs[1:3, 3:-3])
        jsfsarray[8] = np.sum(fs[3:-3, 1:3])
        jsfsarray[9] = np.sum(fs[-3:-1, 3:-3])
        jsfsarray[10] = np.sum(fs[3:-3, -3:-1])
        jsfsarray[11] = np.sum(fs[1:3, -3:-1])
        jsfsarray[12] = np.sum(fs[-3:-1, 1:3])
        jsfsarray[13] = np.sum(fs[3:-3, 3:-3])
        jsfsarray[14] = np.sum(fs[-3:-1, -3:-1])
        jsfsarray[15] = np.sum(fs[0, -1])
        jsfsarray[16] = np.sum(fs[-1, 0])
        jsfsarray[17] = np.sum(fs[-1, 1:3])
        jsfsarray[18] = np.sum(fs[1:3, -1])
        jsfsarray[19] = np.sum(fs[-1, 3:-3])
        jsfsarray[20] = np.sum(fs[3:-3, -1])
        jsfsarray[21] = np.sum(fs[-1, -3:-1])
        jsfsarray[22] = np.sum(fs[-3:-1, -1])
        jsfslist.append(jsfsarray)
    return (jsfslist)
Exemplo n.º 5
0
def ts_to_dadi_sfs(ts_path,
                   out_path,
                   out_path_nonvariant,
                   sample_size=20,
                   mask_file=None):
    '''
	Generate however many different SFS with msprime and convert+save them into SFS for dadi to use.
	'''
    ts = tskit.load(ts_path)

    #haps_pops_joint = np.array(ts.genotype_matrix())

    haps = ts.genotype_matrix()

    total_length = ts.sequence_length

    # Masking
    retain = np.full(ts.get_num_mutations(), False)
    if mask_file:
        mask_table = pd.read_csv(mask_file, sep="\t", header=None)
        chrom = ts_path.split("/")[-1].split(".")[0]
        sub = mask_table[mask_table[0] == chrom]
        mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2])
        snp_locs = [int(x.site.position) for x in ts.variants()]
        tmp_bool = [mask_ints.contains(x) for x in snp_locs]
        retain = np.logical_or(retain, tmp_bool)
        #print(retain)
        total_length -= np.sum(mask_ints.length)
    #print(ts.sequence_length)
    #print(total_length)

    retain = np.logical_not(retain)

    haps_pops_joint = np.array(haps[retain, :])

    #Break up the haplotypes into seperate populations based on sample_size
    haps_pop0_joint = haps_pops_joint[:, :sample_size]
    haps_pop1_joint = haps_pops_joint[:, sample_size:]

    genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles()
    genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles()

    sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1],
                                allele_counts_pop1_joint[:, 1])
    num_sites = sum(sum(sfs_joint))
    #print(ts.num_sites)
    sfs_joint = dadi.Spectrum(sfs_joint)
    sfs_joint.to_file(out_path)
    sfs_joint[
        0,
        0] = total_length - num_sites  # need to get the number of nonvariant sites for the [0,0] entry
    sfs_joint.to_file(out_path_nonvariant)
Exemplo n.º 6
0
def test_joint_sfs():
    # https://github.com/cggh/scikit-allel/issues/144

    warnings.resetwarnings()
    warnings.simplefilter('error')

    dac1 = np.array([0, 1, 2, 3, 4])
    dac2 = np.array([1, 2, 1, 2, 3], dtype='u8')
    s = joint_sfs(dac1, dac2)
    e = [[0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
    assert_array_equal(e, s)

    warnings.resetwarnings()
    warnings.simplefilter('always')
Exemplo n.º 7
0
def jsfs_stats(p1, gt, pos, fold):
    """Calculate the joint site frequency spectrum between two populations.

    Parameters
    ----------
    p1 : TYPE
        DESCRIPTION.
    gt : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    fold : TYPE
        DESCRIPTION.
    rand : TYPE
        DESCRIPTION.
    randn : TYPE
        DESCRIPTION.

    Returns
    -------
    props : TYPE
        DESCRIPTION.

    """
    gtr, pos_s = get_seg(gt, pos)
    gtpop1 = gtr.take(range(p1), axis=1)
    gtpop2 = gtr.take(range(p1, gtr.shape[1]), axis=1)
    ac1 = gtpop1.count_alleles()
    ac2 = gtpop2.count_alleles()
    # jsfs
    if fold:
        # pad for allel as well
        #popsizeA, popsizeB = p1/2, (gtr.shape[1]-p1)/2
        jsfs = allel.joint_sfs_folded(ac1, ac2, gtpop1.shape[1],
                                      gtpop2.shape[1])
        #fss = np.resize(jsfs, (int(popsizeA)+1, int(popsizeB)+1))
    else:
        # pad for allel as well
        #popsizeA, popsizeB = p1, gtr.shape[1]-p1
        jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1], gtpop1.shape[1],
                               gtpop2.shape[1])
        #fss = np.resize(jsfs, (int(popsizeA)+1, int(popsizeB)+1))
    props = summarizejsfs(jsfs)

    return props
Exemplo n.º 8
0
def msprime_to_dadi_simulation(path, seed, org, chrom, sample_size=20):
    '''
	Generate however many different SFS with msprime and convert+save them into SFS for dadi to use.
	'''
    #For testing
    # print(path, seed, chrom, sample_size)
    # chrom = homo_sapiens.genome.chromosomes[chrom]
    # model = homo_sapiens.GutenkunstThreePopOutOfAfrica()
    chrom = getattr(stdpopsim,
                    '_'.join(org.split('_')[:-1])).genome.chromosomes[chrom]
    model = getattr(getattr(stdpopsim, '_'.join(org.split('_')[:-1])),
                    org.split('_')[-1:][0])()

    samples_pops_joint = [
        msprime.Sample(population=0, time=0)
    ] * sample_size + [msprime.Sample(population=1, time=0)] * sample_size
    ts_pops_joint = msprime.simulate(
        samples=samples_pops_joint,
        recombination_map=chrom.recombination_map(),
        mutation_rate=chrom.default_mutation_rate,
        random_seed=seed,
        **model.asdict())
    haps_pops_joint = np.array(ts_pops_joint.genotype_matrix())

    #Break up the haplotypes into seperate populations based on sample_size
    haps_pop0_joint = haps_pops_joint[:, :sample_size]
    haps_pop1_joint = haps_pops_joint[:, sample_size:]

    genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles()
    genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles()

    sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1],
                                allele_counts_pop1_joint[:, 1])
    sfs_joint = dadi.Spectrum(sfs_joint)

    sfs_joint.to_file(path)