Exemplo n.º 1
0
def pca_fx(gnu, meta, nchr, pop2color, pcAll, population, bykary=False):
    """
    gnu: genotype object transformed with .to_n_alt()
    pcAll:
    """
    # PCA
    coords, model = allel.pca(gnu, n_components=10, scaler='patterson')
    #    corrds, model = allel.randomized_pca(gnu, n_components=10,
    #                                         scaler='patterson')
    title = "PCA Chr:{}, var:{}".format(nchr, gnu.shape[0])
    if population is 'All':
        samples = meta.Population.values
    else:
        samples = meta.Population[meta.Population.isin(population)].values
    if bykary:
        s = meta.Population[meta.Population.isin(samples)].index.tolist()
        samples = meta.ChromForm[s].values
        pop2color['Kiribina'] = '#FF0000'
        pop2color['Folonzo'] = '#008000'
    if pcAll:
        i = 0
        while i < 9:
            fig_pca(coords, model, title, samples, pop2color, i, i + 1)
            i += 2
    else:
        fig_pca(coords, model, title, samples, pop2color, 0, 1)
    hist_var(model, title)
    return (coords, model)
Exemplo n.º 2
0
def run_pca(inversion,
            vtbl,
            genotypes,
            variance_threshold=0.15,
            min_count=3,
            whole_inversion=True,
            buffer=0,
            samples_bool=None,
            inversionDict=inversionDict):

    sites = construct_filter_expression(inversion,
                                        inversionDict,
                                        whole_inversion=whole_inversion,
                                        buffer=buffer)

    sites_bool = vtbl.eval(sites)

    alt_alleles, _ =\
    filter_and_convert_genotypes(genotypes, sites_boolean=sites_bool,
                                 samples_boolean=samples_bool,
                                 min_count=min_count,
                                 variance_threshold=variance_threshold)

    coords, model = allel.pca(alt_alleles)

    return coords, model
Exemplo n.º 3
0
def pca(geno,
        chrom,
        ploidy,
        dataset,
        populations,
        samples,
        pop_colours,
        prune=True,
        scaler=None):
    if prune is True:
        if ploidy > 1:
            geno = geno.to_n_alt()
        geno = ld_prune(geno, size=500, step=200, threshold=0.2)
    else:
        if ploidy > 1:
            geno = geno.to_n_alt()

    coords1, model1 = allel.pca(geno, n_components=10, scaler=scaler)

    fig_pca(coords1,
            model1,
            f"PCA {chrom} {dataset}",
            f"results/variantAnalysis/pca/PCA-{chrom}-{dataset}",
            samples,
            pop_colours,
            sample_population=populations)
Exemplo n.º 4
0
def runPCA(genotypes, **kwargs):
    pca = allel.pca(genotypes, n_components=2)[0]
    centroid = np.mean(pca, axis=0)
    df1 = pd.DataFrame(pca, columns=['x', 'y'])
    #dst = distance.euclidean(pca[0], centroid)
    df1['dst'] = df1.apply(
        lambda x: distance.euclidean([x['x'], x['y']], centroid), axis=1)
    indices_pop1 = kwargs['pop_1']
    indices_pop2 = kwargs['pop_2']
    group_mean = np.mean(df1.loc[indices_pop1, 'dst'])
    focal_mean = df1.loc[indices_pop2, 'dst']
    dist = focal_mean / group_mean
    return (dist)
Exemplo n.º 5
0
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000):
    """
    main function to run pca visualization
    """

    gn, callset = prepData(directory, outfn, newVCF, samples, bs)

    ## get metadata
    df = fp.retrieveMetaData(samples, directory, outfn)

    coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson')

    fig_pca(coords1, model1, 'Conventional PCA.', sample_population=df[column])
    #plt.show()
    plt.savefig(directory + "graphics/" + outfn + "_pca.jpg")
Exemplo n.º 6
0
def sim_load_h5_to_PCA(h5_path):
    '''
    load dataset from h5 format file, remove non-informative columns,
    fit a PCA
    input: path file
    output:PCA coordenates
    '''
    callset = h5py.File(h5_path, mode='r')
    #Reference: http://alimanfoo.github.io/2015/09/28/fast-pca.html
    g = allel.GenotypeChunkedArray(callset['calldata/GT'])
    ac = g.count_alleles()[:]

    # remove singletons and multiallelic SNPs. Singletons are not informative for PCA,
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    # transform the genotype data into a 2-dimensional matrix where each cell has the number of non-reference alleles per call
    gn = gf.to_n_alt()

    #Removing correlated features (LD pruning): each SNP is a feature, SNPs tend to be correlated
    #It takes a while 5:15-
    def ld_prune(gn, size, step, threshold=.1, n_iter=1):
        for i in range(n_iter):
            loc_unlinked = allel.locate_unlinked(gn,
                                                 size=size,
                                                 step=step,
                                                 threshold=threshold)
            n = np.count_nonzero(loc_unlinked)
            n_remove = gn.shape[0] - n
            print('iteration', i + 1, 'retaining', n, 'removing', n_remove,
                  'variants')
            gn = gn.compress(loc_unlinked, axis=0)
        return gn

    #more than 3 does not remove almost anything
    gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3)

    #PCA
    k = 2
    coords1, model1 = allel.pca(gnu, n_components=k, scaler='patterson')
    np.savetxt('data_s//tgp_pca' + str(k) + '.txt', coords1, delimiter=',')
    return coords1
Exemplo n.º 7
0
def pca(genotypes_012, subpops):
    """Carries out ld pruning and Patterson PCA of the genotypes.
    :param genotypes_012, genotype matrix in 012 (scikit-allel alt_n format)
    :param subpops, dictionary of subpopulation indexes
    :returns pd.DataFrame
    """
    genotypes_012 = sim.utils.monomorphic_012_filter(genotypes_012)
    genotypes_012 = sim.utils.ld_prune(genotypes_012)

    coords, model = allel.pca(genotypes_012,
                              n_components=2,
                              scaler='patterson')

    pca_data = pd.DataFrame({
        "pc1": coords[:, 0],
        "pc2": coords[:, 1],
        "population": ""
    })

    for pop in ["domestic", "wild", "captive"]:
        pca_data.loc[subpops[pop], "population"] = pop

    return pca_data
Exemplo n.º 8
0
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000):
    """
    main function to run pca visualization
    """
    import pdb

    #gn, callset = prepData(directory, outfn, newVCF, samples, bs)
    callset = allel.read_vcf(directory + outfn + ".vcf")

    g = allel.GenotypeChunkedArray(callset['calldata/GT'])
    gn = transform(g)

    ## get metadata
    df = fp.retrieveMetaData(samples, directory, outfn)

    coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson')

    fig_pca(directory,
            outfn,
            coords1,
            model1,
            'Conventional PCA.',
            sample_population=df[column])
Exemplo n.º 9
0
    def _benchmark_pca(self, gt):
        # Count alleles at each variant
        self.benchmark_profiler.start_benchmark('PCA: Count alleles')
        ac = gt.count_alleles()
        self.benchmark_profiler.end_benchmark()

        # Count number of multiallelic SNPs
        self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_multiallelic_snps = da.count_nonzero(
                ac.max_allele() > 1).compute()
        else:
            num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1)
        self.benchmark_profiler.end_benchmark()
        del num_multiallelic_snps

        # Count number of biallelic singletons
        self.benchmark_profiler.start_benchmark(
            'PCA: Count biallelic singletons')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_biallelic_singletons = da.count_nonzero(
                (ac.max_allele() == 1) & ac.is_singleton(1)).compute()
        else:
            num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1)
                                                        & ac.is_singleton(1))
        self.benchmark_profiler.end_benchmark()
        del num_biallelic_singletons

        # Apply filtering to remove singletons and multiallelic SNPs
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
        flt_count = np.count_nonzero(flt)
        self.benchmark_profiler.start_benchmark(
            'PCA: Remove singletons and multiallelic SNPs')
        if flt_count > 0:
            if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gf = gt.take(np.flatnonzero(flt), axis=0)
            else:
                gf = gt.compress(condition=flt, axis=0)
        else:
            # Don't apply filtering
            print(
                '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...'
            )
            gf = gt
        self.benchmark_profiler.end_benchmark()
        del ac, flt, flt_count

        # Transform genotype data into 2-dim matrix
        self.benchmark_profiler.start_benchmark(
            'PCA: Transform genotype data for PCA')
        gn = gf.to_n_alt()
        self.benchmark_profiler.end_benchmark()
        del gf

        # Randomly choose subset of SNPs
        if self.bench_conf.pca_subset_size == -1:
            print('[Exec][PCA] Including all ({}) variants for PCA.'.format(
                gn.shape[0]))
            gnr = gn
        else:
            n = min(gn.shape[0], self.bench_conf.pca_subset_size)
            print(
                '[Exec][PCA] Including {} random variants for PCA.'.format(n))
            vidx = np.random.choice(gn.shape[0], n, replace=False)
            vidx.sort()
            if self.bench_conf.genotype_array_type in [
                    config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED
            ]:
                gnr = gn.take(vidx, axis=0)
            elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gnr = gn[
                    vidx]  # Use indexing workaround since Dask Array's take() method is not working properly
            else:
                print(
                    '[Exec][PCA] Error: Unspecified genotype array type specified.'
                )
                exit(1)
            del vidx

        if self.bench_conf.pca_ld_enabled:
            if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK:
                # Apply LD pruning to subset of SNPs
                size = self.bench_conf.pca_ld_pruning_size
                step = self.bench_conf.pca_ld_pruning_step
                threshold = self.bench_conf.pca_ld_pruning_threshold
                n_iter = self.bench_conf.pca_ld_pruning_number_iterations

                self.benchmark_profiler.start_benchmark(
                    'PCA: Apply LD pruning')
                gnu = self._pca_ld_prune(gnr,
                                         size=size,
                                         step=step,
                                         threshold=threshold,
                                         n_iter=n_iter)
                self.benchmark_profiler.end_benchmark()
            else:
                print(
                    '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.'
                )
                gnu = gnr
        else:
            print('[Exec][PCA] LD pruning disabled. Skipping this operation.')
            gnu = gnr

        # Run PCA analysis
        pca_num_components = self.bench_conf.pca_number_components
        scaler = self.bench_conf.pca_data_scaler

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column)
            gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]})
        else:
            gnu_pca_conv = gnu

        # Run conventional PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run conventional PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.pca(gnu_pca_conv,
                                  n_components=pca_num_components,
                                  scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_conv, coords, model

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to match original genotype chunk size
            gnu_pca_rand = gnu.rechunk(
                (gt.values.chunksize[0], gt.values.chunksize[1]))
        else:
            gnu_pca_rand = gnu

        # Run randomized PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run randomized PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.randomized_pca(gnu_pca_rand,
                                             n_components=pca_num_components,
                                             scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_rand, coords, model
Exemplo n.º 10
0
engine = stdpopsim.get_engine('msprime')
sim = engine.simulate(model, new_contig, simsamples, seed=12345)
sim_gen = allel.HaplotypeArray(sim.genotype_matrix()).to_genotypes(ploidy=2)
sim_pos = np.array([s.position for s in sim.sites()], dtype="int32")
m2 = np.isin(sim_pos, keep)
sim_gen = sim_gen[m2, :, :]
sim_pos = sim_pos[m2]
# sim_gen=sim_gen[sim_pos<3.8e7,:,:]
# sim_pos=sim_pos[sim_pos<3.8e7]

sim_dc_all, sim_dc, sim_ac_all, sim_ac, sim_pos = filter_genotypes(
    sim_gen, sim_pos)

##################### comparing PCA of real vs generated genotypes #######################
realpca = allel.pca(
    np.transpose(dc) * 2, scaler=None, n_components=2
)  #*2 here to rescale real genotypes back to 0/1/2 to match binomial(2,...) used to bin genotypes.
genpca = allel.pca(np.transpose(bingen), scaler=None, n_components=2)
simpca = allel.pca(sim_dc, scaler=None, n_components=6)
sampledata = pd.read_csv("data/1kg/sample_metadata.txt", sep="\t")
df = pd.DataFrame(np.hstack((realpca[0], genpca[0])))
df.columns = ['realPC1', 'realPC2', 'genPC1', 'genPC2']
df['sampleID'] = samples
df = df.merge(sampledata, on='sampleID')
df.to_csv('out/1kg/1kg_decoder_PCA.csv', sep=",", index=False)

simdf = pd.DataFrame(simpca[0])
simdf.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']
simdf['pop'] = np.concatenate(
    [np.repeat("YRI", 50),
     np.repeat("CEU", 50),
Exemplo n.º 11
0
Arquivo: pca.py Projeto: tle003/ipyrad
    def plot(self, pcs=[1, 2], ax=None, cmap=None, cdict=None, legend=True, title=None, outfile=None):
        """
        Do the PCA and plot it.

        Parameters
        ---------
        pcs: list of ints
        ...
        ax: matplotlib axis
        ...
        cmap: matplotlib colormap
        ...
        cdict: dictionary mapping pop names to colors
        ...
        legend: boolean, whether or not to show the legend

        """
        ## Specify which 2 pcs to plot, default is pc1 and pc2
        pc1 = pcs[0] - 1
        pc2 = pcs[1] - 1
        if pc1 < 0 or pc2 > self.ncomponents - 1:
            raise IPyradError("PCs are 1-indexed. 1 is min & {} is max".format(self.ncomponents))

        ## Convert genotype data to allele count data
        ## We do this here because we might want to try different ways
        ## of accounting for missing data and "alt" allele counts treat
        ## missing data as "ref"
        allele_counts = self.genotypes.to_n_alt()

        ## Actually do the pca
        if self.ncomponents > len(self.samples_vcforder):
            self.ncomponents = len(self.samples_vcforder)
            print("  INFO: # PCs < # samples. Forcing # PCs = {}".format(self.ncomponents))
        coords, model = allel.pca(allele_counts, n_components=self.ncomponents, scaler='patterson')

        self.pcs = pd.DataFrame(coords,
                                index=self.samples_vcforder,
                                columns=["PC{}".format(x) for x in range(1,self.ncomponents+1)])

        ## Just allow folks to pass in the name of the cmap they want to use
        if isinstance(cmap, str):
            try:
                cmap = cm.get_cmap(cmap)
            except:
                raise IPyradError("  Bad cmap value: {}".format(cmap))


        if not cmap and not cdict:
            if not self.quiet:
                print("  Using default cmap: Spectral")
            cmap = cm.get_cmap('Spectral')

        if cmap:
            if cdict:
                print("  Passing in both cmap and cdict defaults to using the cmap value.")
            popcolors = cmap(np.arange(len(self.pops))/len(self.pops))
            cdict = {i:j for i, j in zip(self.pops.keys(), popcolors)}

        fig = ""
        if not ax:
            fig = plt.figure(figsize=(6, 5))
            ax = fig.add_subplot(1, 1, 1)

        x = coords[:, pc1]
        y = coords[:, pc2]
        for pop in self.pops:
            ## Don't include pops with no samples, it makes the legend look stupid
            ## TODO: This doesn't prevent empty pops from showing up in the legend for some reason.
            if len(self.pops[pop]) > 0:
                mask = np.isin(self.samples_vcforder, self.pops[pop])
                ax.plot(x[mask], y[mask], marker='o', linestyle=' ', color=cdict[pop], label=pop, markersize=6, mec='k', mew=.5)

        ax.set_xlabel('PC%s (%.1f%%)' % (pc1+1, model.explained_variance_ratio_[pc1]*100))
        ax.set_ylabel('PC%s (%.1f%%)' % (pc2+1, model.explained_variance_ratio_[pc2]*100))

        if legend:
            ax.legend(bbox_to_anchor=(1, 1), loc='upper left')

        if fig:
            fig.tight_layout()

        if title:
            ax.set_title(title)

        if outfile:
            try:
                plt.savefig(outfile, format="png", bbox_inches="tight")
            except:
                print("  Saving pca.plot() failed to save figure to {}".format(outfile))

        return ax
Exemplo n.º 12
0
            'A6': sns.color_palette()[2],
            'N1': sns.color_palette()[3],
            'N4': sns.color_palette()[4],
            'N6': sns.color_palette()[5],
            'S1': sns.color_palette()[6],
            'S2': sns.color_palette()[7],
            'S5': sns.color_palette()[8]
    }


    ############

        # nests

    # PCA using SVD - LD-pruned data (59544 loci)
    coords1var, model1var = al.pca(gnuVars, n_components=10, scaler='patterson')


    fig_pca(coords1var, model1var, 'LD-pruned PCA', pops = ids['nest'], pcols= nest_cols, filename= 'pca_LDprune.png')
    # which one is the outlier in the LD-pruned PCA?
    ##np.where(coords1var[:,0] > 200)
    ##ids.iloc[59]    # 101a_S1

    ######
    # pca without LD pruning (random subset of 100000 loci)
    coords2var, model2var = al.pca(gnrVars, n_components=10, scaler='patterson')

    # pops
    #fig_pca(coords2var, model2var, 'Conventional PCA', pops = ids['pops'], pcols= pop_cols)

    # nests
Exemplo n.º 13
0
    subsample_nodes = [a for b in subsample_nodes
                       for a in b]  #flatten the list
    subsample_nodes = np.sort(np.array(subsample_nodes))
    ts = ts.simplify(subsample_nodes)
    ts = msp.mutate(ts, args.mu)

    #get haplotypes and locations
    haps = ts.genotype_matrix()
    sample_inds = np.unique([ts.node(j).individual for j in ts.samples()])
    locs = [[ts.individual(x).location[0],
             ts.individual(x).location[1]] for x in sample_inds]

    #run a PCA
    genotype_counts = allel.HaplotypeArray(haps).to_genotypes(
        ploidy=2).to_allele_counts()
    pca = allel.pca(genotype_counts[:, :, 0])
    pcfile = open(os.path.join(args.outdir, simname) + ".pca", "w")
    for i in range(args.nSamples):
        pcfile.write("msp_" + str(i) + " " + "msp_" + str(i) + " ")
        for j in range(10):
            pcfile.write(str(pca[0][i][j]) + " ")
        pcfile.write("\n")
    pcfile.close()

    #write to VCF
    with open(os.path.join(args.outdir, simname) + ".vcf", "w") as vcf_file:
        ts.write_vcf(vcf_file, 2)

    #convert vcf to .ped (throwing error for opening temp files when run from command line on mac... switch to manual ped file creation?)
    sp.check_output([
        args.vcftools_path, "--vcf",
Exemplo n.º 14
0
)  #encoder.predict() returns [mean,sd,sample] for normal distributions describing sample locations in latent space, so [0] is fixed but [2] is stochastic given a set of weights.


#binning with binomial draws
def binomialBinGenotypes(pgen):
    out = np.copy(pgen)
    for i in range(out.shape[0]):
        out[i, :] = np.random.binomial(2, out[i, :])
    return out


bingen = binomialBinGenotypes(pgen)

#comparing PCA of real vs generated genotypes
realpca = allel.pca(
    np.transpose(dc) * 2, scaler="Patterson", n_components=2
)  #*2 here to rescale genotypes to 0/1/2 to match binomial(2,...) used to bin genotypes.
#genpca=allel.pca(np.transpose(bingen),scaler=None,n_components=2)[0] #run a separate PCA
genpca = realpca[1].transform(np.transpose(
    bingen))  #project generated coordinates into the "real" PC space
sampledata = pd.read_csv("data/hgdp/hgdp_sample_data.txt", sep="\t")
df = pd.DataFrame(np.hstack((realpca[0], genpca)))
df.columns = ['realPC1', 'realPC2', 'genPC1', 'genPC2']
df['sampleID'] = samples
df = df.merge(sampledata, on='sampleID')
df.to_csv('pca_decoder_test.csv', sep=",", index=False)

fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
fig.set_figwidth(6)
fig.set_figheight(2.75)
ax1.scatter(df['realPC1'], df['realPC2'], c=pd.factorize(df['region'])[0])
Exemplo n.º 15
0
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn

gnu = ld_prune(nAltSub, size=200, step=50, threshold=.1, n_iter=5)

plot_ld(gnu[:1000], 'Figure 3. Pairwise LD after LD pruning.')



###############

## PCA using Singular Value Decomposition (SVD) and Patterson's scaling on LD-pruned data (see gnu.shape for dimensions)
coords1, model1 = al.pca(gnu, n_components=10, scaler='patterson')

populations = ids['pops'].unique()
pop_colours = {
    'A': sns.color_palette()[0],
    'N': sns.color_palette()[1],
    'S': sns.color_palette()[2],
}



def plot_pca_coords(coords, model, pc1, pc2, ax, pops, pcols):
    sns.despine(ax=ax, offset=5)
    x = coords[:, pc1]
    y = coords[:, pc2]
    for pop in pops.unique():
Exemplo n.º 16
0
    p['sd2'] = pred[1][:, 1]
    pred = p
else:
    pred = pd.DataFrame(pred[0])
    pred.columns = ['LD' + str(x + 1) for x in range(len(pred.columns))]
pred['sampleID'] = samples
pred.to_csv(out + '_latent_coords.txt', sep='\t', index=False)

if not save_weights:
    subprocess.check_output(['rm', out + "_weights.hdf5"])

if PCA:
    pcdata = np.transpose(dc)
    t1 = time.time()
    print("running PCA")
    pca = allel.pca(pcdata, scaler=PCA_scaler, n_components=n_pc_axes)
    pca = pd.DataFrame(pca[0])
    colnames = ['PC' + str(x + 1) for x in range(n_pc_axes)]
    pca.columns = colnames
    pca['sampleID'] = samples
    pca.to_csv(out + "_pca.txt", index=False, sep="\t")
    t2 = time.time()
    pcatime = t2 - t1
    print("PCA run time: " + str(pcatime) + " seconds")

######### plots #########
#training history
#plt.switch_backend('agg')
fig = plt.figure(figsize=(3, 1.5), dpi=200)
plt.rcParams.update({'font.size': 7})
ax1 = fig.add_axes([0, 0, 1, 1])
Exemplo n.º 17
0
sorted(muta.keys())

# %%
muta['calldata/GT'].shape[0]

# %%
gt = muta['calldata/GT']
gt = allel.GenotypeArray(gt)
len(gt)

# %%
gn = gt.to_n_alt()
gn

# %%
coords1, model1 = allel.pca(gn, n_components=10, scaler=None)

# %%
df_samples = pandas.read_csv('LL_pop.txt', delimiter='\t', header=None)
df_samples.head()

# %%
populations = df_samples.iloc[:, 1].unique()
len(populations)

# %%
pop_colours = {
    'French.alps': '#FF0000',
    'E.Greenland': '#008000',
    'Iceland': '#00FFFF',
    'W.Greenland': '#90EE90',
Exemplo n.º 18
0
np.savetxt(os.path.join(args.outdir,simname)+"_locs.txt",locs)

#run a PCA
genotype_counts=allel.HaplotypeArray(haps).to_genotypes(ploidy=2).to_allele_counts() #add arg for n pc's to keep, default is 10
#LD pruning function
def ld_prune(gn, size, step, threshold=.1, n_iter=1): #via http://alimanfoo.github.io/2015/09/28/fast-pca.html
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn
genotype_counts_pruned=ld_prune(genotype_counts[:,:,1],200,100,.1,1)

pca=allel.pca(genotype_counts_pruned,n_components=10)
varexp=pca[1].explained_variance_ratio_
np.savetxt(os.path.join(args.outdir,simname)+".pca_var_explained",varexp) #write out proportion variance explained by PCs

pcfile=open(os.path.join(args.outdir,simname)+".pca","w")
for i in range(args.nSamples):
    pcfile.write("msp_"+str(i)+" "+"msp_"+str(i)+" ")
    for j in range(10):
        pcfile.write(str(pca[0][i][j])+" ")
    pcfile.write("\n")
pcfile.close()

#write to VCF
with open(os.path.join(args.outdir,simname)+".vcf","w") as vcf_file:
    ts.write_vcf(vcf_file,2)