Пример #1
0
def generate_pca_from_snps(snps,
                           sample_population=None,
                           title="",
                           pop_colors=None,
                           plot_pca3=False):
    ## Provide sample_population to color dots
    import allel
    if sample_population is not None:
        ## Check if same number of accessions are provided
        assert sample_population.shape[0] == snps.shape[1]
    coords, model = allel.randomized_pca(snps, scaler=None)
    if plot_pca3:
        fig = plt.figure(figsize=(10, 5))
        ax = fig.add_subplot(1, 2, 1)
        plot_pca_coords(coords, model, 0, 1, ax, sample_population, pop_colors)
        ax = fig.add_subplot(1, 2, 2)
        plot_pca_coords(coords, model, 2, 3, ax, sample_population, pop_colors)
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
        fig.suptitle(title, y=1.05)
        fig.tight_layout()
    else:
        fig = plt.figure(figsize=(8, 5))
        ax = fig.add_subplot(111)
        plot_pca_coords(coords, model, 0, 1, ax, sample_population, pop_colors)
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
        fig.suptitle(title, y=1.05)
        fig.tight_layout()
def apply_pca(g, outfile, seed, n, s):
    """
    Applies PCA to data and saves low-dimensional coordinates in outfile
    @Params: g: input data format
             outfile: path to coordinate file
             seed: seed for prng
             n: Number of principal components
             s: scaler
    """
    coords, _ = allel.randomized_pca(g,
                                     n_components=n,
                                     scaler=s,
                                     random_state=seed)
    logging.info(
        f"Applied PCA with {n} components with scaler {s} and seed {seed}")
    np.savetxt(outfile, coords)
    logging.info(f"Saved coordinates to {outfile}")
Пример #3
0
    def _benchmark_pca(self, gt):
        # Count alleles at each variant
        self.benchmark_profiler.start_benchmark('PCA: Count alleles')
        ac = gt.count_alleles()
        self.benchmark_profiler.end_benchmark()

        # Count number of multiallelic SNPs
        self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_multiallelic_snps = da.count_nonzero(
                ac.max_allele() > 1).compute()
        else:
            num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1)
        self.benchmark_profiler.end_benchmark()
        del num_multiallelic_snps

        # Count number of biallelic singletons
        self.benchmark_profiler.start_benchmark(
            'PCA: Count biallelic singletons')
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            num_biallelic_singletons = da.count_nonzero(
                (ac.max_allele() == 1) & ac.is_singleton(1)).compute()
        else:
            num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1)
                                                        & ac.is_singleton(1))
        self.benchmark_profiler.end_benchmark()
        del num_biallelic_singletons

        # Apply filtering to remove singletons and multiallelic SNPs
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
        flt_count = np.count_nonzero(flt)
        self.benchmark_profiler.start_benchmark(
            'PCA: Remove singletons and multiallelic SNPs')
        if flt_count > 0:
            if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gf = gt.take(np.flatnonzero(flt), axis=0)
            else:
                gf = gt.compress(condition=flt, axis=0)
        else:
            # Don't apply filtering
            print(
                '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...'
            )
            gf = gt
        self.benchmark_profiler.end_benchmark()
        del ac, flt, flt_count

        # Transform genotype data into 2-dim matrix
        self.benchmark_profiler.start_benchmark(
            'PCA: Transform genotype data for PCA')
        gn = gf.to_n_alt()
        self.benchmark_profiler.end_benchmark()
        del gf

        # Randomly choose subset of SNPs
        if self.bench_conf.pca_subset_size == -1:
            print('[Exec][PCA] Including all ({}) variants for PCA.'.format(
                gn.shape[0]))
            gnr = gn
        else:
            n = min(gn.shape[0], self.bench_conf.pca_subset_size)
            print(
                '[Exec][PCA] Including {} random variants for PCA.'.format(n))
            vidx = np.random.choice(gn.shape[0], n, replace=False)
            vidx.sort()
            if self.bench_conf.genotype_array_type in [
                    config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED
            ]:
                gnr = gn.take(vidx, axis=0)
            elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
                gnr = gn[
                    vidx]  # Use indexing workaround since Dask Array's take() method is not working properly
            else:
                print(
                    '[Exec][PCA] Error: Unspecified genotype array type specified.'
                )
                exit(1)
            del vidx

        if self.bench_conf.pca_ld_enabled:
            if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK:
                # Apply LD pruning to subset of SNPs
                size = self.bench_conf.pca_ld_pruning_size
                step = self.bench_conf.pca_ld_pruning_step
                threshold = self.bench_conf.pca_ld_pruning_threshold
                n_iter = self.bench_conf.pca_ld_pruning_number_iterations

                self.benchmark_profiler.start_benchmark(
                    'PCA: Apply LD pruning')
                gnu = self._pca_ld_prune(gnr,
                                         size=size,
                                         step=step,
                                         threshold=threshold,
                                         n_iter=n_iter)
                self.benchmark_profiler.end_benchmark()
            else:
                print(
                    '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.'
                )
                gnu = gnr
        else:
            print('[Exec][PCA] LD pruning disabled. Skipping this operation.')
            gnu = gnr

        # Run PCA analysis
        pca_num_components = self.bench_conf.pca_number_components
        scaler = self.bench_conf.pca_data_scaler

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column)
            gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]})
        else:
            gnu_pca_conv = gnu

        # Run conventional PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run conventional PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.pca(gnu_pca_conv,
                                  n_components=pca_num_components,
                                  scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_conv, coords, model

        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Rechunk Dask array to match original genotype chunk size
            gnu_pca_rand = gnu.rechunk(
                (gt.values.chunksize[0], gt.values.chunksize[1]))
        else:
            gnu_pca_rand = gnu

        # Run randomized PCA analysis
        self.benchmark_profiler.start_benchmark(
            'PCA: Run randomized PCA analysis (scaler: {})'.format(
                scaler if scaler is not None else 'none'))
        coords, model = allel.randomized_pca(gnu_pca_rand,
                                             n_components=pca_num_components,
                                             scaler=scaler)
        if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            coords.compute()
        self.benchmark_profiler.end_benchmark()
        del gnu_pca_rand, coords, model
Пример #4
0
    fig_pca(coords2allVars, model2allVars, 'Conventional PCA without LD pruning', pops= ids['nest'], pcols= nest_cols, filename= 'pca_all.png')


    # pca with LD pruning, without Patterson's scaling
    coords3vars, model3vars = al.pca(gnuVars, n_components=10, scaler=None)


    # pops
    #fig_pca(coords3vars, model3vars, 'Conventional PCA LD-pruned variants without variance scaling', pops = ids['pops'], pcols= pop_cols)

    # nests
    fig_pca(coords3vars, model3vars, 'Conventional PCA LD-pruned variants without variance scaling.', pops = ids['nest'], pcols= nest_cols, filename= 'pca_LDprune_noPatterson.png')


    # randomized PCA with LD pruning
    coords5vars, model5vars = al.randomized_pca(gnuVars, n_components=10, scaler='patterson')


    # pops
    #fig_pca(coords5vars, model5vars, 'Randomized PCA', pops= ids['pops'], pcols= pop_cols)

    # nests
    fig_pca(coords5vars, model5vars, 'Randomized PCA LD-pruned variants', pops= ids['nest'], pcols= nest_cols, filename= 'pca_LDprune_rand.png')


    plotHeatPCs(coords1var, ids['nest'], PCs=5, filename= 'pca_LDprune_Heat.png')
    plotHeatPCs(coords2allVars, ids['nest'], PCs=5, filename= 'pca_all_Heat.png')

    ## get the Eigen values for PCAs

    # for all (segreg.) vars
Пример #5
0
## pca without LD pruning for the random subset of 100k loci with Patterson's scaling
coords2, model2 = al.pca(gnr, n_components=10, scaler='patterson')
fig_pca(coords2, model2, 'Figure 5. Conventional PCA without LD pruning.', pops = ids['pops'], pcols= pop_colours)

## now for the full set (gtseg) with Patterson's scaling
#   NOTE: probably do not run this on your laptop
coords2all, model2all = al.pca(nAltSub, n_components=10, scaler='patterson')
fig_pca(coords2all, model2all, 'Conventional PCA without LD pruning.', pops = ids['pops'], pcols= pop_colours)

## pca + LD-pruning, without Patterson's scaling
coords3, model3 = al.pca(gnu, n_components=10, scaler=None)
fig_pca(coords3, model3, 'Figure 6. Conventional PCA without variance scaling.', pops = ids['pops'], pcols= pop_colours)

## randomized PCA with LD-pruning and Patterson's scaling
coords5, model5 = al.randomized_pca(gnu, n_components=10, scaler='patterson')
fig_pca(coords5, model5, 'Figure 8. Randomized PCA.', pops = ids['pops'], pcols= pop_colours)

## pca with even sample sizes NOTE: not really needed here, see alimanfoo's Fast PCA site
# (https://alimanfoo.github.io/2015/09/28/fast-pca.html)
# also: see alimanfoo's Fast-PCA post on an evaluation of the lower PCs in randomized PCA



## plot a heatmap where the color bar basically represents the correlation between the individuals and the respective principal component.
# NOTE: Comparisons across PCs are somewhat meaningless here, as the color only shows strength of correlation within a PC, not across PCs.


def plotHeatPCs(coords, ids, PCs=4):
    df = pd.DataFrame(coords[:,0:PCs].T, columns=ids, index= range(1,PCs+1))
    plt.subplots(figsize= (20,5))