예제 #1
0
def test_single_snp(args):
    import fastlmm
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from fastlmm.association import single_snp
    from utils import read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import fastlmm.util.util as flutil

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.sample_indices_file is not None:
        logger.info('read indices from file: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(
            (phenotypes['type'] == 'training').values)[0]
    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices)
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes[phenotypes['type'] == 'training'].copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno[['fid', 'iid', trait]].to_csv(pheno_file,
                                               index=False,
                                               sep='\t',
                                               header=False)
        pheno = Pheno(pheno_file)
        logger.info('run FastLMM for single SNP test for %s' % trait)
        results_df = single_snp(test_snps,
                                pheno,
                                K0=K0,
                                count_A1=True,
                                GB_goal=args.GB_goal)
        result_file = os.path.join(args.output_dir, 'single_snp.' + trait)
        logger.info('save results to file: ' + result_file)
        results_df.to_hdf(result_file, trait)

        if args.manhattan:
            plot_file = os.path.join(args.output_dir,
                                     'manhattan.%s.pdf' % trait)
            logger.info('create Manhattan plot: ' + plot_file)
            plt.clf()
            flutil.manhattan_plot(results_df.as_matrix(
                ["Chr", "ChrPos", "PValue"]),
                                  pvalue_line=1e-5,
                                  xaxis_unit_bp=False)
            plt.savefig(plot_file)
예제 #2
0
# Providing the path to the bed file required for analysis
# bed_file = "/birl2/users/cbe453/arabidopsis-association/PLINK_manipulation/Seed_Oil_Composition_maf_ge_05_Fully_Merged_391_Subset_Final"
#pheno_file = "/birl2/data/P2IRC/GE2P/GWAS/arabidopsis/arabidopsis-pheno-files/BC16_0/bioBC_FA-BC16_0_plink.pheno"

# Perform the single_snp GWAS analysis.
# By default, FaST-LMM does not generate a proper output file so the output_file_name option
# is required. An arbitrary RAM cap of 10G was set based on previous tests.
results_df = single_snp(args.bed_file,
                        args.pheno_file,
                        GB_goal=10,
                        count_A1=True,
                        output_file_name=args.out_file)

# Tools for visualization if you're equipped with Xquartz (my Desktop machine is not...)
import matplotlib.pyplot as plt
import fastlmm.util.util as flutil
#draw manhattan plot
flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]),
                      pvalue_line=1e-5,
                      xaxis_unit_bp=False)
plt.title(args.plot_title)
plt.savefig(args.out_file + '.png')

#draw qqplot
from fastlmm.util.stats.plotp import qqplot
qqplot(results_df["PValue"].values,
       fileout='qq_' + args.out_file + '.png',
       title=args.plot_title.replace('Manhattan', 'Quantile-quantile'))

results_df.head()
# Load FaST-LMM basic association test:
from fastlmm.association import single_snp
from pysnptools.snpreader import Ped
from pysnptools.snpreader import Pheno
from pysnptools.snpreader import wrap_plink_parser
import numpy as np
from sys import argv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import fastlmm.util.util as flutil

script, inped_file, inpheno_file, results_dataframe, output_manhattan = argv

# Load snp data:
print "Loading variant data..."
ped_file = Ped(inped_file)
print "Loading phenotype data..."
pheno_fn = Pheno(inpheno_file)

# Run basic association test:
print "Running FaST-LMM single_snp test..."
results_df = single_snp(test_snps=ped_file, pheno=pheno_fn, leave_out_one_chrom=0, output_file_name=results_dataframe)

chromosome_starts = flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]), pvalue_line=4.4e-7, xaxis_unit_bp=True)
plt.show()
# fig = plt.figure()
# fig.savefig(output_manhattan)