def multiple_loci_mixed_model_gwas(phenotype_id=5, pvalue_file_prefix='mlmm_results', result_files_prefix='mlmm_manhattan', max_num_steps=10, snp_priors=None): """ Perform multiple loci mixed model GWAS for flowering time (phenotype_id=5 in the phenotype file) in plants grown under 10C conditions. """ import linear_models as lm import kinship # Load genotypes sd = load_a_thaliana_genotypes() # Load phenotypes phend = load_a_thaliana_phenotypes() # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform multiple loci mixed model GWAS mlmm_results = lm.mlmm(phend.get_values(phenotype_id), K, sd=sd, num_steps=max_num_steps, file_prefix=result_files_prefix, save_pvals=True, pval_file_prefix=result_files_prefix, snp_priors=snp_priors)
def get_ibs_kinship_matrix(self, debug_filter=1, snp_dtype='int8', dtype='single',chunk_size=None): """ Calculate the IBS kinship matrix. (un-scaled) Currently it works only for binary kinship matrices. """ log.debug('Starting kinship calculation') return kinship.calc_ibs_kinship(self,chunk_size=chunk_size)
def mixed_model_gwas(phenotype_id=5, pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) in plants grown under 10C conditions. """ import linear_models as lm import kinship import gwaResults as gr # Load genotypes sd = load_a_thaliana_genotypes() # Load phenotypes phend = load_a_thaliana_phenotypes() # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for Lotus data """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp # Load genotypes sd = dp.parse_snp_data(gt_file) # Load phenotypes import phenotypeData as pd phend = pd.parse_phenotype_file(phen_file, with_db_ids=False) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplemented if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) #Now generating QQ-plots label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals( '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] #Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplementedError if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) # Now generating QQ-plots label_str = '%s_%s_%s_%s' % ( kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[ label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] # Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))