def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits( snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def mixed_model_gwas(phenotype_id=5, pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) in plants grown under 10C conditions. """ import linear_models as lm import kinship import gwaResults as gr # Load genotypes sd = load_a_thaliana_genotypes() # Load phenotypes phend = load_a_thaliana_phenotypes() # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for Lotus data """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp # Load genotypes sd = dp.parse_snp_data(gt_file) # Load phenotypes import phenotypeData as pd phend = pd.parse_phenotype_file(phen_file, with_db_ids=False) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict): import copy phed = copy.deepcopy(phed) phenotype_name = phed.get_name(p_i) phen_is_binary = phed.is_binary(p_i) if trans_method == 'most_normal': trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) emmax_perm_threshold = None k = None res = None #Check whether result already exists. if p_dict['use_existing_results']: if p_dict['region_plots']: sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." snps = sd.getSnps() else: snps = None print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() #Loading candidate genes cand_genes = None if p_dict['cand_genes_file']: cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file']) else: cand_genes = None tair_ids = None if not res: #If results weren't found in a file... then do GWA. #Loading data sd = _get_genotype_data_(p_dict) num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) #Do we need to calculate the K-matrix? if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if p_dict['kinship_file']: #Kinship file was supplied.. print 'Loading supplied kinship file: %s' % p_dict['kinship_file'] k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions) else: print 'Loading kinship file.' if p_dict['data_file'] != None: if p_dict['kinship_type'] == 'ibs': k = sd.get_ibs_kinship_matrix() elif p_dict['kinship_type'] == 'ibd': k = sd.get_ibd_kinship_matrix() else: k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'], method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps, remain_accessions=sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: if num_outliers == 0: print "No outliers were removed!" phen_vals = phed.get_values(p_i) if p_dict['local_gwas']: #Filter SNPs, etc.. sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])], [p_dict['local_gwas'][0]], data_format=sd.data_format) snps = sd.getSnps() sys.stdout.write("Finished loading and handling data!\n") print "Plotting a histogram" p_her = None hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) hist_png_file = hist_file_prefix + "_hist.png" if k is not None: p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability'] p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval'] phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval) else: phed.plot_histogram(p_i, png_file=hist_png_file) print "Applying %s to data." % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if "kw" == mapping_method: if phen_is_binary: warnings.warn("Warning, applying KW to a binary phenotype") kw_res = util.kruskal_wallis(snps, phen_vals) pvals = kw_res['ps'] kwargs['statistics'] = kw_res['ds'] additional_columns.append('statistics') elif "ft" == mapping_method: raise NotImplementedError # pvals, or_est = run_fet(snps, phen_vals) # kwargs['odds_ratio_est'] = or_est # additional_columns.append('odds_ratio_est') else: #Parametric tests below: if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']: r = lm.mm_lrt_test(phen_vals, k) if r['pval'] > 0.05: print "Performing EMMA, even though a mixed model does not fit the data significantly better" print 'p-value: %0.3f' % r['pval'] else: print 'The mixed model fits the data significantly better than the simple linear model.' print 'p-value: %f' % r['pval'] if mapping_method in ['loc_glob_mm']: res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix, global_k=k, window_size=p_dict['loc_glob_ws'], jump_size=p_dict['loc_glob_ws'] / 2, kinship_method=p_dict['kinship_type']) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) return elif mapping_method in ['emma']: res = lm.emma(snps, phen_vals, k) elif mapping_method in ['emmax']: if p_dict['emmax_perm']: perm_sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates']) perm_sd.filter_mac_snps(p_dict['mac_threshold']) t_snps = perm_sd.getSnps() t_phen_vals = phed.get_values(p_i) res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm']) emmax_perm_threshold = res['threshold_05'][0] import pylab hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6) threshold = -sp.log10(emmax_perm_threshold) b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0)) pylab.vlines(threshold, 0, max(hist_res[0]), color='g') pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r') pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']), format='png') if p_dict['with_replicates']: #Get values, with ecotypes, construct Z and do GWAM phen_vals = phed.get_values(p_i) Z = phed.get_incidence_matrix(p_i) res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) else: res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) elif mapping_method in ['emmax_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals'], emma_num=p_dict['emmax_emma_num']) print 'Step-wise EMMAX finished!' return elif mapping_method in ['lm_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals']) print 'Step-wise LM finished!' return elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) elif mapping_method in ['emmax_anova']: res = lm.emmax_anova(snps, phen_vals, k) elif mapping_method in ['lm_anova']: res = lm.anova(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' return if mapping_method in ['lm', 'emma', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] additional_columns.append('genotype_var_perc') if p_dict['with_betas'] or mapping_method in ['emma' ]: betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] additional_columns.append('beta0') if len(betas) > 1: kwargs['beta1'] = betas[1] additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() if mapping_method in ['lm_anova', 'emmax_anova']: kwargs['genotype_var_perc'] = res['var_perc'] pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() # print 'Calculating SNP-phenotype correlations.' # kwargs['correlations'] = calc_correlations(snps, phen_vals) # additional_columns.append('correlations') print 'Writing result to file.' res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs) if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter']) #add results to DB.. if p_dict['add_to_db']: print 'Adding results to DB.' if p_dict['with_db_ids']: db_pid = p_i else: db_pid = phed.get_db_pid(p_i) import results_2_db as rdb short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name, mapping_method, trans_method, p_dict['remove_outliers'], str(p_dict['with_replicates'])) tm_id = transformation_method_dict[trans_method] try: rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid, analysis_methods_dict[mapping_method], tm_id, remove_outliers=p_dict['remove_outliers']) except Exception, err_str: print 'Failed inserting results into DB!' print err_str
def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5( snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict): phenotype_name = phed.getPhenotypeName(p_i) phen_is_binary = phed.isBinary(p_i) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i), mapping_method, trans_method, p_dict['remove_outliers']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) res = None sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'], filter=p_dict['debug_filter']) num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." phen_vals = phed.getPhenVals(p_i) snps = sd.getSnps() if mapping_method in ['emmax']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() k_file = env['data_dir'] + "kinship_matrix_cm" + str(p_dict['call_method_id']) + ".pickled" kinship_file = p_dict['kinship_file'] if not kinship_file and os.path.isfile(k_file): #Check if corresponding call_method_file is available kinship_file = k_file if kinship_file: #Kinship file was somehow supplied.. print 'Loading supplied kinship' k = lm.load_kinship_from_file(kinship_file, sd.accessions) else: print "No kinship file was found. Generating kinship file:", k_file sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format']) snps = sd.getSnps() k_accessions = sd.accessions[:] if p_dict['debug_filter']: import random snps = random.sample(snps, int(p_dict['debug_filter'] * len(snps))) k = lm.calc_kinship(snps) f = open(k_file, 'w') cPickle.dump([k, sd.accessions], f) f.close() num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers']) k = lm.filter_k_for_accessions(k, k_accessions, sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." #Check whether result already exists. if p_dict['use_existing_results']: print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() if not res: #If results weren't found in a file... then do GWA. sys.stdout.write("Finished loading and handling data!\n") print "FIRST STEP: Applying %s to data. " % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if mapping_method in ['emmax']: res = lm.emmax(snps, phen_vals, k) elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' sys.exit(2) if mapping_method in ['lm', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] kwargs['beta1'] = betas[1] additional_columns.append('genotype_var_perc') additional_columns.append('beta0') additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() kwargs['correlations'] = calc_correlations(snps, phen_vals) additional_columns.append('correlations') res = gwaResults.Result(scores=pvals, snps_data=sd, name=result_name, **kwargs) if mapping_method in ["emmax", 'lm']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns) print "Generating a GW plot." sys.stdout.flush() png_file = file_prefix + "_gwa_plot.png" #png_file_max30 = file_prefix+"_gwa_plot_max30.png" if mapping_method in ['lm', "emmax"]: res.neg_log_trans() if mapping_method in ["kw", "ft"]:# or p_dict['data_format'] != 'binary': #res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", # plot_bonferroni=True,max_score=30) res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plot_bonferroni=True) else: if res.filter_attr("mafs", p_dict['mac_threshold']) > 0: #res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", # plot_bonferroni=True,max_score=30) res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plot_bonferroni=True) else: pass print "plotting histogram" hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers']) hist_png_file = hist_file_prefix + "_hist.png" phed.plot_histogram(p_i, pngFile=hist_png_file) else: res.neg_log_trans() assert res.filter_attr("mafs", p_dict['mac_threshold']), 'All SNPs have MAC smaller than threshold' print "SECOND STEP:" res.filter_top_snps(p_dict['second_step_number']) snps = res.snps positions = res.positions chromosomes = res.chromosomes #Checking res_file exists file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['second_step_number']) res_file = file_prefix + '_res.cpickled' if p_dict['use_existing_results'] and os.path.isfile(res_file): print 'Found existing results for the second step... loading.' f = open(res_file, 'rb') second_res = cPickle.load(f) f.close() else: if mapping_method == 'lm': second_res = lm.linear_model_two_snps(snps, phen_vals) if mapping_method == 'emmax': second_res = lm.emmax_two_snps(snps, phen_vals, k) #Pickling results.. print 'Saving results as pickled file:', res_file f = open(res_file, 'wb') cPickle.dump(second_res, f, protocol=2) f.close() #Plotting second step plots: score_array = -sp.log10(second_res['ps']) p3_score_array = -sp.log10(second_res['p3_ps']) p4_score_array = -sp.log10(second_res['p4_ps']) import plotResults as pr pr.plot_snp_pair_result(chromosomes, positions, score_array, file_prefix + '_scatter') pr.plot_snp_pair_result(chromosomes, positions, p3_score_array, file_prefix + '_p3_scatter') pr.plot_snp_pair_result(chromosomes, positions, p4_score_array, file_prefix + '_p4_scatter') if p_dict['region_plots']: import regionPlotter as rp regions_results = res.get_top_region_results(p_dict['region_plots']) plotter = rp.RegionPlotter() print "Starting region plots..." for reg_res in regions_results: chromosome = reg_res.chromosomes[0] caption = phenotype_name + "_c" + str(chromosome) + "_" + mapping_method png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + ".png" tair_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + "_tair_info.txt" plotter.plot_small_result([reg_res], png_file=png_file, highlight_gene_ids=tair_ids, caption=caption, tair_file=tair_file) #Plot Box-plot png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + "_box_plot.png" (marker, score, chromosome, pos) = reg_res.get_max_snp() marker_accessions = sd.accessions phed.plot_marker_box_plot(p_i, marker=marker, marker_accessions=marker_accessions, \ png_file=png_file, title="c" + str(chromosome) + "_p" + str(pos), \ marker_score=score, marker_missing_val=sd.missing_val)
(SNP_names[:, 0].astype("int"), SNP_names[:, 1].astype("int"), np.array(test_H), np.array(test_p), MAF)) test = np.transpose(test) header = 'Chromosome,Position,H,p-val,MAF' filename = "GWAS_KW_for_" + PHENO_file + "_" + str( sys.argv[2]).split("_")[2].replace(".npy", "") + ".csv" np.savetxt(filename, test, delimiter=",", header=header, fmt="%s") END_KW = datetime.now() print('\rKW completed in ' + str(END_KW - START)) if EMMAX: START = datetime.now() print('_____________________________________________________________') print('Starting the computation of EMMAX using mixmogam') mm_results = lm.emmax(np.transpose(SNP_data), Pheno_data, K_data, with_betas=True) betas = mm_results['betas'] betas = [[0, 0] if i is None else i for i in betas] betas = [[0, 0] if len(i) != 2 else i for i in betas] betas = np.hstack(betas).reshape(len(betas), 2) test = np.vstack( (SNP_names[:, 0].astype("int"), SNP_names[:, 1].astype("int"), betas[:, 1], mm_results['ps'], MAF)) test = np.transpose(test) header = 'Chromosome,Position,beta,p-val,MAF' filename = "GWAS_EMMAX_for_" + PHENO_file + "_" + str( sys.argv[2]).split("_")[2].replace(".npy", "") + ".csv" np.savetxt(filename, test, delimiter=",",