def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits( snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict): import copy phed = copy.deepcopy(phed) phenotype_name = phed.get_name(p_i) phen_is_binary = phed.is_binary(p_i) if trans_method == 'most_normal': trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) emmax_perm_threshold = None k = None res = None #Check whether result already exists. if p_dict['use_existing_results']: if p_dict['region_plots']: sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." snps = sd.getSnps() else: snps = None print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() #Loading candidate genes cand_genes = None if p_dict['cand_genes_file']: cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file']) else: cand_genes = None tair_ids = None if not res: #If results weren't found in a file... then do GWA. #Loading data sd = _get_genotype_data_(p_dict) num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) #Do we need to calculate the K-matrix? if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if p_dict['kinship_file']: #Kinship file was supplied.. print 'Loading supplied kinship file: %s' % p_dict['kinship_file'] k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions) else: print 'Loading kinship file.' if p_dict['data_file'] != None: if p_dict['kinship_type'] == 'ibs': k = sd.get_ibs_kinship_matrix() elif p_dict['kinship_type'] == 'ibd': k = sd.get_ibd_kinship_matrix() else: k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'], method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps, remain_accessions=sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: if num_outliers == 0: print "No outliers were removed!" phen_vals = phed.get_values(p_i) if p_dict['local_gwas']: #Filter SNPs, etc.. sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])], [p_dict['local_gwas'][0]], data_format=sd.data_format) snps = sd.getSnps() sys.stdout.write("Finished loading and handling data!\n") print "Plotting a histogram" p_her = None hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) hist_png_file = hist_file_prefix + "_hist.png" if k is not None: p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability'] p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval'] phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval) else: phed.plot_histogram(p_i, png_file=hist_png_file) print "Applying %s to data." % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if "kw" == mapping_method: if phen_is_binary: warnings.warn("Warning, applying KW to a binary phenotype") kw_res = util.kruskal_wallis(snps, phen_vals) pvals = kw_res['ps'] kwargs['statistics'] = kw_res['ds'] additional_columns.append('statistics') elif "ft" == mapping_method: raise NotImplementedError # pvals, or_est = run_fet(snps, phen_vals) # kwargs['odds_ratio_est'] = or_est # additional_columns.append('odds_ratio_est') else: #Parametric tests below: if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']: r = lm.mm_lrt_test(phen_vals, k) if r['pval'] > 0.05: print "Performing EMMA, even though a mixed model does not fit the data significantly better" print 'p-value: %0.3f' % r['pval'] else: print 'The mixed model fits the data significantly better than the simple linear model.' print 'p-value: %f' % r['pval'] if mapping_method in ['loc_glob_mm']: res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix, global_k=k, window_size=p_dict['loc_glob_ws'], jump_size=p_dict['loc_glob_ws'] / 2, kinship_method=p_dict['kinship_type']) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) return elif mapping_method in ['emma']: res = lm.emma(snps, phen_vals, k) elif mapping_method in ['emmax']: if p_dict['emmax_perm']: perm_sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates']) perm_sd.filter_mac_snps(p_dict['mac_threshold']) t_snps = perm_sd.getSnps() t_phen_vals = phed.get_values(p_i) res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm']) emmax_perm_threshold = res['threshold_05'][0] import pylab hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6) threshold = -sp.log10(emmax_perm_threshold) b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0)) pylab.vlines(threshold, 0, max(hist_res[0]), color='g') pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r') pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']), format='png') if p_dict['with_replicates']: #Get values, with ecotypes, construct Z and do GWAM phen_vals = phed.get_values(p_i) Z = phed.get_incidence_matrix(p_i) res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) else: res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) elif mapping_method in ['emmax_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals'], emma_num=p_dict['emmax_emma_num']) print 'Step-wise EMMAX finished!' return elif mapping_method in ['lm_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals']) print 'Step-wise LM finished!' return elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) elif mapping_method in ['emmax_anova']: res = lm.emmax_anova(snps, phen_vals, k) elif mapping_method in ['lm_anova']: res = lm.anova(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' return if mapping_method in ['lm', 'emma', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] additional_columns.append('genotype_var_perc') if p_dict['with_betas'] or mapping_method in ['emma' ]: betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] additional_columns.append('beta0') if len(betas) > 1: kwargs['beta1'] = betas[1] additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() if mapping_method in ['lm_anova', 'emmax_anova']: kwargs['genotype_var_perc'] = res['var_perc'] pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() # print 'Calculating SNP-phenotype correlations.' # kwargs['correlations'] = calc_correlations(snps, phen_vals) # additional_columns.append('correlations') print 'Writing result to file.' res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs) if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter']) #add results to DB.. if p_dict['add_to_db']: print 'Adding results to DB.' if p_dict['with_db_ids']: db_pid = p_i else: db_pid = phed.get_db_pid(p_i) import results_2_db as rdb short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name, mapping_method, trans_method, p_dict['remove_outliers'], str(p_dict['with_replicates'])) tm_id = transformation_method_dict[trans_method] try: rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid, analysis_methods_dict[mapping_method], tm_id, remove_outliers=p_dict['remove_outliers']) except Exception, err_str: print 'Failed inserting results into DB!' print err_str
def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5( snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)