def _get_genotype_data_(p_dict): if p_dict['data_file']: sd = dataParsers.parse_snp_data(p_dict['data_file'] , format=p_dict['data_format'], filter=p_dict['debug_filter']) else: cm_id = p_dict['call_method_id'] df = p_dict['data_format'] #df = df if not cm_id in [78, 79] else 'diploid_int' sd = dataParsers.load_snps_call_method(p_dict['call_method_id'], data_format=df, debug_filter=p_dict['debug_filter']) return sd
def run_gwas(pid, call_method_id, run_id, kinship_method, debug_filter=1): #import snpsdata #LOAD DATA sd = dp.load_snps_call_method(call_method_id) if debug_filter < 1: sd.sample_snps(debug_filter) phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv' phed = pd.parse_phenotype_file(phenotype_file) phed.convert_to_averages() phen_name = phed.get_name(pid) sd.coordinate_w_phenotype_data(phed, pid) phed.transform(pid, 'most_normal') phen_vals = phed.get_values(pid) if kinship_method == 'ibd': global_k = sd.get_ibd_kinship_matrix() elif kinship_method == 'ibs': global_k = sd.get_ibs_kinship_matrix() p_her = phed.get_pseudo_heritability(pid, global_k) hist_file = env.env['results_dir'] + '%s_%s_%d_%d_%s_hist.png' % \ (run_id, kinship_method, call_method_id, pid, phen_name) phed.plot_histogram(pid, p_her=p_her, png_file=hist_file) #Set up GWAS #Chromosomes. res_dict = lm.chrom_vs_rest_mm(phen_vals, sd, kinship_method, global_k) print res_dict file_prefix = env.env['results_dir'] + '%s_loc_v_glob_chrom_%s_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, pid, phen_name) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_2_(res_file_name, res_dict) #Now 'normal' window sizes for ws in [3000000, 1000000, 500000, 200000, 100000, 50000, 20000]: file_prefix = env.env['results_dir'] + '%s_loc_v_glob_%s_%d_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, ws, pid, phen_name) res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix, ws, ws / 2, kinship_method, global_k) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) #Now gene-centralized. for radius in [20000, 10000, 5000]: file_prefix = env.env['results_dir'] + '%s_loc_v_glob_gene_%s_%d_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, radius, pid, phen_name) res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method, global_k) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_3_(res_file_name, res_dict) sd.filter_mac_snps(15) file_prefix = env.env['results_dir'] + '%s_emmax_stepwise_%s_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, pid, phen_name) lm.emmax_step_wise(phen_vals, global_k, sd=sd, num_steps=10, file_prefix=file_prefix, save_pvals=True)
def identify_interesting_haplotypes(chrom_pos_list, phenotype_file, pid): import dataParsers as dp import bisect sd = dp.load_snps_call_method(76) #Full sequence data. phed = pd.get_phenotypes_from_db([pid]) phed.convert_to_averages() sd.coordinate_w_phenotype_data(phed, pid) cpl = sd.getChrPosList() all_snps = sd.getSnps() snps = [] snp_chromosomes = [] snp_positions = [] for chrom_pos in chrom_pos_list: i = bisect.bisect(cpl, chrom_pos) - 1 if cpl[i] != chrom_pos: raise Exception('SNP not found') snps.append(all_snps[i]) snp_chromosomes.append(chrom_pos[0]) snp_positions.append(chrom_pos[1]) sd = dp.load_snps_call_method(76) identify_interesting_accessions(sd, snps, snp_chromosomes, snp_positions, phed.get_ecotypes(pid))
def create_diploid_dataset(call_method_id=76, file_name='/tmp/test.csv', coding_type='normal'): #Load parent list parents = [] with open(env.env['data_dir'] + 'heterozygous_genotypes.csv') as f: f.next() for l in f: parents.append(map(str.strip, l.split(','))) snpsd = dp.load_snps_call_method(call_method_id) l = zip(snpsd.accessions, range(len(snpsd.accessions))) l.sort() l = map(list, zip(*l)) acc_list = l[0] orders = l[1] sds = [] for i, sd in enumerate(snpsd.snpsDataList): snps = sp.array(sd.snps, dtype='int8') snps_list = [] p_list = [] for ps in parents: f_id = bisect.bisect(acc_list, ps[0]) - 1 m_id = bisect.bisect(acc_list, ps[1]) - 1 if acc_list[f_id] == ps[0] and acc_list[m_id] == ps[1]: f_gt = snps[:, orders[f_id]].flatten() m_gt = snps[:, orders[m_id]].flatten() if coding_type == 'normal': o_gt = f_gt + m_gt elif coding_type == 'dominant': o_gt = sp.bitwise_xor(f_gt, m_gt) snps_list.append(o_gt) p_list.append('%s_%s' % (ps[0], ps[1])) snps_list = sp.transpose(sp.array(snps_list, dtype='int8')) snps = [] for s in snps_list: snps.append(s) sds.append( snpsdata.SNPsData(snps, sd.positions, accessions=p_list, chromosome=i + 1)) sd = snpsdata.SNPsDataSet(sds, [1, 2, 3, 4, 5]) sd.writeToFile(file_name)
def telomere_example_plots(debug_filter=1.0, pid=1365, call_method_id=78, radius=20000, kinship_method='ibs'): genes_of_interest = ['AT1G21390', 'AT1G21400', 'AT1G21410', 'AT1G21420', 'AT1G21430', 'AT1G21440', 'AT1G21450', 'AT1G21460', 'AT1G21470', 'AT1G21480', 'AT1G21490'] sd = dp.load_snps_call_method(call_method_id) if debug_filter < 1: sd.sample_snps(debug_filter) phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv' phed = pd.parse_phenotype_file(phenotype_file) phed.convert_to_averages() phen_name = phed.get_name(pid) sd.coordinate_w_phenotype_data(phed, pid) phed.transform(pid, 'most_normal') png_file = env.env['results_dir'] + 'histogram_%s_hist.png' % phed.get_name(pid) phed.plot_histogram(pid, png_file=png_file) phen_vals = phed.get_values(pid) file_prefix = env.env['results_dir'] + 'loc_v_glob_gene_%d_%d_%d_%s' % \ (call_method_id, radius, pid, phen_name) res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method, tair_ids=genes_of_interest, plot_gene_trees=True, ets=sd.accessions)
def _perform_gwas_(phen_id, phenData, analysis_method, transformation, genotype, kinship_type, kinshipFile=None, messenger=None, outputfile=None): additional_columns = {} messenger.update_status(progress=0.0, task_status='Loading genotype data') genotypeData = dataParsers.load_snps_call_method(genotype) #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype) K = None messenger.update_status(step=0.05, task_status='Preparing data') n_filtered_snps = _prepare_data_(genotypeData, phenData, phen_id) phen_vals = phenData.get_values(phen_id) if analysis_method in [ 'emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm', 'amm' ]: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if kinshipFile: #Kinship file was supplied.. messenger.update_status( progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile) print 'Loading supplied kinship file: %s' % kinshipFile K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions) else: messenger.update_status(progress=0.15, task_status='Loading kinship file') print 'Loading kinship file.' K = kinship.get_kinship(call_method_id=genotype, method=kinship_type, n_removed_snps=n_filtered_snps, remain_accessions=genotypeData.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") snps = genotypeData.getSnps() positions = genotypeData.getPositions() chromosomes = [] for i, (s, c) in enumerate( itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)): chromosomes.extend([c] * len(s.snps)) maf_dict = genotypeData.get_mafs() if analysis_method in ['kw']: messenger.update_status(progress=0.7, task_status='Performing KW') res = util.kruskal_wallis(snps, phen_vals) elif analysis_method in ['loc_glob_mm']: raise NotImplementedError elif analysis_method in ['emma']: res = lm.emma(snps, phen_vals, K) elif analysis_method in ['emmax', 'amm']: d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100) res = d['res'] #additional_columns['stats'] = d['stats'] elif analysis_method in ['lm']: d = lm.lin_reg_step(phen_vals, genotypeData, []) res = d['res'] #additional_columns['stats'] = d['stats'] else: raise Exception('analysis method %s not supported' % analysis_method) pvals = res['ps'] #Calculate Benjamini-Hochberg threshold bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05) #Calculate Median p-value med_pval = agr.calc_median(res['ps']) #Calculate the Kolmogorov-Smirnov statistic ks_res = agr.calc_ks_stats(res['ps']) quantiles_dict = _calculate_qqplot_data_(pvals) scores = map(lambda x: -math.log10(x), pvals) if analysis_method in ['lm', 'emma', 'emmax', 'amm']: additional_columns['genotype_var_perc'] = res['var_perc'] if 'betas' in res: betas = map(list, zip(*res['betas'])) additional_columns['beta0'] = betas[0] if len(betas) > 1: additional_columns['beta1'] = betas[1] #calculate ld if outputfile is None: outputfile = "%s.hdf5" % phen_id messenger.update_status(progress=0.8, task_status='Processing and saving results') _save_hdf5_pval_file(outputfile, analysis_method, transformation, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], quantiles_dict, ks_res, bh_thres_d['thes_pval'], med_pval, additional_columns)
def load_and_plot_info_files(call_method_id=75, temperature=10, mac_threshold=15, debug_filter=1, near_const_filter=20, data_format='binary'): import random phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature) phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) print 'Loading the gene annotation dictionary' gene_dict = dp.parse_tair_gff_file() run_id = 'd081511' #run_id = 'rs_%d' % call_method_id file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id) num_genes = 0 radii = [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0] tss_dists = [200000, 100000, 50000, 25000, 10000, 5000, 1000] cvt_summary_dict = {'radius':{'avg_cis_trans_var_ratio':[0.0 for r in radii], 'avg_cis_herit':[0.0 for r in radii], 'avg_trans_herit':[0.0 for r in radii], 'counts':[0.0 for td in radii]}, 'radius_herit':{'avg_cis_trans_var_ratio':[0.0 for r in radii], 'avg_cis_herit':[0.0 for r in radii], 'avg_trans_herit':[0.0 for r in radii], 'counts':[0.0 for td in radii]}, 'tss_dist':{'avg_cis_trans_var_ratio':[0.0 for td in tss_dists], 'avg_cis_herit':[0.0 for td in tss_dists], 'avg_trans_herit':[0.0 for td in tss_dists], 'counts':[0.0 for td in tss_dists]}} heritabilities = [] transformations = [] shapiro_wilk_pvals = [] tair_ids = [] pval_infl_dict = {} dist_min_pval_dict = {} distance_bins = [(0, 5000), (0, 10000), (0, 25000), (0, 50000), (0, 100000), (1, -1), (6, -1)] radius_bins = [0, 1000, 5000, 10000, 25000, 50000, 100000] bonf_sign_bin_dict = {} res_dict = {} sign_count = {} for mm in ['EX', 'LM', 'KW']: pval_infl_dict[mm] = {'kolmogorov_smirnov':[], 'median_pvals':[]} dist_min_pval_dict[mm] = {} for bin in distance_bins: dist_min_pval_dict[mm][bin] = 0 bonf_sign_bin_dict[mm] = {} for bin in radius_bins: bonf_sign_bin_dict[mm][bin] = {'count':0.0, 'total':0.0} sign_count[mm] = 0 cofactor_count_dict = {} for criteria in ['ebics', 'mbonf', 'min_cof_ppa']: cofactor_count_dict[criteria] = {'num_cofactor_list':[], 'bin_counts':sp.zeros(9), 'num_cis_cofactor_list':[], 'num_found':0} pickle_file_dict = {} for mm in ['EX', 'LM', 'KW']: pickle_file_dict[mm] = {} pickle_file_dict[mm]['file_name'] = '%sresults_%s_mac%d.pickled' % (file_prefix, mm, mac_threshold) pickle_file_dict[mm]['res_dict'] = {} pids = phed.get_pids() for i, pid in enumerate(pids): tair_id = phed.get_name(pid) chrom = int(tair_id[2]) curr_file_prefix = '%schr_%d/rna_seq_%s_%dC_mac%d_pid%d_%s' % \ (file_prefix, chrom, run_id, temperature, mac_threshold, pid, tair_id) info_file_name = '%s_info.pickled' % curr_file_prefix for mm in ['EX', 'LM', 'KW']: res_dict[mm] = '%s_%s_.pvals' % (curr_file_prefix, mm) if random.random() > debug_filter: continue if os.path.isfile(info_file_name) and os.path.isfile(res_dict['EX'] + ".pickled") \ and os.path.isfile(res_dict['LM'] + ".pickled") and os.path.isfile(res_dict['KW'] + ".pickled"): print 'Loading info file: %s' % info_file_name num_genes += 1 info_dict = cPickle.load(open(info_file_name)) #Loading the info dict for mm in ['EX', 'LM', 'KW']: res_dict[mm] = gr.Result(res_dict[mm]) #Loading the result #Saving some basic statistics transformations.append(info_dict['transformation_type']) shapiro_wilk_pvals.append(info_dict['transformation_shapiro_pval']) heritabilities.append(info_dict['pseudo_heritability']) #cis vs. trans stuff cvt_dict = info_dict['CVT'] for r_i, r in enumerate(radii): if cvt_dict['radius'][r] != None: pvg = cvt_dict['radius'][r]['perc_var1'] pvl = cvt_dict['radius'][r]['perc_var2'] herit = cvt_dict['radius'][r]['pseudo_heritability1'] cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg) cvt_summary_dict['radius']['avg_cis_herit'][r_i] += pvl * herit cvt_summary_dict['radius']['avg_trans_herit'][r_i] += pvg * herit cvt_summary_dict['radius']['counts'][r_i] += 1.0 for r_i, r in enumerate(radii): if cvt_dict['radius'][r] != None: herit = cvt_dict['radius'][r]['pseudo_heritability1'] if herit > 0.05: pvg = cvt_dict['radius'][r]['perc_var1'] pvl = cvt_dict['radius'][r]['perc_var2'] cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg) cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] += pvl * herit cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] += pvg * herit cvt_summary_dict['radius_herit']['counts'][r_i] += 1.0 for td_i, td in enumerate(tss_dists): if cvt_dict['tss_upstream'][td] != None: pvg = cvt_dict['tss_upstream'][td]['perc_var1'] pvl = cvt_dict['tss_upstream'][td]['perc_var2'] herit = cvt_dict['tss_upstream'][td]['pseudo_heritability1'] cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] += pvl / (pvl + pvg) cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] += pvl * herit cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] += pvg * herit cvt_summary_dict['tss_dist']['counts'][td_i] += 1.0 tair_ids.append(tair_id) for mm in ['EX', 'LM', 'KW']: pval_infl_dict[mm]['kolmogorov_smirnov'].append(info_dict[mm]['kolmogorov_smirnov']['D']) pval_infl_dict[mm]['median_pvals'].append(info_dict[mm]['pval_median']) dist_min_pval = tuple(info_dict[mm]['dist_to_min_pval']) if res_dict[mm].min_score() < 1 / (20.0 * res_dict[mm].num_scores()): sign_count[mm] += 1 for bin in distance_bins: if dist_min_pval <= bin: dist_min_pval_dict[mm][bin] += 1 break for bin in radius_bins: pval = info_dict[mm]['bin_dict'][bin]['min_pval'] num_snps = info_dict[mm]['bin_dict'][bin]['num_snps'] if num_snps > 0: bonf_sign_bin_dict[mm][bin]['total'] += 1 if pval < 1.0 / (20 * num_snps): bonf_sign_bin_dict[mm][bin]['count'] += 1 #Stepwise stuff for criteria in ['ebics', 'mbonf', 'min_cof_ppa']: num_cofactors = len(info_dict['SW'][criteria]['cofactors']) cofactor_count_dict[criteria]['num_cofactor_list'].append(num_cofactors) if num_cofactors > 0: cofactor_count_dict[criteria]['num_found'] += 1 cofactor_count_dict[criteria]['bin_counts'] += sp.array(info_dict['SW'][criteria]['bin_counts']) cofactor_count_dict[criteria]['num_cis_cofactor_list'].append(info_dict['SW'][criteria]['bin_counts'][2]) #Pre-process the results.. for mm in ['EX', 'LM', 'KW']: res = res_dict[mm] #Trim results res.neg_log_trans() if mm == 'EX': res.filter_attr('scores', 3) #Filter everything below 10^-2.5 else: res.filter_attr('scores', 4) #Filter everything below 10^-4 if res.num_scores() == 0: print "Skipping file since nothing is below 10^-5" continue gene_d = gene_dict[tair_id] avg_g_pos = (gene_d['start_pos'] + gene_d['end_pos']) / 2.0 chrom = int(gene_d['chromosome']) #Current gene chromosome #Prepare for plotting results.. x,y style, where gene is x, and y is p-values chrom_pos_score_dict = res.get_chrom_score_pos_dict() dist_dict = {} for score_threshold in [5, 6, 7]: #negative log10 thresholds. if len(res.snp_results['scores']) == 0: dist_dict[score_threshold] = -2 #No results else: res.filter_attr('scores', score_threshold) if len(res.snp_results['scores']) == 0: dist_dict[score_threshold] = -2 #No results else: cps_dict = res.get_chrom_score_pos_dict() pos_list = cps_dict[chrom]['positions'] if len(pos_list) > 0: distances = sp.absolute(sp.array(pos_list) - avg_g_pos) d_i = sp.argmin(distances) dist_dict[score_threshold] = distances[d_i] #Min distance. else: dist_dict[score_threshold] = -1 #Different chromosome pickle_file_dict[mm]['res_dict'][(chrom, avg_g_pos)] = {'tair_id':tair_id, 'chrom_pos_score':chrom_pos_score_dict, 'dist_dict':dist_dict, 'pid':pid} print dist_dict else: print "Didn't find file: %s or %s" % (info_file_name, res_dict['EX'] + ".pickled") for mm in ['EX', 'LM', 'KW']: cPickle.dump(pickle_file_dict[mm]['res_dict'], open(pickle_file_dict[mm]['file_name'], 'wb'), protocol=2) for r_i, r in enumerate(radii): r_counts = cvt_summary_dict['radius']['counts'][r_i] cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] = \ cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] / r_counts cvt_summary_dict['radius']['avg_cis_herit'][r_i] = \ cvt_summary_dict['radius']['avg_cis_herit'][r_i] / r_counts cvt_summary_dict['radius']['avg_trans_herit'][r_i] = \ cvt_summary_dict['radius']['avg_trans_herit'][r_i] / r_counts for r_i, r in enumerate(radii): r_counts = cvt_summary_dict['radius_herit']['counts'][r_i] cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] = \ cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] / r_counts cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] = \ cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] / r_counts cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] = \ cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] / r_counts for td_i, td in enumerate(tss_dists): td_counts = cvt_summary_dict['tss_dist']['counts'][td_i] cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] = \ cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] / td_counts cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] = \ cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] / td_counts cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] = \ cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] / td_counts results_prefix = env['results_dir'] + 'RNAseq_summary_%dC_cm%d' % (temperature, call_method_id) pylab.figure() pylab.plot(cvt_summary_dict['radius']['avg_cis_trans_var_ratio']) pylab.ylabel('Avg. perc. of cis genetic var.') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_rad.png') pylab.clf() pylab.figure() pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio']) pylab.ylabel('Avg. perc. of cis genetic var.') pylab.xlabel('Dist. upstream from gene TSS (kb)') pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_td.png') pylab.clf() # pylab.figure() # pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_herit']) # pylab.ylabel('Avg. cis heritability') # pylab.xlabel('Dist. upstream from gene TSS (kb)') # pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) # pylab.savefig(results_prefix + 'avg_cis_herit_td.png') # pylab.clf() # # # pylab.figure() # pylab.plot(cvt_summary_dict['tss_dist']['avg_trans_herit']) # pylab.ylabel('Avg. remaining heritability') # pylab.xlabel('Dist. upstream from gene TSS (kb)') # pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) # pylab.savefig(results_prefix + 'avg_trans_herit_td.png') # pylab.clf() # pylab.figure() # pylab.plot(cvt_summary_dict['radius']['avg_trans_herit']) # pylab.ylabel('Avg. remaining heritability') # pylab.xlabel('Dist. from gene (kb)') # pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) # pylab.savefig(results_prefix + 'avg_trans_herit_rad.png') # pylab.clf() # # pylab.figure() # pylab.plot(cvt_summary_dict['radius']['avg_cis_herit']) # pylab.ylabel('Avg. cis heritability') # pylab.xlabel('Dist. from gene (kb)') # pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) # pylab.savefig(results_prefix + 'avg_cis_herit_rad.png') # pylab.clf() tot_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['radius']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 7, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_rad.png') tot_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['radius_herit']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 7, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_2_rad.png') tot_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['tss_dist']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 6], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(7), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(7), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. upstream from gene TSS (kb)') pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 6, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_td.png') pylab.figure() pylab.hist(heritabilities, bins=20, alpha=0.7) pylab.xlabel('Pseudo-heritability') pylab.xlim((-0.025, 1.025)) pylab.savefig(results_prefix + '_herits_hist.png') pylab.clf() ks_list = [] pm_list = [] for mm in ['EX', 'LM', 'KW']: ks_list.append(pval_infl_dict[mm]['kolmogorov_smirnov']) pm_list.append(pval_infl_dict[mm]['median_pvals']) png_file_name = results_prefix + '_kolmogorov_smirnov_boxplot.png' pylab.figure() pylab.boxplot(ks_list) pylab.axhline(0, color='k', alpha=0.6, ls='-.') pylab.xticks(range(1, 4), ['EX', 'LM', 'KW']) pylab.ylabel('Kolmogorov-Smirnov statistic D.') pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + '_median_pvals_boxplot.png' pylab.figure() pylab.boxplot(pm_list) pylab.axhline(0, color='k', alpha=0.6, ls='-.') pylab.xticks(range(1, 4), ['EX', 'LM', 'KW']) pylab.ylabel('Median p-value bias') pylab.savefig(png_file_name) pylab.clf() x_positions = sp.arange(len(distance_bins), dtype='d64') width = 0.25 png_file_name = results_prefix + '_dist_min_pval_hist.png' pylab.axes([0.08, 0.2, 0.91, 0.75]) for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']): l = [dist_min_pval_dict[mm][bin] for bin in distance_bins] tot_sum = sum(l) l = map(lambda x: x / float(tot_sum), l) pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm) x_positions += width pylab.ylabel('Frequency') pylab.xticks(x_positions - 3 * width / 2.0, (r'$d \leq 5$', r'$5< d \leq 10$', r'$10< d \leq 25$', \ r'$25< d \leq 50$', r'$50< d \leq 100$', r'$d>100$', \ 'Other chrom.'), rotation='45') pylab.xlabel('Distance $d$ (kb) to the smallest p-value from the gene.') pylab.xlim((-0.25, len(distance_bins))) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf() x_positions = sp.arange(len(radius_bins) + 1, dtype='d64') width = 0.25 png_file_name = results_prefix + 'bonf_sign_bin_hist.png' pylab.axes([0.08, 0.22, 0.91, 0.73]) for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']): l = [bonf_sign_bin_dict[mm][bin]['count'] / bonf_sign_bin_dict[mm][bin]['total'] for bin in radius_bins] l.append(sign_count[mm] / float(num_genes)) pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm) x_positions += width pylab.ylabel('Fraction of sign. results') pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$d \leq 1$', r'$d \leq 5$', \ r'$d \leq 10$', r'$d \leq 25$', r'$d \leq 50$', \ r'$d \leq 100$', 'Whole genome'), rotation='45') pylab.xlabel(r'Among SNPs with distance $d$ (kb) from gene.') pylab.xlim((-0.25, len(radius_bins) + 1)) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cofactor_count_hist.png' x_positions = sp.arange(6, dtype='d64') width = 0.25 for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cofactor_list'])) while len(bin_counts) < 6: bin_counts.append(0) pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.xlabel('Number of cofactor SNPs') pylab.ylabel('Number of genes') pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5')) pylab.legend(loc=1) pylab.xlim((-0.2, 6)) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cis_cofactor_count_hist.png' x_positions = sp.arange(6, dtype='d64') for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cis_cofactor_list'])) while len(bin_counts) < 6: bin_counts.append(0) pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.xlabel('Number of cis cofactor SNPs') pylab.ylabel('Number of genes') pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5')) pylab.legend(loc=1) pylab.xlim((-0.2, 6)) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cofactor_bin_count_hist.png' x_positions = sp.arange(9, dtype='d64') width = 0.25 pylab.axes([0.08, 0.2, 0.91, 0.75]) for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): cofactor_count_dict[criteria]['bin_counts'] = \ cofactor_count_dict[criteria]['bin_counts'] / cofactor_count_dict[criteria]['num_found'] l = list(cofactor_count_dict[criteria]['bin_counts']) l.reverse() pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.ylabel('Fraction all genes with cofactors.') pylab.xlabel(r'Distance $d$ (kb) to cofactor from gene.') pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$1\geq d$', r'$5\geq d$', r'$10\geq d$', \ r'$25\geq d$', r'$50\geq d$', r'$100\geq d$', \ r'$d>100$', 'Other chrom.'), rotation='45') pylab.xlim((-0.2, 9)) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf()
def plot(temperature=10, call_method_id=75, mapping_method='EX', mac_threshold=15, min_score=5, near_const_filter=20, data_format='binary', plot_data=True): #Load in chromosome dict.. #file_prefix = '/srv/lab/data/rna_seq_062911/%dC/cm_%d/' % (temperature, call_method_id) file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id) results_dict_file = '%sresults_%s_mac%d.pickled' % (file_prefix, mapping_method, mac_threshold) res_dict = cPickle.load(open(results_dict_file)) phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature) phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) chrom_dict = {} for x_chrom in [1, 2, 3, 4, 5]: for y_chrom in [1, 2, 3, 4, 5]: chrom_dict[(x_chrom, y_chrom)] = {'scores':[], 'x_positions':[], 'y_positions':[], 'tair_ids':[], 'r2':[], 'mac':[]} scores = [] for x_chrom, x_pos in res_dict: d = res_dict[(x_chrom, x_pos)] tair_id = d['tair_id'] for y_chrom in [1, 2, 3, 4, 5]: cps_d = d['chrom_pos_score'][y_chrom] for i in range(len(cps_d['scores'])): s = cps_d['scores'][i] if s > min_score: if s > 25: s = 25 scores.append(s) chrom_dict[(x_chrom, y_chrom)]['scores'].append(s) chrom_dict[(x_chrom, y_chrom)]['tair_ids'].append(tair_id) chrom_dict[(x_chrom, y_chrom)]['x_positions'].append(x_pos) chrom_dict[(x_chrom, y_chrom)]['y_positions'].append(cps_d['positions'][i]) #Write chrom_dict to file.. if not plot_data: for x_chrom in [1, 2, 3, 4, 5]: for y_chrom in [1, 2, 3, 4, 5]: file_name = file_prefix + 'result_plots/pvalues_chrom%d_chrom%d_%s_min%d.txt' % (x_chrom, y_chrom, mapping_method, min_score) print 'Writing to file:', file_name with open(file_name, 'w') as f: d = chrom_dict[(x_chrom, y_chrom)] f.write('x_position, y_position, score, tair_id\n') l = zip(d['x_positions'], d['y_positions'], d['scores'], d['tair_ids']) l.sort() for t in l: f.write('%d,%d,%f,%s\n' % t) chrom_sizes = [30425061, 19694800, 23456476, 18578714, 26974904] cum_chrom_sizes = [sum(chrom_sizes[:i]) for i in range(5)] tot_num_bases = float(sum(chrom_sizes)) rel_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), chrom_sizes) rel_cum_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), cum_chrom_sizes) for i in range(5): rel_cum_chrom_sizes[i] = rel_cum_chrom_sizes[i] + 0.02 + 0.01 * i chromosome_ends = {1:30.425061, 2:19.694800, 3:23.456476, 4:18.578714, 5:26.974904} print rel_chrom_sizes, rel_cum_chrom_sizes #Filter data.. #Now plot data!! if plot_data: alpha = 0.8 linewidths = 0 vmin = min_score f = pylab.figure(figsize=(40, 35)) chromosomes = [1, 2, 3, 4, 5] plot_file_name = file_prefix + 'result_plots/pvalues_%s_min%d.png' % (mapping_method, min_score) label = '$-log_{10}$(p-value)' vmax = max(scores) for yi, chr2 in enumerate(chromosomes): for xi, chr1 in enumerate(chromosomes): l = chrom_dict[(chr1, chr2)]['scores'] if len(l) == 0: continue ax = f.add_axes([0.96 * (rel_cum_chrom_sizes[xi] + 0.01), rel_cum_chrom_sizes[yi] - 0.02, 0.96 * (rel_chrom_sizes[xi]), rel_chrom_sizes[yi] ]) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) #ax.tick_params(fontsize='x-large') if xi > 0: ax.spines['left'].set_visible(False) ax.yaxis.set_visible(False) else: ax.yaxis.set_ticks_position('left') ax.set_ylabel('Chromosome %d (Mb)' % chr2, fontsize='x-large') if yi < 4: ax.spines['top'].set_visible(False) ax.xaxis.set_visible(False) else: ax.xaxis.set_ticks_position('top') ax.xaxis.set_label_position('top') ax.set_xlabel('Chromosome %d (Mb)' % chr1, fontsize='x-large') #ax.set_xlabel('Chromosome %d' % chr1) #l = -sp.log10(l) #l = l.tolist() l_zxy = zip(l, chrom_dict[(chr1, chr2)]['x_positions'], chrom_dict[(chr1, chr2)]['y_positions']) l_zxy.sort() l = map(list, zip(*l_zxy)) zs = l[0] xs = map(lambda x: x / 1000000.0, l[1]) ys = map(lambda x: x / 1000000.0, l[2]) scatter_plot = ax.scatter(xs, ys, c=zs, alpha=alpha, linewidths=linewidths, vmin=vmin, vmax=vmax) ax.axis([-0.025 * chromosome_ends[chr1], 1.025 * chromosome_ends[chr1], - 0.025 * chromosome_ends[chr2], 1.025 * chromosome_ends[chr2]]) cax = f.add_axes([0.965, 0.7, 0.01, 0.2]) cb = pylab.colorbar(scatter_plot, cax=cax) cb.set_label(label, fontsize='xx-large') #cb.set_tick_params(fontsize='x-large') f.text(0.005, 0.47, 'Associated SNP position', size='xx-large', rotation='vertical') f.text(0.47, 0.988, 'Expressed gene position', size='xx-large') print 'Saving figure:', plot_file_name f.savefig(plot_file_name, format='png')
def run_gwas(file_prefix, phen_file, start_i, stop_i, temperature, mac_threshold=15, filter_threshold=0.02, call_method_id=79, data_format='diploid_int', debug_filter=1.0, near_const_filter=20): """ GWAS """ phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids[start_i :stop_i] sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) print len(sd.accessions) K = sd.get_ibs_kinship_matrix() #K = dp.load_kinship(call_method_id=call_method_id, data_format=data_format, sd=sd, method='ibs') sd.filter_mac_snps(mac_threshold) snps = sd.getSnps() positions = sd.getPositions() chromosomes = sd.get_chr_list() r = sd.get_mafs() macs = r['mafs'] mafs = r['marfs'] print 'In total there are %d SNPs to be mapped.' % len(snps) gene_dict = dp.parse_tair_gff_file()#_load_genes_list_('rna_seq_031311_%sC' % temperature) for i, pid in enumerate(pids): if not pid in phed.phen_ids: continue gene_tair_id = phed.get_name(pid) # exons = [] # for isoform in d: # for exon in isoform['exons']: # exons.append((d['chromosome'], exon['start_pos'], exon['end_pos'])) d = gene_dict[gene_tair_id] gene_strand = d['strand'] try: chrom = int(d['chromosome']) except Exception: raise gene = gwaResults.Gene(chromosome=int(d['chromosome']), startPos=d['start_pos'], endPos=d['end_pos'], name=gene_tair_id, description=None, dbRef=gene_tair_id, tairID=gene_tair_id) print i, pid, gene curr_file_prefix = '%s_mac%d_pid%d_%s' % (file_prefix, mac_threshold, pid, gene_tair_id) trans_type, shapiro_pval = phed.most_normal_transformation(pid) print 'Most normal transformation was: %s' % trans_type #trans_type = 'None' summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':shapiro_pval} #summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':0} print'Applying Kruskal-Wallis' phen_vals = phed.get_values(pid) res = util.kruskal_wallis(snps, phen_vals) pvals = res['ps'].tolist() kw_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes) print 'Summarizing KW' summary_dict['KW'] = kw_res.get_gene_analysis(gene) summary_dict['KW']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['KW']['pval_median'] = agr.calc_median(res['ps']) print 'Applying LM' res = lm.linear_model(snps, phen_vals) pvals = res['ps'].tolist() perc_var_expl = res['var_perc'].tolist() lm_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) print 'Summarizing LM' summary_dict['LM'] = lm_res.get_gene_analysis(gene) summary_dict['LM']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['LM']['pval_median'] = agr.calc_median(res['ps']) print 'Applying EX Stepwise' snp_priors = sd.get_cand_genes_snp_priors([gene]) ex_sw_res = lm.emmax_step_wise(phen_vals, K, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, snps=snps, num_steps=5, cand_gene_list=[gene], with_qq_plots=False, log_qq_max_val=6.0, save_pvals=True, snp_priors=snp_priors) print 'Summarizing the step-wise mixed model' pvals = ex_sw_res['first_emmax_res']['ps'].tolist() perc_var_expl = ex_sw_res['first_emmax_res']['var_perc'].tolist() ex_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) summary_dict['EX'] = ex_res.get_gene_analysis(gene) summary_dict['pseudo_heritability'] = ex_sw_res['step_info_list'][0]['pseudo_heritability'] summary_dict['EX']['kolmogorov_smirnov'] = agr.calc_ks_stats(ex_sw_res['first_emmax_res']['ps']) summary_dict['EX']['pval_median'] = agr.calc_median(ex_sw_res['first_emmax_res']['ps']) #Does the linear mixed model fit the data better? summary_dict['MM_LRT'] = lm.mm_lrt_test(phen_vals, K) #FINISH summarizing the stepwise!!! summarize_stepwise(summary_dict, gene, ex_sw_res['step_info_list'], ex_sw_res['opt_dict']) cvt_dict = {'radius':{}, 'tss_upstream':{}} print 'Comparing cis vs. trans kinship' #Check 1 mb, 200kb, 100kb, 50kb, 20kb, 10kb, 2kb, 0kb for radius in [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]: print radius r_start_pos = max(gene.startPos - radius, 0) r_end_pos = gene.endPos + radius d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['radius'][radius] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['radius'][radius] = None print cvt_dict['radius'][radius] #Check TSS, 100kb, 50kb,25kb, 10kb,5kb,0kb, (all upstream) for dist in [200000, 100000, 50000, 25000, 10000, 5000, 1000]: print dist, gene_strand if gene_strand == '+': r_start_pos = max(gene.startPos - dist, 0) r_end_pos = gene.startPos else: r_start_pos = gene.endPos r_end_pos = gene.endPos + dist d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['tss_upstream'][dist] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['tss_upstream'][dist] = None print cvt_dict['tss_upstream'][dist] summary_dict['CVT'] = cvt_dict #Write info to file.. cPickle.dump(summary_dict, open(curr_file_prefix + '_info.pickled', 'w'), protocol=2) f_prefix = curr_file_prefix + '_hist' phed.plot_histogram(pid, title='Gene expressions for %s' % gene_tair_id, png_file=f_prefix + '.png', p_her=summary_dict['pseudo_heritability'], x_label='RNA seq expression levels (%s transformed)' % trans_type) #Plot GWAs... for res, method_name in [(kw_res, 'KW'), (lm_res, 'LM'), (ex_res, 'EX')]: res.filter_percentile(filter_threshold, reversed=True) res.write_to_file('%s_%s_.pvals' % (curr_file_prefix, method_name), only_pickled=True) if ex_res.min_score() < 10e-10: #print [cg.tairID for cg in cgs] f_prefix = '%s_%s_manhattan' % (curr_file_prefix, method_name) res.plot_manhattan(png_file=f_prefix + '.png', percentile=0, cand_genes=[gene], plot_bonferroni=True, neg_log_transform=True)
def plot_gw_r2_decay(file_prefix, num_random_xs=200, max_dist=1000000, call_method_id=78, mac_filter=15, debug_filter=1): """ Plots r2 decay on the genome-wide scale """ dtype = 'single' #To increase matrix multiplication speed... using 32 bits. sd = dp.load_snps_call_method(call_method_id=call_method_id, debug_filter=debug_filter, min_mac=mac_filter) #sd.filter_mac_snps(mac_filter) h_inverse_matrix_file = env[ 'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id) if not os.path.isfile(h_inverse_matrix_file): K = sd.get_snp_cov_matrix() H_sqrt = lm.cholesky(K) H_sqrt_inv = (H_sqrt).I with file(h_inverse_matrix_file, 'wb') as f: cPickle.dump(H_sqrt_inv, f, protocol=2) else: with file(h_inverse_matrix_file) as f: H_sqrt_inv = cPickle.load(f) cps_list = sd.getChrPosSNPList() x_cps = random.sample(cps_list, num_random_xs) y_cps = cps_list result_dict = {} n = len(sd.accessions) print 'Starting calculation' sys.stdout.flush() dists = [] r2s = [] t_r2s = [] x_macs = [] y_macs = [] n_saved = 0 s1 = time.time() for i, (x_c, x_p, x_snp) in enumerate(x_cps): print '%d: chromosome=%d, position=%d' % (i, x_c, x_p) #Normalize SNP.. xs = sp.array(x_snp) x_mac = sum(xs) t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T for (y_c, y_p, y_snp) in reversed(y_cps): if x_c != y_c: continue if abs(x_p - y_p) > max_dist: continue ys = sp.array(y_snp) x_macs.append(x_mac) y_macs.append(sum(ys)) (r, pearson_pval) = st.pearsonr(xs, ys) r2 = r * r t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T (t_r, t_pearson_pval) = st.pearsonr( t_x_snp, t_y_snp) #Done twice, but this is fast.. t_r, t_pearson_pval = float(t_r), float(t_pearson_pval) t_r2 = t_r * t_r dists.append(abs(x_p - y_p)) r2s.append(r2) t_r2s.append(t_r2) n_saved += 1 time_secs = time.time() - s1 print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60, time_secs % 60) print '%d values were saved.' % n_saved sys.stdout.flush() #Now plotting and binning.. for m_dist in [50000, 100000, 200000, 500000, 1000000]: kbs = m_dist / 1000 bin_ids = sp.digitize(dists, sp.arange(0, m_dist, m_dist / 100)) - 1 bin_dict = {} for bid in range(100): bin_dict[bid] = {'r2s': [], 't_r2s': []} filtered_r2s = [] filtered_t_r2s = [] filtered_dists = [] for bid, r2, t_r2, dist in izip(bin_ids, r2s, t_r2s, dists): if dist > m_dist: continue bin_dict[bid]['r2s'].append(r2) filtered_r2s.append(r2) bin_dict[bid]['t_r2s'].append(t_r2) filtered_t_r2s.append(t_r2) filtered_dists.append(dist) pylab.figure() pylab.plot(filtered_dists, filtered_r2s, alpha=0.3, color='k', marker='.', ls='None') pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_r2s.png' % (kbs)) pylab.figure() pylab.plot(filtered_dists, filtered_t_r2s, alpha=0.3, color='k', marker='.', ls='None') pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_t_r2s.png' % (kbs)) r2_avgs = [] t_r2_avgs = [] xs = [] l = sp.arange(0, m_dist, m_dist / 100) + (m_dist / 200) for bid in range(100): n = len(bin_dict[bid]['r2s']) if n > 0: r2_avgs.append(sp.sum(bin_dict[bid]['r2s']) / n) t_r2_avgs.append(sp.sum(bin_dict[bid]['t_r2s']) / n) xs.append(l[bid]) pylab.figure() pylab.plot(xs, r2_avgs, alpha=0.7, color='b', lw=1.8, label=r'standard $r^2$') pylab.plot(xs, t_r2_avgs, alpha=0.7, color='m', lw=1.8, label=r'transformed $r^2$') pylab.legend(loc=1) pylab.xlabel('Distance (bases)') pylab.ylabel(r'$r^2$') pylab.savefig(file_prefix + '_%dkb_r2s_avgs.png' % (kbs))
def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75, kinship_method='ibs', progress_file_writer=None): """ Performs GWAS and updates the datastructure. """ import bisect import gwa step_wise = False if analysis_method not in ['lm', 'emmax', 'kw']: raise Exception('analysis method %s not supported' % analysis_method) progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data') phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}}) phend.convert_to_averages() progress_file_writer.update_progress_bar(task_status='Loading genotype data') sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data') sd.coordinate_w_phenotype_data(phend, 1) progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs') sd.filter_monomorphic_snps() phen_vals = phend.get_values(1) snps = sd.getSnps() positions = sd.getPositions() chromosomes = [] progress_file_writer.set_step(0.03) for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)): progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes))) chromosomes.extend([c] * len(s.snps)) maf_dict = sd.get_mafs() kwargs = {} if analysis_method == 'emmax': progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix') k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions, scaled=True, min_mac=5, sd=sd) progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX') d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') res = d['res'] stats_dict = d['stats'] elif analysis_method == 'lm': progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM') res = lm.linear_model(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') elif analysis_method == 'kw': progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW') kw_res = util.kruskal_wallis(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') scores = map(lambda x:-math.log10(x), kw_res['ps']) self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds']) else: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method in ['lm', 'emmax']: if 'betas' in res: betas = map(list, zip(*res['betas'])) else: betas = [None, None] scores = map(lambda x:-math.log10(x), res['ps']) stats_dict['step'] = 0 cofactors = [stats_dict] self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1], cofactors=cofactors) progress_file_writer.update_progress_bar(progress=1.0, task_status='Done') print 'Done!' return analysis_method
def _run_otu_wperm(self, file_prefix, phenotype_file, delimiter=',', covariate_file=None, phenotype_id=1, call_method_id=1307, maf_threshold=5, number_of_permutations=10): ## # phenotype_file = "/home/GMI/matt.horton/meta/metagenomics/gwas/leaf/16S/min800_cca/phenotypes/leaf.16S.800.2sampPerOTU.rare.cca.abd.2reps.n100.cca.txt" # call_method_id = 1308 # maf_threshold = 5 # phenotype_id = 1 # delimiter = ',' print "Opening snp and phenotype files." sys.stdout.flush() if '/' in phenotype_file: print "Opening phenotype-file: " + phenotype_file phenotype = pd.parse_phenotype_file( phenotype_file, delim=delimiter) #load phenotype file results_directory = phenotype_file.partition( "phenotypes" ) # parse this off of the phenotypeFileName and sub the phenotypes dir for the results dir (which needs to be at the same level!!!) results_directory = results_directory[0] + 'results/' print "Outputing results to: " + results_directory else: phenotype = pd.parse_phenotype_file( env['phen_dir'] + phenotype_file, delim=delimiter) #load phenotype file results_directory = env['results_dir'] sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary') indices_to_keep = sd.coordinate_w_phenotype_data( phenotype, phenotype_id) #truncate to the phenotype of interest. indices_to_keep = indices_to_keep.get('pd_indices_to_keep') # determine whether to use mac or maf (I might have to use the mac code after determining what the mac should be from the maf) if maf_threshold > 0: sd.filter_mac_snps(10) # mac_threshold = int(math.ceil(len(sd.accessions) * (float(maf_threshold) / 100))) # print "Applying maf threshold: " + str(maf_threshold) + "% to " + str(len(sd.accessions)) + " accessions (mac < " + str(mac_threshold) + ")" # sd.filter_mac_snps(mac_threshold) phenotype_name = phenotype.get_name(phenotype_id) phenotype_values = phenotype.get_values(phenotype_id) Z = phenotype.get_incidence_matrix(phenotype_id) print "There are: " + str(sd.num_snps()) + " SNPs." print "in: " + str(len(sd.accessions)) + " accessions" print "and " + str(len(indices_to_keep)) + " observations." print "The average number of observations per genotype is " + str( float(len(indices_to_keep)) / float(len(sd.accessions))) sys.stdout.flush() K = sd.get_ibs_kinship_matrix() K = sp.matrix(K) Z = sp.matrix(Z) print "Examining phenotype: '" + phenotype_name + "' (phenotype_id: " + str( phenotype_id) + ")." print 'Applying Permutation tests.' snps = sd.get_snps() print "Running %d EMMAX-permutations (writes %d dots)" % ( number_of_permutations, number_of_permutations) s1 = time.time() res_perm = self._emmax_permutations(snps, phenotype_values, number_of_permutations, K=K, Z=Z) p_f_list = zip(res_perm['min_ps'], res_perm['max_f_stats']) p_f_list.sort() print p_f_list[:10] threshold = p_f_list[len(p_f_list) / 20] res_perm['threshold_05'] = threshold print 'Threshold should be:', threshold secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) print "Permutation tests done for phenotype: " + phenotype_name results = {} results['perm_pval'] = res_perm['min_ps'].tolist() results['perm_fstat'] = res_perm['max_f_stats'].tolist() output_file = '%s/%s_perm.pvals_pid_%d_%s' % ( results_directory, file_prefix, phenotype_id, phenotype_name) columns = ['perm_pval', 'perm_fstat'] with open(output_file, "w") as f: f.write(','.join(columns) + "\n") for i in range(1, (number_of_permutations + 1)): l = [results[c][i - 1] for c in columns] l = map(str, l) f.write(",".join(l) + "\n") print "Permutation p-values written."
def _perform_gwas_(phen_id,phenData,analysis_method,transformation,genotype,kinship_type,kinshipFile=None,messenger=None,outputfile=None): additional_columns = {} messenger.update_status(progress=0.0, task_status='Loading genotype data') genotypeData = dataParsers.load_snps_call_method(genotype) #genotypeData = dataParsers.load_hdf5_snps_call_method(genotype) K = None messenger.update_status(step=0.05, task_status='Preparing data') n_filtered_snps = _prepare_data_(genotypeData,phenData,phen_id) phen_vals = phenData.get_values(phen_id) if analysis_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm','amm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if kinshipFile: #Kinship file was supplied.. messenger.update_status(progress=0.15, task_status='Loading supplied kinship file: %s' % kinshipFile) print 'Loading supplied kinship file: %s' % kinshipFile K = kinship.load_kinship_from_file(kinshipFile, genotypeData.accessions) else: messenger.update_status(progress=0.15, task_status='Loading kinship file') print 'Loading kinship file.' K = kinship.get_kinship(call_method_id=genotype, method=kinship_type, n_removed_snps=n_filtered_snps, remain_accessions=genotypeData.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") snps = genotypeData.getSnps() positions = genotypeData.getPositions() chromosomes = [] for i, (s, c) in enumerate(itertools.izip(genotypeData.snpsDataList, genotypeData.chromosomes)): chromosomes.extend([c] * len(s.snps)) maf_dict = genotypeData.get_mafs() if analysis_method in ['kw']: messenger.update_status(progress=0.7, task_status='Performing KW') res = util.kruskal_wallis(snps, phen_vals) elif analysis_method in ['loc_glob_mm']: raise NotImplementedError elif analysis_method in ['emma']: res = lm.emma(snps, phen_vals, K) elif analysis_method in ['emmax','amm']: d = lm.emmax_step(phen_vals, genotypeData, K, [], emma_num=100) res = d['res'] #additional_columns['stats'] = d['stats'] elif analysis_method in ['lm']: d = lm.lin_reg_step(phen_vals, genotypeData, []) res = d['res'] #additional_columns['stats'] = d['stats'] else: raise Exception('analysis method %s not supported' % analysis_method) pvals = res['ps'] #Calculate Benjamini-Hochberg threshold bh_thres_d = mtcorr.get_bhy_thres(res['ps'], fdr_thres=0.05) #Calculate Median p-value med_pval = agr.calc_median(res['ps']) #Calculate the Kolmogorov-Smirnov statistic ks_res = agr.calc_ks_stats(res['ps']) quantiles_dict = _calculate_qqplot_data_(pvals) scores = map(lambda x:-math.log10(x), pvals) if analysis_method in ['lm', 'emma', 'emmax','amm']: additional_columns['genotype_var_perc'] = res['var_perc'] if 'betas' in res: betas = map(list, zip(*res['betas'])) additional_columns['beta0'] = betas[0] if len(betas) > 1: additional_columns['beta1'] = betas[1] #calculate ld if outputfile is None: outputfile = "%s.hdf5" % phen_id messenger.update_status(progress=0.8, task_status='Processing and saving results') _save_hdf5_pval_file(outputfile, analysis_method, transformation,chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], quantiles_dict,ks_res,bh_thres_d['thes_pval'],med_pval,additional_columns)
def perform_stepwise_gwas(self, phen_name, dataset, transformation, analysis_method, result_name, chromosome, position, call_method_id=75, kinship_method='ibs',progress_file_writer=None): """ Performs GWAS and updates the datastructure. """ #if analysis_method not in ['emmax','lm']: # raise Exception("Step-Wise GWAS only possible with emmax or LM") snp = ((int(chromosome), int(position))) result_group = self.h5file.getNode('/phenotypes/%s/%s/%s/%s' % (phen_name, dataset, transformation, analysis_method)) result = result_group._f_getChild(result_name) cofactors = result._v_attrs.cofactors[:] co_var_snps = [(int(factors['chr']), int(factors['pos'])) for factors in cofactors if 'chr' in factors and 'pos' in factors] if snp in co_var_snps: raise Exception('The SNP %s,%s is already in the result' % chromosome, position) co_var_snps.append(snp) co_var_snps = set(co_var_snps) #for avail_result in result_group._f_iterNodes(classname='Table'): # if set(avail_result._v_attrs.cofactors) == co_var_snps: # raise Exception("There is already a result with the selected snps") new_result_name = "SW_%s" % result_group._v_nchildren name = "%s_%s" % (analysis_method, new_result_name) import bisect import gwa if analysis_method not in ['lm', 'emmax', 'kw']: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method == 'kw': analysis_method = 'emmax' progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data') phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}}) phend.convert_to_averages() progress_file_writer.update_progress_bar(task_status='Loading genotype data') sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data') sd.coordinate_w_phenotype_data(phend, 1) sd.filter_monomorphic_snps() phen_vals = phend.get_values(1) snps = sd.getSnps() positions = sd.getPositions() chromosomes = [] progress_file_writer.set_step(0.03) for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)): chromosomes.extend([c] * len(s.snps)) progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes))) maf_dict = sd.get_mafs() kwargs = {} if analysis_method == 'emmax': progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix') k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions, scaled=True, min_mac=5, sd=sd) progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing Step-Wise EMMAX') d = lm.emmax_step(phen_vals, sd, k, co_var_snps,progress_file_writer=progress_file_writer) progress_file_writer.update_progress_bar(0.95, 'Processing and saving results') res = d['res'] stats_dict = d['stats'] elif analysis_method == 'lm': res = lm.linear_model(snps, phen_vals) else: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method in ['lm', 'emmax']: if 'betas' in res: betas = map(list, zip(*res['betas'])) else: betas = [None, None] scores = map(lambda x:-math.log10(x), res['ps']) stats_dict['chr'] = snp[0] stats_dict['pos'] = snp[1] stats_dict['step'] = len(cofactors) cofactors.append(stats_dict) self.add_results(phen_name,dataset, analysis_method, name, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1], cofactors=cofactors, result_name=new_result_name) print 'Done!' progress_file_writer.update_progress_bar(1.0, 'Done') return name
def calc_r2_levels(file_prefix, x_start_i, x_stop_i, call_method_id=78, data_format='diploid_int', mac_filter=15, save_threshold=0.2, save_threshold2=0.3, debug_filter=1): """ Returns statistics on LD levels, and plot them. """ dtype = 'single' #To increase matrix multiplication speed... using 32 bits. sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter, min_mac=mac_filter) #sd.filter_mac_snps(mac_filter) h_inverse_matrix_file = env[ 'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id) if not os.path.isfile(h_inverse_matrix_file): K = sd.get_snp_cov_matrix() H_sqrt = lm.cholesky(K) H_sqrt_inv = (H_sqrt).I with file(h_inverse_matrix_file, 'wb') as f: cPickle.dump(H_sqrt_inv, f, protocol=2) else: with file(h_inverse_matrix_file) as f: H_sqrt_inv = cPickle.load(f) cps_list = sd.getChrPosSNPList() x_cps = cps_list[x_start_i:x_stop_i] y_cps = cps_list result_dict = {} n = len(sd.accessions) print 'Starting calculation' sys.stdout.flush() hdf5_file_name = file_prefix + '_x_' + str(x_start_i) + '_' + str( x_stop_i) + ".hdf5" h5_file = h5py.File(hdf5_file_name, 'w') for i, (x_c, x_p, x_snp) in enumerate(x_cps): print '%d: chromosome=%d, position=%d' % (i, x_c, x_p) #Normalize SNP.. xs = sp.array(x_snp) t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T s1 = time.time() y_cs = [] y_ps = [] r2s = [] t_r2s = [] ps = [] t_ps = [] n_saved = 0 for (y_c, y_p, y_snp) in reversed(y_cps): if (x_c, x_p) < (y_c, y_p): ys = sp.array(y_snp) mac = ys.sum() (r, pearson_pval) = st.pearsonr(xs, ys) r2 = r * r if x_c == y_c and y_p - x_p <= 50000 and r2 > save_threshold2: t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T (t_r, t_pearson_pval) = st.pearsonr( t_x_snp, t_y_snp) #Done twice, but this is fast.. t_r, t_pearson_pval = float(t_r), float(t_pearson_pval) t_r2 = t_r * t_r y_cs.append(y_c) y_ps.append(y_p) r2s.append(r2) t_r2s.append(t_r2) ps.append(pearson_pval) t_ps.append(t_pearson_pval) n_saved += 1 elif ((x_c == y_c and y_p - x_p > 50000) or x_c != y_c) and r2 > save_threshold: t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T (t_r, t_pearson_pval) = st.pearsonr( t_x_snp, t_y_snp) #Done twice, but this is fast.. t_r, t_pearson_pval = float(t_r), float(t_pearson_pval) t_r2 = t_r * t_r y_cs.append(y_c) y_ps.append(y_p) r2s.append(r2) t_r2s.append(t_r2) ps.append(pearson_pval) t_ps.append(t_pearson_pval) n_saved += 1 else: break if n_saved > 0: grp = h5_file.create_group('x%d' % i) grp.create_dataset("n_saved", data=n_saved) grp.create_dataset("x_c", data=x_c) grp.create_dataset("x_p", data=x_p) grp.create_dataset("x_snp", compression='gzip', data=x_snp) grp.create_dataset("y_cs", compression='gzip', data=y_cs) grp.create_dataset("y_ps", compression='gzip', data=y_ps) grp.create_dataset("r2s", compression='gzip', data=r2s) grp.create_dataset("t_r2s", compression='gzip', data=t_r2s) grp.create_dataset("ps", compression='gzip', data=ps) grp.create_dataset("t_ps", compression='gzip', data=t_ps) time_secs = time.time() - s1 print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60, time_secs % 60) print '%d values were saved.' % n_saved sys.stdout.flush() h5_file.close()
def get_kinship(call_method_id=75, data_format='binary', method='ibs', n_removed_snps=None, remain_accessions=None, scaled=True, min_mac=5, sd=None, debug_filter=1, return_accessions=False): """ Loads and processes the kinship matrix """ import dataParsers as dp import env if method == 'ibd': if sd != None: k = sd.get_ibd_kinship_matrix() if scaled: k = scale_k(k) return k else: raise NotImplementedError( 'Currently only IBS kinship matrices are supported') elif method == 'ibs': if call_method_id: file_prefix = '%s%d/kinship_%s_%s' % ( env.env['cm_dir'], call_method_id, method, data_format) kinship_file = file_prefix + '_mac%d.h5py' % min_mac if os.path.isfile(kinship_file): print('Found kinship file: {}'.format(kinship_file)) d = load_kinship_from_file(kinship_file, scaled=False) k = d['k'] k_accessions = d['accessions'] n_snps = d['n_snps'] else: print("Didn't find kinship file: {}, now generating one..". format(kinship_file)) try: sd = dp.load_snps_call_method( call_method_id=call_method_id, data_format=data_format, min_mac=min_mac, debug_filter=debug_filter) except Exception: if sd != None: k = sd.get_ibs_kinship_matrix() if scaled: k = scale_k(k) return k k = sd.get_ibs_kinship_matrix() k_accessions = sd.accessions n_snps = sd.num_snps() save_kinship_to_file(kinship_file, k, sd.accessions, n_snps) if n_removed_snps != None and remain_accessions != None: k = update_k_monomorphic(n_removed_snps, k, k_accessions, n_snps, remain_accessions, kinship_type='ibs', dtype='single') if scaled: k = scale_k(k) return k else: if scaled: k = scale_k(k) if return_accessions: return k, k_accessions else: return k else: print('Method {} is not implemented'.format(method)) raise NotImplementedError
def test_genotype_data(): import dataParsers as dp sd = dp.load_snps_call_method(75) gd = genotype_data('/tmp/test.h5py')