def execute_gw_regions(args): ''' apply significance cutoff to genome-wide data to identify regions ''' basedir = args.writedir pop = args.emppop thresshold = args.thresshold cutoff = args.cutoff windowlen = args.regionlen suffix = args.suffix chroms = range(1,23) signif_windows = [] #################### ## LOOP OVER CHRS ## #################### for chrom in chroms: chrom_signif = [] normedempfilename = get_emp_cms_file(pop, chrom, normed=True, suffix=suffix, basedir=basedir) if not os.path.isfile(normedempfilename): print("missing: " + normedempfilename) else: physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(normedempfilename) for iPos in range(len(physpos)): ################## ## CHECK REGION ## ################## window_scores = get_window(iPos, physpos, cms_normed, windowlen) percentage = check_outliers(window_scores, cutoff) if percentage > thresshold: chrom_signif.append(physpos[iPos]) signif_windows.append(chrom_signif) ############################## ## MERGE CONTIGUOUS WINDOWS ## ############################## final_starts = [] final_ends = [] print('merging regions') for chrom_signif in signif_windows: starts, ends = merge_windows(chrom_signif, windowlen) final_starts.append(starts) final_ends.append(ends) ################### ## WRITE TO FILE ## ################### if args.saveLog is not None: writefilename = args.saveLog writefile = open(writefilename, 'w') for ichrom in range(len(final_starts)): chromnum = ichrom + 1 starts = final_starts[ichrom] ends = final_ends[ichrom] for iregion in range(len(starts)-1): writeline = "chr" + str(chromnum) + "\t" + str(starts[iregion]) + "\t" + str(ends[iregion]) + '\n' writefile.write(writeline) writefile.close() print('wrote to ' + writefilename) return
def execute_tpr(args): ''' estimate true positive rate for region detection ''' model = args.model regionlen = args.regionlen thresshold = args.thresshold cutoff = args.cutoff numReps = args.nrep pop = args.simpop suffix = args.suffix writedir = args.writedir takeScore = args.score all_scores = [] all_percentages = [] #if args.saveLog is not None: # writefilename = args.saveLog # if os.path.isfile(writefilename): # print(writefilename + " already exists; aborting.") # sys.exit(0) #per seldaf dafbins = [['0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'], ['0.10', '0.20', '0.30'], ['0.40', '0.50', '0.60'], ['0.70', '0.80', '0.90'], ['0.90']] daflabels = ['all', 'lo', 'mid', 'hi','highest'] for ibin in [3]:#[1, 2, 3, 4]:#range(1): thesebins, thislabel = dafbins[ibin], daflabels[ibin] allrepfilenames = [] for selbin in thesebins: for irep in range(1, numReps + 1): repfilename = get_sel_repfile_name(model, irep, pop, selbin, normed=True, suffix=suffix, basedir=writedir) if (irep==1): print(repfilename) if os.path.isfile(repfilename): allrepfilenames.append(repfilename) print('loaded ' + str(len(allrepfilenames)) + " replicates...") #numToTake = min(500, len(allrepfilenames)) #chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample chosen = allrepfilenames #this was just to expedite, no? for repfilename in chosen: physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) #physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) these_scores = eval(takeScore) if len(these_scores) > 0: all_scores.append(these_scores) rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff) all_percentages.append(rep_percentages) print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...") tpr = calc_pr(all_percentages, thresshold) print('true positive rate: ' + str(tpr) + "\n") if args.saveLog is not None: writefilename = args.saveLog +"_" + thislabel writefile = open(writefilename, 'w') writefile.write(str(tpr)+'\n') writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n') writefile.close() print('wrote to : ' + str(writefilename)) return
def execute_fpr(args): ''' estimate false positive rate for region identification ''' model = args.model regionlen = args.regionlen thresshold = args.thresshold cutoff = args.cutoff numReps = args.nrep pop = args.simpop suffix = args.suffix writedir = args.writedir takeScore = args.score all_scores = [] all_percentages = [] if True: for irep in range(1, numReps + 1): repfilename = get_neut_repfile_name(model, irep, pop, normed=True, suffix=suffix, basedir=writedir) if (irep==1): print(repfilename) physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) #physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) these_scores = eval(takeScore) if len(these_scores) > 0: all_scores.append(these_scores) rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff) all_percentages.append(rep_percentages) #FOR DEBUG #print(str(rep_percentages) + "\t" + repfilename) if len(rep_percentages) > 0: if max(rep_percentages) > thresshold: print("false positive: " + repfilename) print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...") fpr = calc_pr(all_percentages, thresshold) print('false positive rate: ' + str(fpr) + "\n") if args.saveLog is not None: writefilename = args.saveLog writefile = open(writefilename, 'w') writefile.write(str(fpr)+'\n') writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n') writefile.close() print('wrote to : ' + str(writefilename)) return
def execute_cdf(args): """ visualize power to localize variants: estimate p(causal variant captured | signif thresshold includes x top SNPs) from simulates. plot as cumulative density function""" reps = args.nrep savefilename = args.savefilename writedir = args.writedir scenars = ['0.70', '0.80', '0.90']#'0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'] model = args.model causalPos = args.selPos suffix = args.suffix #causal_ranks_all = [] causal_ranks_1, causal_ranks_2, causal_ranks_3, causal_ranks_4 = [], [], [], [] for pop in [1, 2, 3, 4]: for scenar in scenars: for irep in range(1, reps+1): cmsfilename = get_sel_repfile_name(model, irep, pop, scenar, normed = False, basedir=writedir, suffix=suffix) if os.path.isfile(cmsfilename): physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename) if causalPos in physpos: causal_index = physpos.index(causalPos) causal_unnormed = cms_unnormed[causal_index] causal_rank = get_causal_rank(cms_unnormed, causal_unnormed) #print(cmsfilename) #print('causal rank: ' + str(causal_rank)) #causal_ranks.append(causal_rank) this_array = eval('causal_ranks_' + str(pop)) if not np.isnan(causal_rank): this_array.append(causal_rank) else: print("missing; " + cmsfilename) print("for pop 1, loaded " + str(len(causal_ranks_1)) + " replicates.") print("for pop 2, loaded " + str(len(causal_ranks_2)) + " replicates.") print("for pop 3, loaded " + str(len(causal_ranks_3)) + " replicates.") print("for pop 4, loaded " + str(len(causal_ranks_4)) + " replicates.") cdf_fig, cdf_ax = plt.subplots() if len(causal_ranks_1) > 0: cdf_bins1, cdf1 = get_cdf_from_causal_ranks(causal_ranks_1) cdf_ax.plot(cdf_bins1[1:], cdf1, color="yellow") if len(causal_ranks_2) > 0: cdf_bins2, cdf2 = get_cdf_from_causal_ranks(causal_ranks_2) cdf_ax.plot(cdf_bins2[1:], cdf2, color="blue") if len(causal_ranks_3) > 0: cdf_bins3, cdf3 = get_cdf_from_causal_ranks(causal_ranks_3) cdf_ax.plot(cdf_bins3[1:], cdf3, color="green") if len(causal_ranks_4) > 0: cdf_bins4, cdf4 = get_cdf_from_causal_ranks(causal_ranks_4) cdf_ax.plot(cdf_bins4[1:], cdf4, color="purple") cdf_ax.set_xlim([0, 50]) plt.title(model) #+ ", " + str(len(causal_ranks)) + " selection replicates") plt.ylabel('probability that the causal variant is captured') plt.xlabel('significance thresshold (i.e., examining the top x variants)') plt.savefig(savefilename) plt.close() print('plotted to ' + savefilename) return
def execute_extended_manhattan(args): """ generate a genome-wide plot of CMS scores with option to hilight outlier regions """ plotscore = args.plotscore selpop = args.emppop basedir = args.writedir suffix = args.suffix savename = args.savefilename dpi = args.dpi numChr = 22 titlestring = args.titlestring modelpops = {'YRI':1, 'GWD':1, 'LWK':1, 'MSL':1, 'ESN':1, 'CEU':2, 'FIN':2, 'IBS':2, 'TSI':2, 'GBR':2, 'IRN':2, 'CHB':3, 'JPT':3, 'KHV':3, 'CDX':3, 'CHS':3, 'BEB':4, 'STU':4, 'ITU':4, 'PJL':4, 'GIH':4} pop = modelpops[selpop] #colorDict = {1:'#FFB933', 2:'#0EBFF0', 3:'#ADCD00', 4:'#8B08B0'} #1000 Genomes group color scheme colorDict = {1:'#cec627', 2:'#0EBFF0', 3:'#65ff00', 4:'#8B08B0'} #make it pop-! f, axarr = plt.subplots(numChr, 1, sharex = True, sharey=True, dpi=dpi, figsize=(7, 10)) plt.suptitle(titlestring, fontsize=10) plt.xlabel('position') plt.ylabel('cms_gw normed score') all_emp_pos, all_emp_scores = [], [] for chrom in range(1,numChr +1): emp_cms_filename = get_emp_cms_file(selpop, chrom, normed=True, suffix=suffix, basedir=basedir) print('loading chr ' + str(chrom) + ": " + emp_cms_filename) if not os.path.isfile(emp_cms_filename): print("missing: " + emp_cms_filename) break physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(emp_cms_filename) iax = chrom-1 ax = axarr[iax] #ax.grid() plot_data = eval(plotscore) plotManhattan_extended(ax, plot_data, physpos, chrom) all_emp_pos.append(physpos) all_emp_scores.append(plot_data) ################################ ## HILITE SIGNIFICANT REGIONS ## ################################ if args.regionsfile is not None: regionchrs, regionstarts, regionends = load_regions(args.regionsfile) print('loaded ' + str(len(regionchrs)) + ' significant regions from ' + args.regionsfile) for iregion in range(len(regionchrs)): regionchr, regionstart, regionend = regionchrs[iregion], regionstarts[iregion], regionends[iregion] this_chrom = int(regionchr.strip('chr')) ichrom = this_chrom-1 chrompos, chromscores = all_emp_pos[ichrom], all_emp_scores[ichrom] zipped = zip(chrompos, chromscores) plotpos, plotvals = [], [] for locus in zipped: if locus[0] >= regionstart: plotpos.append(locus[0]) plotvals.append(locus[1]) if locus[0] > regionend: break axarr[ichrom].plot(plotpos, plotvals, color=colorDict[pop], markersize=1) if args.percentile is not None: percentile = float(args.percentile) print('plotting data with heuristic cutoff for ' + str(percentile) + " percentile...") flat_emp_scores = [item for sublist in all_emp_scores for item in sublist if not np.isnan(item)] score_cutoff = float(np.percentile(flat_emp_scores, percentile)) print("score cutoff: " + str(score_cutoff)) for chrom in range(1,numChr +1): iax = chrom-1 ax = axarr[iax] maximumVal = ax.get_xlim()[1] xpoints = np.array([0, maximumVal]) ypoints = np.array([score_cutoff, score_cutoff]) ax.plot(xpoints, ypoints ,linestyle = "dotted", color="red", markersize=.3) #get empirical scores and positions for pass threshhold and plot them as above with color these_scores, these_pos = all_emp_scores[iax], all_emp_pos[iax] zipped = zip(these_scores, these_pos) significant = [item for item in zipped if item[0] >= score_cutoff] signif_vals = [item[0] for item in significant] signif_pos = [item[1] for item in significant] ax.plot(signif_pos, signif_vals, color=colorDict[pop], linestyle='None', marker=".", markersize=.3)#, markersize=1) plt.savefig(savename) print('saved to: ' + savename) return
def execute_regionviz(args): ''' visualize component and composite scores for a region ''' savefilename = args.savefilename cmsfilename = args.cmsInfile if os.path.isfile(cmsfilename): print('loading from... ' + cmsfilename) physpos, genpos, daf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename) #need to make this flexible to regional input vs gw. (vs. likes) causal_index = -1 if args.hilitePos is not None: if args.hilitePos in physpos: causal_index = physpos.index(args.hilitePos) f, (ax1, ax2, ax3, ax4, ax5, ax6, ax7) = plt.subplots(7, sharex = True) quick_plot(ax1, physpos, ihs_normed, "ihs_normed", causal_index) quick_plot(ax2, physpos, delihh_normed, "delihh_normed", causal_index) quick_plot(ax3, physpos, nsl_normed, "nsl_normed", causal_index) quick_plot(ax4, physpos, xpehh_normed, "xpehh_normed", causal_index) quick_plot(ax5, physpos, fst, "fst", causal_index) quick_plot(ax6, physpos, deldaf, "deldaf", causal_index) quick_plot(ax7, physpos, cms_unnormed, "cms", causal_index) plt.savefig(savefilename) print("plotted to " + savefilename) plt.close() return