def execute_tpr(args): ''' estimate true positive rate for region detection ''' model = args.model regionlen = args.regionlen thresshold = args.thresshold cutoff = args.cutoff numReps = args.nrep pop = args.simpop suffix = args.suffix writedir = args.writedir takeScore = args.score all_scores = [] all_percentages = [] #if args.saveLog is not None: # writefilename = args.saveLog # if os.path.isfile(writefilename): # print(writefilename + " already exists; aborting.") # sys.exit(0) #per seldaf dafbins = [['0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'], ['0.10', '0.20', '0.30'], ['0.40', '0.50', '0.60'], ['0.70', '0.80', '0.90'], ['0.90']] daflabels = ['all', 'lo', 'mid', 'hi','highest'] for ibin in [3]:#[1, 2, 3, 4]:#range(1): thesebins, thislabel = dafbins[ibin], daflabels[ibin] allrepfilenames = [] for selbin in thesebins: for irep in range(1, numReps + 1): repfilename = get_sel_repfile_name(model, irep, pop, selbin, normed=True, suffix=suffix, basedir=writedir) if (irep==1): print(repfilename) if os.path.isfile(repfilename): allrepfilenames.append(repfilename) print('loaded ' + str(len(allrepfilenames)) + " replicates...") #numToTake = min(500, len(allrepfilenames)) #chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample chosen = allrepfilenames #this was just to expedite, no? for repfilename in chosen: physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) #physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename) these_scores = eval(takeScore) if len(these_scores) > 0: all_scores.append(these_scores) rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff) all_percentages.append(rep_percentages) print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...") tpr = calc_pr(all_percentages, thresshold) print('true positive rate: ' + str(tpr) + "\n") if args.saveLog is not None: writefilename = args.saveLog +"_" + thislabel writefile = open(writefilename, 'w') writefile.write(str(tpr)+'\n') writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n') writefile.close() print('wrote to : ' + str(writefilename)) return
def execute_cdf(args): """ visualize power to localize variants: estimate p(causal variant captured | signif thresshold includes x top SNPs) from simulates. plot as cumulative density function""" reps = args.nrep savefilename = args.savefilename writedir = args.writedir scenars = ['0.70', '0.80', '0.90']#'0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'] model = args.model causalPos = args.selPos suffix = args.suffix #causal_ranks_all = [] causal_ranks_1, causal_ranks_2, causal_ranks_3, causal_ranks_4 = [], [], [], [] for pop in [1, 2, 3, 4]: for scenar in scenars: for irep in range(1, reps+1): cmsfilename = get_sel_repfile_name(model, irep, pop, scenar, normed = False, basedir=writedir, suffix=suffix) if os.path.isfile(cmsfilename): physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename) if causalPos in physpos: causal_index = physpos.index(causalPos) causal_unnormed = cms_unnormed[causal_index] causal_rank = get_causal_rank(cms_unnormed, causal_unnormed) #print(cmsfilename) #print('causal rank: ' + str(causal_rank)) #causal_ranks.append(causal_rank) this_array = eval('causal_ranks_' + str(pop)) if not np.isnan(causal_rank): this_array.append(causal_rank) else: print("missing; " + cmsfilename) print("for pop 1, loaded " + str(len(causal_ranks_1)) + " replicates.") print("for pop 2, loaded " + str(len(causal_ranks_2)) + " replicates.") print("for pop 3, loaded " + str(len(causal_ranks_3)) + " replicates.") print("for pop 4, loaded " + str(len(causal_ranks_4)) + " replicates.") cdf_fig, cdf_ax = plt.subplots() if len(causal_ranks_1) > 0: cdf_bins1, cdf1 = get_cdf_from_causal_ranks(causal_ranks_1) cdf_ax.plot(cdf_bins1[1:], cdf1, color="yellow") if len(causal_ranks_2) > 0: cdf_bins2, cdf2 = get_cdf_from_causal_ranks(causal_ranks_2) cdf_ax.plot(cdf_bins2[1:], cdf2, color="blue") if len(causal_ranks_3) > 0: cdf_bins3, cdf3 = get_cdf_from_causal_ranks(causal_ranks_3) cdf_ax.plot(cdf_bins3[1:], cdf3, color="green") if len(causal_ranks_4) > 0: cdf_bins4, cdf4 = get_cdf_from_causal_ranks(causal_ranks_4) cdf_ax.plot(cdf_bins4[1:], cdf4, color="purple") cdf_ax.set_xlim([0, 50]) plt.title(model) #+ ", " + str(len(causal_ranks)) + " selection replicates") plt.ylabel('probability that the causal variant is captured') plt.xlabel('significance thresshold (i.e., examining the top x variants)') plt.savefig(savefilename) plt.close() print('plotted to ' + savefilename) return
def execute_normsims_genomewide(args): """ given output from composite_sims, normalize all replicates to neutral parameters """ sel_freq_bins = [ '0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90' ] model = args.model selpop = args.simpop numPerBin_sel = args.nrep_sel numPerBin_neut = args.nrep_neut writedir = args.writedir suffix = args.runSuffix values = [] ############################## ## LOAD STATS FROM NEUT SIMS # ############################## for irep in range(1, numPerBin_neut + 1): outfile = get_neut_repfile_name(model, irep, selpop, suffix=suffix, normed=False, basedir=writedir) if os.path.isfile(outfile): openfile = open(outfile, 'r') header = openfile.readline() for line in openfile: entries = line.split() rawscore = np.log(float(entries[-1])) values.append(rawscore) openfile.close() else: print('missing: ' + outfile) print('loaded ' + str(len(values)) + ' values from neutral sims...') #check for nans values = np.array(values) values = values[~np.isnan(values)] values = list(values) #check for infs values = np.array(values) values = values[~np.isinf(values)] values = list(values) mean = np.mean(values) var = np.var(values) sd = np.sqrt(var) print("max: " + str(max(values))) print("min: " + str(min(values))) print("mean: " + str(np.mean(values))) print("var: " + str(np.var(values))) ############################ ## NORMALIZE NEUTRAL SIMS ## ############################ for irep in range(1, numPerBin_neut + 1): outfile = get_neut_repfile_name(model, irep, selpop, suffix=suffix, normed=False, basedir=writedir) if os.path.isfile(outfile): normedfile = outfile + ".norm" #.z" if True: #if not os.path.isfile(normedfile): #CHANGE FOR --checkOverwrite openfile = open(outfile, 'r') writefile = open(normedfile, 'w') header = openfile.readline() writefile.write(header) for line in openfile: entries = line.split() rawscore = np.log(float(entries[-1])) normalized = normalize(rawscore, mean, sd) writeline = line.strip('\n') + "\t" + str( normalized) + "\n" writefile.write(writeline) openfile.close() writefile.close() print("wrote to eg: " + normedfile) ######################## ## NORMALIZE SEL SIMS ## ######################## for sel_freq_bin in sel_freq_bins: for irep in range(1, numPerBin_sel + 1): rawfile = get_sel_repfile_name(model, irep, selpop, sel_freq_bin, suffix=suffix, normed=False, basedir=writedir) #print(rawfile) if os.path.isfile(rawfile): normedfile = rawfile + ".norm" #.z" if True: #if not os.path.isfile(normedfile): openfile = open(rawfile, 'r') writefile = open(normedfile, 'w') header = openfile.readline() writefile.write(header) for line in openfile: entries = line.split() rawscore = np.log(float(entries[-1])) normalized = normalize(rawscore, mean, sd) writeline = line.strip('\n') + "\t" + str( normalized) + "\n" writefile.write(writeline) openfile.close() writefile.close() print("wrote to eg: " + normedfile) return