예제 #1
0
def execute_gw_regions(args):
	''' apply significance cutoff to genome-wide data to identify regions '''
	basedir = args.writedir
	pop = args.emppop
	thresshold = args.thresshold
	cutoff = args.cutoff
	windowlen = args.regionlen
	suffix = args.suffix

	chroms = range(1,23)
	signif_windows = []
	####################
	## LOOP OVER CHRS ##
	####################
	for chrom in chroms:
		chrom_signif = []
		normedempfilename = get_emp_cms_file(pop, chrom, normed=True, suffix=suffix, basedir=basedir)
		if not os.path.isfile(normedempfilename):
			print("missing: " + normedempfilename)
		else:
			physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(normedempfilename)
			for iPos in range(len(physpos)):
				##################
				## CHECK REGION ##
				##################
				window_scores = get_window(iPos, physpos, cms_normed, windowlen)
				percentage = check_outliers(window_scores, cutoff)
				if percentage > thresshold:
					chrom_signif.append(physpos[iPos])
		signif_windows.append(chrom_signif)

	##############################
	## MERGE CONTIGUOUS WINDOWS ##
	##############################
	final_starts = []
	final_ends = []
	print('merging regions')
	for chrom_signif in signif_windows:
		starts, ends =  merge_windows(chrom_signif, windowlen)
		final_starts.append(starts)
		final_ends.append(ends)

	###################
	## WRITE TO FILE ##
	###################
	if args.saveLog is not None:
		writefilename = args.saveLog 
		writefile = open(writefilename, 'w')
		for ichrom in range(len(final_starts)):
			chromnum = ichrom + 1
			starts = final_starts[ichrom]
			ends = final_ends[ichrom]
			for iregion in range(len(starts)-1):
				writeline = "chr" + str(chromnum) + "\t" + str(starts[iregion]) + "\t" + str(ends[iregion]) + '\n'
				writefile.write(writeline)
		writefile.close()
		print('wrote to ' + writefilename)	
	return
예제 #2
0
def execute_tpr(args):
	''' estimate true positive rate for region detection '''
	model = args.model
	regionlen = args.regionlen
	thresshold = args.thresshold
	cutoff = args.cutoff
	numReps = args.nrep
	pop = args.simpop
	suffix = args.suffix
	writedir = args.writedir
	takeScore = args.score

	all_scores = []
	all_percentages = []
	
	#if args.saveLog	is not None:
	#	writefilename = args.saveLog
	#	if os.path.isfile(writefilename):
	#		print(writefilename + " already exists; aborting.")
	#		sys.exit(0)

	#per seldaf
	dafbins = [['0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'], ['0.10', '0.20', '0.30'], ['0.40', '0.50', '0.60'], ['0.70', '0.80', '0.90'], ['0.90']]
	daflabels = ['all', 'lo', 'mid', 'hi','highest']
	for ibin in [3]:#[1, 2, 3, 4]:#range(1):
		thesebins, thislabel = dafbins[ibin], daflabels[ibin]
		allrepfilenames = []
		for selbin in thesebins:
			for irep in range(1, numReps + 1):
				repfilename = get_sel_repfile_name(model, irep, pop, selbin, normed=True, suffix=suffix, basedir=writedir)
				if (irep==1):
					print(repfilename)
				if os.path.isfile(repfilename):
					allrepfilenames.append(repfilename)
		print('loaded ' + str(len(allrepfilenames)) + " replicates...")
		#numToTake = min(500, len(allrepfilenames))
		#chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample	
		chosen = allrepfilenames #this was just to expedite, no?
		for repfilename in chosen:
			physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			#physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			these_scores = eval(takeScore)
			if len(these_scores) > 0:
				all_scores.append(these_scores)
				rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff)
				all_percentages.append(rep_percentages)		

		print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...")
		tpr = calc_pr(all_percentages, thresshold)
		print('true positive rate: ' + str(tpr) + "\n")

		if args.saveLog	is not None:
			writefilename = args.saveLog +"_" + thislabel
			writefile = open(writefilename, 'w')
			writefile.write(str(tpr)+'\n')

			writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n')
			writefile.close()
			print('wrote to :  ' + str(writefilename))
	return	
예제 #3
0
def execute_fpr(args):
	''' estimate false positive rate for region identification '''
	model = args.model
	regionlen = args.regionlen
	thresshold = args.thresshold
	cutoff = args.cutoff
	numReps = args.nrep
	pop = args.simpop
	suffix = args.suffix
	writedir = args.writedir
	takeScore = args.score

	all_scores = []
	all_percentages = []
	
	if True:
		for irep in range(1, numReps + 1):
			repfilename = get_neut_repfile_name(model, irep, pop, normed=True, suffix=suffix, basedir=writedir)
			if (irep==1):
				print(repfilename)
			physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			#physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			these_scores = eval(takeScore)
			if len(these_scores) > 0:
				all_scores.append(these_scores)
				rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff)
				all_percentages.append(rep_percentages)		
				#FOR DEBUG
				#print(str(rep_percentages) + "\t" + repfilename)
				if len(rep_percentages) > 0:
					if max(rep_percentages) > thresshold:
						print("false positive: " + repfilename)

	print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...")
	fpr = calc_pr(all_percentages, thresshold)
	print('false positive rate: ' + str(fpr) + "\n")

	if args.saveLog	is not None:
		writefilename = args.saveLog 
		writefile = open(writefilename, 'w')
		writefile.write(str(fpr)+'\n')

		writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n')
		writefile.close()
		print('wrote to :  ' + str(writefilename))
	return
예제 #4
0
def execute_cdf(args):
	""" visualize power to localize variants: estimate p(causal variant captured | signif thresshold includes x top SNPs) from simulates. plot as cumulative density function"""
	reps = args.nrep
	savefilename = args.savefilename
	writedir = args.writedir
	scenars = ['0.70', '0.80', '0.90']#'0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90']
	model = args.model
	causalPos = args.selPos
	suffix = args.suffix
	#causal_ranks_all = []
	causal_ranks_1, causal_ranks_2, causal_ranks_3, causal_ranks_4 = [], [], [], []
	for pop in [1, 2, 3, 4]:
		for scenar in scenars:
			for irep in range(1, reps+1):
				cmsfilename = get_sel_repfile_name(model, irep, pop, scenar, normed = False, basedir=writedir, suffix=suffix)
			
				if os.path.isfile(cmsfilename):
					physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename)
					if causalPos in physpos:
						causal_index = physpos.index(causalPos)
						causal_unnormed = cms_unnormed[causal_index]
						causal_rank = get_causal_rank(cms_unnormed, causal_unnormed)
						#print(cmsfilename)
						#print('causal rank: ' + str(causal_rank)) 
						#causal_ranks.append(causal_rank)
						this_array = eval('causal_ranks_' + str(pop))
						if not np.isnan(causal_rank):
							this_array.append(causal_rank)
				else:
					print("missing; " + cmsfilename)
	print("for pop 1, loaded " + str(len(causal_ranks_1)) + " replicates.")
	print("for pop 2, loaded " + str(len(causal_ranks_2)) + " replicates.")
	print("for pop 3, loaded " + str(len(causal_ranks_3)) + " replicates.")
	print("for pop 4, loaded " + str(len(causal_ranks_4)) + " replicates.")

	cdf_fig, cdf_ax = plt.subplots()
	if len(causal_ranks_1) > 0:
		cdf_bins1, cdf1 = get_cdf_from_causal_ranks(causal_ranks_1)
		cdf_ax.plot(cdf_bins1[1:], cdf1, color="yellow")
	if len(causal_ranks_2) > 0:
		cdf_bins2, cdf2 = get_cdf_from_causal_ranks(causal_ranks_2)
		cdf_ax.plot(cdf_bins2[1:], cdf2, color="blue")
	if len(causal_ranks_3) > 0:
		cdf_bins3, cdf3 = get_cdf_from_causal_ranks(causal_ranks_3)
		cdf_ax.plot(cdf_bins3[1:], cdf3, color="green")
	if len(causal_ranks_4) > 0:
		cdf_bins4, cdf4 = get_cdf_from_causal_ranks(causal_ranks_4)			
		cdf_ax.plot(cdf_bins4[1:], cdf4, color="purple")
	cdf_ax.set_xlim([0, 50])
	plt.title(model) #+ ", " + str(len(causal_ranks)) + " selection replicates")
	plt.ylabel('probability that the causal variant is captured')
	plt.xlabel('significance thresshold (i.e., examining the top x variants)')
	plt.savefig(savefilename)
	plt.close()
	print('plotted to ' + savefilename)
	return
예제 #5
0
def execute_extended_manhattan(args):
	""" generate a genome-wide plot of CMS scores with option to hilight outlier regions """
	plotscore = args.plotscore
	selpop = args.emppop
	basedir = args.writedir
	suffix = args.suffix
	savename = args.savefilename
	dpi = args.dpi
	numChr = 22
	titlestring = args.titlestring

	modelpops = {'YRI':1, 'GWD':1, 'LWK':1, 'MSL':1, 'ESN':1, 
				'CEU':2, 'FIN':2, 'IBS':2, 'TSI':2, 'GBR':2, 'IRN':2,
				'CHB':3, 'JPT':3, 'KHV':3, 'CDX':3, 'CHS':3, 
				'BEB':4, 'STU':4, 'ITU':4, 'PJL':4, 'GIH':4}
	pop = modelpops[selpop]
	#colorDict = {1:'#FFB933', 2:'#0EBFF0', 3:'#ADCD00', 4:'#8B08B0'} #1000 Genomes group color scheme
	colorDict = {1:'#cec627', 2:'#0EBFF0', 3:'#65ff00', 4:'#8B08B0'} #make it pop-!
	

	f, axarr = plt.subplots(numChr, 1, sharex = True, sharey=True, dpi=dpi, figsize=(7, 10))
	plt.suptitle(titlestring, fontsize=10)

	plt.xlabel('position')
	plt.ylabel('cms_gw normed score')

	all_emp_pos, all_emp_scores = [], []
	for chrom in range(1,numChr +1):
		emp_cms_filename = get_emp_cms_file(selpop, chrom, normed=True, suffix=suffix, basedir=basedir)
		print('loading chr ' + str(chrom) + ": " + emp_cms_filename)
		if not os.path.isfile(emp_cms_filename):
			print("missing: " + emp_cms_filename)
			break
		physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(emp_cms_filename)

		iax = chrom-1
		ax = axarr[iax]
		#ax.grid()
		plot_data = eval(plotscore)
		plotManhattan_extended(ax, plot_data, physpos, chrom)
		all_emp_pos.append(physpos)
		all_emp_scores.append(plot_data)

	################################
	## HILITE SIGNIFICANT REGIONS ##
	################################

	if args.regionsfile is not None:
		regionchrs, regionstarts, regionends = load_regions(args.regionsfile)
		print('loaded ' + str(len(regionchrs)) + ' significant regions from ' + args.regionsfile)
		for iregion in range(len(regionchrs)):
			regionchr, regionstart, regionend = regionchrs[iregion], regionstarts[iregion], regionends[iregion]
			this_chrom = int(regionchr.strip('chr'))
			ichrom = this_chrom-1
			chrompos, chromscores = all_emp_pos[ichrom], all_emp_scores[ichrom]
			zipped = zip(chrompos, chromscores)
			plotpos, plotvals = [], []
			for locus in zipped:
				if locus[0] >= regionstart:
					plotpos.append(locus[0])
					plotvals.append(locus[1])
				if locus[0] > regionend:
					break
			axarr[ichrom].plot(plotpos, plotvals, color=colorDict[pop], markersize=1)

	if args.percentile is not None:
		percentile = float(args.percentile)
		print('plotting data with heuristic cutoff for ' + str(percentile) + " percentile...")
		flat_emp_scores = [item for sublist in all_emp_scores for item in sublist if not np.isnan(item)]
		score_cutoff = float(np.percentile(flat_emp_scores, percentile))
		print("score cutoff: " + str(score_cutoff))
		for chrom in range(1,numChr +1):
			iax = chrom-1
			ax = axarr[iax]
			maximumVal = ax.get_xlim()[1]
			xpoints = np.array([0, maximumVal])
			ypoints = np.array([score_cutoff, score_cutoff])
			ax.plot(xpoints, ypoints ,linestyle = "dotted", color="red", markersize=.3)

			#get empirical scores and positions for pass threshhold and plot them as above with color
			these_scores, these_pos = all_emp_scores[iax], all_emp_pos[iax]
			zipped =  zip(these_scores, these_pos)
			significant = [item for item in zipped if item[0] >= score_cutoff]
			signif_vals = [item[0] for item in significant]
			signif_pos = [item[1] for item in significant]
			ax.plot(signif_pos, signif_vals, color=colorDict[pop], linestyle='None', marker=".", markersize=.3)#, markersize=1)

	plt.savefig(savename)
	print('saved to: ' + savename)
	return
예제 #6
0
def execute_regionviz(args):
	''' visualize component and composite scores for a region '''
	savefilename = args.savefilename
	cmsfilename = args.cmsInfile
	if os.path.isfile(cmsfilename):
		print('loading from... ' + cmsfilename)
		physpos, genpos, daf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename) #need to make this flexible to regional input vs gw. (vs. likes)
		causal_index = -1
		if args.hilitePos is not None:
			if args.hilitePos in physpos:
				causal_index = physpos.index(args.hilitePos)
		f, (ax1, ax2, ax3, ax4, ax5, ax6, ax7) = plt.subplots(7, sharex = True)
		quick_plot(ax1, physpos, ihs_normed, "ihs_normed", causal_index)
		quick_plot(ax2, physpos, delihh_normed, "delihh_normed", causal_index)
		quick_plot(ax3, physpos, nsl_normed, "nsl_normed", causal_index)
		quick_plot(ax4, physpos, xpehh_normed, "xpehh_normed", causal_index)
		quick_plot(ax5, physpos, fst, "fst", causal_index)
		quick_plot(ax6, physpos, deldaf, "deldaf", causal_index)
		quick_plot(ax7, physpos, cms_unnormed, "cms", causal_index)
		plt.savefig(savefilename)
		print("plotted to " + savefilename)
		plt.close()
	return