def main():
	"""
	"""
	
	#-----------------------------------------------------------
	exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007']
	cutoff_list = [3, 4.3, 6.7]
	eQTL_threshold = [0,1,2,3]
	chromo = [1,2,3,4,5]
	#-----------------------------------------------------------
	outputpath = "%s/%s/summary_validation_numerics.txt"%(
										 fa.mr_folder, fa.numfolder
											)	

	for eQTL in eQTL_threshold:
		
		for dataset in exp_list:
			
			substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset)	
				
			for cutoff in cutoff_list:
			


				inputpath = "%s/permutate_eqtl_%s_%s_co%s.txt"%(
													 substorage, eQTL, 
													 dataset, cutoff
													)
				data = read_data(inputpath)
				
				for line in data:
					if line.startswith("lower_F1:"):
						lower_F1 = int(line[9:].strip())
					if line.startswith("higher_F1:"):
						higher_F1 = int(line[10:].strip())
						
					if line.startswith("lower_recall:"):
						lower_recall = int(line[13:].strip())
					if line.startswith("higher_recall:"):
						higher_recall = int(line[14:].strip())

					if line.startswith("lower_precision:"):
						lower_precision = int(line[16:].strip())
					if line.startswith("higher_precision:"):
						higher_precision = int(line[17:].strip())
							
				try:
					with open(outputpath, 'a') as fo:
						fo.write("dataset %s\n"%dataset)
						fo.write("cutoff %s\n"%cutoff)
						fo.write("eQTLs %s\n"%eQTL)
						fo.write("lower_F1 %s\n"%lower_F1)
						fo.write("higher_F1 %s\n"%higher_F1)
						fo.write("lower_recall %s\n"%lower_recall)
						fo.write("higher_recall %s\n"%higher_recall)
						fo.write("lower_precision %s\n"%lower_precision)
						fo.write("higher_precision %s\n"%higher_precision)
						fo.write("\n")
				except:
					pass
def read_predicted_confusion_data(fn):
	"""
	"""
	
	conf_data = read_data(fn)
	
	for line in conf_data:
		if line.startswith("dataset:"):
			dataset = line[9:].strip()
		if line.startswith("cutoff:"):
			cutoff = float(line[8:].strip())
			
		if line.startswith("recall"):
			recall = line[7:].strip()
			if recall != "None":
				recall = float(recall)
			else:
				recall = None
				
		if line.startswith("precis"):
			precision = line[7:].strip()
			if precision != "None":
				precision = float(precision)
			else:
				precision = None
		
		if line.startswith("F1"):
			F1 = line[3:].strip()
			if F1 != "None":
				F1 = float(F1)
			else:
				F1 = None
				
			if F1:
				return recall, precision, F1
def main():
	"""
		
	"""
		
	#Datasets
	exp_list = ['Ligterink_2014', 'Ligterink_2014_gxe', 'Snoek_2012','Keurentjes_2007']

	#Variables
	chromosome = [1,2,3,4,5]
	cutoff_list = [6.7, 4.3, 3]
	
	reduce_traits = False
	reduce_traits_even_more = True
	
	for dataset in exp_list:
		
		storage_folder = "%s/%s/genelist_%s"%(fa.mr_folder, fa.gfolder, dataset)

		for cutoff in cutoff_list:
			
			if reduce_traits:
				print "retrieving traits for %s %s"%(dataset, cutoff)
				traits = get_trait_with_genelist(dataset, cutoff, chromosome)
				
				fileloc = "%s/%s/reduced_traitlist_%s_co%s.txt"%(
																fa.mr_folder, 
																fa.trait_folder, 
																dataset, cutoff
																)
				
				print "writing file to %s"%fileloc												
				write_trait_to_file(fileloc, traits)
			
			if reduce_traits_even_more:
				#emr = even more reduced
				emr_traits = []
				fname = "%s/genelist_%s_co%s.txt"%(
									storage_folder, 
									dataset, cutoff
									)
				traitdata = read_data(fname)
				for line in traitdata:
					if line.startswith("trait:"):
						trait = line[7:16].strip()
						emr_traits.append(trait)
						
				#make a new emr_traitfilename
				#and store the traits
				
				emr_fileloc = "%s/%s/emr_traitlist_%s_co%s.txt"%(
																fa.mr_folder, 
																fa.trait_folder, 
																dataset, cutoff
																)
				print "writing file to %s"%emr_fileloc												
				write_trait_to_file(emr_fileloc, emr_traits)
def get_truetraits(fn):
	"""
	"""
	truetrait_list = []
	data = read_data(fn)
	for line in data:	
		if line.startswith("AT"):
			trait = line.strip()
			truetrait_list.append(trait)
			
	return truetrait_list
def get_genelist(fn):
	"""
	"""
	trait_genelist = []
	data = read_data(fn)
	for line in data:
		if line.startswith("trait:"):
			trait = line[7:].strip()
		if line.startswith("AT"):
			genelist = line.split()
			trait_genelist.append([trait, genelist])
			
	return trait_genelist
def get_enriched(fn):
	"""
	"""
	datadict = {}
	data = read_data(fn)
	for line in data:
		if line.startswith("trait:"):
			trait = line[7:].strip()
		if line.startswith("AT"):
			genelist = line.split()
			
			datadict[trait] = genelist			
			
	return datadict
def process_data(fn):
	"""
	"""
	data = read_data(fn)
	sizes = []
	for line in data:
		if line.startswith("trait:"):
			trait = line[7:].strip()
		if line.startswith("eQTL_size:"):
			size = int(line[11:].strip())
			if size != 0:
				sizes.append(size)
			#if size == 74:
				#print trait

	return sizes
def get_info(fn):
    """
	"""
    data = read_data(fn)
    trait_eqtl_genelist = []

    for line in data:
        if line.startswith("trait:"):
            trait = line[7:16]
        if line.startswith("eqtl:"):
            eqtl = int(line[6:].strip())
        if line.startswith("AT"):
            genelist = line.split()

            trait_eqtl_genelist.append([trait, eqtl, genelist])

    return trait_eqtl_genelist
def get_enriched(fn):
	"""
	"""
	datalist_eqtl = []
	datadict = {}
	data = read_data(fn)
	for line in data:
		if line.startswith("trait:"):
			trait = line[7:].strip()
		if line.startswith("eqtl:"):
			eqtl = int(line[6:].strip())
		if line.startswith("AT"):
			genelist = line.split()
			
			datalist_eqtl.append([trait, eqtl])
			datadict[trait] = genelist			
			
	return datalist_eqtl, datadict
def main():
	"""
	"""
	tic = time.clock()
	#Get test data from AtRegNet.txt
	AtReg_data = read_data(fa.filename_atreg)
	AtRegNet_parse = parse_AtReg_data(AtReg_data)
	
	TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
	TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
	
	TG_list_ref = [info[0] for info in TG_TF_ref]
	TG_set_ref = set(TG_list_ref)
	sh_TG_list_ref = list(TG_set_ref)
	
	TF_list_ref = [info[1] for info in TG_TF_ref]
	TF_set_ref = set(TF_list_ref)
	sh_TF_list_ref = list(TF_set_ref)

	#-----------------------------------------------------------
	#exp_list = ['Snoek_2012']
	exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007']
	cutoff_list = [3,4.3,6.7]
	chromo = [1,2,3,4,5]
	#-----------------------------------------------------------
	
	eQTL_threshold_list = [0,1,2,3]
	
	#-----------------------------------------------------------
	
	#get the premade 1000 distinct seeds of 8 digits each
	seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder)
	#print "Retrieving randomized seeds from %s"%seedfile
	seeds = read_seeds(seedfile)
	
	data_dict = {}
	
	write_summary = False
	write_conf = False
	print_conf = True
	

	summary = []
	print "start:"
	print "----------------------------"

	for eQTL_threshold in eQTL_threshold_list:

		for dataset in exp_list:
			
			for cutoff in cutoff_list:
			

				#print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff)
				
				F1 = None
				ref_F1 = None
							
				############################################################		
				####Retrieve original confusion matrix results
				subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset)
				F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%(
												fa.mr_folder, fa.numfolder,
												subfolder_F1, dataset, cutoff
												)
				try:
					ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn)
				except:
					ref_recall= ref_precision= ref_F1 = None
				
				if ref_F1 != None:
		
					############################################################
					####Retrieve genelist
					subfolder_genelist = "genelist_%s"%dataset
					genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%(
													fa.mr_folder, fa.gfolder,
													subfolder_genelist, dataset,
													cutoff
													)
					trait_genelist_list = get_genelist(genelist_fn)
					true_rel, total_rel = get_TGTF_from_genelist(
															genelist_fn, TG_TF_ref, 
															TG_list_ref, TF_list_ref
															)
					tt_genes = list(set([info[0] for info in true_rel]))
					############################################################			
					####Retrieve enriched list
					subfolder_enriched = "enriched_%s"%dataset
					enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%(
													fa.mr_folder, fa.enriched_folder,
													subfolder_enriched, dataset,
													cutoff
													)
					trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn)
					
					############################################################
					####Retrieve True Traits
					emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%(
													fa.mr_folder, fa.trait_folder,
													dataset, cutoff
													)

		
					############################################################
					#get all traits that have more than X eQTLs, where X = eQTL_threshold			
					truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold]
					trait_with_eqtl = [info[0] for info in truetrait_eqtl_list]
					

					print "dataset:", dataset
					print "cutoff:", cutoff
					print "eQTL_threshold:", eQTL_threshold
					print "traits with eQTL:", len(trait_with_eqtl)
示例#11
0
def main():
	"""
	confusion_matrix.py:
	###########################################################										
	true_pred_rel, false_pred_rel = identify_true_false_positives(
														TG_TF_pred,
														TG_TF_ref
														)
	###########################################################
	unpredicted_rel = count_false_negatives(
											TG_TF_ref, true_pred_rel, 
											tt_genes
											)										
	###########################################################
	TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
								total_rel, true_pred_rel, 
								false_pred_rel, unpredicted_rel
								)
	###########################################################
	print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
	###########################################################
	"""
	#Get test data from AtRegNet.txt
	AtReg_data = read_data(fa.filename_atreg)
	AtRegNet_parse = parse_AtReg_data(AtReg_data)
	
	TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
	TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
	
	TG_list_ref = [info[0] for info in TG_TF_ref]
	TG_set_ref = set(TG_list_ref)
	sh_TG_list_ref = list(TG_set_ref)
	
	TF_list_ref = [info[1] for info in TG_TF_ref]
	TF_set_ref = set(TF_list_ref)
	sh_TF_list_ref = list(TF_set_ref)
	###########################################################
	###########################################################
	artificial_data = copy.deepcopy(TG_TF_ref)
	dataset = "artificial dataset"
	cutoff = 0
	
	###########################################################
	###########################################################
	if true_P:
		tt_genes = [info[0] for info in artificial_data]
		total_rel = artificial_data
		###########################################################								
		true_pred_rel, false_pred_rel = identify_true_false_positives(
															artificial_data,
															TG_TF_ref
															)
		###########################################################

		unpredicted_rel = count_false_negatives(
												TG_TF_ref, true_pred_rel, 
												tt_genes
												)										
		###########################################################
		TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
									total_rel, true_pred_rel, 
									false_pred_rel, unpredicted_rel
									)
		###########################################################
		print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
	###########################################################	
	###########################################################


	percentages = [20, 40, 50, 60]



	if not false_N and false_P:
		#Add some noise in the form of False Positives
		for perc in percentages:
			cutoff = perc
			noised_artificial_data = add_false_positives(
										artificial_data, sh_TG_list_ref, 
										sh_TF_list_ref, perc
										)
			###########################################################
			tt_genes = [info[0] for info in artificial_data]
			total_rel = noised_artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
		###########################################################	
		###########################################################		


	if false_N and not false_P:
		#Add some noise in the form of False Negatives
		for perc in percentages:
			cutoff = perc
			noised_artificial_data = add_false_negatives(artificial_data, TG_TF_ref, perc)
			
			###########################################################
			tt_genes = [info[0] for info in artificial_data]
			total_rel = artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
		###########################################################	
		###########################################################	



	if false_P and false_N:
		#Add some noise in the form of False Positives
		#And then add some False_Negatives
		for perc in percentages:
			cutoff = perc
			print "ad:",len(artificial_data)
			noised_artificial_data = add_false_positives(
													artificial_data, 
													sh_TG_list_ref, 
													sh_TF_list_ref, perc
													)	
			print "nad:",len(noised_artificial_data)
			more_noised_artificial_data = add_false_negatives(
												noised_artificial_data, 
												TG_TF_ref, perc
												)
			print "mnad:",len(more_noised_artificial_data)
		
			
			
			###########################################################
			tt_genes = [info[0] for info in noised_artificial_data]
			total_rel = noised_artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																more_noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
def main():
	"""
	"""
	tic = time.clock()
	#Get test data from AtRegNet.txt
	AtReg_data = read_data(fa.filename_atreg)
	AtRegNet_parse = parse_AtReg_data(AtReg_data)
	
	TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
	TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
	
	TG_list_ref = [info[0] for info in TG_TF_ref]
	TG_set_ref = set(TG_list_ref)
	sh_TG_list_ref = list(TG_set_ref)
	
	TF_list_ref = [info[1] for info in TG_TF_ref]
	TF_set_ref = set(TF_list_ref)
	sh_TF_list_ref = list(TF_set_ref)

	#-----------------------------------------------------------
	exp_list = ['Snoek_2012']
	#exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007']
	cutoff_list = [3]#[4.3,6.7]
	chromo = [1,2,3,4,5]
	#-----------------------------------------------------------
	
	eQTL_threshold_list = [0,1,2,3]
	
	#-----------------------------------------------------------
	
	#get the premade 1000 distinct seeds of 8 digits each
	seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder)
	#print "Retrieving randomized seeds from %s"%seedfile
	seeds = read_seeds(seedfile)
	
	data_dict = {}
	
	write_summary = False
	write_conf = False
	print_conf = True
	

	summary = []
	
	for dataset in exp_list:
		
		for cutoff in cutoff_list:
			
			for eQTL_threshold in eQTL_threshold_list:
				
				print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff)
				
				F1 = None
				ref_F1 = None
							
				############################################################		
				####Retrieve original confusion matrix results
				subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset)
				F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%(
												fa.mr_folder, fa.numfolder,
												subfolder_F1, dataset, cutoff
												)
				try:
					ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn)
				except:
					ref_recall= ref_precision= ref_F1 = None
				
				if ref_F1 != None:
		
					############################################################
					####Retrieve genelist
					subfolder_genelist = "genelist_%s"%dataset
					genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%(
													fa.mr_folder, fa.gfolder,
													subfolder_genelist, dataset,
													cutoff
													)
					trait_genelist_list = get_genelist(genelist_fn)
					true_rel, total_rel = get_TGTF_from_genelist(
															genelist_fn, TG_TF_ref, 
															TG_list_ref, TF_list_ref
															)
					tt_genes = list(set([info[0] for info in true_rel]))
					############################################################			
					####Retrieve enriched list
					subfolder_enriched = "enriched_%s"%dataset
					enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%(
													fa.mr_folder, fa.enriched_folder,
													subfolder_enriched, dataset,
													cutoff
													)
					trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn)
					
					############################################################
					####Retrieve True Traits
					emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%(
													fa.mr_folder, fa.trait_folder,
													dataset, cutoff
													)

		
					############################################################
					#get all traits that have more than X eQTLs, where X = eQTL_threshold			
					truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold]
					trait_with_eqtl = [info[0] for info in truetrait_eqtl_list]
					print "traits with eQTL", len(trait_with_eqtl)
					############################################################
					
					higher_recall=lower_recall=higher_precision=lower_precision=0
					higher_F1=lower_F1=0		
					
					permutated_confusion = []
					#permutate!
					#print "Commencing permutation of %s, standby..."%len(seeds)
					#i = 0
					for seedling in seeds:
						#reset variables
						TP=FP=FN=TN=recall=specif=precision=F1= 0
						#print i
						#i += 1
						
						trait_randomsample = []
		
						#create [trait - sample gene list]
						for tr_ge in trait_genelist_list:
							g_trait, g_genelist = tr_ge
							#print g_trait
							#print len(g_genelist)
							
							if g_trait in trait_with_eqtl and g_trait in dict_trait_enriched:
								
								sample_size = len(dict_trait_enriched[g_trait])
								rsamp = select_random_sample(g_genelist, sample_size, seedling)
								trait_randomsample.append([g_trait,0, rsamp])
								#q = len(g_genelist)
								#print "take %s from %s"%(sample_size, q)


						#TG_TF_pred is summed over all traits in a (dataset, cutoff) combination
						#TG_TF_pred = get_randomized_predictions(trait_randomsample, TF_set_ref)
						TG_TF_pred = process_enrichment(trait_randomsample, TF_set_ref)
		
						
						#proceed with the random sample to the confusion matrix
						###########################################################										
						true_pred_rel, false_pred_rel = identify_true_false_positives(
																			TG_TF_pred,
																			TG_TF_ref
																			)
						
						###########################################################
						unpredicted_rel = count_false_negatives(
																TG_TF_ref, true_pred_rel, 
																tt_genes
																)
																
						###########################################################
						TP, FP, FN, TN, recall, specif, precision, F1 = calculate_confusion(
													total_rel, true_pred_rel, 
													false_pred_rel, unpredicted_rel
													)
						
						permutated_confusion.append([TP, FP, FN, TN, recall, specif, precision, F1])
						###########################################################
						#print "true_traits: %s"%len(set(tt_genes))
						#print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precision)
						###########################################################
						
		
						if recall != None:
							if recall < ref_recall:
								lower_recall += 1
							if recall >= ref_recall:
								higher_recall += 1
						#else:
							#print "recall is None"
							#pass
							
						if precision != None:
							if precision < ref_precision:
								lower_precision += 1
							if precision >= ref_precision:
								higher_precision += 1
						#else:
							#print "precision is None"
							#pass
							
						if ref_F1 != None and F1 != None:
								if F1 < ref_F1:
									lower_F1 += 1
								if F1 >= ref_F1:
									higher_F1 += 1
		
					summary.append([dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1])
						###########################################################
					
					
				if write_conf:
					substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset)
					if not os.path.exists(substorage):
						os.mkdir(substorage)
						
					resultsfolder_conf = "%s/permutate_eqtl_%s_%s_co%s.txt"%(
														 substorage, eQTL_threshold, 
														 dataset, cutoff
														)

					try:	
						print "Writing to file %s"%resultsfolder_conf
						with open(resultsfolder_conf, 'w') as fo:
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("dataset: %s"%dataset)
							fo.write("\n")
							fo.write("cutoff: %s"%cutoff)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_F1: \t%s"%lower_F1)
							fo.write("\n")
							fo.write("higher_F1: \t%s"%higher_F1)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_recall: %s"%lower_recall)
							fo.write("\n")
							fo.write("higher_recall: %s"%higher_recall)
							fo.write("\n")
							fo.write("lower_precision: %s"%lower_precision)
							fo.write("\n")
							fo.write("higher_precision: %s"%higher_precision)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							for [TP, FP, FN, TN, recall, specif, precision, F1] in permutated_confusion:
								fo.write("-------------------------\n")
								fo.write("TP\t%s\tFN\t%s"%(TP, FN))
								fo.write("\n")
								fo.write("FP\t%s\tTN\t%s"%(FP, TN))
								fo.write("\n")
								fo.write("-------------------------\n")
								fo.write("recall\t%s"%recall)
								fo.write("\n")
								fo.write("specificity\t%s"%specif)
								fo.write("\n")
								fo.write("precision\t%s"%precision)
								fo.write("\n")
								fo.write("F1\t%s"%F1)
								fo.write("\n")
								fo.write("-------------------------\n")
					except:
						pass
											
				if print_conf:
					try:
						print "-------------------------"
						print "TP\t%s\tFN\t%s"%(TP, FN)
						print "FP\t%s\tTN\t%s"%(FP, TN)
						print "-------------------------"
						print "dataset: %s"%dataset
						print "cutoff: %s"%cutoff
						print "eQTL: %s"%eQTL_threshold
						print "-------------------------"
						print "lower_F1:\t%s"%lower_F1
						print "higher_F1:\t%s"%higher_F1	
						print "-------------------------"
						print "lower_recall: %s"%lower_recall
						print "higher_recall: %s"%higher_recall
						print "lower_precision: %s"%lower_precision
						print "higher_precision: %s"%higher_precision
						print "-------------------------"
					except:
						pass


						
			if write_summary:
				summfolder_conf = "%s/%s/permutate_summary_eqtl_%s.txt"%(
												fa.mr_folder, fa.numfolder,
												eQTL_threshold
												)
				try:
					with open(summfolder_conf, 'w') as fo:
						for dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1 in summary:
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("dataset: %s"%dataset)
							fo.write("\n")
							fo.write("cutoff: %s"%cutoff)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_F1: \t%s"%lower_F1)
							fo.write("\n")
							fo.write("higher_F1: \t%s"%higher_F1)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("recall:")
							fo.write("\n")
							fo.write("lower: %s"%lower_recall)
							fo.write("\n")
							fo.write("higher: %s"%higher_recall)
							fo.write("\n")
							fo.write("precision:")
							fo.write("\n")
							fo.write("lower: %s"%lower_precision)
							fo.write("\n")
							fo.write("higher: %s"%higher_precision)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
				except:
					pass
def main():
    """
	"""
    # Get test data from AtRegNet.txt
    AtReg_data = read_data(fa.filename_atreg)
    AtRegNet_parse = parse_AtReg_data(AtReg_data)

    TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ["all"])
    TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]

    TG_list_ref = [info[0] for info in TG_TF_ref]
    TG_set_ref = set(TG_list_ref)

    TF_list_ref = [info[1] for info in TG_TF_ref]
    TF_set_ref = set(TF_list_ref)

    # -----------------------------------------------------------
    # exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Keurentjes_2007','Snoek_2012']
    exp_list = ["Ligterink_2014"]
    cutoff_list = [3]  # , 4.3, 6.7]
    chromo = [1, 2, 3, 4, 5]
    # -----------------------------------------------------------

    for dataset in exp_list:

        for cutoff in cutoff_list:

            print "Analysing %s %s" % (dataset, cutoff)

            ###########################################################
            # Extract the true TG-TF relations and the total possible
            # relations from the stored datafiles
            filelocation = "%s/%s/genelist_%s/genelist_%s_co%s.txt" % (
                fa.mr_folder,
                fa.gfolder,
                dataset,
                dataset,
                cutoff,
            )
            true_rel, total_rel = get_TGTF_from_genelist(filelocation, TG_TF_ref, TG_list_ref, TF_list_ref)

            ###########################################################
            # The TG in the true TG-TF relations are the true_traits (tt)
            # in this case named tt_genes
            tt_genes = list(set([info[0] for info in true_rel]))

            ########################################################################
            # Get for each true_trait the number of eQTLs
            enriched_fn = "%s/%s/enriched_%s/enriched_%s_co%s.txt" % (
                fa.mr_folder,
                fa.enriched_folder,
                dataset,
                dataset,
                cutoff,
            )
            trait_eqtl_genelist = get_info(enriched_fn)
            # Select true traits based on number of eQTLs
            tt_trait_eqtl_genelist = [[t[0], t[1], t[2]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1] > 0]
            trait_with_eqtl = [info[0] for info in tt_trait_eqtl_genelist]
            ########################################################################

            ###########################################################
            TG_TF_pred = process_enrichment(tt_trait_eqtl_genelist, TF_set_ref)

            ###########################################################
            true_pred_rel, false_pred_rel = identify_true_false_positives(TG_TF_pred, TG_TF_ref)

            ###########################################################
            unpredicted_rel = count_false_negatives(TG_TF_ref, true_pred_rel, trait_with_eqtl)

            ###########################################################
            TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
                total_rel, true_pred_rel, false_pred_rel, unpredicted_rel
            )

            ###########################################################
            print "true_traits: %s" % len(set(tt_genes))
            print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
示例#14
0
chromosome = [1,2,3,4,5]
#cutoff_list = [3]
#cutoff_list = [4.3]
cutoff_list = [6.7]

dataset = exp_list[0]
cutoff = cutoff_list[0]		

#----------------------------------------------------------------------
#Set all data files and lists
#----------------------------------------------------------------------

#read data from files, parse and process

TF_family_data = read_data(fa.filename_fam)
TF_fam_data = parse_family_data(TF_family_data)
TF_fam_list = sorted([info[1] for info in TF_fam_data])
TF_fam_set = set(TF_fam_list)

AtReg_data = read_data(fa.filename_atreg)
AtRegNet_parse = parse_AtReg_data(AtReg_data)
AtRegNet_list = [info[2] for info in AtRegNet_parse]
AtRegNet_set = set(AtRegNet_list)
AR_dict = make_AtReg_dict(AtRegNet_parse)

fam_selection = ["all"]#use "all" for all regulators
AtRegNet_pairs = TFloc_pairs_AtRegNet(AtRegNet_parse, fam_selection)

TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
示例#15
0
"""
Read traits with a LOD score > 20
dataset is Ligterink_2014
"""

import folder_assignments as fa
from data_handlers import read_data


filename = "%s/%s/lod_of_20.txt" % (fa.mr_folder, fa.raw_folder)
data = read_data(filename)
genelist = []

for line in data:
    if line.startswith("| AT"):
        gene = line[2:12]
        genelist.append(gene)

genelist = sorted(list(set(genelist)))
for g in genelist:
    print g
def main():
	"""
	"""

	#exp_list = ['Ligterink_2014', 'Ligterink_2014_gxe', 'Keurentjes_2007', 'Snoek_2012']
	#cutoff_list = [3, 4.3, 6.7]
	#-----------------------------------------------------------
	
	#exp_list = ['Ligterink_2014']
	#exp_list = ['Ligterink_2014_gxe']
	#exp_list = ['Keurentjes_2007']
	exp_list = ['Snoek_2012']
	
	cutoff_list = [3]
	#cutoff_list = [4.3]
	#cutoff_list = [6.7]
	
	chromo = [1,2,3,4,5]
	#-----------------------------------------------------------
	
	fileloc = "%s/%s/tt_te_combi.txt"%(fa.mr_folder, fa.numfolder)
	#print "Retrieving random sample sizes from %s"%fileloc
	szdata = read_data(fileloc)
	sample_size_dict = get_trait_samplesize_data(szdata)
	
	draw_trait_vs_eqtlsize = False
	draw_trait_vs_nreqtls = True
	
	for dataset in exp_list:
		
		for cutoff in cutoff_list:
			
			key = (dataset, cutoff)	
			if key in sample_size_dict:
				#sample_size_list = [trait, sample_size]
				sample_size_list = sample_size_dict[key]
				tt_genes = [item[0] for item in sample_size_list]
			
########################################################################			
			x = []
			y = []	
########################################################################
			

			
			#Print some specific traits based on size of genelist
			#for t, gl in trait_genelist_list:
				#if len(gl) < 124:
					#print t
########################################################################
			
			if draw_trait_vs_eqtlsize:
				
				#trait_genelist_list = [trait, genelist]
				trait_genelist_list = get_genelist(dataset, cutoff, chromo)
				
				tt_trait_genelist_list = [[t[0], t[1]] for t in trait_genelist_list if t[0] in tt_genes]
				t_gls_list = [[len(info[1]), info[0]] for info in tt_trait_genelist_list]
				sort_t_gls_list = sorted(t_gls_list, reverse=True)
				t = [info[1] for info in sort_t_gls_list]
				gls = [info[0] for info in sort_t_gls_list]
				
				for ind_t, trait in enumerate(t):
					x.append(ind_t)
				
				for eQTLsize in gls:
					y.append(eQTLsize)
			
				filename_plot = "%s/plots/tt_eQTLvsTrait_%s_co_%s.png"%(fa.mr_folder, dataset, cutoff)
				filename_text = "%s/plots/tt_eQTLvsTrait_%s_co_%s.txt"%(fa.mr_folder, dataset, cutoff)
				title = "eQTL size vs Traits for %s with cutoff %s"%(dataset, cutoff)
			
			
				write_plot(sort_t_gls_list, filename_text, title)
				draw_plot(x, y, filename_plot, title)
				
				tot = float(sum(gls))
				if len(gls) != 0:
					avg = tot/float(len(gls))
				
					print key
					print "Average eQTL size expressed in nr of genes: %s"%avg
				
				
			if draw_trait_vs_nreqtls:		
				tr_eqtl_list = get_info(dataset, cutoff, chromo, tt_genes)

				tt_trait_eqtls_list = [[t[0], t[1]] for t in tr_eqtl_list if t[0] in tt_genes and t[1]>2]
				t_eqtls_list = [[info[1], info[0]] for info in tt_trait_eqtls_list]
				sort_t_eqtls_list = sorted(t_eqtls_list, reverse=True)
				t = [info[1] for info in sort_t_eqtls_list ]
				eqtls = [info[0] for info in sort_t_eqtls_list ]

				for ind_t, trait in enumerate(t):
					x.append(ind_t)
				
				for eQTLsize in eqtls:
					y.append(eQTLsize)
			
				filename_plot = "%s/plots/tt_nrofeQTLsvsTrait_%s_co_%s.png"%(fa.mr_folder, dataset, cutoff)
				filename_text = "%s/plots/tt_nrofeQTLsvsTrait_%s_co_%s.txt"%(fa.mr_folder, dataset, cutoff)
				title = "nr of eQTLs vs Traits for %s with cutoff %s"%(dataset, cutoff)
			
			
				#write_plot(sort_t_gls_list, filename_text, title)
				draw_plot(x, y, filename_plot, title)