def main(): """ """ # Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ["all"]) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) # ----------------------------------------------------------- # exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Keurentjes_2007','Snoek_2012'] exp_list = ["Ligterink_2014"] cutoff_list = [3] # , 4.3, 6.7] chromo = [1, 2, 3, 4, 5] # ----------------------------------------------------------- for dataset in exp_list: for cutoff in cutoff_list: print "Analysing %s %s" % (dataset, cutoff) ########################################################### # Extract the true TG-TF relations and the total possible # relations from the stored datafiles filelocation = "%s/%s/genelist_%s/genelist_%s_co%s.txt" % ( fa.mr_folder, fa.gfolder, dataset, dataset, cutoff, ) true_rel, total_rel = get_TGTF_from_genelist(filelocation, TG_TF_ref, TG_list_ref, TF_list_ref) ########################################################### # The TG in the true TG-TF relations are the true_traits (tt) # in this case named tt_genes tt_genes = list(set([info[0] for info in true_rel])) ######################################################################## # Get for each true_trait the number of eQTLs enriched_fn = "%s/%s/enriched_%s/enriched_%s_co%s.txt" % ( fa.mr_folder, fa.enriched_folder, dataset, dataset, cutoff, ) trait_eqtl_genelist = get_info(enriched_fn) # Select true traits based on number of eQTLs tt_trait_eqtl_genelist = [[t[0], t[1], t[2]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1] > 0] trait_with_eqtl = [info[0] for info in tt_trait_eqtl_genelist] ######################################################################## ########################################################### TG_TF_pred = process_enrichment(tt_trait_eqtl_genelist, TF_set_ref) ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives(TG_TF_pred, TG_TF_ref) ########################################################### unpredicted_rel = count_false_negatives(TG_TF_ref, true_pred_rel, trait_with_eqtl) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print "true_traits: %s" % len(set(tt_genes)) print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
def main(): """ """ tic = time.clock() #Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all']) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) sh_TG_list_ref = list(TG_set_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) sh_TF_list_ref = list(TF_set_ref) #----------------------------------------------------------- exp_list = ['Snoek_2012'] #exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007'] cutoff_list = [3]#[4.3,6.7] chromo = [1,2,3,4,5] #----------------------------------------------------------- eQTL_threshold_list = [0,1,2,3] #----------------------------------------------------------- #get the premade 1000 distinct seeds of 8 digits each seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder) #print "Retrieving randomized seeds from %s"%seedfile seeds = read_seeds(seedfile) data_dict = {} write_summary = False write_conf = False print_conf = True summary = [] for dataset in exp_list: for cutoff in cutoff_list: for eQTL_threshold in eQTL_threshold_list: print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff) F1 = None ref_F1 = None ############################################################ ####Retrieve original confusion matrix results subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset) F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%( fa.mr_folder, fa.numfolder, subfolder_F1, dataset, cutoff ) try: ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn) except: ref_recall= ref_precision= ref_F1 = None if ref_F1 != None: ############################################################ ####Retrieve genelist subfolder_genelist = "genelist_%s"%dataset genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%( fa.mr_folder, fa.gfolder, subfolder_genelist, dataset, cutoff ) trait_genelist_list = get_genelist(genelist_fn) true_rel, total_rel = get_TGTF_from_genelist( genelist_fn, TG_TF_ref, TG_list_ref, TF_list_ref ) tt_genes = list(set([info[0] for info in true_rel])) ############################################################ ####Retrieve enriched list subfolder_enriched = "enriched_%s"%dataset enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%( fa.mr_folder, fa.enriched_folder, subfolder_enriched, dataset, cutoff ) trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn) ############################################################ ####Retrieve True Traits emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%( fa.mr_folder, fa.trait_folder, dataset, cutoff ) ############################################################ #get all traits that have more than X eQTLs, where X = eQTL_threshold truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold] trait_with_eqtl = [info[0] for info in truetrait_eqtl_list] print "traits with eQTL", len(trait_with_eqtl) ############################################################ higher_recall=lower_recall=higher_precision=lower_precision=0 higher_F1=lower_F1=0 permutated_confusion = [] #permutate! #print "Commencing permutation of %s, standby..."%len(seeds) #i = 0 for seedling in seeds: #reset variables TP=FP=FN=TN=recall=specif=precision=F1= 0 #print i #i += 1 trait_randomsample = [] #create [trait - sample gene list] for tr_ge in trait_genelist_list: g_trait, g_genelist = tr_ge #print g_trait #print len(g_genelist) if g_trait in trait_with_eqtl and g_trait in dict_trait_enriched: sample_size = len(dict_trait_enriched[g_trait]) rsamp = select_random_sample(g_genelist, sample_size, seedling) trait_randomsample.append([g_trait,0, rsamp]) #q = len(g_genelist) #print "take %s from %s"%(sample_size, q) #TG_TF_pred is summed over all traits in a (dataset, cutoff) combination #TG_TF_pred = get_randomized_predictions(trait_randomsample, TF_set_ref) TG_TF_pred = process_enrichment(trait_randomsample, TF_set_ref) #proceed with the random sample to the confusion matrix ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( TG_TF_pred, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precision, F1 = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) permutated_confusion.append([TP, FP, FN, TN, recall, specif, precision, F1]) ########################################################### #print "true_traits: %s"%len(set(tt_genes)) #print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precision) ########################################################### if recall != None: if recall < ref_recall: lower_recall += 1 if recall >= ref_recall: higher_recall += 1 #else: #print "recall is None" #pass if precision != None: if precision < ref_precision: lower_precision += 1 if precision >= ref_precision: higher_precision += 1 #else: #print "precision is None" #pass if ref_F1 != None and F1 != None: if F1 < ref_F1: lower_F1 += 1 if F1 >= ref_F1: higher_F1 += 1 summary.append([dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1]) ########################################################### if write_conf: substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset) if not os.path.exists(substorage): os.mkdir(substorage) resultsfolder_conf = "%s/permutate_eqtl_%s_%s_co%s.txt"%( substorage, eQTL_threshold, dataset, cutoff ) try: print "Writing to file %s"%resultsfolder_conf with open(resultsfolder_conf, 'w') as fo: fo.write("-------------------------") fo.write("\n") fo.write("dataset: %s"%dataset) fo.write("\n") fo.write("cutoff: %s"%cutoff) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_F1: \t%s"%lower_F1) fo.write("\n") fo.write("higher_F1: \t%s"%higher_F1) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_recall: %s"%lower_recall) fo.write("\n") fo.write("higher_recall: %s"%higher_recall) fo.write("\n") fo.write("lower_precision: %s"%lower_precision) fo.write("\n") fo.write("higher_precision: %s"%higher_precision) fo.write("\n") fo.write("-------------------------") fo.write("\n") for [TP, FP, FN, TN, recall, specif, precision, F1] in permutated_confusion: fo.write("-------------------------\n") fo.write("TP\t%s\tFN\t%s"%(TP, FN)) fo.write("\n") fo.write("FP\t%s\tTN\t%s"%(FP, TN)) fo.write("\n") fo.write("-------------------------\n") fo.write("recall\t%s"%recall) fo.write("\n") fo.write("specificity\t%s"%specif) fo.write("\n") fo.write("precision\t%s"%precision) fo.write("\n") fo.write("F1\t%s"%F1) fo.write("\n") fo.write("-------------------------\n") except: pass if print_conf: try: print "-------------------------" print "TP\t%s\tFN\t%s"%(TP, FN) print "FP\t%s\tTN\t%s"%(FP, TN) print "-------------------------" print "dataset: %s"%dataset print "cutoff: %s"%cutoff print "eQTL: %s"%eQTL_threshold print "-------------------------" print "lower_F1:\t%s"%lower_F1 print "higher_F1:\t%s"%higher_F1 print "-------------------------" print "lower_recall: %s"%lower_recall print "higher_recall: %s"%higher_recall print "lower_precision: %s"%lower_precision print "higher_precision: %s"%higher_precision print "-------------------------" except: pass if write_summary: summfolder_conf = "%s/%s/permutate_summary_eqtl_%s.txt"%( fa.mr_folder, fa.numfolder, eQTL_threshold ) try: with open(summfolder_conf, 'w') as fo: for dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1 in summary: fo.write("-------------------------") fo.write("\n") fo.write("dataset: %s"%dataset) fo.write("\n") fo.write("cutoff: %s"%cutoff) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_F1: \t%s"%lower_F1) fo.write("\n") fo.write("higher_F1: \t%s"%higher_F1) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("recall:") fo.write("\n") fo.write("lower: %s"%lower_recall) fo.write("\n") fo.write("higher: %s"%higher_recall) fo.write("\n") fo.write("precision:") fo.write("\n") fo.write("lower: %s"%lower_precision) fo.write("\n") fo.write("higher: %s"%higher_precision) fo.write("\n") fo.write("-------------------------") fo.write("\n") except: pass
print "Retrieving data from AtRegNet.txt..." print "------------------------------------" label_lever = False color = 'blue' ########################################################### true_relations, total_rel_major = get_TGTF_from_genelist( dataset, cutoff, chromosome, TG_TF_ref, sh_TG_list_ref, sh_TF_list_ref ) ########################################################### tt_genes = list(set([info[0] for info in true_relations])) ########################################################### TG_TF_pred = process_enrichment( dataset, cutoff, chromosome, TF_set_ref, tt_genes ) ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( TG_TF_pred, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) #TFloc_list contains predicted datapoints #TFloc_list = [[info[1],info[0]] for info in total_rel_major]