def evaluate_inference(result, distance_name=None, hitlist_file='/media/lalil0u/New/workspace2/Xb_screen/data/mitocheck_exp_hitlist_perPheno.pkl', print_=False,folder='/media/lalil0u/New/projects/drug_screen/results/', threshold=0.0001): f=open(hitlist_file, 'r') hitlist=pickle.load(f); f.close() yqualdict=expSi('../data/mapping_2014/qc_export.txt') dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') to_plot_info=defaultdict(dict) to_GO_info=defaultdict(list) for cond in sorted(result): if print_: print '-----------', cond l=np.array([(dictSiEntrez[el[0]], el[-1]) for el in result[cond]]) to_plot_info[cond]=defaultdict(list) for el in sorted(KNOWN_TARGETS[cond.split('--')[0]]): if np.where(l[:,0]==el)[0].shape[0]>0: rank_=np.where(l[:,0]==el)[0][0] to_plot_info[cond]['genes'].append(rank_) to_plot_info[cond]['gene_list'].append(el) if print_: print el, l[np.where(l[:,0]==el)], rank_ if type(threshold)==int: gene_lim=l[:threshold][:,0] else: lim=np.where(np.array(l[:,1], dtype=float)==threshold)[0] gene_lim=l[lim][:,0] to_GO_info[cond]=[trad[el] for el in gene_lim] res=[] for gene in gene_lim: res.extend(hitlist[gene]) #STEP2 here : we add writing gene list files to_plot_info[cond]['type']= Counter(res) multipleGeneListsToFile([to_GO_info[el] for el in to_GO_info], [el for el in to_GO_info], name=os.path.join(folder, 'GO_{}.txt'.format(distance_name))) return to_plot_info
def condition_cluster_inference(M, clusters, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, filename, taking_siRNAs=True, gsea=False): ''' - M: distance matrix of size (hits, mitocheck) - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition - num_permutations: no calculation of p-values if None, else number of permutations - filename if we want to write GSEA ranking files - gsea : if you want to write gsea ranking files ''' r={} yqualdict=expSi('../data/mapping_2014/qc_export.txt') yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False)) dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') siRNAs=[yqualdict[e] for e in who_Mitocheck] genes=[dictSiEntrez[e] for e in siRNAs] past_cluster_num=0 for cluster_num in clusters: print cluster_num r[cluster_num]={'conditions':clusters[cluster_num]} where_=np.hstack((np.where(exposure_hits==el)[0] for el in clusters[cluster_num])) batch_names = who_hits[where_] curr_dist=np.hstack((M[j] for j in where_)) curr_who=[(batch_names[0], '') for k in range(M.shape[1])] curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(genes) for name in batch_names[1:]: curr_who.extend([(name, '') for k in range(M.shape[1])]) if not taking_siRNAs: curr_conditions.extend(who_Mitocheck) else: curr_conditions.extend(genes) print curr_dist.shape if num_permutations is not None and past_cluster_num!=len(batch_names): #If there are only five experiments that are different between the two cluster length, then no need to redo the random rank product computation random_result=None curr_res=computeRPpvalues(curr_dist, np.array(curr_who), conditions=np.array(curr_conditions), technical_replicates_key=np.median, xb_screen=False, num_permutations=num_permutations, reverse=False, batch_names=batch_names, random_result=random_result, signed=False) #donc curr_res est [(gene, rank value)] if num_permutations is None and gsea: writeGSEARankingFile(curr_res, filename.format(cluster_num)) else: if len(curr_res)==2: #this means that we have the p-values curr_res, random_result=curr_res curr_res=sorted(curr_res, key=itemgetter(-1)) pval=np.array(np.array(curr_res)[:,-1], dtype=float) if not taking_siRNAs: currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]] else: currG=[e for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]] print sorted(currG) else: #this means that we're working with the rank product values curr_res=sorted(curr_res, key=itemgetter(-1)) if not taking_siRNAs: currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[:threshold][:,0]] else: currG=[e for e in np.array(curr_res)[:threshold][:,0]] print sorted(currG) past_cluster_num=len(batch_names) r[cluster_num]['genes']=[trad[el] for el in currG] r[cluster_num]['result']=[(el[0],el[-1]) for el in curr_res] multipleGeneListsToFile([r[k]['genes'] for k in r], [k for k in r], name=filename) return r