def mito_PHENOSCORE(file_='MITO_pheno_scores.pkl', folder='/media/lalil0u/New/projects/drug_screen/results/'): f=open('../data/mitocheck_exp_hitlist_perPheno.pkl') hitlistperpheno=pickle.load(f) f.close() f=open(os.path.join(folder, file_), 'r') scores, who=pickle.load(f); f.close() norm = mpl.colors.Normalize(-0.2,0.6) yqualdict=expSi('../data/mapping_2014/qc_export.txt') dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') bigGenesL=np.array([dictSiEntrez[yqualdict[e]] for e in who]) for pheno in hitlistperpheno: expL=hitlistperpheno[pheno] genesL=np.array([dictSiEntrez[yqualdict[e]] for e in filter(lambda x: yqualdict[x] in dictSiEntrez, expL)]) distinct_genes = sorted(list(set(genesL))) ind=np.hstack((np.where(bigGenesL==gene)[0] for gene in distinct_genes)) print pheno, currscores=scores[ind] print currscores.shape f=p.figure() ax=f.add_subplot(111) ax.matshow(currscores.T, cmap=mpl.cm.YlOrRd, norm=norm, aspect='auto') ax.set_title(pheno) ax.set_yticks(range(15)) ax.set_yticklabels(CLASSES) # p.show() p.savefig(os.path.join(folder, 'pheno_score_MITO_{}.png'.format(pheno[:10]))) return
def load_pheno_seq_results_MITO(self,exp_list, time_aggregated=False): ''' Here we're loading results from per frame files (pheno_count) on a per experiment basis. This will be interesting to look at distances between experiments based on phenotypes, aggregated on time ''' missed=[] yqualDict=expSi(self.settings.mitocheck_qc_file) dictSiEntrez=siEntrez(self.settings.mitocheck_mapping_file) result = None; i=0; who=[] for pl,w in exp_list: print i, try: f=open(os.path.join(self.settings.outputFolder,pl, self.settings.outputFile.format(pl[:9], w)), 'r') pheno_seq_per_frame= pickle.load(f) f.close() except: print "Loading error for ", pl, w self.plate=pl; self.well=w if self.MITO_usable(): missed.append((pl,w)) continue else: #15 and 16 are respectively out of focus and artefact objects. We don't want them if time_aggregated: pheno_seq_per_frame=np.sum(pheno_seq_per_frame, 0) pheno_seq_list=pheno_seq_per_frame/float(np.sum(pheno_seq_per_frame)) result = np.vstack((result, pheno_seq_list)) if result is not None else pheno_seq_list else: pheno_seq_per_frame=np.vstack((np.sum(pheno_seq_per_frame[self.settings.time_agg*k:self.settings.time_agg*(k+1)],0) for k in range(pheno_seq_per_frame.shape[0]/self.settings.time_agg))) if result is not None: shape_=min(result.shape[1], pheno_seq_per_frame.shape[0]) result = np.vstack((result[:,:shape_], pheno_seq_per_frame[np.newaxis][:,:shape_])) else: result= pheno_seq_per_frame[np.newaxis] #ATTENTION gros bug avec LTValidMItosis si on limite a la taille et non pas au split de -- le nom de la plaque... who.append('{}--{:>03}'.format(pl.split('--')[0], w)) finally: i+=1 _name='time' if not time_aggregated else '' print "Saving to ",self.settings.outputFile.format("ALL", "MITO_{}".format(_name)) f=open(os.path.join(self.settings.outputFolder,self.settings.outputFile.format("ALL", "MITO_{}".format(_name))), 'w') pickle.dump((result, who),f); f.close() return missed
def condition_inference(M, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, taking_siRNAs=False): ''' - M: distance matrix of size (hits, mitocheck) - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition ''' r={} yqualdict=expSi('../data/mapping_2014/qc_export.txt') yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False)) dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') siRNAs=[yqualdict[e] for e in who_Mitocheck] genes=[dictSiEntrez[e] for e in siRNAs] count=Counter(exposure_hits) #This way I look at exposures that are hits at least 50% of the times/plates for el in filter(lambda x: count[x]/float(PASSED_QC_COND[x])>0.5, count): print el where_=np.where(exposure_hits==el)[0] batch_names = who_hits[where_] curr_dist=np.hstack((M[j] for j in where_)) curr_who=[(batch_names[0], '') for k in range(M.shape[1])] curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(siRNAs) for name in batch_names[1:]: curr_who.extend([(name, '') for k in range(M.shape[1])]) if not taking_siRNAs: curr_conditions.extend(who_Mitocheck) else: curr_conditions.extend(siRNAs) print curr_dist.shape r[el], random_result=computeRPpvalues(curr_dist, np.array(curr_who), conditions=np.array(curr_conditions), technical_replicates_key=np.median, num_permutations=num_permutations, reverse=False, batch_names=batch_names, random_result=random_result, signed=False) r[el]=sorted(r[el], key=itemgetter(2)) pval=np.array(np.array(r[el])[:,-1], dtype=float) if not taking_siRNAs: currG=[dictSiEntrez[yqualdict[e]] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]] else: currG=[dictSiEntrez[e] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]] print sorted(Counter(currG).keys()) return r
def from_geneL_to_phenoHit(geneL,hitFile='../data/mitocheck_exp_hitlist_perPheno.pkl'): f=open(hitFile) expPerPheno=pickle.load(f); f.close() yqualdict=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_export.txt') dictSiEntrez=siEntrez('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') res=defaultdict(list) for pheno in expPerPheno: for exp in expPerPheno[pheno]: if yqualdict[exp] in dictSiEntrez: res[dictSiEntrez[yqualdict[exp]]].append(pheno) for gene in res: res[gene]=sorted(list(set(res[gene]))) return res
def evaluate_inference(result, distance_name=None, hitlist_file='/media/lalil0u/New/workspace2/Xb_screen/data/mitocheck_exp_hitlist_perPheno.pkl', print_=False,folder='/media/lalil0u/New/projects/drug_screen/results/', threshold=0.0001): f=open(hitlist_file, 'r') hitlist=pickle.load(f); f.close() yqualdict=expSi('../data/mapping_2014/qc_export.txt') dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') to_plot_info=defaultdict(dict) to_GO_info=defaultdict(list) for cond in sorted(result): if print_: print '-----------', cond l=np.array([(dictSiEntrez[el[0]], el[-1]) for el in result[cond]]) to_plot_info[cond]=defaultdict(list) for el in sorted(KNOWN_TARGETS[cond.split('--')[0]]): if np.where(l[:,0]==el)[0].shape[0]>0: rank_=np.where(l[:,0]==el)[0][0] to_plot_info[cond]['genes'].append(rank_) to_plot_info[cond]['gene_list'].append(el) if print_: print el, l[np.where(l[:,0]==el)], rank_ if type(threshold)==int: gene_lim=l[:threshold][:,0] else: lim=np.where(np.array(l[:,1], dtype=float)==threshold)[0] gene_lim=l[lim][:,0] to_GO_info[cond]=[trad[el] for el in gene_lim] res=[] for gene in gene_lim: res.extend(hitlist[gene]) #STEP2 here : we add writing gene list files to_plot_info[cond]['type']= Counter(res) multipleGeneListsToFile([to_GO_info[el] for el in to_GO_info], [el for el in to_GO_info], name=os.path.join(folder, 'GO_{}.txt'.format(distance_name))) return to_plot_info
def loadPredictions(loadingFolder = '../resultData/thrivisions/predictions', outputFilename = "thripred_{}_{}.pkl", sh=False, load=False,write=False, mitocheck = '/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/mitocheck_siRNAs_target_genes_Ens75.txt', qc = '/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/qc_export.txt', ensembl="../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt", threshold=0.05, qval=None ): if load : yqualDict=expSi(qc) dictSiEntrez=siEntrez(mitocheck, yqualDict.values()) results = filter(lambda x: 'thripred' in x, os.listdir(loadingFolder)) who=[]; siRNA=[]; genes=[] nb_objects_init=[];nb_objects_final=[]; percent_thrivision=[] for result in results: try: f=open(os.path.join(loadingFolder, result)) r = pickle.load(f); f.close() except OSError, IOError: pdb.set_trace() else: percent, nb_ob_init, nb_obj_final=r[:3] who.append((result[9:18], result[19:27])) nb_objects_init.append(nb_ob_init) nb_objects_final.append(nb_obj_final) percent_thrivision.append(percent*nb_ob_init) siCourant = yqualDict[result[9:18]+'--'+result[21:24]] siRNA.append(siCourant) try: genes.append(dictSiEntrez[siCourant]) except KeyError: if siCourant in ["scramble", '103860', '251283']: genes.append('ctrl') else: pdb.set_trace() genes.append('ctrl') f=open(os.path.join(loadingFolder, "all_predictions.pkl"), 'w') pickle.dump((nb_objects_init, nb_objects_final, percent_thrivision, who, genes, siRNA),f); f.close() return
def MITO_usable(self, yqualDict=None, dictSiEntrez=None): if yqualDict==None: dictSiEntrez=siEntrez(self.settings.mitocheck_mapping_file) if 'LTValidMitosis' in self.plate: yqualDict=expSi(self.settings.valid_qc_file, primary_screen=False) test0='{}--{:>03}'.format(self.plate.split('--')[0], self.well) not in yqualDict test=yqualDict['{}--{:>03}'.format(self.plate.split('--')[0], self.well)] not in dictSiEntrez else: yqualDict=expSi(self.settings.mitocheck_qc_file) test0='{}--{:>03}'.format(self.plate[:9], self.well) not in yqualDict test=not is_ctrl_mitocheck((self.plate[:9], '{:>05}'.format(self.well))) and yqualDict['{}--{:>03}'.format(self.plate[:9], self.well)] not in dictSiEntrez if test0: #i. checking if quality control passed sys.stderr.write("Quality control not passed {} {} \n".format(self.plate, self.well)) return False if test: #ii.checking if siRNA corresponds to a single target in the current state of knowledge sys.stderr.write( "SiRNA having no target or multiple target {} {}\n".format(self.plate, self.well)) return False return True
def selecting_right_Mito_exp(folder='/media/lalil0u/New/projects/drug_screen/results/'): f=open('../data/ANCIENTmitocheck_exp_hitlist.pkl') mito_hitexp=list(set(pickle.load(f))) f.close() f=open(os.path.join(folder, 'MITO_pheno_scores_NOTVAL.pkl')) r=pickle.load(f);f.close() big_phenoscore=dict(zip(r[1], r[0])) f=open(os.path.join(folder, 'MITO_pheno_scores_VAL.pkl')) r=pickle.load(f);f.close() val_phenoscore=dict(zip(r[1], r[0])) yqualdict=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_export.txt') dictSiEntrez=siEntrez('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') genes_big=[] gene_si=defaultdict(list) res_siRNA=defaultdict(list) si_exp=defaultdict(list) unmapped_si=[] for exp in mito_hitexp: try: currSi=yqualdict[exp]; currGene = dictSiEntrez[currSi] except KeyError: #print "{} siRNA not in mapping file anymore".format(yqualdict[exp]), unmapped_si.append(yqualdict[exp]) else: if exp in big_phenoscore: genes_big.append(currGene) gene_si[currGene].append(currSi) res_siRNA[currSi].append(big_phenoscore[exp][0]) si_exp[currSi].append(exp) else: print "{} no data".format(exp) continue print "------------------------------------------------------Youpi next step looking at validation experiments" genes_big=sorted(list(set(genes_big))) yqualdict2=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_validation_exp.txt', primary_screen=False) genes_big2=[] for exp in yqualdict2: try: currSi=yqualdict2[exp]; currGene = dictSiEntrez[currSi] except KeyError: if yqualdict2[exp]!='empty': # print "{} siRNA not in mapping file anymore".format(yqualdict2[exp]), unmapped_si.append(yqualdict2[exp]) else: if exp in val_phenoscore: genes_big2.append(currGene) gene_si[currGene].append(currSi) res_siRNA[currSi].append(val_phenoscore[exp][0]) si_exp[currSi].append(exp) else: print "{} no data".format(exp) continue genes_big2=sorted(list(set(genes_big2))) print "Unmapped siRNAs ", len(unmapped_si) print "How many genes do we gain by using validation experiments?" print len([el for el in genes_big if el not in genes_big2]) genes_big.extend(genes_big2) genes_big=sorted(list(set(genes_big))) print "How many genes finally ", len(genes_big) final_siRNA_list=[] final_exp_list=[] genesL=[] count_siRNA_total=0 for gene in genes_big: currSiL=gene_si[gene] counts=Counter(currSiL) count_siRNA_total+=len(counts) currRes=[]; siL=[] for siRNA in filter(lambda x: counts[x]>=2, counts): currRes.append(np.median(res_siRNA[siRNA])) siL.append(siRNA) if currRes!=[]: choice=np.array(siL)[np.argmin(np.array(currRes))] final_exp_list.extend(si_exp[choice]) final_siRNA_list.extend([choice for k in range(counts[choice])]) genesL.extend([gene for k in range(counts[choice])]) print "How many siRNAs in total ", count_siRNA_total return genesL, final_exp_list, final_siRNA_list
def condition_cluster_inference(M, clusters, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, filename, taking_siRNAs=True, gsea=False): ''' - M: distance matrix of size (hits, mitocheck) - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition - num_permutations: no calculation of p-values if None, else number of permutations - filename if we want to write GSEA ranking files - gsea : if you want to write gsea ranking files ''' r={} yqualdict=expSi('../data/mapping_2014/qc_export.txt') yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False)) dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') siRNAs=[yqualdict[e] for e in who_Mitocheck] genes=[dictSiEntrez[e] for e in siRNAs] past_cluster_num=0 for cluster_num in clusters: print cluster_num r[cluster_num]={'conditions':clusters[cluster_num]} where_=np.hstack((np.where(exposure_hits==el)[0] for el in clusters[cluster_num])) batch_names = who_hits[where_] curr_dist=np.hstack((M[j] for j in where_)) curr_who=[(batch_names[0], '') for k in range(M.shape[1])] curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(genes) for name in batch_names[1:]: curr_who.extend([(name, '') for k in range(M.shape[1])]) if not taking_siRNAs: curr_conditions.extend(who_Mitocheck) else: curr_conditions.extend(genes) print curr_dist.shape if num_permutations is not None and past_cluster_num!=len(batch_names): #If there are only five experiments that are different between the two cluster length, then no need to redo the random rank product computation random_result=None curr_res=computeRPpvalues(curr_dist, np.array(curr_who), conditions=np.array(curr_conditions), technical_replicates_key=np.median, xb_screen=False, num_permutations=num_permutations, reverse=False, batch_names=batch_names, random_result=random_result, signed=False) #donc curr_res est [(gene, rank value)] if num_permutations is None and gsea: writeGSEARankingFile(curr_res, filename.format(cluster_num)) else: if len(curr_res)==2: #this means that we have the p-values curr_res, random_result=curr_res curr_res=sorted(curr_res, key=itemgetter(-1)) pval=np.array(np.array(curr_res)[:,-1], dtype=float) if not taking_siRNAs: currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]] else: currG=[e for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]] print sorted(currG) else: #this means that we're working with the rank product values curr_res=sorted(curr_res, key=itemgetter(-1)) if not taking_siRNAs: currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[:threshold][:,0]] else: currG=[e for e in np.array(curr_res)[:threshold][:,0]] print sorted(currG) past_cluster_num=len(batch_names) r[cluster_num]['genes']=[trad[el] for el in currG] r[cluster_num]['result']=[(el[0],el[-1]) for el in curr_res] multipleGeneListsToFile([r[k]['genes'] for k in r], [k for k in r], name=filename) return r