def mito_PHENOSCORE(file_='MITO_pheno_scores.pkl', 
                    folder='/media/lalil0u/New/projects/drug_screen/results/'):
    f=open('../data/mitocheck_exp_hitlist_perPheno.pkl')
    hitlistperpheno=pickle.load(f)
    f.close()
    
    f=open(os.path.join(folder, file_), 'r')
    scores, who=pickle.load(f); f.close()
    norm = mpl.colors.Normalize(-0.2,0.6)
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    bigGenesL=np.array([dictSiEntrez[yqualdict[e]] for e in who])
    
    for pheno in hitlistperpheno:
        expL=hitlistperpheno[pheno]
        genesL=np.array([dictSiEntrez[yqualdict[e]] for e in filter(lambda x: yqualdict[x] in dictSiEntrez, expL)])
        distinct_genes = sorted(list(set(genesL)))
        ind=np.hstack((np.where(bigGenesL==gene)[0] for gene in distinct_genes))
        print pheno, 
        
        currscores=scores[ind]
        print currscores.shape
        f=p.figure()
        ax=f.add_subplot(111)
        ax.matshow(currscores.T, cmap=mpl.cm.YlOrRd, norm=norm, aspect='auto')
        ax.set_title(pheno)
        ax.set_yticks(range(15))
        ax.set_yticklabels(CLASSES)
        
#        p.show()
        p.savefig(os.path.join(folder, 'pheno_score_MITO_{}.png'.format(pheno[:10])))
    return
Пример #2
0
 def load_pheno_seq_results_MITO(self,exp_list, time_aggregated=False):
     '''
     Here we're loading results from per frame files (pheno_count) on a per experiment basis. This will be interesting to look at distances between experiments
     based on phenotypes, aggregated on time
     '''
     missed=[]
     yqualDict=expSi(self.settings.mitocheck_qc_file)
     dictSiEntrez=siEntrez(self.settings.mitocheck_mapping_file)
     result = None; i=0; who=[]
     for pl,w in exp_list:
         print i,
         
         try:
             f=open(os.path.join(self.settings.outputFolder,pl, self.settings.outputFile.format(pl[:9], w)), 'r')
             pheno_seq_per_frame= pickle.load(f)
             f.close()
         except:
             print "Loading error for ", pl, w
             self.plate=pl; self.well=w
             if self.MITO_usable():
                 missed.append((pl,w))
             continue
         else:
         #15 and 16 are respectively out of focus and artefact objects. We don't want them
             if time_aggregated:
                 pheno_seq_per_frame=np.sum(pheno_seq_per_frame, 0)
                 pheno_seq_list=pheno_seq_per_frame/float(np.sum(pheno_seq_per_frame))
                 result = np.vstack((result, pheno_seq_list)) if result is not None else pheno_seq_list
             else:
                 pheno_seq_per_frame=np.vstack((np.sum(pheno_seq_per_frame[self.settings.time_agg*k:self.settings.time_agg*(k+1)],0) 
                                                for k in range(pheno_seq_per_frame.shape[0]/self.settings.time_agg)))
                 if result is not None:
                     shape_=min(result.shape[1], pheno_seq_per_frame.shape[0])
                     result = np.vstack((result[:,:shape_], pheno_seq_per_frame[np.newaxis][:,:shape_]))
                 else:
                     result= pheno_seq_per_frame[np.newaxis]
                 
     #ATTENTION gros bug avec LTValidMItosis si on limite a la taille et non pas au split de -- le nom de la plaque...
             who.append('{}--{:>03}'.format(pl.split('--')[0], w))
         finally:
             i+=1
     
     _name='time' if not time_aggregated else ''
             
     print "Saving to ",self.settings.outputFile.format("ALL", "MITO_{}".format(_name))
     
     f=open(os.path.join(self.settings.outputFolder,self.settings.outputFile.format("ALL", "MITO_{}".format(_name))), 'w')
     pickle.dump((result, who),f); f.close()
     return missed
Пример #3
0
def condition_inference(M, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, 
                         taking_siRNAs=False):
    '''
    - M: distance matrix of size (hits, mitocheck)
    - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition
'''
    r={}
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False))
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    siRNAs=[yqualdict[e] for e in who_Mitocheck]
    genes=[dictSiEntrez[e] for e in siRNAs]
    
    count=Counter(exposure_hits)
    #This way I look at exposures that are hits at least 50% of the times/plates
    for el in filter(lambda x: count[x]/float(PASSED_QC_COND[x])>0.5, count):
        print el
        where_=np.where(exposure_hits==el)[0]
        
        batch_names = who_hits[where_]
        
        curr_dist=np.hstack((M[j] for j in where_))
        curr_who=[(batch_names[0], '') for k in range(M.shape[1])]
        
        curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(siRNAs)
        for name in batch_names[1:]:
            curr_who.extend([(name, '') for k in range(M.shape[1])])
            if not taking_siRNAs:
                curr_conditions.extend(who_Mitocheck)
            else:
                curr_conditions.extend(siRNAs)
        print curr_dist.shape
        r[el], random_result=computeRPpvalues(curr_dist, np.array(curr_who), conditions=np.array(curr_conditions), technical_replicates_key=np.median, 
                     num_permutations=num_permutations, reverse=False, 
                     batch_names=batch_names, random_result=random_result,
                     signed=False)
        r[el]=sorted(r[el], key=itemgetter(2))
        pval=np.array(np.array(r[el])[:,-1], dtype=float)
        if not taking_siRNAs:
            currG=[dictSiEntrez[yqualdict[e]] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]]
        else:
            currG=[dictSiEntrez[e] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]]
        print sorted(Counter(currG).keys())
        
        
    return r
def from_geneL_to_phenoHit(geneL,hitFile='../data/mitocheck_exp_hitlist_perPheno.pkl'):
    f=open(hitFile)
    expPerPheno=pickle.load(f); f.close()
    yqualdict=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_export.txt')
    dictSiEntrez=siEntrez('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    res=defaultdict(list)
    
    for pheno in expPerPheno:
        for exp in expPerPheno[pheno]:
            if yqualdict[exp] in dictSiEntrez:
                res[dictSiEntrez[yqualdict[exp]]].append(pheno)
                
    for gene in res:   
        res[gene]=sorted(list(set(res[gene])))
        
    return res
Пример #5
0
def evaluate_inference(result, distance_name=None, hitlist_file='/media/lalil0u/New/workspace2/Xb_screen/data/mitocheck_exp_hitlist_perPheno.pkl', 
                       print_=False,folder='/media/lalil0u/New/projects/drug_screen/results/',
                       threshold=0.0001):
    f=open(hitlist_file, 'r')
    hitlist=pickle.load(f); f.close()
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    to_plot_info=defaultdict(dict)
    to_GO_info=defaultdict(list)
    
    for cond in sorted(result):
        if print_:
            print '-----------', cond
        l=np.array([(dictSiEntrez[el[0]], el[-1]) for el in result[cond]])
        to_plot_info[cond]=defaultdict(list)
        for el in sorted(KNOWN_TARGETS[cond.split('--')[0]]):
            if np.where(l[:,0]==el)[0].shape[0]>0:
                rank_=np.where(l[:,0]==el)[0][0]
                to_plot_info[cond]['genes'].append(rank_)
                to_plot_info[cond]['gene_list'].append(el)
                if print_:
                    print el, l[np.where(l[:,0]==el)], rank_
        
        if type(threshold)==int:
            gene_lim=l[:threshold][:,0]
        else:
            lim=np.where(np.array(l[:,1], dtype=float)==threshold)[0]
            gene_lim=l[lim][:,0]
        to_GO_info[cond]=[trad[el] for el in gene_lim]
        res=[]
        for gene in gene_lim:
            res.extend(hitlist[gene])
    #STEP2 here : we add writing gene list files
            
        to_plot_info[cond]['type']= Counter(res)
    
    multipleGeneListsToFile([to_GO_info[el] for el in to_GO_info], [el for el in to_GO_info], name=os.path.join(folder, 'GO_{}.txt'.format(distance_name)))
    
    return to_plot_info
Пример #6
0
def loadPredictions(loadingFolder = '../resultData/thrivisions/predictions', outputFilename = "thripred_{}_{}.pkl", sh=False, load=False,write=False,
                    mitocheck = '/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/mitocheck_siRNAs_target_genes_Ens75.txt',
                    qc = '/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/qc_export.txt',
                    ensembl="../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt",
                    threshold=0.05, qval=None
                    ):
    if load :
        yqualDict=expSi(qc)
        dictSiEntrez=siEntrez(mitocheck, yqualDict.values())
        
        results = filter(lambda x: 'thripred' in x, os.listdir(loadingFolder))
        who=[]; siRNA=[]; genes=[]
        nb_objects_init=[];nb_objects_final=[];
        percent_thrivision=[]
        for result in results:
            try:
                f=open(os.path.join(loadingFolder, result))
                r = pickle.load(f); f.close()
            except OSError, IOError:
                pdb.set_trace()
            else:
                percent, nb_ob_init, nb_obj_final=r[:3]
                who.append((result[9:18], result[19:27]))
                nb_objects_init.append(nb_ob_init)
                nb_objects_final.append(nb_obj_final)
                percent_thrivision.append(percent*nb_ob_init)
                siCourant = yqualDict[result[9:18]+'--'+result[21:24]]
                siRNA.append(siCourant)
                try:
                    genes.append(dictSiEntrez[siCourant])
                except KeyError:
                    if siCourant in ["scramble", '103860', '251283']:
                        genes.append('ctrl')
                    else:
                        pdb.set_trace()
                        genes.append('ctrl')
        f=open(os.path.join(loadingFolder, "all_predictions.pkl"), 'w')
        pickle.dump((nb_objects_init, nb_objects_final, percent_thrivision, who, genes, siRNA),f); f.close()
        return
Пример #7
0
    def MITO_usable(self, yqualDict=None, dictSiEntrez=None):
        if yqualDict==None:
            dictSiEntrez=siEntrez(self.settings.mitocheck_mapping_file)
            
            if 'LTValidMitosis' in self.plate:
                yqualDict=expSi(self.settings.valid_qc_file, primary_screen=False)
                test0='{}--{:>03}'.format(self.plate.split('--')[0], self.well) not in yqualDict
                test=yqualDict['{}--{:>03}'.format(self.plate.split('--')[0], self.well)] not in dictSiEntrez
            else:
                yqualDict=expSi(self.settings.mitocheck_qc_file)
                test0='{}--{:>03}'.format(self.plate[:9], self.well) not in yqualDict
                test=not is_ctrl_mitocheck((self.plate[:9], '{:>05}'.format(self.well))) and yqualDict['{}--{:>03}'.format(self.plate[:9], self.well)] not in dictSiEntrez
            

        if test0:
    #i. checking if quality control passed
            sys.stderr.write("Quality control not passed {} {} \n".format(self.plate, self.well))
            return False
        if test:
    #ii.checking if siRNA corresponds to a single target in the current state of knowledge
            sys.stderr.write( "SiRNA having no target or multiple target {} {}\n".format(self.plate, self.well))
            return False
        return True
def selecting_right_Mito_exp(folder='/media/lalil0u/New/projects/drug_screen/results/'):
    f=open('../data/ANCIENTmitocheck_exp_hitlist.pkl')
    mito_hitexp=list(set(pickle.load(f)))
    f.close()
    
    f=open(os.path.join(folder, 'MITO_pheno_scores_NOTVAL.pkl'))
    r=pickle.load(f);f.close()
    big_phenoscore=dict(zip(r[1], r[0]))

    f=open(os.path.join(folder, 'MITO_pheno_scores_VAL.pkl'))
    r=pickle.load(f);f.close()
    val_phenoscore=dict(zip(r[1], r[0]))

    yqualdict=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_export.txt')
    dictSiEntrez=siEntrez('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    genes_big=[]
    gene_si=defaultdict(list)
    res_siRNA=defaultdict(list)
    si_exp=defaultdict(list)
    
    unmapped_si=[]
    
    for exp in mito_hitexp:
        try:
            currSi=yqualdict[exp]; currGene = dictSiEntrez[currSi]
            
        except KeyError:
            #print "{} siRNA not in mapping file anymore".format(yqualdict[exp]),
            unmapped_si.append(yqualdict[exp])
        else:
            if exp in big_phenoscore:
                genes_big.append(currGene)
                gene_si[currGene].append(currSi)
                res_siRNA[currSi].append(big_phenoscore[exp][0])
                si_exp[currSi].append(exp)
            else:
                print "{} no data".format(exp)
                continue
    print "------------------------------------------------------Youpi next step looking at validation experiments"
    genes_big=sorted(list(set(genes_big)))
            
    yqualdict2=expSi('/media/lalil0u/New/workspace2/Xb_screen/data/mapping_2014/qc_validation_exp.txt', primary_screen=False)
    genes_big2=[]
    for exp in yqualdict2:
        try:
            currSi=yqualdict2[exp]; currGene = dictSiEntrez[currSi]
        except KeyError:
            if yqualdict2[exp]!='empty':
#                print "{} siRNA not in mapping file anymore".format(yqualdict2[exp]),
                unmapped_si.append(yqualdict2[exp])
        else:
            if exp in val_phenoscore:
                genes_big2.append(currGene)
                gene_si[currGene].append(currSi)
                res_siRNA[currSi].append(val_phenoscore[exp][0])
                si_exp[currSi].append(exp)
            else:
                print "{} no data".format(exp)
                continue
            
    genes_big2=sorted(list(set(genes_big2)))
    
    print "Unmapped siRNAs ", len(unmapped_si)
    
    print "How many genes do we gain by using validation experiments?"
    print len([el for el in genes_big if el not in genes_big2])
    
    genes_big.extend(genes_big2)
    genes_big=sorted(list(set(genes_big)))
    print "How many genes finally ", len(genes_big)
    final_siRNA_list=[]
    final_exp_list=[]
    genesL=[]
    count_siRNA_total=0
    for gene in genes_big:
        currSiL=gene_si[gene]
        counts=Counter(currSiL)
        count_siRNA_total+=len(counts)
        currRes=[]; siL=[]
        for siRNA in filter(lambda x: counts[x]>=2, counts):
            currRes.append(np.median(res_siRNA[siRNA]))
            siL.append(siRNA)

        if currRes!=[]: 
            choice=np.array(siL)[np.argmin(np.array(currRes))]
            final_exp_list.extend(si_exp[choice])
            final_siRNA_list.extend([choice for k in range(counts[choice])])
            genesL.extend([gene for k in range(counts[choice])])
    print "How many siRNAs in total ", count_siRNA_total
    return genesL, final_exp_list, final_siRNA_list
Пример #9
0
def condition_cluster_inference(M, clusters, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, filename, 
                         taking_siRNAs=True, gsea=False):
    '''
    - M: distance matrix of size (hits, mitocheck)
    - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition
    - num_permutations: no calculation of p-values if None, else number of permutations
    - filename if we want to write GSEA ranking files
    - gsea : if you want to write gsea ranking files
'''
    r={}
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False))
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    siRNAs=[yqualdict[e] for e in who_Mitocheck]
    genes=[dictSiEntrez[e] for e in siRNAs]
    past_cluster_num=0
    for cluster_num in clusters:
        print cluster_num
        r[cluster_num]={'conditions':clusters[cluster_num]}
        where_=np.hstack((np.where(exposure_hits==el)[0] for el in clusters[cluster_num]))
        
        batch_names = who_hits[where_]
        
        curr_dist=np.hstack((M[j] for j in where_))
        curr_who=[(batch_names[0], '') for k in range(M.shape[1])]
        
        curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(genes)
        for name in batch_names[1:]:
            curr_who.extend([(name, '') for k in range(M.shape[1])])
            if not taking_siRNAs:
                curr_conditions.extend(who_Mitocheck)
            else:
                curr_conditions.extend(genes)
        print curr_dist.shape
        if num_permutations is not None and past_cluster_num!=len(batch_names):
    #If there are only five experiments that are different between the two cluster length, then no need to redo the random rank product computation
            random_result=None
        curr_res=computeRPpvalues(curr_dist, np.array(curr_who), 
                                                       conditions=np.array(curr_conditions), 
                                                       technical_replicates_key=np.median,
                                                       xb_screen=False, 
                                                         num_permutations=num_permutations, reverse=False, 
                                                         batch_names=batch_names, random_result=random_result,
                                                         signed=False)
        #donc curr_res est [(gene, rank value)]
        if num_permutations is None and gsea:
            writeGSEARankingFile(curr_res, filename.format(cluster_num))
            
        else:
            if len(curr_res)==2: 
    #this means that we have the p-values
                curr_res, random_result=curr_res
                curr_res=sorted(curr_res, key=itemgetter(-1))
                pval=np.array(np.array(curr_res)[:,-1], dtype=float)
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                print sorted(currG)
            else:
    #this means that we're working with the rank product values
                curr_res=sorted(curr_res, key=itemgetter(-1))
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[:threshold][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[:threshold][:,0]]
                print sorted(currG)
                
            past_cluster_num=len(batch_names)
        
        r[cluster_num]['genes']=[trad[el] for el in currG]
        
        r[cluster_num]['result']=[(el[0],el[-1]) for el in curr_res]
    multipleGeneListsToFile([r[k]['genes'] for k in r], [k for k in r], name=filename)
    
    return r