示例#1
0
def condition_inference(M, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, 
                         taking_siRNAs=False):
    '''
    - M: distance matrix of size (hits, mitocheck)
    - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition
'''
    r={}
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False))
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    siRNAs=[yqualdict[e] for e in who_Mitocheck]
    genes=[dictSiEntrez[e] for e in siRNAs]
    
    count=Counter(exposure_hits)
    #This way I look at exposures that are hits at least 50% of the times/plates
    for el in filter(lambda x: count[x]/float(PASSED_QC_COND[x])>0.5, count):
        print el
        where_=np.where(exposure_hits==el)[0]
        
        batch_names = who_hits[where_]
        
        curr_dist=np.hstack((M[j] for j in where_))
        curr_who=[(batch_names[0], '') for k in range(M.shape[1])]
        
        curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(siRNAs)
        for name in batch_names[1:]:
            curr_who.extend([(name, '') for k in range(M.shape[1])])
            if not taking_siRNAs:
                curr_conditions.extend(who_Mitocheck)
            else:
                curr_conditions.extend(siRNAs)
        print curr_dist.shape
        r[el], random_result=computeRPpvalues(curr_dist, np.array(curr_who), conditions=np.array(curr_conditions), technical_replicates_key=np.median, 
                     num_permutations=num_permutations, reverse=False, 
                     batch_names=batch_names, random_result=random_result,
                     signed=False)
        r[el]=sorted(r[el], key=itemgetter(2))
        pval=np.array(np.array(r[el])[:,-1], dtype=float)
        if not taking_siRNAs:
            currG=[dictSiEntrez[yqualdict[e]] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]]
        else:
            currG=[dictSiEntrez[e] for e in np.array(r[el])[np.where(pval<=threshold)][:,0]]
        print sorted(Counter(currG).keys())
        
        
    return r
示例#2
0
def condition_cluster_inference(M, clusters, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, filename, 
                         taking_siRNAs=True, gsea=False):
    '''
    - M: distance matrix of size (hits, mitocheck)
    - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition
    - num_permutations: no calculation of p-values if None, else number of permutations
    - filename if we want to write GSEA ranking files
    - gsea : if you want to write gsea ranking files
'''
    r={}
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False))
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    siRNAs=[yqualdict[e] for e in who_Mitocheck]
    genes=[dictSiEntrez[e] for e in siRNAs]
    past_cluster_num=0
    for cluster_num in clusters:
        print cluster_num
        r[cluster_num]={'conditions':clusters[cluster_num]}
        where_=np.hstack((np.where(exposure_hits==el)[0] for el in clusters[cluster_num]))
        
        batch_names = who_hits[where_]
        
        curr_dist=np.hstack((M[j] for j in where_))
        curr_who=[(batch_names[0], '') for k in range(M.shape[1])]
        
        curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(genes)
        for name in batch_names[1:]:
            curr_who.extend([(name, '') for k in range(M.shape[1])])
            if not taking_siRNAs:
                curr_conditions.extend(who_Mitocheck)
            else:
                curr_conditions.extend(genes)
        print curr_dist.shape
        if num_permutations is not None and past_cluster_num!=len(batch_names):
    #If there are only five experiments that are different between the two cluster length, then no need to redo the random rank product computation
            random_result=None
        curr_res=computeRPpvalues(curr_dist, np.array(curr_who), 
                                                       conditions=np.array(curr_conditions), 
                                                       technical_replicates_key=np.median,
                                                       xb_screen=False, 
                                                         num_permutations=num_permutations, reverse=False, 
                                                         batch_names=batch_names, random_result=random_result,
                                                         signed=False)
        #donc curr_res est [(gene, rank value)]
        if num_permutations is None and gsea:
            writeGSEARankingFile(curr_res, filename.format(cluster_num))
            
        else:
            if len(curr_res)==2: 
    #this means that we have the p-values
                curr_res, random_result=curr_res
                curr_res=sorted(curr_res, key=itemgetter(-1))
                pval=np.array(np.array(curr_res)[:,-1], dtype=float)
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                print sorted(currG)
            else:
    #this means that we're working with the rank product values
                curr_res=sorted(curr_res, key=itemgetter(-1))
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[:threshold][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[:threshold][:,0]]
                print sorted(currG)
                
            past_cluster_num=len(batch_names)
        
        r[cluster_num]['genes']=[trad[el] for el in currG]
        
        r[cluster_num]['result']=[(el[0],el[-1]) for el in curr_res]
    multipleGeneListsToFile([r[k]['genes'] for k in r], [k for k in r], name=filename)
    
    return r