예제 #1
0
def evaluate_inference(result, distance_name=None, hitlist_file='/media/lalil0u/New/workspace2/Xb_screen/data/mitocheck_exp_hitlist_perPheno.pkl', 
                       print_=False,folder='/media/lalil0u/New/projects/drug_screen/results/',
                       threshold=0.0001):
    f=open(hitlist_file, 'r')
    hitlist=pickle.load(f); f.close()
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    to_plot_info=defaultdict(dict)
    to_GO_info=defaultdict(list)
    
    for cond in sorted(result):
        if print_:
            print '-----------', cond
        l=np.array([(dictSiEntrez[el[0]], el[-1]) for el in result[cond]])
        to_plot_info[cond]=defaultdict(list)
        for el in sorted(KNOWN_TARGETS[cond.split('--')[0]]):
            if np.where(l[:,0]==el)[0].shape[0]>0:
                rank_=np.where(l[:,0]==el)[0][0]
                to_plot_info[cond]['genes'].append(rank_)
                to_plot_info[cond]['gene_list'].append(el)
                if print_:
                    print el, l[np.where(l[:,0]==el)], rank_
        
        if type(threshold)==int:
            gene_lim=l[:threshold][:,0]
        else:
            lim=np.where(np.array(l[:,1], dtype=float)==threshold)[0]
            gene_lim=l[lim][:,0]
        to_GO_info[cond]=[trad[el] for el in gene_lim]
        res=[]
        for gene in gene_lim:
            res.extend(hitlist[gene])
    #STEP2 here : we add writing gene list files
            
        to_plot_info[cond]['type']= Counter(res)
    
    multipleGeneListsToFile([to_GO_info[el] for el in to_GO_info], [el for el in to_GO_info], name=os.path.join(folder, 'GO_{}.txt'.format(distance_name)))
    
    return to_plot_info
예제 #2
0
def condition_cluster_inference(M, clusters, who_hits, exposure_hits, who_Mitocheck, num_permutations, threshold, random_result, filename, 
                         taking_siRNAs=True, gsea=False):
    '''
    - M: distance matrix of size (hits, mitocheck)
    - taking_siRNAs: indicates if you want to consider different values of the same siRNAs independently (False) or as replicates of the same condition
    - num_permutations: no calculation of p-values if None, else number of permutations
    - filename if we want to write GSEA ranking files
    - gsea : if you want to write gsea ranking files
'''
    r={}
    yqualdict=expSi('../data/mapping_2014/qc_export.txt')
    yqualdict.update(expSi('../data/mapping_2014/qc_validation_exp.txt', primary_screen=False))
    dictSiEntrez=siEntrez('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    trad=EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
    
    siRNAs=[yqualdict[e] for e in who_Mitocheck]
    genes=[dictSiEntrez[e] for e in siRNAs]
    past_cluster_num=0
    for cluster_num in clusters:
        print cluster_num
        r[cluster_num]={'conditions':clusters[cluster_num]}
        where_=np.hstack((np.where(exposure_hits==el)[0] for el in clusters[cluster_num]))
        
        batch_names = who_hits[where_]
        
        curr_dist=np.hstack((M[j] for j in where_))
        curr_who=[(batch_names[0], '') for k in range(M.shape[1])]
        
        curr_conditions=list(who_Mitocheck) if not taking_siRNAs else list(genes)
        for name in batch_names[1:]:
            curr_who.extend([(name, '') for k in range(M.shape[1])])
            if not taking_siRNAs:
                curr_conditions.extend(who_Mitocheck)
            else:
                curr_conditions.extend(genes)
        print curr_dist.shape
        if num_permutations is not None and past_cluster_num!=len(batch_names):
    #If there are only five experiments that are different between the two cluster length, then no need to redo the random rank product computation
            random_result=None
        curr_res=computeRPpvalues(curr_dist, np.array(curr_who), 
                                                       conditions=np.array(curr_conditions), 
                                                       technical_replicates_key=np.median,
                                                       xb_screen=False, 
                                                         num_permutations=num_permutations, reverse=False, 
                                                         batch_names=batch_names, random_result=random_result,
                                                         signed=False)
        #donc curr_res est [(gene, rank value)]
        if num_permutations is None and gsea:
            writeGSEARankingFile(curr_res, filename.format(cluster_num))
            
        else:
            if len(curr_res)==2: 
    #this means that we have the p-values
                curr_res, random_result=curr_res
                curr_res=sorted(curr_res, key=itemgetter(-1))
                pval=np.array(np.array(curr_res)[:,-1], dtype=float)
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[np.where(pval<=threshold)][:,0]]
                print sorted(currG)
            else:
    #this means that we're working with the rank product values
                curr_res=sorted(curr_res, key=itemgetter(-1))
                if not taking_siRNAs:
                    currG=[dictSiEntrez[yqualdict[e]] for e in np.array(curr_res)[:threshold][:,0]]
                else:
                    currG=[e for e in np.array(curr_res)[:threshold][:,0]]
                print sorted(currG)
                
            past_cluster_num=len(batch_names)
        
        r[cluster_num]['genes']=[trad[el] for el in currG]
        
        r[cluster_num]['result']=[(el[0],el[-1]) for el in curr_res]
    multipleGeneListsToFile([r[k]['genes'] for k in r], [k for k in r], name=filename)
    
    return r