def distance_matrix(self, squareform=True): """Euclidean pairwise distance matrix in scaled space. If squareform is flagged, return the full distance matrix, else return the flattened upper triangle without the main diagonal.""" d = self.scaled[:, None] - self.scaled[None, :] mat = scipy.sqrt((d**2).sum(axis=2)) return mat if squareform else mat[scipy.triu_indices(len(self), 1)]
def adjacency(self, min_snp2gene_obs=2,fdr_cutoff=0.3,return_genes=False): ''' Return a matrix showing the number of shared HPO genes by Term. The diagonal of the matrix is the number of genes discoverd by that term. The upper diagonal shows the overlap between the row and column and the lower diagonal shows the hypergeomitric pval for the overlap between the two terms. The universe used is the number of unique genes in the overlap results. min_snp2gene_obs : int (default: 2) The min SNP2gene mappinging observations needed to be HPO fdr_cutoff: float (default: 0.3) The FDR cutoff the be considered HPO return_genes : bool (default: False) Return the candidate gene list instead of the overlap table ''' df = self.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True) # x={df[0]:set(df[1].gene) for df in df.groupby('Term')} adj = [] #num_universe = len(set(chain(*x.values()))) num_universe = len(self.results.gene.unique()) for i,a in enumerate(x.keys()): for j,b in enumerate(x.keys()): if j < i: continue common = set(x[a]).intersection(x[b]) num_common = len(set(x[a]).intersection(x[b])) if a != b: pval = hypergeom.sf(num_common-1,num_universe,len(x[a]),len(x[b])) else: # This will make the diagonal of the matrix be the number HPO genes # for the element pval = len(x[a]) adj.append((a,b,num_common,pval,','.join(common))) adj = pd.DataFrame(adj) adj.columns = ['Term1','Term2','num_common','pval','common'] # Stop early if we just want to return the lists if return_genes == True: adj = adj[adj.num_common>0] adj = adj[np.logical_not(adj.Term1==adj.Term2)] return adj.drop_duplicates() else: overlap = pd.pivot_table(adj,index='Term1',columns='Term2',values='num_common') # Mask out the lower diagonal on the overalp matrix overlap.values[tril_indices(len(overlap))] = 0 pvals = pd.pivot_table(adj,index='Term2',columns='Term1',values='pval') # Mask out the upper tringular on the pvals matrix pvals.values[triu_indices(len(pvals),1)] = 0 return (overlap+pvals).astype(float)
def adjacency( self, min_snp2gene_obs=2, fdr_cutoff=0.3, return_genes=False, second_overlap=None, ): """ Return a matrix showing the number of shared HPO genes by Term. The diagonal of the matrix is the number of genes discoverd by that term. The upper diagonal shows the overlap between the row and column and the lower diagonal shows the hypergeomitric pval for the overlap between the two terms. The universe used is the number of unique genes in the overlap results. min_snp2gene_obs : int (default: 2) The min SNP2gene mappinging observations needed to be HPO fdr_cutoff: float (default: 0.3) The FDR cutoff the be considered HPO return_genes : bool (default: False) Return the candidate gene list instead of the overlap table second_overlap : Overlap Object (default: None) If specified, overlap between terms will be calculated between this overlaps HPO genes and the second overlaps HPO genes resulting in a adjacency matrix where the x-axis is overlap 1's terms and the y-axis is overlap 2's terms and the values are the number of shared genes per term. """ hpo1 = self.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True, ) if second_overlap is None: second_overlap = self hpo2 = second_overlap.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True, ) # x = {df[0]: set(df[1].gene) for df in hpo1.groupby("Term")} y = {df[0]: set(df[1].gene) for df in hpo2.groupby("Term")} adj = [] # num_universe = len(set(chain(*x.values()))) num_universe = len( set(self.results.gene.unique()).union( set(second_overlap.results.gene.unique()) ) ) for i, a in enumerate(x.keys()): for j, b in enumerate(y.keys()): num_a = len(x[a]) num_b = len(y[b]) if j < i: continue common = set(x[a]).intersection(y[b]) num_common = len(set(x[a]).intersection(y[b])) if a != b: pval = hypergeom.sf( num_common - 1, num_universe, len(x[a]), len(y[b]) ) else: # This will make the diagonal of the matrix be the number HPO genes # for the element pval = len(x[a]) adj.append((a, b, num_a, num_b, num_common, pval, ",".join(common))) adj = pd.DataFrame(adj) adj.columns = [ "Term1", "Term2", "num_term1", "num_term2", "num_common", "pval", "common", ] # Stop early if we just want to return the lists if return_genes == True: adj = adj[adj.num_common > 0] adj = adj[np.logical_not(adj.Term1 == adj.Term2)] adj = adj.drop_duplicates() adj["bonferoni"] = adj.pval <= (0.05 / (len(x) * len(y))) return adj.drop_duplicates() else: overlap = pd.pivot_table( adj, index="Term1", columns="Term2", values="num_common" ) # Mask out the lower diagonal on the overalp matrix overlap.values[tril_indices(len(overlap))] = 0 pvals = pd.pivot_table(adj, index="Term1", columns="Term2", values="pval") # Mask out the upper tringular on the pvals matrix pvals.values[triu_indices(len(pvals), 1)] = 0 return (overlap + pvals).astype(float)
def adjacency(self, min_snp2gene_obs=2,fdr_cutoff=0.3,return_genes=False, second_overlap=None): ''' Return a matrix showing the number of shared HPO genes by Term. The diagonal of the matrix is the number of genes discoverd by that term. The upper diagonal shows the overlap between the row and column and the lower diagonal shows the hypergeomitric pval for the overlap between the two terms. The universe used is the number of unique genes in the overlap results. min_snp2gene_obs : int (default: 2) The min SNP2gene mappinging observations needed to be HPO fdr_cutoff: float (default: 0.3) The FDR cutoff the be considered HPO return_genes : bool (default: False) Return the candidate gene list instead of the overlap table second_overlap : Overlap Object (default: None) If specified, overlap between terms will be calculated between this overlaps HPO genes and the second overlaps HPO genes resulting in a adjacency matrix where the x-axis is overlap 1's terms and the y-axis is overlap 2's terms and the values are the number of shared genes per term. ''' hpo1 = self.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True) if second_overlap is None: second_overlap = self hpo2 = second_overlap.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True ) # x={df[0]:set(df[1].gene) for df in hpo1.groupby('Term')} y={df[0]:set(df[1].gene) for df in hpo2.groupby('Term')} adj = [] #num_universe = len(set(chain(*x.values()))) num_universe = len(set(self.results.gene.unique()).union(set(second_overlap.results.gene.unique()))) for i,a in enumerate(x.keys()): for j,b in enumerate(y.keys()): num_a = len(x[a]) num_b = len(y[b]) if j < i: continue common = set(x[a]).intersection(y[b]) num_common = len(set(x[a]).intersection(y[b])) if a != b: pval = hypergeom.sf(num_common-1,num_universe,len(x[a]),len(y[b])) else: # This will make the diagonal of the matrix be the number HPO genes # for the element pval = len(x[a]) adj.append((a,b,num_a,num_b,num_common,pval,','.join(common))) adj = pd.DataFrame(adj) adj.columns = ['Term1','Term2','num_term1','num_term2','num_common','pval','common'] # Stop early if we just want to return the lists if return_genes == True: adj = adj[adj.num_common>0] adj = adj[np.logical_not(adj.Term1==adj.Term2)] adj = adj.drop_duplicates() adj['bonferoni'] = adj.pval <= (0.05 / (len(x)*len(y))) return adj.drop_duplicates() else: overlap = pd.pivot_table(adj,index='Term1',columns='Term2',values='num_common') # Mask out the lower diagonal on the overalp matrix overlap.values[tril_indices(len(overlap))] = 0 pvals = pd.pivot_table(adj,index='Term1',columns='Term2',values='pval') # Mask out the upper tringular on the pvals matrix pvals.values[triu_indices(len(pvals),1)] = 0 return (overlap+pvals).astype(float)