def all_genes_in_submatrices(network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only, mutation_profile): """Processing of sub-matrices for each case of genes Extract the sub-matrices of references genes and Zero-padding of the adjacency matrix. ┌ - - - - - - - ┐ | | | | | AA |AB| AC | | | | | |- - - - - - - -| | BA |BB| BC | |- - - - - - - -| | CA |CB| CC | └ - - - - - - - ┘ AA, BB and CC are all adjacency matrces (0/1 matrices with 0 on diagonal) : AA : common genes between PPI and patients' mutation profiles BB : genes founded only in PPI CC : genes founded only in patients' mutation profiles = zero matrix Parameters ---------- network : sparse matrix, shape (len(gene_id_ppi),len(gene_id_ppi)) PPI data matrix, called also 'adjacency matrix'. idx_ppi : list List of common genes' indexes in PPI. idx_ppi_only : list List of genes' indexes only in PPI. idx_mut_only : list List of genes' indexes only in patients' mutation profiles. Returns ------- ppi_total : sparse matrix Built from all sparse sub-matrices (AA, ... , CC). mut_total : sparse matrix Patients' mutation profiles of all genes (rows: patients, columns: genes of AA, BB and CC). ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. ┌ - - - - -┐ | | | | AA |AB| | | | |- - - - - | | BA |BB| └ - - - - -┘ mut_filt : sparse matrix Filtration from mut_total : only genes in PPI are considered. """ print(' ==== all_genes_in_submatrices ') AA = network[idx_ppi][:, idx_ppi] if AA.shape[0] == 0: warnings.warn("There are no common genes between PPI network and patients' mutation profile") AB = network[idx_ppi][:, idx_ppi_only] AC = sp.csc_matrix((len(idx_ppi), len(idx_mut_only))).astype(np.float32) BA = network[idx_ppi_only][:, idx_ppi] BB = network[idx_ppi_only][:, idx_ppi_only] BC = sp.csc_matrix((len(idx_ppi_only), len(idx_mut_only))).astype(np.float32) # TODO condition: if mutOnly = 0 CA = sp.csc_matrix((len(idx_mut_only), len(idx_ppi)), dtype=np.float32) CB = sp.csc_matrix((len(idx_mut_only), len(idx_ppi_only)), dtype=np.float32) CC = sp.csc_matrix((len(idx_mut_only), len(idx_mut_only)), dtype=np.float32) print(' ==== ABC ') ppi_total = sp.bmat([[AA, AB, AC], [BA, BB, BC], [CA, CB, CC]], format='csc') # NOTE ppi_total in COO matrix -> csc matrix # ppi_total = ppi_total.tocsc() print(' ==== mut_total ') mut_total = sp.bmat([[mutation_profile[:, idx_mut], sp.csc_matrix((mutation_profile.shape[0], len(idx_ppi_only)), dtype=np.float32), mutation_profile[:, idx_mut_only]]]) # filter only genes in PPI print(' ==== filter only genes in PPI ') degree = Ppi(ppi_total).deg ppi_filt = ppi_total[degree > 0, :][:, degree > 0] mut_filt = mut_total[:, degree > 0] print(' ==== all_genes_in_submatrices finish ') return ppi_total, mut_total, ppi_filt, mut_filt
def filter_ppi_patients(ppi_total, mut_total, ppi_filt, final_influence, ngh_max, keep_singletons=False, min_mutation=0, max_mutation=2000): """Keeping only the connections with the best influencers and Filtering some patients based on mutation number 'the 11 most influential neighbors of each gene in the network as determined by network influence distance were used' 'Only mutation data generated using the Illumina GAIIx platform were retained for subsequent analy- sis, and patients with fewer than 10 mutations were discarded.' Parameters ---------- ppi_total : sparse matrix Built from all sparse sub-matrices (AA, ... , CC). mut_total : sparse matrix Patients' mutation profiles of all genes (rows: patients, columns: genes of AA, BB and CC). ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. final_influence : Smoothed PPI influence matrices based on minimum or maximum weight. ngh_max : int Number of best influencers in PPI. keep_singletons : boolean, default: False If True, proteins not annotated in PPI (genes founded only in patients' mutation profiles) will be also considered. If False, only annotated proteins in PPI will be considered. min_mutation, max_mutation : int Numbers of lowest mutations and highest mutations per patient. Returns ------- ppi_final, mut_final : sparse matrix PPI and mutation profiles after filtering. """ # n = final_influence.shape[0] # final_influence = index_to_sym_matrix(n, final_influence) ppi_ngh = best_neighboors(ppi_filt, final_influence, ngh_max) deg0 = Ppi(ppi_total).deg == 0 # True if protein degree = 0 if keep_singletons: ppi_final = sp.bmat([ [ppi_ngh, sp.csc_matrix((ppi_ngh.shape[0], sum(deg0)))], [sp.csc_matrix((sum(deg0), ppi_ngh.shape[0])), sp.csc_matrix((sum(deg0), sum(deg0)))] ]) # -> COO matrix # mut_final=sp.bmat([[mut_total[:,deg0==False],mut_total[:,deg0==True]]]) mut_final = mut_total else: ppi_final = ppi_ngh mut_final = mut_total[:, Ppi(ppi_total).deg > 0] # filtered_patients = np.array([k < min_mutation or k > max_mutation for k in Patient(mut_final).mut_per_patient]) # mut_final = mut_final[filtered_patients == False, :] # to avoid worse comparison '== False' mut_final = mut_final[np.array([min_mutation < k < max_mutation for k in Patient(mut_final).mut_per_patient])] print(" Removing %i patients with less than %i or more than %i mutations" % (mut_total.shape[0]-mut_final.shape[0], min_mutation, max_mutation)) return ppi_final, mut_final
def filter_ppi_patients(result_folder, influence_weight, simplification, alpha, tol, ppi_total, mut_total, ppi_filt, final_influence, ngh_max, keep_singletons=False, min_mutation=0, max_mutation=2000): """Keeping only the connections with the best influencers and Filtering some patients based on mutation number 'the 11 most influential neighbors of each gene in the network as determined by network influence distance were used' 'Only mutation data generated using the Illumina GAIIx platform were retained for subsequent analy- sis, and patients with fewer than 10 mutations were discarded.' Parameters ---------- ppi_total : sparse matrix Built from all sparse sub-matrices (AA, ... , CC). mut_total : sparse matrix Patients' mutation profiles of all genes (rows: patients, columns: genes of AA, BB and CC). ppi_filt : sparse matrix Filtration from ppi_total : only genes in PPI are considered. final_influence : Smoothed PPI influence matrices based on minimum or maximum weight. ngh_max : int Number of best influencers in PPI. keep_singletons : boolean, default: False If True, proteins not annotated in PPI (genes founded only in patients' mutation profiles) will be also considered. If False, only annotated proteins in PPI will be considered. min_mutation, max_mutation : int Numbers of lowest mutations and highest mutations per patient. Returns ------- ppi_final, mut_final : sparse matrix PPI and mutation profiles after filtering. """ ppi_final_directory = result_folder + 'final_influence/' ppi_final_file = ( ppi_final_directory + 'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'. format(influence_weight, simplification, alpha, tol, keep_singletons, ngh_max)) existance_same_param = os.path.exists(ppi_final_file) if existance_same_param: ppi_final_data = loadmat(ppi_final_file) ppi_final = ppi_final_data['ppi_final'] print(' **** Same parameters file of PPI FINAL already exists') if keep_singletons: mut_final = mut_total else: mut_final = mut_total[:, Ppi(ppi_total).deg > 0] else: ppi_ngh = best_neighboors(ppi_filt, final_influence, ngh_max) deg0 = Ppi(ppi_total).deg == 0 # True if protein degree = 0 if keep_singletons: ppi_final = sp.bmat( [[ppi_ngh, sp.csc_matrix((ppi_ngh.shape[0], sum(deg0)))], [ sp.csc_matrix((sum(deg0), ppi_ngh.shape[0])), sp.csc_matrix((sum(deg0), sum(deg0))) ]]) # -> COO matrix # mut_final=sp.bmat([[mut_total[:,deg0==False],mut_total[:,deg0==True]]]) mut_final = mut_total else: ppi_final = ppi_ngh mut_final = mut_total[:, Ppi(ppi_total).deg > 0] savemat(ppi_final_file, {'ppi_final': ppi_final}, do_compression=True) # to avoid worse comparison '== False' mut_final = mut_final[np.array([ min_mutation <= k <= max_mutation for k in Patient(mut_final).mut_per_patient ])] print( " Removing %i patients with less than %i or more than %i mutations" % (mut_total.shape[0] - mut_final.shape[0], min_mutation, max_mutation)) return ppi_final, mut_final