示例#1
0
def all_genes_in_submatrices(network, idx_ppi, idx_mut, idx_ppi_only,
                             idx_mut_only, mutation_profile):
    """Processing of sub-matrices for each case of genes

    Extract the sub-matrices of references genes and Zero-padding of the
    adjacency matrix.
    ┌ - - - - - - - ┐
    |       |  |    |
    |   AA  |AB| AC |
    |       |  |    |
    |- - - - - - - -|
    |   BA  |BB| BC |
    |- - - - - - - -|
    |   CA  |CB| CC |
    └ - - - - - - - ┘
    AA, BB and CC are all adjacency matrces (0/1 matrices with 0 on diagonal) :
        AA : common genes between PPI and patients' mutation profiles
        BB : genes founded only in PPI
        CC : genes founded only in patients' mutation profiles = zero matrix

    Parameters
    ----------
    network : sparse matrix, shape (len(gene_id_ppi),len(gene_id_ppi))
        PPI data matrix, called also 'adjacency matrix'.

    idx_ppi : list
        List of common genes' indexes in PPI.

    idx_ppi_only : list
        List of genes' indexes only in PPI.


    idx_mut_only : list
        List of genes' indexes only in patients' mutation profiles.

    Returns
    -------
    ppi_total : sparse matrix
        Built from all sparse sub-matrices (AA, ... , CC).

    mut_total : sparse matrix
        Patients' mutation profiles of all genes (rows: patients,
        columns: genes of AA, BB and CC).

    ppi_filt : sparse matrix
        Filtration from ppi_total : only genes in PPI are considered.
        ┌ - - - - -┐
        |       |  |
        |   AA  |AB|
        |       |  |
        |- - - - - |
        |   BA  |BB|
        └ - - - - -┘

    mut_filt : sparse matrix
        Filtration from mut_total : only genes in PPI are considered.
    """
    print(' ==== all_genes_in_submatrices ')
    AA = network[idx_ppi][:, idx_ppi]
    if AA.shape[0] == 0:
        warnings.warn("There are no common genes between PPI network and patients' mutation profile")
    AB = network[idx_ppi][:, idx_ppi_only]
    AC = sp.csc_matrix((len(idx_ppi), len(idx_mut_only))).astype(np.float32)
    BA = network[idx_ppi_only][:, idx_ppi]
    BB = network[idx_ppi_only][:, idx_ppi_only]
    BC = sp.csc_matrix((len(idx_ppi_only), len(idx_mut_only))).astype(np.float32)

    # TODO condition: if mutOnly = 0
    CA = sp.csc_matrix((len(idx_mut_only), len(idx_ppi)), dtype=np.float32)
    CB = sp.csc_matrix((len(idx_mut_only), len(idx_ppi_only)), dtype=np.float32)
    CC = sp.csc_matrix((len(idx_mut_only), len(idx_mut_only)), dtype=np.float32)

    print(' ==== ABC  ')
    ppi_total = sp.bmat([[AA, AB, AC], [BA, BB, BC], [CA, CB, CC]],
                        format='csc')
    # NOTE ppi_total in COO matrix -> csc matrix
    # ppi_total = ppi_total.tocsc()
    print(' ==== mut_total  ')
    mut_total = sp.bmat([[mutation_profile[:, idx_mut],
                          sp.csc_matrix((mutation_profile.shape[0],
                                         len(idx_ppi_only)), dtype=np.float32),
                          mutation_profile[:, idx_mut_only]]])
    # filter only genes in PPI
    print(' ==== filter only genes in PPI  ')
    degree = Ppi(ppi_total).deg
    ppi_filt = ppi_total[degree > 0, :][:, degree > 0]
    mut_filt = mut_total[:, degree > 0]
    print(' ==== all_genes_in_submatrices finish  ')
    return ppi_total, mut_total, ppi_filt, mut_filt
示例#2
0
def filter_ppi_patients(ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
                        keep_singletons=False,
                        min_mutation=0, max_mutation=2000):
    """Keeping only the connections with the best influencers and Filtering some
    patients based on mutation number

    'the 11 most influential neighbors of each gene in the network as
    determined by network influence distance were used'
    'Only mutation data generated using the Illumina GAIIx platform were
    retained for subsequent analy- sis, and patients with fewer than 10
    mutations were discarded.'

    Parameters
    ----------
    ppi_total : sparse matrix
        Built from all sparse sub-matrices (AA, ... , CC).

    mut_total : sparse matrix
        Patients' mutation profiles of all genes (rows: patients,
        columns: genes of AA, BB and CC).

    ppi_filt : sparse matrix
        Filtration from ppi_total : only genes in PPI are considered.

    final_influence :
        Smoothed PPI influence matrices based on minimum or maximum weight.

    ngh_max : int
        Number of best influencers in PPI.

    keep_singletons : boolean, default: False
        If True, proteins not annotated in PPI (genes founded only in patients'
        mutation profiles) will be also considered.
        If False, only annotated proteins in PPI will be considered.

    min_mutation, max_mutation : int
        Numbers of lowest mutations and highest mutations per patient.

    Returns
    -------
    ppi_final, mut_final : sparse matrix
        PPI and mutation profiles after filtering.
    """
    # n = final_influence.shape[0]
    # final_influence = index_to_sym_matrix(n, final_influence)

    ppi_ngh = best_neighboors(ppi_filt, final_influence, ngh_max)
    deg0 = Ppi(ppi_total).deg == 0  # True if protein degree = 0

    if keep_singletons:
        ppi_final = sp.bmat([
            [ppi_ngh, sp.csc_matrix((ppi_ngh.shape[0], sum(deg0)))],
            [sp.csc_matrix((sum(deg0), ppi_ngh.shape[0])),
             sp.csc_matrix((sum(deg0), sum(deg0)))]
            ])  # -> COO matrix
        # mut_final=sp.bmat([[mut_total[:,deg0==False],mut_total[:,deg0==True]]])
        mut_final = mut_total
    else:
        ppi_final = ppi_ngh
        mut_final = mut_total[:, Ppi(ppi_total).deg > 0]

    # filtered_patients = np.array([k < min_mutation or k > max_mutation for k in Patient(mut_final).mut_per_patient])
    # mut_final = mut_final[filtered_patients == False, :]

    # to avoid worse comparison '== False'
    mut_final = mut_final[np.array([min_mutation < k < max_mutation for k in
                                    Patient(mut_final).mut_per_patient])]

    print(" Removing %i patients with less than %i or more than %i mutations" %
          (mut_total.shape[0]-mut_final.shape[0], min_mutation, max_mutation))

    return ppi_final, mut_final
示例#3
0
def filter_ppi_patients(result_folder,
                        influence_weight,
                        simplification,
                        alpha,
                        tol,
                        ppi_total,
                        mut_total,
                        ppi_filt,
                        final_influence,
                        ngh_max,
                        keep_singletons=False,
                        min_mutation=0,
                        max_mutation=2000):
    """Keeping only the connections with the best influencers and Filtering some
    patients based on mutation number

    'the 11 most influential neighbors of each gene in the network as
    determined by network influence distance were used'
    'Only mutation data generated using the Illumina GAIIx platform were
    retained for subsequent analy- sis, and patients with fewer than 10
    mutations were discarded.'

    Parameters
    ----------
    ppi_total : sparse matrix
        Built from all sparse sub-matrices (AA, ... , CC).

    mut_total : sparse matrix
        Patients' mutation profiles of all genes (rows: patients,
        columns: genes of AA, BB and CC).

    ppi_filt : sparse matrix
        Filtration from ppi_total : only genes in PPI are considered.

    final_influence :
        Smoothed PPI influence matrices based on minimum or maximum weight.

    ngh_max : int
        Number of best influencers in PPI.

    keep_singletons : boolean, default: False
        If True, proteins not annotated in PPI (genes founded only in patients'
        mutation profiles) will be also considered.
        If False, only annotated proteins in PPI will be considered.

    min_mutation, max_mutation : int
        Numbers of lowest mutations and highest mutations per patient.

    Returns
    -------
    ppi_final, mut_final : sparse matrix
        PPI and mutation profiles after filtering.
    """
    ppi_final_directory = result_folder + 'final_influence/'
    ppi_final_file = (
        ppi_final_directory +
        'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'.
        format(influence_weight, simplification, alpha, tol, keep_singletons,
               ngh_max))

    existance_same_param = os.path.exists(ppi_final_file)
    if existance_same_param:
        ppi_final_data = loadmat(ppi_final_file)
        ppi_final = ppi_final_data['ppi_final']
        print(' **** Same parameters file of PPI FINAL already exists')
        if keep_singletons:
            mut_final = mut_total
        else:
            mut_final = mut_total[:, Ppi(ppi_total).deg > 0]
    else:
        ppi_ngh = best_neighboors(ppi_filt, final_influence, ngh_max)
        deg0 = Ppi(ppi_total).deg == 0  # True if protein degree = 0

        if keep_singletons:
            ppi_final = sp.bmat(
                [[ppi_ngh,
                  sp.csc_matrix((ppi_ngh.shape[0], sum(deg0)))],
                 [
                     sp.csc_matrix((sum(deg0), ppi_ngh.shape[0])),
                     sp.csc_matrix((sum(deg0), sum(deg0)))
                 ]])  # -> COO matrix
            # mut_final=sp.bmat([[mut_total[:,deg0==False],mut_total[:,deg0==True]]])
            mut_final = mut_total
        else:
            ppi_final = ppi_ngh
            mut_final = mut_total[:, Ppi(ppi_total).deg > 0]

        savemat(ppi_final_file, {'ppi_final': ppi_final}, do_compression=True)

    # to avoid worse comparison '== False'
    mut_final = mut_final[np.array([
        min_mutation <= k <= max_mutation
        for k in Patient(mut_final).mut_per_patient
    ])]

    print(
        " Removing %i patients with less than %i or more than %i mutations" %
        (mut_total.shape[0] - mut_final.shape[0], min_mutation, max_mutation))

    return ppi_final, mut_final