Exemplo n.º 1
0
def all_functions(params):

    if alpha == 0 and qn is not None:
        print('############ PASS ############')
        pass

    else:
        if patient_data == 'SSC':
            result_folder = (data_folder + 'result_' + ssc_mutation_data +
                             '_' + ssc_subgroups + '_' + gene_data + '_' +
                             ppi_data + '/')
        else:
            result_folder = (data_folder + 'result_' + patient_data + '_' +
                             ppi_data + '/')
        print(result_folder, flush=True)
        print("alpha =", alpha, flush=True)
        print("QN =", qn, flush=True)
        print("k =", n_components, flush=True)
        print("lambda =", lambd, flush=True)
        print("PPI network =", ppi_data, flush=True)

        # ------------ load_data.py ------------
        print("------------ load_data.py ------------ {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
        if patient_data == 'TCGA_UCEC':
            (patient_id, mutation_profile, gene_id_patient, gene_symbol_profile
             ) = load_data.load_TCGA_UCEC_patient_data(data_folder)

        elif patient_data == 'Faroe':
            mutation_profile, gene_id_patient = (
                load_data.load_Faroe_Islands_data(data_folder))

        elif patient_data == 'SSC':
            mutation_profile, gene_id_patient, patient_id = (
                load_data.load_specific_SSC_mutation_profile(
                    data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

        if ppi_data == 'Hofree_STRING':
            gene_id_ppi, network = load_data.load_Hofree_PPI_String(
                data_folder, ppi_data)

        else:
            gene_id_ppi, network = load_data.load_PPI_network(
                data_folder, ppi_data)

        # ------------ formatting_data.py ------------
        print("------------ formatting_data.py ------------ {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
        (network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only,
         idx_mut_only) = (formatting_data.classify_gene_index(
             network, mutation_profile, gene_id_ppi, gene_id_patient))

        (ppi_total, mut_total, ppi_filt,
         mut_filt) = (formatting_data.all_genes_in_submatrices(
             network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
             mutation_profile))

        # ------------ filtering_diffusion.py ------------
        print("------------ filtering_diffusion.py ------------ {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
        # ppi_influence = (
        #     filtering_diffusion.calcul_ppi_influence(
        #         sp.eye(ppi_filt.shape[0]), ppi_filt,
        #         result_folder, compute, overwrite, alpha, tol))

        final_influence = (filtering_diffusion.calcul_final_influence(
            sp.eye(ppi_filt.shape[0],
                   dtype=np.float32), ppi_filt, result_folder,
            influence_weight, simplification, compute, overwrite, alpha, tol))

        ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
            ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
            keep_singletons, min_mutation, max_mutation)

        mut_type, mut_propag = filtering_diffusion.propagation_profile(
            mut_final, ppi_filt, result_folder, alpha, tol, qn)

        # ------------ clustering.py ------------
        print("------------ clustering.py ------------ {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
        genes_clustering, patients_clustering = (clustering.bootstrap(
            result_folder, mut_type, mut_propag, ppi_final, influence_weight,
            simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
            max_mutation, n_components, n_permutations, run_bootstrap, lambd,
            tol_nmf, compute_gene_clustering))

        distance_genes, distance_patients = clustering.consensus_clustering(
            result_folder, genes_clustering, patients_clustering,
            influence_weight, simplification, mut_type, alpha, tol,
            keep_singletons, ngh_max, min_mutation, max_mutation, n_components,
            n_permutations, run_consensus, lambd, tol_nmf,
            compute_gene_clustering)

        # ------------ hierarchical_clustering.py ------------
        print("------------ hierarchical_clustering.py ------------ {}".format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

        hierarchical_clustering.distance_patients_from_consensus_file(
            result_folder, distance_patients, ppi_data, mut_type,
            influence_weight, simplification, alpha, tol, keep_singletons,
            ngh_max, min_mutation, max_mutation, n_components, n_permutations,
            lambd, tol_nmf, linkage_method, patient_data, data_folder,
            ssc_subgroups, ssc_mutation_data, gene_data)

        (total_cluster_list, probands_cluster_list, siblings_cluster_list,
         male_cluster_list, female_cluster_list, iq_cluster_list, distCEU_list,
         mutation_nb_cluster_list,
         text_file) = hierarchical_clustering.get_lists_from_clusters(
             data_folder, patient_data, ssc_mutation_data, ssc_subgroups,
             ppi_data, gene_data, result_folder, mut_type, influence_weight,
             simplification, alpha, tol, keep_singletons, ngh_max,
             min_mutation, max_mutation, n_components, n_permutations, lambd,
             tol_nmf, linkage_method)

        hierarchical_clustering.bio_statistics(
            n_components, total_cluster_list, probands_cluster_list,
            siblings_cluster_list, male_cluster_list, female_cluster_list,
            iq_cluster_list, distCEU_list, mutation_nb_cluster_list, text_file)
Exemplo n.º 2
0
def all_functions(params):

    if alpha == 0 and qn is not None:
        print('############ PASS ############')
        pass

    else:
        if patient_data == 'SSC':
            result_folder = (data_folder + 'result_' + ssc_mutation_data + '_' +
                                 ssc_subgroups + '_' + gene_data + '_' +  ppi_data + '/')
        else:
            result_folder = (data_folder + 'result_' + patient_data + '_' +
                             ppi_data + '/')
        print(result_folder, flush=True)
        print("alpha =", alpha, flush=True)
        print("QN =", qn, flush=True)
        print("k =", n_components, flush=True)
        print("lambda =", lambd, flush=True)
        print("PPI network =", ppi_data, flush=True)

        # ------------ load_data.py ------------
        print("------------ load_data.py ------------ {}"
              .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), flush=True)
        if patient_data == 'TCGA_UCEC':
            (patient_id, mutation_profile, gene_id_patient,
             gene_symbol_profile) = load_data.load_TCGA_UCEC_patient_data(
                 data_folder)

        elif patient_data == 'Faroe':
            mutation_profile, gene_id_patient = (
                load_data.load_Faroe_Islands_data(data_folder))

        elif patient_data == 'SSC':
            mutation_profile, gene_id_patient, patient_id = (
                load_data.load_specific_SSC_mutation_profile(
                    data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

        if ppi_data == 'Hofree_STRING':
            gene_id_ppi, network = load_data.load_Hofree_PPI_String(
                data_folder, ppi_data)

        else:
            gene_id_ppi, network = load_data.load_PPI_network(
                data_folder, ppi_data)

        # ------------ formatting_data.py ------------
        print("------------ formatting_data.py ------------ {}"
              .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), flush=True)
        (network, mutation_profile,
         idx_ppi, idx_mut, idx_ppi_only, idx_mut_only) = (
            formatting_data.classify_gene_index(
                network, mutation_profile, gene_id_ppi, gene_id_patient))

        (ppi_total, mut_total, ppi_filt, mut_filt) = (
            formatting_data.all_genes_in_submatrices(
                network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
                mutation_profile))

        # ------------ filtering_diffusion.py ------------
        print("------------ filtering_diffusion.py ------------ {}"
              .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), flush=True)
        final_influence = (
            filtering_diffusion.calcul_final_influence(
                sp.eye(ppi_filt.shape[0], dtype=np.float32), ppi_filt,
                result_folder, influence_weight, simplification,
                compute, overwrite, alpha, tol))

        ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
            ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
            keep_singletons, min_mutation, max_mutation)

        mut_type, mut_propag = filtering_diffusion.propagation_profile(
            mut_final, ppi_filt, result_folder, alpha, tol, qn)

        # ------------ clustering.py ------------
        print("------------ clustering.py ------------ {}"
              .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), flush=True)
        genes_clustering, patients_clustering = (clustering.bootstrap(
            result_folder, mut_type, mut_propag, ppi_final,
            influence_weight, simplification,
            alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation,
            n_components, n_permutations,
            run_bootstrap, lambd, tol_nmf, compute_gene_clustering))

        distance_genes, distance_patients = clustering.consensus_clustering(
            result_folder, genes_clustering, patients_clustering,
            influence_weight, simplification, mut_type,
            alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation,
            n_components, n_permutations, run_consensus, lambd, tol_nmf,
            compute_gene_clustering)

        # ------------ hierarchical_clustering.py ------------
        print("------------ hierarchical_clustering.py ------------ {}"
              .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), flush=True)
        # if alpha > 0:
        #     if qn == 'mean':
        #         mut_type = 'mean_qn'
        #     elif qn == 'median':
        #         mut_type = 'median_qn'
        #     else:
        #         mut_type = 'diff'
        # else:
        #     mut_type = 'raw'
        # print("mutation type =", mut_type)
        #
        # consensus_directory = result_folder+'consensus_clustering/'
        # consensus_mut_type_directory = consensus_directory + mut_type + '/'
        #
        # hierarchical_directory = result_folder+'hierarchical_clustering/'
        # os.makedirs(hierarchical_directory, exist_ok=True)
        # hierarchical_mut_type_directory = hierarchical_directory + mut_type + '/'
        # os.makedirs(hierarchical_mut_type_directory, exist_ok=True)
        #
        # if lambd > 0:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'gnmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'gnmf/')
        #
        # else:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'nmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'nmf/')
        # os.makedirs(hierarchical_factorization_directory, exist_ok=True)
        #
        # consensus_file = (consensus_factorization_directory +
        #               'consensus_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}.mat'
        #               .format(influence_weight, simplification, alpha, tol,
        #                       keep_singletons, ngh_max,
        #                       min_mutation, max_mutation,
        #                       n_components, n_permutations, lambd, tol_nmf))
        #
        # consensus_data = loadmat(consensus_file)
        # distance_genes = consensus_data['distance_genes']
        # distance_patients = consensus_data['distance_patients']


        hierarchical_clustering.distances_from_consensus_file(
            result_folder, distance_genes, distance_patients, ppi_data, mut_type,
            influence_weight, simplification,
            alpha, tol,  keep_singletons, ngh_max, min_mutation, max_mutation,
            n_components, n_permutations, lambd, tol_nmf, linkage_method,
            patient_data, data_folder, ssc_subgroups, ssc_mutation_data, gene_data)

        (total_cluster_list, probands_cluster_list, siblings_cluster_list,
                male_cluster_list, female_cluster_list, iq_cluster_list,
                distCEU_list, mutation_nb_cluster_list,
                text_file) = hierarchical_clustering.get_lists_from_clusters(
                    data_folder, patient_data, ssc_mutation_data,
                    ssc_subgroups, ppi_data, gene_data, result_folder,
                    mut_type, influence_weight, simplification, alpha, tol,
                    keep_singletons, ngh_max, min_mutation, max_mutation,
                    n_components, n_permutations, lambd, tol_nmf,
                    linkage_method)

        hierarchical_clustering.bio_statistics(
            n_components, total_cluster_list, probands_cluster_list,
            siblings_cluster_list, male_cluster_list, female_cluster_list,
            iq_cluster_list, distCEU_list, mutation_nb_cluster_list, text_file)

        hierarchical_clustering.get_entrezgene_from_cluster(
            data_folder, result_folder, ssc_mutation_data, patient_data,
            ssc_subgroups, alpha, n_components, ngh_max, n_permutations, lambd,
            influence_weight, simplification, tol, keep_singletons, min_mutation,
            max_mutation, tol_nmf, linkage_method, gene_data, ppi_data,
            gene_id_ppi, idx_ppi, idx_ppi_only, mut_type)
Exemplo n.º 3
0
def merge_2_subgroups(data_folder, ssc_mutation_data, ssc_subgroups, gene_data,
                      ppi_data, n_components, mut_type, alpha, ngh_max,
                      n_permutations, lambd, O, gene2go, min_category_size,
                      max_category_size, max_category_depth):
    ################ SSC 1 ################
    ssc_subgroups = 'SSC1'
    print(ssc_subgroups)
    mutation_profile, gene_id_patient, patient_id = (
    load_data.load_specific_SSC_mutation_profile(
        data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

    gene_id_ppi, network = load_data.load_PPI_network(data_folder, ppi_data)

    network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only = (
        formatting_data.classify_gene_index(
             network, mutation_profile, gene_id_ppi, gene_id_patient))

    # idx_mut : List of common genes' indexes in patients' mutation profiles.
    reference = pd.DataFrame({'GeneID': [gene_id_patient[i] for i in idx_mut]})
    background = tools.generate_background(
        gene2go, reference, 'GO_ID', 'GeneID')
    goenrich.enrich.propagate(O, background, 'reference')

    df_enrich1, df_pcount1 = p_df_by_subgroup(
        data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data,
        n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
        min_category_size, max_category_size, max_category_depth)

    ################ SSC 2 ################
    ssc_subgroups = 'SSC2'
    print(ssc_subgroups)
    mutation_profile, gene_id_patient, patient_id = (
    load_data.load_specific_SSC_mutation_profile(
        data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

    gene_id_ppi, network = load_data.load_PPI_network(data_folder, ppi_data)

    network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only = (
        formatting_data.classify_gene_index(
             network, mutation_profile, gene_id_ppi, gene_id_patient))

    # idx_mut : List of common genes' indexes in patients' mutation profiles.
    reference = pd.DataFrame({'GeneID': [gene_id_patient[i] for i in idx_mut]})
    background = tools.generate_background(
        gene2go, reference, 'GO_ID', 'GeneID')
    goenrich.enrich.propagate(O, background, 'reference')

    df_enrich2, df_pcount2 = p_df_by_subgroup(
        data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data,
        n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
        min_category_size, max_category_size, max_category_depth)

    ################ merge ################
    df_enrich = pd.concat([df_enrich1, df_enrich2], ignore_index=True)
    # change col order
    cols = list(df_enrich)
    cols.insert(0, cols.pop(cols.index('k')))
    cols.insert(0, cols.pop(cols.index('ind_group')))
    df_enrich = df_enrich.loc[:, cols]

    df_pcount = df_pcount1.merge(df_pcount2, how='inner', left_on='k',
                                 right_on='k', suffixes=[1, 2])

    directory = (
        data_folder + 'result_biostat_genes_GOslim/' + ssc_mutation_data +
        '_' + gene_data + '/')
    os.makedirs(directory, exist_ok=True)
    file_name_enrich = 'GOslim_{}_lambd={}_{}.pkl'.format(
        mut_type, lambd, ppi_data)
    file_name_pcount = 'p_count_{}_lambd={}_{}.pkl'.format(
        mut_type, lambd, ppi_data)
    # save
    df_enrich.to_pickle('{}{}'.format(directory, file_name_enrich))
    df_pcount.to_pickle('{}{}'.format(directory, file_name_pcount))
Exemplo n.º 4
0
def all_functions(params):

    if alpha == 0 and qn is not None:
        print('############ PASS ############')
        pass

    else:
        result_folder = data_folder + 'result_' + patient_data + '_' + ppi_data + '/'
        print(result_folder)
        print("alpha =", alpha)
        print("QN =", qn)
        print("k =", n_components)
        print("lambda =", lambd)
        print("PPI network =", ppi_data)

        # ------------ load_data.py ------------
        print("------------ load_data.py ------------")
        if patient_data == 'TCGA_UCEC':
            (patient_id, mutation_profile, gene_id_patient,
             gene_symbol_profile) = load_data.load_TCGA_UCEC_patient_data(
                 data_folder)

        elif patient_data == 'Faroe':
            mutation_profile, gene_id_patient = load_data.load_Faroe_Islands_data(
                data_folder)

        if ppi_data == 'STRING':
            gene_id_ppi, network = load_data.load_PPI_String(
                data_folder, ppi_data)

        elif ppi_data == 'Y2H':
            gene_id_ppi, network = load_data.load_PPI_Y2H(
                data_folder, ppi_data)

        # ------------ formatting_data.py ------------
        print("------------ formatting_data.py ------------")
        (network, mutation_profile,
         idx_ppi, idx_mut, idx_ppi_only, idx_mut_only) = (
            formatting_data.classify_gene_index(
                network, mutation_profile, gene_id_ppi, gene_id_patient))

        (ppi_total, mut_total, ppi_filt, mut_filt) = (
            formatting_data.all_genes_in_submatrices(
                network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
                mutation_profile))

        # ------------ filtering_diffusion.py ------------
        print("------------ filtering_diffusion.py ------------")
        # ppi_influence = (
        #     filtering_diffusion.calcul_ppi_influence(
        #         sp.eye(ppi_filt.shape[0]), ppi_filt,
        #         result_folder, compute, overwrite, alpha, tol))

        final_influence = (
            filtering_diffusion.calcul_final_influence(
                sp.eye(ppi_filt.shape[0], dtype=np.float32), ppi_filt,
                result_folder, influence_weight, simplification,
                compute, overwrite, alpha, tol))

        ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
            ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
            keep_singletons, min_mutation, max_mutation)

        mut_type, mut_propag = filtering_diffusion.propagation_profile(
            mut_final, ppi_filt, alpha, tol, qn)

        # ------------ clustering.py ------------
        print("------------ clustering.py ------------")
        genes_clustering, patients_clustering = (clustering.bootstrap(
            result_folder, mut_type, mut_propag, ppi_final,
            influence_weight, simplification,
            alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation,
            n_components, n_permutations,
            run_bootstrap, lambd, tol_nmf))

        distance_genes, distance_patients = clustering.consensus_clustering(
            result_folder, genes_clustering, patients_clustering,
            influence_weight, simplification, mut_type,
            alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation,
            n_components, n_permutations, run_consensus, lambd, tol_nmf)

        # ------------ hierarchical_clustering.py ------------
        print("------------ hierarchical_clustering.py ------------")
        # if alpha > 0:
        #     if qn == 'mean':
        #         mut_type = 'mean_qn'
        #     elif qn == 'median':
        #         mut_type = 'median_qn'
        #     else:
        #         mut_type = 'diff'
        # else:
        #     mut_type = 'raw'
        # print("mutation type =", mut_type)
        #
        # consensus_directory = result_folder+'consensus_clustering/'
        # consensus_mut_type_directory = consensus_directory + mut_type + '/'
        #
        # hierarchical_directory = result_folder+'hierarchical_clustering/'
        # os.makedirs(hierarchical_directory, exist_ok=True)
        # hierarchical_mut_type_directory = hierarchical_directory + mut_type + '/'
        # os.makedirs(hierarchical_mut_type_directory, exist_ok=True)
        #
        # if lambd > 0:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'gnmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'gnmf/')
        #
        # else:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'nmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'nmf/')
        # os.makedirs(hierarchical_factorization_directory, exist_ok=True)
        #
        # consensus_file = (consensus_factorization_directory +
        #                   'consensus_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}.mat'
        #                   .format(alpha, tol, keep_singletons, ngh_max,
        #                           min_mutation, max_mutation,
        #                           n_components, n_permutations, lambd, tol_nmf))
        #
        # consensus_data = loadmat(consensus_file)
        # distance_patients = consensus_data['distance_patients']
        #
        # hierarchical_clustering.distance_matrix(
        #     hierarchical_factorization_directory, distance_patients, ppi_data,
        #     mut_type,
        #     alpha, tol,  keep_singletons, ngh_max, min_mutation, max_mutation,
        #     n_components, n_permutations, lambd, tol_nmf, linkage_method)
        hierarchical_clustering.distance_patients_from_consensus_file(
            result_folder, distance_patients, ppi_data, mut_type,
            influence_weight, simplification, alpha, tol,  keep_singletons,
            ngh_max, min_mutation, max_mutation, n_components, n_permutations,
            lambd, tol_nmf, linkage_method)
Exemplo n.º 5
0
def all_functions(params):
    # if patient_data == 'SSC':
    #     if mut_type == 'raw':
    #         alpha = 0
    #         result_folder = (
    #         data_folder + 'result_' + ssc_mutation_data + '_' +
    #         ssc_subgroups + '_' + gene_data +  '/' + mut_type + '/')
    #     else:
    #         result_folder = (
    #             data_folder + 'result_' + ssc_mutation_data + '_' +
    #             ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
    #     # result_folder = (
    #     #     '/Volumes/Abu3/min/201809_sfari_without_category6_NaN/result_' + ssc_mutation_data + '_' +
    #     #     ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
    # else:
    #     result_folder = (data_folder + 'result_' + patient_data + '_' +
    #                      ppi_data + '/')
    #     if mut_type == 'raw':
    #         alpha = 0

    # global params
    muttype = params['muttype']

    alpha, result_folder = initiation(mut_type, alpha, patient_data,
                                      data_folder, ssc_mutation_data,
                                      ssc_subgroups, gene_data, ppi_data,
                                      lambd, n_components)

    print(result_folder, flush=True)
    print("mutation type =", mut_type, flush=True)
    print("alpha =", alpha, flush=True)
    print("k =", n_components, flush=True)
    print("lambda =", lambd, flush=True)
    print("PPI network =", ppi_data, flush=True)

    # ------------ load_data.py ------------
    print("------------ load_data.py ------------ {}".format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    if patient_data == 'TCGA_UCEC':
        (patient_id, mutation_profile, gene_id_patient, gene_symbol_profile
         ) = load_data.load_TCGA_UCEC_patient_data(data_folder)

    elif patient_data == 'Faroe':
        mutation_profile, gene_id_patient = (
            load_data.load_Faroe_Islands_data(data_folder))

    elif patient_data == 'SSC':
        mutation_profile, gene_id_patient, patient_id = (
            load_data.load_specific_SSC_mutation_profile(
                data_folder, ssc_mutation_data, ssc_subgroups, gene_data))

    if ppi_data == 'Hofree_STRING':
        gene_id_ppi, network = load_data.load_Hofree_PPI_String(
            data_folder, ppi_data)

    else:
        gene_id_ppi, network = load_data.load_PPI_network(
            data_folder, ppi_data)

    # ------------ formatting_data.py ------------
    print("------------ formatting_data.py ------------ {}".format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    (network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only,
     idx_mut_only) = (formatting_data.classify_gene_index(
         network, mutation_profile, gene_id_ppi, gene_id_patient))

    (ppi_total, mut_total, ppi_filt,
     mut_filt) = (formatting_data.all_genes_in_submatrices(
         network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
         mutation_profile))

    # ------------ filtering_diffusion.py ------------
    print("------------ filtering_diffusion.py ------------ {}".format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    # ppi_influence = (
    #     filtering_diffusion.calcul_ppi_influence(
    #         sp.eye(ppi_filt.shape[0]), ppi_filt,
    #         result_folder, compute, overwrite, alpha, tol))

    final_influence = (filtering_diffusion.calcul_final_influence(
        sp.eye(ppi_filt.shape[0], dtype=np.float32), ppi_filt, result_folder,
        influence_weight, simplification, compute, overwrite, alpha, tol))

    ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
        ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
        keep_singletons, min_mutation, max_mutation)

    mut_type, mut_propag = filtering_diffusion.propagation_profile(
        mut_final, ppi_filt, result_folder, alpha, tol, qn)

    # ------------ clustering.py ------------
    print("------------ clustering.py ------------ {}".format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    genes_clustering, patients_clustering = (clustering.bootstrap(
        result_folder, mut_type, mut_propag, ppi_final, influence_weight,
        simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
        max_mutation, n_components, n_permutations, run_bootstrap, lambd,
        tol_nmf, compute_gene_clustering))

    distance_genes, distance_patients = clustering.consensus_clustering(
        result_folder, genes_clustering, patients_clustering, influence_weight,
        simplification, mut_type, alpha, tol, keep_singletons, ngh_max,
        min_mutation, max_mutation, n_components, n_permutations,
        run_consensus, lambd, tol_nmf, compute_gene_clustering)

    # ------------ hierarchical_clustering.py ------------
    print("------------ hierarchical_clustering.py ------------ {}".format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    hierarchical_clustering.distance_patients_from_consensus_file(
        result_folder, distance_patients, ppi_data, mut_type, influence_weight,
        simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
        max_mutation, n_components, n_permutations, lambd, tol_nmf,
        linkage_method, patient_data, data_folder, ssc_subgroups,
        ssc_mutation_data, gene_data)

    (total_cluster_list, probands_cluster_list, siblings_cluster_list,
     male_cluster_list, female_cluster_list, iq_cluster_list, distCEU_list,
     mutation_nb_cluster_list,
     text_file) = hierarchical_clustering.get_lists_from_clusters(
         data_folder, patient_data, ssc_mutation_data, ssc_subgroups, ppi_data,
         gene_data, result_folder, mut_type, influence_weight, simplification,
         alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation,
         n_components, n_permutations, lambd, tol_nmf, linkage_method)

    hierarchical_clustering.bio_statistics(
        n_components, total_cluster_list, probands_cluster_list,
        siblings_cluster_list, male_cluster_list, female_cluster_list,
        iq_cluster_list, distCEU_list, mutation_nb_cluster_list, text_file)

    print("\n------------ biostat.py ------------ {}".format(
        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
          flush=True)
    import biostat

    gene_id_ppi, idx_ppi, idx_ppi_only = preprocessing(
        data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
        ppi_data, result_folder, influence_weight, simplification, compute,
        overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
        max_mutation, mut_type)

    biostat.biostat_analysis(
        data_folder, result_folder, patient_data, ssc_mutation_data,
        ssc_subgroups, ppi_data, gene_data, mut_type, influence_weight,
        simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
        max_mutation, n_components, n_permutations, lambd, tol_nmf,
        linkage_method, p_val_threshold, gene_id_ppi, idx_ppi, idx_ppi_only)

    biostat_go.biostat_go_enrichment(alpha, result_folder, mut_type,
                                     patient_data, data_folder,
                                     ssc_mutation_data, ssc_subgroups,
                                     gene_data, ppi_data, lambd, n_components,
                                     ngh_max, n_permutations)
Exemplo n.º 6
0
def all_functions(params):

    if alpha == 0 and qn is not None:
        print('############ PASS ############')
        pass

    else:
        result_folder = data_folder + 'result_' + patient_data + '_' + ppi_data + '/'
        print(result_folder)
        print("alpha =", alpha)
        print("QN =", qn)
        print("k =", n_components)
        print("lambda =", lambd)
        print("PPI network =", ppi_data)

        # ------------ load_data.py ------------
        print("------------ load_data.py ------------")
        if patient_data == 'TCGA_UCEC':
            (patient_id, mutation_profile, gene_id_patient, gene_symbol_profile
             ) = load_data.load_TCGA_UCEC_patient_data(data_folder)

        elif patient_data == 'Faroe':
            mutation_profile, gene_id_patient = load_data.load_Faroe_Islands_data(
                data_folder)

        if ppi_data == 'STRING':
            gene_id_ppi, network = load_data.load_PPI_String(
                data_folder, ppi_data)

        elif ppi_data == 'Y2H':
            gene_id_ppi, network = load_data.load_PPI_Y2H(
                data_folder, ppi_data)

        # ------------ formatting_data.py ------------
        print("------------ formatting_data.py ------------")
        (network, mutation_profile, idx_ppi, idx_mut, idx_ppi_only,
         idx_mut_only) = (formatting_data.classify_gene_index(
             network, mutation_profile, gene_id_ppi, gene_id_patient))

        (ppi_total, mut_total, ppi_filt,
         mut_filt) = (formatting_data.all_genes_in_submatrices(
             network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
             mutation_profile))

        # ------------ filtering_diffusion.py ------------
        print("------------ filtering_diffusion.py ------------")
        final_influence = (filtering_diffusion.calcul_final_influence(
            sp.eye(ppi_filt.shape[0],
                   dtype=np.float32), ppi_filt, result_folder,
            influence_weight, simplification, compute, overwrite, alpha, tol))

        ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
            ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
            keep_singletons, min_mutation, max_mutation)

        mut_type, mut_propag = filtering_diffusion.propagation_profile(
            mut_final, ppi_filt, alpha, tol, qn)

        # ------------ clustering.py ------------
        print("------------ clustering.py ------------")
        genes_clustering, patients_clustering = (clustering.bootstrap(
            result_folder, mut_type, mut_propag, ppi_final, influence_weight,
            simplification, alpha, tol, keep_singletons, ngh_max, min_mutation,
            max_mutation, n_components, n_permutations, run_bootstrap, lambd,
            tol_nmf))

        distance_genes, distance_patients = clustering.consensus_clustering(
            result_folder, genes_clustering, patients_clustering,
            influence_weight, simplification, mut_type, alpha, tol,
            keep_singletons, ngh_max, min_mutation, max_mutation, n_components,
            n_permutations, run_consensus, lambd, tol_nmf)

        # ------------ hierarchical_clustering.py ------------
        print("------------ hierarchical_clustering.py ------------")
        # if alpha > 0:
        #     if qn == 'mean':
        #         mut_type = 'mean_qn'
        #     elif qn == 'median':
        #         mut_type = 'median_qn'
        #     else:
        #         mut_type = 'diff'
        # else:
        #     mut_type = 'raw'
        # print("mutation type =", mut_type)
        #
        # consensus_directory = result_folder+'consensus_clustering/'
        # consensus_mut_type_directory = consensus_directory + mut_type + '/'
        #
        # hierarchical_directory = result_folder+'hierarchical_clustering/'
        # os.makedirs(hierarchical_directory, exist_ok=True)
        # hierarchical_mut_type_directory = hierarchical_directory + mut_type + '/'
        # os.makedirs(hierarchical_mut_type_directory, exist_ok=True)
        #
        # if lambd > 0:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'gnmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'gnmf/')
        #
        # else:
        #     consensus_factorization_directory = (consensus_mut_type_directory + 'nmf/')
        #     hierarchical_factorization_directory = (hierarchical_mut_type_directory + 'nmf/')
        # os.makedirs(hierarchical_factorization_directory, exist_ok=True)
        #
        # consensus_file = (consensus_factorization_directory +
        #                   'consensus_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}.mat'
        #                   .format(alpha, tol, keep_singletons, ngh_max,
        #                           min_mutation, max_mutation,
        #                           n_components, n_permutations, lambd, tol_nmf))
        #
        # consensus_data = loadmat(consensus_file)
        # distance_patients = consensus_data['distance_patients']
        #
        # hierarchical_clustering.distance_matrix(
        #     hierarchical_factorization_directory, distance_patients, ppi_data,
        #     mut_type,
        #     alpha, tol,  keep_singletons, ngh_max, min_mutation, max_mutation,
        #     n_components, n_permutations, lambd, tol_nmf, linkage_method)
        hierarchical_clustering.distance_patients_from_consensus_file(
            result_folder, distance_patients, ppi_data, mut_type,
            influence_weight, simplification, alpha, tol, keep_singletons,
            ngh_max, min_mutation, max_mutation, n_components, n_permutations,
            lambd, tol_nmf, linkage_method)
Exemplo n.º 7
0
def all_functions(data_folder, patient_data, ppi_data, influence_weight, simplification,
                 compute, overwrite, alpha, tol, ngh_max, keep_singletons, min_mutation,
                 max_mutation, qn, n_components, n_permutations, run_bootstrap, run_consensus,
                 lambd, tol_nmf, linkage_method):
    if (sys.version_info < (3, 2)):
        raise "Must be using Python ≥ 3.2"

    else:
        start = time.time()
        if alpha == 0 and qn is not None:
            print('######################## PASS ########################')
            pass

        else:
            result_folder = 'reproducibility_output/' + 'result_' + patient_data + '_' + ppi_data + '/'
            print('\n######################## Starting StratiPy ########################')
            print("\nGraph regulator factor (lambda) =", lambd)
            print("Permutation number of bootstrap =", n_permutations)

            print("\n------------ load_data.py ------------ {}"
                  .format(datetime.datetime.now()
                          .strftime("%Y-%m-%d %H:%M:%S")))

            (patient_id, mutation_profile, gene_id_patient,
             gene_symbol_profile) = load_data.load_TCGA_UCEC_patient_data(
                 data_folder)

            gene_id_ppi, network = load_data.load_PPI_String(
                data_folder, ppi_data)

            print("\n------------ formatting_data.py ------------ {}"
                  .format(datetime.datetime.now()
                          .strftime("%Y-%m-%d %H:%M:%S")))
            (network, mutation_profile,
             idx_ppi, idx_mut, idx_ppi_only, idx_mut_only) = (
                formatting_data.classify_gene_index(
                    network, mutation_profile, gene_id_ppi, gene_id_patient))

            (ppi_total, mut_total, ppi_filt, mut_filt) = (
                formatting_data.all_genes_in_submatrices(
                    network, idx_ppi, idx_mut, idx_ppi_only, idx_mut_only,
                    mutation_profile))

            print("\n------------ filtering_diffusion.py ------------ {}"
                  .format(datetime.datetime.now()
                          .strftime("%Y-%m-%d %H:%M:%S")))
            final_influence = (
                filtering_diffusion.calcul_final_influence(
                    sp.eye(ppi_filt.shape[0], dtype=np.float32), ppi_filt,
                    result_folder, influence_weight, simplification,
                    compute, overwrite, alpha, tol))

            ppi_final, mut_final = filtering_diffusion.filter_ppi_patients(
                ppi_total, mut_total, ppi_filt, final_influence, ngh_max,
                keep_singletons, min_mutation, max_mutation)

            mut_type, mut_propag = filtering_diffusion.propagation_profile(
                mut_final, ppi_filt, result_folder, alpha, tol, qn)

            # ------------ clustering.py ------------
            print("\n------------ clustering.py ------------ {}"
                  .format(datetime.datetime.now()
                          .strftime("%Y-%m-%d %H:%M:%S")))
            genes_clustering, patients_clustering = (clustering.bootstrap(
                result_folder, mut_type, mut_propag, ppi_final,
                influence_weight, simplification,
                alpha, tol, keep_singletons, ngh_max, min_mutation,
                max_mutation, n_components, n_permutations, run_bootstrap,
                lambd, tol_nmf))

            distance_genes, distance_patients = clustering.consensus_clustering(
                result_folder, genes_clustering, patients_clustering,
                influence_weight, simplification, mut_type, alpha, tol,
                keep_singletons, ngh_max, min_mutation, max_mutation,
                n_components, n_permutations, run_consensus, lambd, tol_nmf)

            # ------------ hierarchical_clustering.py ------------
            print("\n------------ hierarchical_clustering.py ------------ {}"
                  .format(datetime.datetime.now()
                          .strftime("%Y-%m-%d %H:%M:%S")))
            hierarchical_clustering.distance_patients_from_consensus_file(
                result_folder, distance_patients, ppi_data, mut_type,
                influence_weight, simplification, alpha, tol,  keep_singletons,
                ngh_max, min_mutation, max_mutation, n_components,
                n_permutations, lambd, tol_nmf, linkage_method)

            end = time.time()
            print('\n------------ END: One Step of StratiPy = {} ------------ {}\n\n'
                  .format(datetime.timedelta(seconds=end-start),
                          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))