def find_and_save_cc_net_nmf_clusters_parallel(network_mat, spreadsheet_mat,
                                               lap_diag, lap_pos,
                                               run_parameters,
                                               local_parallelism):
    """ central loop: compute components for the consensus matrix from the input
        network and spreadsheet matrices and save them to temp files.

    Args:
        network_mat: genes x genes symmetric matrix.
        spreadsheet_mat: genes x samples matrix.
        lap_dag: laplacian matrix component, L = lap_dag - lap_val.
        lap_val: laplacian matrix component, L = lap_dag - lap_val.
        run_parameters: dictionary of run-time parameters.
        number_of_cpus: number of processes to be running in parallel
    """

    jobs_id = range(0, local_parallelism)
    zipped_arguments = dstutil.zip_parameters(network_mat, spreadsheet_mat,
                                              lap_diag, lap_pos,
                                              run_parameters, jobs_id)

    if 'parallelism' in run_parameters:
        parallelism = dstutil.determine_parallelism_locally(
            local_parallelism, run_parameters['parallelism'])

    else:
        parallelism = dstutil.determine_parallelism_locally(local_parallelism)

    dstutil.parallelize_processes_locally(run_cc_net_nmf_clusters_worker,
                                          zipped_arguments, parallelism)
Exemplo n.º 2
0
def run_correlation(run_parameters):
    """ perform feature prioritization

    Args:
        run_parameters: parameter set dictionary.
    """
    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    phenotype_df = phenotype_df.T

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    for i in range(0, len_phenotype, max_cpu):
        jobs_id = array_of_jobs[i:i + max_cpu]
        number_of_jobs = len(jobs_id)

        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  jobs_id)
        dstutil.parallelize_processes_locally(run_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Exemplo n.º 3
0
def run_bootstrap_net_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling and network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)
    sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0]

    number_of_jobs = len(phenotype_df.index)
    jobs_id = range(0, number_of_jobs)
    zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat,
                                              spreadsheet_genes_as_input, baseline_array, jobs_id)
    dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs)

    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Exemplo n.º 4
0
def run_correlation(run_parameters):
    """ perform gene prioritization

    Args:
        run_parameters: parameter set dictionary.
    """

    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    results_tmp_directory      = run_parameters["results_tmp_directory"     ]
    phenotype_name_full_path   = run_parameters["phenotype_name_full_path"  ]
    spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"]

    spreadsheet_df             = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    phenotype_df               = kn.get_spreadsheet_df(phenotype_name_full_path  )
    phenotype_df               = phenotype_df.T

    number_of_jobs             = len(phenotype_df.index)
    jobs_id                    = range(0, number_of_jobs)
    zipped_arguments           = dstutil.zip_parameters( run_parameters
                                                       , spreadsheet_df
                                                       , phenotype_df
                                                       , jobs_id
                                                       )

    dstutil.parallelize_processes_locally( run_correlation_worker
                                         , zipped_arguments
                                         , number_of_jobs
                                         )

    write_phenotype_data_all(run_parameters       )
    kn.remove_dir           (results_tmp_directory)
Exemplo n.º 5
0
def find_and_save_cc_similarity_parallel(expression_df, signature_df,
                                         run_parameters, local_parallelism):
    """ central loop: compute components for the similarity matrix by

    Args:
        expression_df    : genes x samples
        signature_df     : genes x samples
        run_parameters   : dictionary of run-time parameters
        local_parallelism: parallelism option
    """
    import knpackage.distributed_computing_utils as dstutil

    jobs_id = range(0, local_parallelism)
    zipped_arguments = dstutil.zip_parameters(expression_df, signature_df,
                                              run_parameters, jobs_id)

    if 'parallelism' in run_parameters:
        parallelism = dstutil.determine_parallelism_locally(
            local_parallelism, run_parameters['parallelism'])

    else:
        parallelism = dstutil.determine_parallelism_locally(local_parallelism)

    dstutil.parallelize_processes_locally(run_cc_similarity_signature_worker,
                                          zipped_arguments, parallelism)
Exemplo n.º 6
0
def find_and_save_cc_link_hclust_clusters_parallel(spreadsheet_mat,
                                                   run_parameters,
                                                   local_parallelism):
    #-----------------------------------------------------
    """ central loop: compute components for the consensus matrix by hclust.

    Args:
        spreadsheet_mat: genes x samples matrix.
        run_parameters: dictionary of run-time parameters.
        number_of_cpus: number of processes to be running in parallel
    """

    import knpackage.distributed_computing_utils as dstutil

    jobs_id = range(0, local_parallelism)
    zipped_arguments = dstutil.zip_parameters(spreadsheet_mat, run_parameters,
                                              jobs_id)

    if 'parallelism' in run_parameters:
        parallelism = dstutil.determine_parallelism_locally(
            local_parallelism, run_parameters['parallelism'])
    else:
        parallelism = dstutil.determine_parallelism_locally(local_parallelism)

    dstutil.parallelize_processes_locally(run_cc_link_hclust_clusters_worker,
                                          zipped_arguments, parallelism)
def run_bootstrap_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """

    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    results_tmp_directory = run_parameters["results_tmp_directory"]
    n_bootstraps = run_parameters["number_of_bootstraps"]
    results_tmp_directory = run_parameters["results_tmp_directory"]
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])

    phenotype_df = phenotype_df.T

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------
    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)
        #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)

        write_phenotype_data_all(run_parameters)
        #-----------------------------------------------------------------------------------------

    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(results_tmp_directory)
Exemplo n.º 8
0
def run_bootstrap_correlation(run_parameters):
    """ perform feature prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    phenotype_df        = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df      = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    phenotype_df        = phenotype_df.T
    n_bootstraps        = run_parameters["number_of_bootstraps"]
    number_of_jobs      = len(phenotype_df.index)
    jobs_id             = range(0, number_of_jobs)
    zipped_arguments    = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id)

    dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
def find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters,
                                           local_parallelism):
    """ central loop: compute components for the consensus matrix by
        non-negative matrix factorization.

    Args:
        spreadsheet_mat: genes x samples matrix.
        run_parameters: dictionary of run-time parameters.
        number_of_cpus: number of processes to be running in parallel
    """

    jobs_id = range(0, local_parallelism)
    zipped_arguments = dstutil.zip_parameters(spreadsheet_mat, run_parameters,
                                              jobs_id)

    if 'parallelism' in run_parameters:
        parallelism = dstutil.determine_parallelism_locally(
            local_parallelism, run_parameters['parallelism'])

    else:
        parallelism = dstutil.determine_parallelism_locally(local_parallelism)

    dstutil.parallelize_processes_locally(run_cc_nmf_clusters_worker,
                                          zipped_arguments, parallelism)
def run_net_correlation(run_parameters):
    """ perform gene prioritization with network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    network_mat, unique_gene_names = kn.get_sparse_network_matrix(
        gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)

    sample_smooth, iterations = kn.smooth_matrix_with_rwr(
        spreadsheet_df.values, network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth,
                                  index=spreadsheet_df.index,
                                  columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat,
                                               run_parameters)[0]

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)

        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  network_mat,
                                                  spreadsheet_genes_as_input,
                                                  baseline_array, jobs_id)
        dstutil.parallelize_processes_locally(run_net_correlation_worker,
                                              zipped_arguments, number_of_jobs)

        write_phenotype_data_all(run_parameters)

        #-----------------------------------------------------------------------------------------
    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  network_mat,
                                                  spreadsheet_genes_as_input,
                                                  baseline_array, jobs_id)
        dstutil.parallelize_processes_locally(run_net_correlation_worker,
                                              zipped_arguments, number_of_jobs)

    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(run_parameters["results_tmp_directory"])