def find_and_save_cc_net_nmf_clusters_parallel(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, local_parallelism): """ central loop: compute components for the consensus matrix from the input network and spreadsheet matrices and save them to temp files. Args: network_mat: genes x genes symmetric matrix. spreadsheet_mat: genes x samples matrix. lap_dag: laplacian matrix component, L = lap_dag - lap_val. lap_val: laplacian matrix component, L = lap_dag - lap_val. run_parameters: dictionary of run-time parameters. number_of_cpus: number of processes to be running in parallel """ jobs_id = range(0, local_parallelism) zipped_arguments = dstutil.zip_parameters(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, jobs_id) if 'parallelism' in run_parameters: parallelism = dstutil.determine_parallelism_locally( local_parallelism, run_parameters['parallelism']) else: parallelism = dstutil.determine_parallelism_locally(local_parallelism) dstutil.parallelize_processes_locally(run_cc_net_nmf_clusters_worker, zipped_arguments, parallelism)
def run_correlation(run_parameters): """ perform feature prioritization Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, jobs_id) dstutil.parallelize_processes_locally(run_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_bootstrap_net_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling and network smoothing Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_correlation(run_parameters): """ perform gene prioritization Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory" ] phenotype_name_full_path = run_parameters["phenotype_name_full_path" ] spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) phenotype_df = kn.get_spreadsheet_df(phenotype_name_full_path ) phenotype_df = phenotype_df.T number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters( run_parameters , spreadsheet_df , phenotype_df , jobs_id ) dstutil.parallelize_processes_locally( run_correlation_worker , zipped_arguments , number_of_jobs ) write_phenotype_data_all(run_parameters ) kn.remove_dir (results_tmp_directory)
def find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, local_parallelism): """ central loop: compute components for the similarity matrix by Args: expression_df : genes x samples signature_df : genes x samples run_parameters : dictionary of run-time parameters local_parallelism: parallelism option """ import knpackage.distributed_computing_utils as dstutil jobs_id = range(0, local_parallelism) zipped_arguments = dstutil.zip_parameters(expression_df, signature_df, run_parameters, jobs_id) if 'parallelism' in run_parameters: parallelism = dstutil.determine_parallelism_locally( local_parallelism, run_parameters['parallelism']) else: parallelism = dstutil.determine_parallelism_locally(local_parallelism) dstutil.parallelize_processes_locally(run_cc_similarity_signature_worker, zipped_arguments, parallelism)
def find_and_save_cc_link_hclust_clusters_parallel(spreadsheet_mat, run_parameters, local_parallelism): #----------------------------------------------------- """ central loop: compute components for the consensus matrix by hclust. Args: spreadsheet_mat: genes x samples matrix. run_parameters: dictionary of run-time parameters. number_of_cpus: number of processes to be running in parallel """ import knpackage.distributed_computing_utils as dstutil jobs_id = range(0, local_parallelism) zipped_arguments = dstutil.zip_parameters(spreadsheet_mat, run_parameters, jobs_id) if 'parallelism' in run_parameters: parallelism = dstutil.determine_parallelism_locally( local_parallelism, run_parameters['parallelism']) else: parallelism = dstutil.determine_parallelism_locally(local_parallelism) dstutil.parallelize_processes_locally(run_cc_link_hclust_clusters_worker, zipped_arguments, parallelism)
def run_bootstrap_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory"] n_bootstraps = run_parameters["number_of_bootstraps"] results_tmp_directory = run_parameters["results_tmp_directory"] phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(results_tmp_directory)
def run_bootstrap_correlation(run_parameters): """ perform feature prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T n_bootstraps = run_parameters["number_of_bootstraps"] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters, local_parallelism): """ central loop: compute components for the consensus matrix by non-negative matrix factorization. Args: spreadsheet_mat: genes x samples matrix. run_parameters: dictionary of run-time parameters. number_of_cpus: number of processes to be running in parallel """ jobs_id = range(0, local_parallelism) zipped_arguments = dstutil.zip_parameters(spreadsheet_mat, run_parameters, jobs_id) if 'parallelism' in run_parameters: parallelism = dstutil.determine_parallelism_locally( local_parallelism, run_parameters['parallelism']) else: parallelism = dstutil.determine_parallelism_locally(local_parallelism) dstutil.parallelize_processes_locally(run_cc_nmf_clusters_worker, zipped_arguments, parallelism)
def run_net_correlation(run_parameters): """ perform gene prioritization with network smoothing Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix( gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr( spreadsheet_df.values, network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(run_parameters["results_tmp_directory"])