def run_net_nmf(run_parameters):
    """ wrapper: call sequence to perform network based stratification and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    number_of_clusters = run_parameters['number_of_clusters']
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    network_mat, unique_gene_names = kn.get_sparse_network_matrix(
        gg_network_name_full_path)
    network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat)
    lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat)

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)
    sample_names = spreadsheet_df.columns
    spreadsheet_mat = spreadsheet_df.as_matrix()

    spreadsheet_mat, iterations = kn.smooth_matrix_with_rwr(
        spreadsheet_mat, network_mat, run_parameters)
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)
    h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_pos, lap_diag,
                               run_parameters)

    linkage_matrix = np.zeros(
        (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1]))
    sample_perm = np.arange(0, spreadsheet_mat.shape[1])
    linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm,
                                              linkage_matrix)
    labels = kn.perform_kmeans(linkage_matrix, number_of_clusters)

    save_consensus_clustering(linkage_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters, network_mat)
def run_net_path(run_parameters):
    ''' wrapper: call sequence to perform net path
    Args:
        run_parameters: dictionary of run parameters
    '''
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']
    results_directory          = run_parameters['results_directory'         ]

    network_sparse,            \
    unique_gene_names,         \
    pg_network_n1_names        = build_hybrid_sparse_matrix(run_parameters, False, False)

    spreadsheet_df             = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    new_spreadsheet_df         = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names)

    hetero_network             = normalize(network_sparse, norm='l1', axis=0)

    identity_mat               = np.eye(hetero_network.shape[0])

    final_rwr_matrix,          \
    step                       = kn.smooth_matrix_with_rwr( identity_mat, hetero_network, run_parameters)
    smooth_rwr_matrix          = smooth_final_spreadsheet_matrix(final_rwr_matrix, len(unique_gene_names))

    cosine_matrix              = get_net_path_results(len(unique_gene_names), smooth_rwr_matrix, run_parameters)

    cosine_matrix_df = pd.DataFrame(cosine_matrix, index=unique_gene_names, columns=pg_network_n1_names)
    #----------------------
    # save_cosine_matrix_df(cosine_matrix_df, run_parameters)
    #----------------------
    property_rank_df = rank_netpath_property      (new_spreadsheet_df, cosine_matrix_df)
    prop_result_df   = construct_netpath_result_df(new_spreadsheet_df, cosine_matrix_df)

    save_timestamped_df  (property_rank_df, results_directory, 'net_path_ranked_by_property')
    save_timestamped_df  (prop_result_df,   results_directory, 'net_path_sorted_by_property_score')
    map_and_save_droplist(spreadsheet_df,   unique_gene_names, 'net_path_droplist', run_parameters)

    return property_rank_df
def run_bootstrap_net_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling and network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    max_cpu = run_parameters["max_cpu"]

    network_mat, unique_gene_names = kn.get_sparse_network_matrix(
        gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)
    sample_smooth, iterations = kn.smooth_matrix_with_rwr(
        spreadsheet_df.values, network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth,
                                  index=spreadsheet_df.index,
                                  columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat,
                                               run_parameters)[0]

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)
        # -----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  network_mat,
                                                  spreadsheet_genes_as_input,
                                                  baseline_array, jobs_id)
        dstutil.parallelize_processes_locally(
            run_bootstrap_net_correlation_worker, zipped_arguments,
            number_of_jobs)
        write_phenotype_data_all(run_parameters)
        # -----------------------------------------------------------------------------------------
    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
            zipped_arguments = dstutil.zip_parameters(
                run_parameters, spreadsheet_df, phenotype_df, network_mat,
                spreadsheet_genes_as_input, baseline_array, jobs_id)
            dstutil.parallelize_processes_locally(
                run_bootstrap_net_correlation_worker, zipped_arguments,
                number_of_jobs)
    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(run_parameters["results_tmp_directory"])
def run_cc_net_nmf(run_parameters):
    """ wrapper: call sequence to perform network based stratification with consensus clustering
        and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    tmp_dir = 'tmp_cc_net_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_clusters = run_parameters['number_of_clusters']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    network_mat,               \
             unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)
    network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat)
    lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat)

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)

    spreadsheet_mat = spreadsheet_df.values
    number_of_samples = spreadsheet_mat.shape[1]
    sample_names = spreadsheet_df.columns

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat,
                                           lap_diag, lap_pos, run_parameters,
                                           sample)

    elif processing_method == 'parallel':
        find_and_save_cc_net_nmf_clusters_parallel(network_mat,
                                                   spreadsheet_mat, lap_diag,
                                                   lap_pos, run_parameters,
                                                   number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [
            network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters
        ]
        dependency_list = [
            run_cc_net_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        cluster_ip_address = run_parameters['cluster_ip_address']
        dstutil.execute_distribute_computing_job(
            cluster_ip_address, number_of_bootstraps, func_args,
            find_and_save_cc_net_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    distance_matrix = pairwise_distances(
        consensus_matrix,
        n_jobs=-1)  # [n_samples, n_samples] use all available cores
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels,
                                         run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters, network_mat)

    kn.remove_dir(run_parameters["tmp_directory"])
def run_DRaWR(run_parameters):
    ''' wrapper: call sequence to perform random walk with restart
    Args:
        run_parameters: dictionary of run parameters
    '''

    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']
    results_directory          = run_parameters['results_directory'         ]

    network_sparse,            \
    unique_gene_names,         \
    pg_network_n1_names        = build_hybrid_sparse_matrix(run_parameters, True, True)

    unique_all_node_names      = unique_gene_names + pg_network_n1_names

    spreadsheet_df             = kn.get_spreadsheet_df  (spreadsheet_name_full_path)
    new_spreadsheet_df         = kn.update_spreadsheet_df(spreadsheet_df, unique_all_node_names)

    unique_genes_length        = len( unique_gene_names        )
    property_length            = len( set(pg_network_n1_names) )

    base_col                   = np.append( np.ones(unique_genes_length, dtype=np.int),
                                           np.zeros(property_length,     dtype=np.int)  )

    new_spreadsheet_df         = kn.append_column_to_spreadsheet(new_spreadsheet_df, base_col, 'base')

    hetero_network             = normalize(network_sparse, norm='l1', axis=0)

    normalize_new_spreadsheet  = normalize(new_spreadsheet_df, norm='l1', axis=0)

    final_spreadsheet_matrix,  \
    step                       = kn.smooth_matrix_with_rwr( normalize_new_spreadsheet
                                                          , hetero_network
                                                          , run_parameters     )

    final_spreadsheet_df         = pd.DataFrame(final_spreadsheet_matrix)
    final_spreadsheet_df.index   = new_spreadsheet_df.index.values
    final_spreadsheet_df.columns = new_spreadsheet_df.columns.values
    prop_spreadsheet_df          = rank_drawr_property( final_spreadsheet_df
                                                      , pg_network_n1_names  )

    tmp_index                    = final_spreadsheet_df.index.isin(spreadsheet_df.index)
    spreadsheet_df_mask          = final_spreadsheet_df.loc[tmp_index]

    gene_result_df = construct_drawr_result_df( spreadsheet_df_mask
                                              , 0
                                              , spreadsheet_df_mask.shape[0]
                                              , True
                                              , run_parameters  )

    prop_result_df = construct_drawr_result_df( final_spreadsheet_df
                                              , unique_genes_length
                                              , final_spreadsheet_df.shape[0]
                                              , False
                                              , run_parameters  )

    save_timestamped_df  ( prop_spreadsheet_df, results_directory, 'DRaWR_ranked_by_property'      )
    save_timestamped_df  ( gene_result_df,      results_directory, 'DRaWR_sorted_by_gene_score'    )
    save_timestamped_df  ( prop_result_df,      results_directory, 'DRaWR_sorted_by_property_score')
    map_and_save_droplist( spreadsheet_df,      unique_gene_names, 'DRaWR_droplist', run_parameters)

    return prop_spreadsheet_df
def run_fisher(run_parameters):
    ''' wrapper: call sequence to perform fisher gene-set characterization
    Args:
        run_parameters: dictionary of run parameters
    '''

    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']
    pg_network_name_full_path  = run_parameters['pg_network_name_full_path' ]

    # -----------------------------------
    # - Data read and extraction Section -
    # -----------------------------------
    spreadsheet_df       = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    prop_gene_network_df = kn.get_network_df    ( pg_network_name_full_path)

    spreadsheet_gene_names      = kn.extract_spreadsheet_gene_names(spreadsheet_df)
    prop_gene_network_n1_names, \
    prop_gene_network_n2_names = kn.extract_network_node_names     (prop_gene_network_df)

    # -----------------------------------------------------------------------
    # - limit the gene set to the intersection of network and user gene set -
    # -----------------------------------------------------------------------
    common_gene_names      = kn.find_common_node_names( prop_gene_network_n2_names
                                                      , spreadsheet_gene_names )
    common_gene_names_dict = kn.create_node_names_dict( common_gene_names )

    prop_gene_network_n1_names_dict = kn.create_node_names_dict        (prop_gene_network_n1_names     )
    reverse_prop_dict               = kn.create_reverse_node_names_dict(prop_gene_network_n1_names_dict)

    # ----------------------------------------------------------------------------
    # - restrict spreadsheet and network to common genes and drop everthing else -
    # ----------------------------------------------------------------------------
    new_spreadsheet_df         = kn.update_spreadsheet_df(spreadsheet_df,       common_gene_names          )
    prop_gene_network_df       = kn.update_network_df    (prop_gene_network_df, common_gene_names, "node_2")
    prop_gene_network_df['wt'] = 1

    # ----------------------------------------------------------------------------
    # - map every gene name to an integer index in sequential order startng at 0 -
    # ----------------------------------------------------------------------------
    prop_gene_network_df = kn.map_node_names_to_index( prop_gene_network_df
                                                     , prop_gene_network_n1_names_dict
                                                     , "node_1")

    prop_gene_network_df = kn.map_node_names_to_index( prop_gene_network_df
                                                     , common_gene_names_dict
                                                     , "node_2")

    # --------------------------------------------
    # - store the network in a csr sparse format -
    # --------------------------------------------
    universe_count           = len(common_gene_names)

    prop_gene_network_sparse = kn.convert_network_df_to_sparse( prop_gene_network_df
                                                              , universe_count
                                                              , len(prop_gene_network_n1_names))

    fisher_contingency_pval = get_fisher_exact_test           ( prop_gene_network_sparse
                                                              , reverse_prop_dict
                                                              , new_spreadsheet_df
                                                              , run_parameters['max_cpu']  )

    fisher_final_result     = save_fisher_test_result         ( fisher_contingency_pval
                                                              , run_parameters['results_directory']
                                                              , spreadsheet_df.columns.values  )

    map_and_save_droplist   ( spreadsheet_df
                            , common_gene_names
                            , 'fisher_droplist'
                            , run_parameters  )

    return fisher_final_result
def run_cc_net_similarity(run_parameters):
    """ wrapper: call sequence to perform signature analysis with
        random walk smoothing and bootstrapped similarity and save results.

    Args:
        run_parameters: parameter set dictionary.
    """

    tmp_dir               = 'tmp_cc_similarity'
    run_parameters        = update_tmp_directory(run_parameters, tmp_dir)

    expression_name       = run_parameters["spreadsheet_name_full_path"]
    signature_name        = run_parameters["signature_name_full_path"  ]

    gg_network_name       = run_parameters['gg_network_name_full_path' ]

    similarity_measure    = run_parameters["similarity_measure"        ]
    number_of_bootstraps  = run_parameters['number_of_bootstraps'      ]
    processing_method     = run_parameters['processing_method'         ]

    expression_df         = kn.get_spreadsheet_df(expression_name)
    signature_df          = kn.get_spreadsheet_df(signature_name )

    expression_col_names  = expression_df.columns
    signature_col_names   =  signature_df.columns

    #---------------------
    network_mat,          \
    unique_gene_names     = kn.get_sparse_network_matrix(gg_network_name)
    
    expression_df         = kn.update_spreadsheet_df(expression_df, unique_gene_names)
    signature_df          = kn.update_spreadsheet_df(signature_df, unique_gene_names)
    #---------------------

    expression_mat        = expression_df.values
    signature_mat         =  signature_df.values

    expression_mat,       \
    iterations            = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters)
    signature_mat,        \
    iterations            = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters)

    expression_df.iloc[:] = expression_mat
    signature_df.iloc [:] = signature_mat

    # --------------
    if   processing_method == 'serial':
         for sample in range(0, number_of_bootstraps):
            run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample              )
    elif processing_method == 'parallel':
         find_and_save_cc_similarity_parallel (expression_df, signature_df, run_parameters, number_of_bootstraps)
    else:
        raise ValueError('processing_method contains bad value.')

    similarity_df         = assemble_similarity_df(expression_df, signature_df, run_parameters)
    similarity_mat        = similarity_df.values
    # --------------

    similarity_df  = pd.DataFrame( similarity_mat, index = expression_col_names, columns = signature_col_names )
    save_final_expression_signature( similarity_df,  run_parameters                                            )
    save_best_match_signature      ( similarity_df,  run_parameters                                            )

    kn.remove_dir(run_parameters["tmp_directory"])
Пример #8
0
 def test_update_spreadsheet_df(self):
     ret = kn.update_spreadsheet_df(self.spreadsheet, self.gene_list)
     self.assertEqual(True, ret.equals(self.spreadsheet_result),
                      'test_update_spreadsheet_df failed')