Exemplo n.º 1
0
def cluster_genes_heatprop(seed_genes, cluster_x_y_z):
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods

    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False

    '''

    #G_DB = nx.Graph()
    #G_DB.add_edges_from(DB_el)

    G_cluster = cluster_x_y_z  #load_cluster_data(path_to_cluster_file)

    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster['cluster'],
                                                weighted=True)

    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster['cluster'], Wprime,
                                            seed_genes)

    # sort heat vector Fnew
    Fnew.sort(ascending=False)

    H = G_cluster['cluster'].subgraph(Fnew.head(500).keys())

    return H
def drug_gene_heatprop(seed_genes,path_to_DB_file,path_to_cluster_file,plot_flag=False):
    
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods
    
    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False
        
    '''
    
    
    # load and parse the drug-bank file into a dict ()
    DBdict = load_DB_data(path_to_DB_file)
    
    # make a network out of drug-gene interactions
    DB_el = []
    for d in DBdict.keys():
        node_list = DBdict[d]['node_list']
        for n in node_list:
            DB_el.append((DBdict[d]['drugbank_id'],n['name']))
            
            
    G_DB = nx.Graph()
    G_DB.add_edges_from(DB_el)
    
    G_cluster = load_cluster_data(path_to_cluster_file)
    
    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster,weighted=True)
    
    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster,Wprime,seed_genes)
    
    # sort heat vector Fnew
    Fnew.sort(ascending=False)
    
    # if plot_flag is on plot the cluster genes with heat overlaid
    if plot_flag:
        pos = nx.spring_layout(G_cluster)

        plt.figure(figsize=(10,10))
        nx.draw_networkx_edges(G_cluster,pos=pos,alpha=.03)
        nx.draw_networkx_nodes(G_cluster,pos=pos,node_size=20,alpha=.8,node_color=Fnew[G_cluster.nodes()],cmap='jet',
                               vmin=0,vmax=np.max(Fnew)/10)
        nx.draw_networkx_nodes(G_cluster,pos=pos,nodelist=seed_genes,node_size=50,alpha=.7,node_color='red',linewidths=2)

        plt.grid('off')
        plt.title('Sample subnetwork: post-heat propagation',fontsize=16)
    
    # find the drugs related to hot genes
    gene_drug_df = find_drugs_from_hot_genes(Fnew,G_DB,seed_genes,keep_seed_genes =True)
    
    return gene_drug_df
    
    
Exemplo n.º 3
0
def drug_gene_heatprop(seed_genes, cluster_x_y_z, plot_flag=False):
    '''
    Function to establish drugs potentially related to an input gene list, using network propagation methods
    
    inputs:
        - seed_genes:  genes from which to initiate heat propagation simulation
        - path_to_DB_file:  path to drug bank file, including filename
        - path_to_cluster_file: path to cluster file, including filename
        - plot_flag: should we plot the subnetwork with heat overlaid? Default False
        
    '''

    # load and parse the drug-bank file into a dict ()
    #DBdict = load_DB_data(path_to_DB_file)

    # make a network out of drug-gene interactions
    #    DB_el = []
    #    for d in DBdict.keys():
    #        node_list = DBdict[d]['node_list']
    #        for n in node_list:
    #            DB_el.append((DBdict[d]['drugbank_id'],n['name']))

    #load_DB_el()

    start_time = time.time()
    G_DB = nx.Graph()
    G_DB.add_edges_from(DB_el)

    G_cluster = cluster_x_y_z  #load_cluster_data(path_to_cluster_file)

    # calculate the degree-normalized adjacency matrix
    Wprime = network_prop.normalized_adj_matrix(G_cluster, weighted=True)

    # run the network_propagation simulation starting from the seed genes
    Fnew = network_prop.network_propagation(G_cluster, Wprime, seed_genes)

    # sort heat vector Fnew
    #Fnew.sort(ascending=False)
    Fnew.sort_values(inplace=True, ascending=False)
    # if plot_flag is on plot the cluster genes with heat overlaid
    if plot_flag:
        pos = nx.spring_layout(G_cluster)

        plt.figure(figsize=(10, 10))
        nx.draw_networkx_edges(G_cluster, pos=pos, alpha=.03)
        nx.draw_networkx_nodes(G_cluster,
                               pos=pos,
                               node_size=20,
                               alpha=.8,
                               node_color=Fnew[G_cluster.nodes()],
                               cmap='jet',
                               vmin=0,
                               vmax=np.max(Fnew) / 10)
        nx.draw_networkx_nodes(G_cluster,
                               pos=pos,
                               nodelist=seed_genes,
                               node_size=50,
                               alpha=.7,
                               node_color='red',
                               linewidths=2)

        plt.grid('off')
        plt.title('Sample subnetwork: post-heat propagation', fontsize=16)

    # find the drugs related to hot genes
    gene_drug_df = find_drugs_from_hot_genes(Fnew,
                                             G_DB,
                                             seed_genes,
                                             keep_seed_genes=True)

    #print gene_drug_df

    return gene_drug_df
def main(num_reps=10,
         seed_gene_file='HC_genes/ASD_HC_no_shared_200114.tsv',
         int_file='../interactomes/G_PCnet.gpickle',
         out_name='ASD',
         rand_method='degree_binning',
         single_or_double='single',
         save_fnew_rand=False):
    '''
    
    Calculate z-scores for heat propagation
    
    python netprop_zscore.py 10 HC_genes/ASD_HC_no_shared_200114.tsv ../interactomes/G_PCnet.gpickle ASD degree_binning single False

    
    '''

    print('number of randomizations = ' + str(num_reps))
    print('background interactome = ' + int_file)
    print('randomization method = ' + rand_method)
    print('single or double = ' + single_or_double)
    print('save Fnew rand = ' + save_fnew_rand)

    num_reps = int(num_reps)
    # load interactome and select focal interactome
    Gint = nx.Graph()
    Gint = nx.read_gpickle(int_file)
    if 'None' in Gint.nodes():
        Gint.remove_node('None')

    # load HC genes
    HC_genes_temp = pd.read_csv(seed_gene_file,
                                sep='\t',
                                index_col='Unnamed: 0')
    seed_HC = [
        str(g[1:-1]).strip("'")
        for g in HC_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ')
    ]

    print(seed_gene_file + ':')
    print(len(seed_HC))
    seed_HC = list(np.intersect1d(Gint.nodes(), seed_HC))
    print(len(seed_HC))

    # calculate the z-score
    # calc Wprime from Gint
    Wprime = network_prop.normalized_adj_matrix(Gint, conserve_heat=True)

    if single_or_double == 'single':  # calculate z-scores from a single set of seed genes

        print('calculating z-scores: ' + seed_gene_file)
        z_seed, Fnew_rand_seed = calc_zscore_heat(Gint,
                                                  Wprime,
                                                  seed_HC,
                                                  num_reps=num_reps,
                                                  rand_method=rand_method)
        z_seed.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' +
                      rand_method + '.tsv',
                      sep='\t')
        if save_fnew_rand == 'True':  # if true, save out the vector of randoms (this can be a large file)
            pd.DataFrame(Fnew_rand_seed).to_csv('Fnew_' + out_name + '_rand' +
                                                str(num_reps) + '_reps_' +
                                                rand_method + '.tsv',
                                                sep='\t')

    elif single_or_double == 'double':  # calculate z-scores from two sets of seed genes:

        # --- not currently functional ----

        print('calculating ASD-CHD z-scores')
        z_ASD_CHD, Fnew_rand_ASD_CHD = calc_zscore_heat_double(
            Gint,
            Wprime,
            ASD_HC,
            CHD_HC,
            num_reps=num_reps,
            rand_method=rand_method)
        z_ASD_CHD.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' +
                         rand_method + '.tsv',
                         sep='\t')
Exemplo n.º 5
0
def main(num_reps=10,
         seed_gene_file='HC_genes/example_seed.tsv',
         int_file='../interactomes/G_PCnet.gpickle',
         out_name='ASD',
         rand_method='degree_binning',
         single_or_double='single'):
    '''
    
    Calculate z-scores for heat propagation
    
    Inputs:
    num_reps: number of randomizations
    seed_gene_file: location of file containing seed genes (see example for format... clunky format due to historical reasons... need to improve)
    int_file: location of interactome to use (gpickle format) 
    out_name: identifier for output files (currently saves in current directory... need to update to allow setting of save location)
    rand_method: type of randomization (default = 'degree_binning', alternate method 'degree_ks_test' deprecated)
    single_or_double: single network prop or double network prop. (default = 'single'. 'double' is deprecated)
    
    python netprop_zscore.py 10 HC_genes/example_seed.tsv ../interactomes/G_PCnet.gpickle ASD degree_binning single

    
    '''

    print('number of randomizations = ' + str(num_reps))
    print('background interactome = ' + int_file)
    print('randomization method = ' + rand_method)
    print('single or double = ' + single_or_double)

    num_reps = int(num_reps)
    # load interactome and select focal interactome
    Gint = nx.Graph()
    Gint = nx.read_gpickle(int_file)
    if 'None' in Gint.nodes():
        Gint.remove_node('None')

    # load HC genes
    HC_genes_temp = pd.read_csv(seed_gene_file,
                                sep='\t',
                                index_col='Unnamed: 0')
    seed_HC = [
        str(g[1:-1]).strip("'")
        for g in HC_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ')
    ]

    print(seed_gene_file + ':')
    print(len(seed_HC))
    seed_HC = list(np.intersect1d(Gint.nodes(), seed_HC))
    print(len(seed_HC))

    # calculate the z-score
    # calc Wprime from Gint
    Wprime = network_prop.normalized_adj_matrix(Gint, conserve_heat=True)

    if single_or_double == 'single':  # calculate z-scores from a single set of seed genes

        print('calculating z-scores: ' + seed_gene_file)
        z_seed, Fnew_rand_seed = calc_zscore_heat(Gint,
                                                  Wprime,
                                                  seed_HC,
                                                  num_reps=num_reps,
                                                  rand_method=rand_method)
        z_seed.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' +
                      rand_method + '.tsv',
                      sep='\t')
        #pd.DataFrame(Fnew_rand_seed).to_csv('Fnew_'+outname+'_rand'+str(num_reps)+'_reps_'+rand_method+'.tsv',sep='\t')

    elif single_or_double == 'double':  # calculate z-scores from two sets of seed genes:

        # --- keeping for completeness, but currently not functional ----

        print('calculating ASD-CHD z-scores')
        z_ASD_CHD, Fnew_rand_ASD_CHD = calc_zscore_heat_double(
            Gint,
            Wprime,
            ASD_HC,
            CHD_HC,
            num_reps=num_reps,
            rand_method=rand_method)
        z_ASD_CHD.to_csv('z_' + out_name + '_' + str(num_reps) + '_reps_' +
                         rand_method + '.tsv',
                         sep='\t')