def main(net, inst_dm, filter_sim): from __init__ import Network from copy import deepcopy import calc_clust sim_dict = {} for inst_rc in ['row','col']: sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True, filter_sim=filter_sim) sim_net = {} for inst_rc in ['row','col']: sim_net[inst_rc] = deepcopy(Network()) sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc] sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][inst_rc] sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][inst_rc] calc_clust.cluster_row_and_col(sim_net[inst_rc]) return sim_net
def main(net, inst_dm, filter_sim): from __init__ import Network from copy import deepcopy import calc_clust sim_dict = {} for inst_rc in ['row', 'col']: sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True, filter_sim=filter_sim) sim_net = {} for inst_rc in ['row', 'col']: sim_net[inst_rc] = deepcopy(Network()) sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc] sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][ inst_rc] sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][ inst_rc] calc_clust.cluster_row_and_col(sim_net[inst_rc]) return sim_net
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'): from copy import deepcopy from __init__ import Network import calc_clust, run_filter keep_top = ['all', 500, 250, 100, 50, 20, 10] rows_sorted = run_filter.get_sorted_rows(df['mat'], rank_type) for inst_keep in keep_top: tmp_df = deepcopy(df) if inst_keep < len(rows_sorted) or inst_keep == 'all': tmp_net = deepcopy(Network()) if inst_keep != 'all': keep_rows = rows_sorted[0:inst_keep] tmp_df['mat'] = tmp_df['mat'].ix[keep_rows] if 'mat_up' in tmp_df: tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows] tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows] if 'mat_orig' in tmp_df: tmp_df['mat_orig'] = tmp_df['mat_orig'].ix[keep_rows] tmp_df = run_filter.df_filter_col_sum(tmp_df, 0.001) tmp_net.df_to_dat(tmp_df) else: tmp_net.df_to_dat(tmp_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=False) # add view inst_view = {} inst_view['N_row_' + rank_type] = inst_keep inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: # print('\t*** did not cluster N filtered view') pass return all_views
def pct_rows(net, df, all_views, dist_type, rank_type): from __init__ import Network from copy import deepcopy import numpy as np import calc_clust, run_filter copy_net = deepcopy(net) if len(net.dat['node_info']['col']['cat']) > 0: cat_key_col = {} for i in range(len(net.dat['nodes']['col'])): cat_key_col[net.dat['nodes']['col'][i]] = \ net.dat['node_info']['col']['cat'][i] all_filt = range(10) all_filt = [i / float(10) for i in all_filt] mat = deepcopy(df['mat']) sum_row = np.sum(mat, axis=1) max_sum = max(sum_row) for inst_filt in all_filt: cutoff = inst_filt * max_sum copy_net = deepcopy(net) inst_df = deepcopy(df) inst_df = run_filter.df_filter_row(inst_df, cutoff, take_abs=False) tmp_net = deepcopy(Network()) tmp_net.df_to_dat(inst_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, run_clustering=False) inst_view = {} inst_view['pct_row_' + rank_type] = inst_filt inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: pass return all_views
def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, requested_views=['pct_row_sum', 'N_row_sum'], linkage_type='average', sim_mat=False, filter_sim=0.1, calc_cat_pval=False): ''' This will calculate multiple views of a clustergram by filtering the data and clustering after each filtering. This filtering will keep the top N rows based on some quantity (sum, num-non-zero, etc). ''' from copy import deepcopy import calc_clust, run_filter, make_views, make_sim_mat, cat_pval import scipy df = net.dat_to_df() threshold = 0.0001 df = run_filter.df_filter_row_sum(df, threshold) df = run_filter.df_filter_col_sum(df, threshold) # calculate initial view with no row filtering net.df_to_dat(df) inst_dm = calc_clust.cluster_row_and_col(net, dist_type=dist_type, linkage_type=linkage_type, run_clustering=run_clustering, dendro=dendro, ignore_cat=False, calc_cat_pval=calc_cat_pval) all_views = [] send_df = deepcopy(df) if 'N_row_sum' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'N_row_var' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if 'pct_row_sum' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'pct_row_var' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if sim_mat is True: print('make similarity matrices of rows and columns, add to viz data structure') sim_net = make_sim_mat.main(net, inst_dm, filter_sim) net.sim = {} net.sim['row'] = sim_net['row'].viz net.sim['col'] = sim_net['col'].viz net.viz['views'] = all_views
def main(net, inst_dm, filter_sim, sim_mat_views=['N_row_sum']): from __init__ import Network from copy import deepcopy import calc_clust, make_views sim_dict = {} for inst_rc in ['row','col']: sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True, filter_sim=filter_sim) sim_net = {} for inst_rc in ['row','col']: sim_net[inst_rc] = deepcopy(Network()) sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc] sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][inst_rc] sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][inst_rc] calc_clust.cluster_row_and_col(sim_net[inst_rc]) all_views = [] df = sim_net[inst_rc].dat_to_df() send_df = deepcopy(df) if 'N_row_sum' in sim_mat_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type='cos', rank_type='sum') sim_net[inst_rc].viz['views'] = all_views return sim_net
def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, requested_views=['pct_row_sum', 'N_row_sum'], linkage_type='average', sim_mat=False): ''' This will calculate multiple views of a clustergram by filtering the data and clustering after each filtering. This filtering will keep the top N rows based on some quantity (sum, num-non-zero, etc). ''' from copy import deepcopy import calc_clust import run_filter import make_views import scipy df = net.dat_to_df() threshold = 0.0001 df = run_filter.df_filter_row(df, threshold) df = run_filter.df_filter_col(df, threshold) # calculate initial view with no row filtering net.df_to_dat(df) # preparing to make similarity matrices of rows and cols ########################################################### # tmp_dist_mat = calc_clust.calc_distance_matrix(net.dat['mat'], 'col', # get_sim=True, # make_squareform=True, # filter_sim_below=0.1) # # print(tmp_dist_mat) # print(net.dat['node_info']['row']) # print('\n') # print(net.dat['node_info']['col']) calc_clust.cluster_row_and_col(net, dist_type=dist_type, linkage_type=linkage_type, run_clustering=run_clustering, dendro=dendro, ignore_cat=False) all_views = [] send_df = deepcopy(df) if 'N_row_sum' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'N_row_var' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if 'pct_row_sum' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'pct_row_var' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if sim_mat is True: print( 'make similarity matrices of rows and columns, add to viz data structure' ) net.viz['views'] = all_views
def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, requested_views=['pct_row_sum', 'N_row_sum'], linkage_type='average', sim_mat=False, filter_sim=0.1, calc_cat_pval=False, sim_mat_views=['N_row_sum'], run_enrichr=None): ''' This will calculate multiple views of a clustergram by filtering the data and clustering after each filtering. This filtering will keep the top N rows based on some quantity (sum, num-non-zero, etc). ''' from copy import deepcopy import scipy import calc_clust, run_filter, make_views, make_sim_mat, cat_pval import enrichr_functions as enr_fun df = net.dat_to_df() threshold = 0.0001 df = run_filter.df_filter_row_sum(df, threshold) df = run_filter.df_filter_col_sum(df, threshold) if run_enrichr is not None: df = enr_fun.add_enrichr_cats(df, 'row', run_enrichr) # calculate initial view with no row filtering net.df_to_dat(df) inst_dm = calc_clust.cluster_row_and_col(net, dist_type=dist_type, linkage_type=linkage_type, run_clustering=run_clustering, dendro=dendro, ignore_cat=False, calc_cat_pval=calc_cat_pval) all_views = [] send_df = deepcopy(df) if 'N_row_sum' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'N_row_var' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if 'pct_row_sum' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'pct_row_var' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if sim_mat is True: print( 'make similarity matrices of rows and columns, add to viz data structure' ) sim_net = make_sim_mat.main(net, inst_dm, filter_sim, sim_mat_views) net.sim = {} net.sim['row'] = sim_net['row'].viz net.sim['col'] = sim_net['col'].viz net.viz['views'] = all_views
def calc_cat_clust_order(net, inst_rc): from __init__ import Network from copy import deepcopy import calc_clust inst_keys = net.dat['node_info'][inst_rc].keys() all_cats = [x for x in inst_keys if 'cat-' in x] if len(all_cats) > 0: for inst_name_cat in all_cats: tmp_name = 'dict_' + inst_name_cat.replace('-', '_') dict_cat = net.dat['node_info'][inst_rc][tmp_name] all_cats = sorted(dict_cat.keys()) # this is the ordering of the columns based on their category, not # including their clustering ordering within category all_cat_orders = [] tmp_names_list = [] for inst_cat in all_cats: inst_nodes = dict_cat[inst_cat] tmp_names_list.extend(inst_nodes) cat_net = deepcopy(Network()) cat_net.dat['mat'] = deepcopy(net.dat['mat']) cat_net.dat['nodes'] = deepcopy(net.dat['nodes']) cat_df = cat_net.dat_to_df() sub_df = {} if inst_rc == 'col': sub_df['mat'] = cat_df['mat'][inst_nodes] elif inst_rc == 'row': # need to transpose df cat_df['mat'] = cat_df['mat'].transpose() sub_df['mat'] = cat_df['mat'][inst_nodes] sub_df['mat'] = sub_df['mat'].transpose() # load back to dat cat_net.df_to_dat(sub_df) try: calc_clust.cluster_row_and_col(cat_net, 'cos') # inst_cat_order = cat_net.dat['node_info'][inst_rc] inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc])) except: inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc])) prev_order_len = len(all_cat_orders) # add prev order length to the current order number inst_cat_order = [i + prev_order_len for i in inst_cat_order] all_cat_orders.extend(inst_cat_order) names_clust_list = [x for (y, x) in sorted(zip(all_cat_orders, tmp_names_list))] # calc category-cluster order final_order = [] for i in range(len(net.dat['nodes'][inst_rc])): inst_node_name = net.dat['nodes'][inst_rc][i] inst_node_num = names_clust_list.index(inst_node_name) final_order.append(inst_node_num) net.dat['node_info'][inst_rc][inst_name_cat.replace('-', '_') + '_index'] = final_order
def make_clust(net, dist_type='cosine', run_clustering=True, dendro=True, requested_views=['pct_row_sum', 'N_row_sum'], linkage_type='average', sim_mat=False): ''' This will calculate multiple views of a clustergram by filtering the data and clustering after each filtering. This filtering will keep the top N rows based on some quantity (sum, num-non-zero, etc). ''' from copy import deepcopy import calc_clust import run_filter import make_views import scipy df = net.dat_to_df() threshold = 0.0001 df = run_filter.df_filter_row(df, threshold) df = run_filter.df_filter_col(df, threshold) # calculate initial view with no row filtering net.df_to_dat(df) # preparing to make similarity matrices of rows and cols ########################################################### # tmp_dist_mat = calc_clust.calc_distance_matrix(net.dat['mat'], 'col', # get_sim=True, # make_squareform=True, # filter_sim_below=0.1) # # print(tmp_dist_mat) # print(net.dat['node_info']['row']) # print('\n') # print(net.dat['node_info']['col']) calc_clust.cluster_row_and_col(net, dist_type=dist_type, linkage_type=linkage_type, run_clustering=run_clustering, dendro=dendro, ignore_cat=False) all_views = [] send_df = deepcopy(df) if 'N_row_sum' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'N_row_var' in requested_views: all_views = make_views.N_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if 'pct_row_sum' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='sum') if 'pct_row_var' in requested_views: all_views = make_views.pct_rows(net, send_df, all_views, dist_type=dist_type, rank_type='var') if sim_mat is True: print('make similarity matrices of rows and columns, add to viz data structure') net.viz['views'] = all_views
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'): from copy import deepcopy from __init__ import Network import calc_clust, run_filter keep_top = ['all', 500, 400, 300, 200, 100, 80, 60, 40, 20, 10] df_abs = deepcopy(df['mat']) df_abs = df_abs.transpose() if rank_type == 'sum': tmp_sum = df_abs.sum(axis=0) elif rank_type == 'var': tmp_sum = df_abs.var(axis=0) tmp_sum = tmp_sum.abs() tmp_sum.sort_values(inplace=True, ascending=False) rows_sorted = tmp_sum.index.values.tolist() for inst_keep in keep_top: tmp_df = deepcopy(df) if inst_keep < len(rows_sorted) or inst_keep == 'all': tmp_net = deepcopy(Network()) if inst_keep != 'all': keep_rows = rows_sorted[0:inst_keep] tmp_df['mat'] = tmp_df['mat'].ix[keep_rows] if 'mat_up' in tmp_df: tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows] tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows] tmp_df = run_filter.df_filter_col(tmp_df, 0.001) tmp_net.df_to_dat(tmp_df) else: tmp_net.df_to_dat(tmp_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=False) # add view inst_view = {} inst_view['N_row_' + rank_type] = inst_keep inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: # print('\t*** did not cluster N filtered view') pass return all_views