def df_filter_row_sum(df, threshold, take_abs=True): ''' filter rows in matrix at some threshold and remove columns that have a sum below this threshold ''' from copy import deepcopy from __init__ import Network net = Network() if take_abs is True: df_copy = deepcopy(df['mat'].abs()) else: df_copy = deepcopy(df['mat']) ini_rows = df_copy.index.values.tolist() df_copy = df_copy.transpose() tmp_sum = df_copy.sum(axis=0) tmp_sum = tmp_sum.abs() tmp_sum.sort_values(inplace=True, ascending=False) tmp_sum = tmp_sum[tmp_sum > threshold] keep_rows = sorted(tmp_sum.index.values.tolist()) if len(keep_rows) < len(ini_rows): df['mat'] = grab_df_subset(df['mat'], keep_rows=keep_rows) if 'mat_up' in df: df['mat_up'] = grab_df_subset(df['mat_up'], keep_rows=keep_rows) df['mat_dn'] = grab_df_subset(df['mat_dn'], keep_rows=keep_rows) return df
def df_filter_col_sum(df, threshold, take_abs=True): ''' filter columns in matrix at some threshold and remove rows that have all zero values ''' from copy import deepcopy from __init__ import Network net = Network() if take_abs is True: df_copy = deepcopy(df['mat'].abs()) else: df_copy = deepcopy(df['mat']) df_copy = df_copy.transpose() df_copy = df_copy[df_copy.sum(axis=1) > threshold] df_copy = df_copy.transpose() df_copy = df_copy[df_copy.sum(axis=1) > 0] if take_abs is True: inst_rows = df_copy.index.tolist() inst_cols = df_copy.columns.tolist() df['mat'] = grab_df_subset(df['mat'], inst_rows, inst_cols) if 'mat_up' in df: df['mat_up'] = grab_df_subset(df['mat_up'], inst_rows, inst_cols) df['mat_dn'] = grab_df_subset(df['mat_dn'], inst_rows, inst_cols) else: df['mat'] = df_copy return df
def main(net, inst_dm, filter_sim): from __init__ import Network from copy import deepcopy import calc_clust sim_dict = {} for inst_rc in ['row', 'col']: sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc], make_squareform=True, filter_sim=filter_sim) sim_net = {} for inst_rc in ['row', 'col']: sim_net[inst_rc] = deepcopy(Network()) sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc] sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc] sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][ inst_rc] sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][ inst_rc] calc_clust.cluster_row_and_col(sim_net[inst_rc]) return sim_net
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'): from copy import deepcopy from __init__ import Network import calc_clust, run_filter keep_top = ['all', 500, 250, 100, 50, 20, 10] rows_sorted = run_filter.get_sorted_rows(df['mat'], rank_type) for inst_keep in keep_top: tmp_df = deepcopy(df) if inst_keep < len(rows_sorted) or inst_keep == 'all': tmp_net = deepcopy(Network()) if inst_keep != 'all': keep_rows = rows_sorted[0:inst_keep] tmp_df['mat'] = tmp_df['mat'].ix[keep_rows] if 'mat_up' in tmp_df: tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows] tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows] if 'mat_orig' in tmp_df: tmp_df['mat_orig'] = tmp_df['mat_orig'].ix[keep_rows] tmp_df = run_filter.df_filter_col_sum(tmp_df, 0.001) tmp_net.df_to_dat(tmp_df) else: tmp_net.df_to_dat(tmp_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=False) # add view inst_view = {} inst_view['N_row_' + rank_type] = inst_keep inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: # print('\t*** did not cluster N filtered view') pass return all_views
def pct_rows(net, df, all_views, dist_type, rank_type): from __init__ import Network from copy import deepcopy import numpy as np import calc_clust, run_filter copy_net = deepcopy(net) if len(net.dat['node_info']['col']['cat']) > 0: cat_key_col = {} for i in range(len(net.dat['nodes']['col'])): cat_key_col[net.dat['nodes']['col'][i]] = \ net.dat['node_info']['col']['cat'][i] all_filt = range(10) all_filt = [i / float(10) for i in all_filt] mat = deepcopy(df['mat']) sum_row = np.sum(mat, axis=1) max_sum = max(sum_row) for inst_filt in all_filt: cutoff = inst_filt * max_sum copy_net = deepcopy(net) inst_df = deepcopy(df) inst_df = run_filter.df_filter_row(inst_df, cutoff, take_abs=False) tmp_net = deepcopy(Network()) tmp_net.df_to_dat(inst_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type=dist_type, run_clustering=False) inst_view = {} inst_view['pct_row_' + rank_type] = inst_filt inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: pass return all_views
def calc_cat_clust_order(net, inst_rc): from __init__ import Network from copy import deepcopy import calc_clust inst_keys = net.dat['node_info'][inst_rc].keys() all_cats = [x for x in inst_keys if 'cat-' in x] if len(all_cats) > 0: for inst_name_cat in all_cats: tmp_name = 'dict_' + inst_name_cat.replace('-', '_') dict_cat = net.dat['node_info'][inst_rc][tmp_name] all_cats = sorted(dict_cat.keys()) # this is the ordering of the columns based on their category, not # including their clustering ordering within category all_cat_orders = [] tmp_names_list = [] for inst_cat in all_cats: inst_nodes = dict_cat[inst_cat] tmp_names_list.extend(inst_nodes) cat_net = deepcopy(Network()) cat_net.dat['mat'] = deepcopy(net.dat['mat']) cat_net.dat['nodes'] = deepcopy(net.dat['nodes']) cat_df = cat_net.dat_to_df() sub_df = {} if inst_rc == 'col': sub_df['mat'] = cat_df['mat'][inst_nodes] elif inst_rc == 'row': # need to transpose df cat_df['mat'] = cat_df['mat'].transpose() sub_df['mat'] = cat_df['mat'][inst_nodes] sub_df['mat'] = sub_df['mat'].transpose() # load back to dat cat_net.df_to_dat(sub_df) try: calc_clust.cluster_row_and_col(cat_net, 'cos') # inst_cat_order = cat_net.dat['node_info'][inst_rc] inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc])) except: inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc])) prev_order_len = len(all_cat_orders) # add prev order length to the current order number inst_cat_order = [i + prev_order_len for i in inst_cat_order] all_cat_orders.extend(inst_cat_order) names_clust_list = [x for (y, x) in sorted(zip(all_cat_orders, tmp_names_list))] # calc category-cluster order final_order = [] for i in range(len(net.dat['nodes'][inst_rc])): inst_node_name = net.dat['nodes'][inst_rc][i] inst_node_num = names_clust_list.index(inst_node_name) final_order.append(inst_node_num) net.dat['node_info'][inst_rc][inst_name_cat.replace('-', '_') + '_index'] = final_order
def main(real_net, vect_post): import numpy as np from copy import deepcopy from __init__ import Network import proc_df_labels net = deepcopy(Network()) sigs = vect_post['columns'] all_rows = [] all_sigs = [] for inst_sig in sigs: all_sigs.append(inst_sig['col_name']) col_data = inst_sig['data'] for inst_row_data in col_data: all_rows.append(inst_row_data['row_name']) all_rows = sorted(list(set(all_rows))) all_sigs = sorted(list(set(all_sigs))) net.dat['nodes']['row'] = all_rows net.dat['nodes']['col'] = all_sigs net.dat['mat'] = np.empty((len(all_rows), len(all_sigs))) net.dat['mat'][:] = np.nan is_up_down = False if 'is_up_down' in vect_post: if vect_post['is_up_down'] is True: is_up_down = True if is_up_down is True: net.dat['mat_up'] = np.empty((len(all_rows), len(all_sigs))) net.dat['mat_up'][:] = np.nan net.dat['mat_dn'] = np.empty((len(all_rows), len(all_sigs))) net.dat['mat_dn'][:] = np.nan for inst_sig in sigs: inst_sig_name = inst_sig['col_name'] col_data = inst_sig['data'] for inst_row_data in col_data: inst_row = inst_row_data['row_name'] inst_value = inst_row_data['val'] row_index = all_rows.index(inst_row) col_index = all_sigs.index(inst_sig_name) net.dat['mat'][row_index, col_index] = inst_value if is_up_down is True: net.dat['mat_up'][row_index, col_index] = inst_row_data['val_up'] net.dat['mat_dn'][row_index, col_index] = inst_row_data['val_dn'] tmp_df = net.dat_to_df() tmp_df = proc_df_labels.main(tmp_df) real_net.df_to_dat(tmp_df)
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'): from copy import deepcopy from __init__ import Network import calc_clust, run_filter keep_top = ['all', 500, 400, 300, 200, 100, 80, 60, 40, 20, 10] df_abs = deepcopy(df['mat']) df_abs = df_abs.transpose() if rank_type == 'sum': tmp_sum = df_abs.sum(axis=0) elif rank_type == 'var': tmp_sum = df_abs.var(axis=0) tmp_sum = tmp_sum.abs() tmp_sum.sort_values(inplace=True, ascending=False) rows_sorted = tmp_sum.index.values.tolist() for inst_keep in keep_top: tmp_df = deepcopy(df) if inst_keep < len(rows_sorted) or inst_keep == 'all': tmp_net = deepcopy(Network()) if inst_keep != 'all': keep_rows = rows_sorted[0:inst_keep] tmp_df['mat'] = tmp_df['mat'].ix[keep_rows] if 'mat_up' in tmp_df: tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows] tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows] tmp_df = run_filter.df_filter_col(tmp_df, 0.001) tmp_net.df_to_dat(tmp_df) else: tmp_net.df_to_dat(tmp_df) try: try: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=True) except: calc_clust.cluster_row_and_col(tmp_net, dist_type, run_clustering=False) # add view inst_view = {} inst_view['N_row_' + rank_type] = inst_keep inst_view['dist'] = 'cos' inst_view['nodes'] = {} inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes'] inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes'] all_views.append(inst_view) except: # print('\t*** did not cluster N filtered view') pass return all_views