示例#1
0
def df_filter_row_sum(df, threshold, take_abs=True):
    ''' filter rows in matrix at some threshold
  and remove columns that have a sum below this threshold '''

    from copy import deepcopy
    from __init__ import Network
    net = Network()

    if take_abs is True:
        df_copy = deepcopy(df['mat'].abs())
    else:
        df_copy = deepcopy(df['mat'])

    ini_rows = df_copy.index.values.tolist()
    df_copy = df_copy.transpose()
    tmp_sum = df_copy.sum(axis=0)
    tmp_sum = tmp_sum.abs()
    tmp_sum.sort_values(inplace=True, ascending=False)

    tmp_sum = tmp_sum[tmp_sum > threshold]
    keep_rows = sorted(tmp_sum.index.values.tolist())

    if len(keep_rows) < len(ini_rows):
        df['mat'] = grab_df_subset(df['mat'], keep_rows=keep_rows)

        if 'mat_up' in df:
            df['mat_up'] = grab_df_subset(df['mat_up'], keep_rows=keep_rows)
            df['mat_dn'] = grab_df_subset(df['mat_dn'], keep_rows=keep_rows)

    return df
示例#2
0
def df_filter_col_sum(df, threshold, take_abs=True):
    ''' filter columns in matrix at some threshold
  and remove rows that have all zero values '''

    from copy import deepcopy
    from __init__ import Network
    net = Network()

    if take_abs is True:
        df_copy = deepcopy(df['mat'].abs())
    else:
        df_copy = deepcopy(df['mat'])

    df_copy = df_copy.transpose()
    df_copy = df_copy[df_copy.sum(axis=1) > threshold]
    df_copy = df_copy.transpose()
    df_copy = df_copy[df_copy.sum(axis=1) > 0]

    if take_abs is True:
        inst_rows = df_copy.index.tolist()
        inst_cols = df_copy.columns.tolist()
        df['mat'] = grab_df_subset(df['mat'], inst_rows, inst_cols)

        if 'mat_up' in df:
            df['mat_up'] = grab_df_subset(df['mat_up'], inst_rows, inst_cols)
            df['mat_dn'] = grab_df_subset(df['mat_dn'], inst_rows, inst_cols)

    else:
        df['mat'] = df_copy

    return df
示例#3
0
def main(net, inst_dm, filter_sim):
    from __init__ import Network
    from copy import deepcopy
    import calc_clust

    sim_dict = {}

    for inst_rc in ['row', 'col']:

        sim_dict[inst_rc] = dm_to_sim(inst_dm[inst_rc],
                                      make_squareform=True,
                                      filter_sim=filter_sim)

    sim_net = {}

    for inst_rc in ['row', 'col']:

        sim_net[inst_rc] = deepcopy(Network())

        sim_net[inst_rc].dat['mat'] = sim_dict[inst_rc]

        sim_net[inst_rc].dat['nodes']['row'] = net.dat['nodes'][inst_rc]
        sim_net[inst_rc].dat['nodes']['col'] = net.dat['nodes'][inst_rc]

        sim_net[inst_rc].dat['node_info']['row'] = net.dat['node_info'][
            inst_rc]
        sim_net[inst_rc].dat['node_info']['col'] = net.dat['node_info'][
            inst_rc]

        calc_clust.cluster_row_and_col(sim_net[inst_rc])

    return sim_net
示例#4
0
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'):
    from copy import deepcopy
    from __init__ import Network
    import calc_clust, run_filter

    keep_top = ['all', 500, 250, 100, 50, 20, 10]

    rows_sorted = run_filter.get_sorted_rows(df['mat'], rank_type)

    for inst_keep in keep_top:

        tmp_df = deepcopy(df)

        if inst_keep < len(rows_sorted) or inst_keep == 'all':

            tmp_net = deepcopy(Network())

            if inst_keep != 'all':

                keep_rows = rows_sorted[0:inst_keep]

                tmp_df['mat'] = tmp_df['mat'].ix[keep_rows]
                if 'mat_up' in tmp_df:
                    tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows]
                    tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows]
                if 'mat_orig' in tmp_df:
                    tmp_df['mat_orig'] = tmp_df['mat_orig'].ix[keep_rows]

                tmp_df = run_filter.df_filter_col_sum(tmp_df, 0.001)
                tmp_net.df_to_dat(tmp_df)

            else:
                tmp_net.df_to_dat(tmp_df)

            try:
                try:
                    calc_clust.cluster_row_and_col(tmp_net,
                                                   dist_type,
                                                   run_clustering=True)
                except:
                    calc_clust.cluster_row_and_col(tmp_net,
                                                   dist_type,
                                                   run_clustering=False)

                # add view
                inst_view = {}
                inst_view['N_row_' + rank_type] = inst_keep
                inst_view['dist'] = 'cos'
                inst_view['nodes'] = {}
                inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes']
                inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes']
                all_views.append(inst_view)

            except:
                # print('\t*** did not cluster N filtered view')
                pass

    return all_views
示例#5
0
def pct_rows(net, df, all_views, dist_type, rank_type):

    from __init__ import Network
    from copy import deepcopy
    import numpy as np
    import calc_clust, run_filter

    copy_net = deepcopy(net)

    if len(net.dat['node_info']['col']['cat']) > 0:
        cat_key_col = {}
        for i in range(len(net.dat['nodes']['col'])):
            cat_key_col[net.dat['nodes']['col'][i]] = \
                net.dat['node_info']['col']['cat'][i]

    all_filt = range(10)
    all_filt = [i / float(10) for i in all_filt]

    mat = deepcopy(df['mat'])
    sum_row = np.sum(mat, axis=1)
    max_sum = max(sum_row)

    for inst_filt in all_filt:

        cutoff = inst_filt * max_sum
        copy_net = deepcopy(net)
        inst_df = deepcopy(df)
        inst_df = run_filter.df_filter_row(inst_df, cutoff, take_abs=False)

        tmp_net = deepcopy(Network())
        tmp_net.df_to_dat(inst_df)

        try:
            try:
                calc_clust.cluster_row_and_col(tmp_net,
                                               dist_type=dist_type,
                                               run_clustering=True)

            except:
                calc_clust.cluster_row_and_col(tmp_net,
                                               dist_type=dist_type,
                                               run_clustering=False)

            inst_view = {}
            inst_view['pct_row_' + rank_type] = inst_filt
            inst_view['dist'] = 'cos'
            inst_view['nodes'] = {}
            inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes']
            inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes']

            all_views.append(inst_view)

        except:
            pass

    return all_views
示例#6
0
def calc_cat_clust_order(net, inst_rc):
  from __init__ import Network
  from copy import deepcopy
  import calc_clust

  inst_keys = net.dat['node_info'][inst_rc].keys()
  all_cats = [x for x in inst_keys if 'cat-' in x]

  if len(all_cats) > 0:

    for inst_name_cat in all_cats:

      tmp_name = 'dict_' + inst_name_cat.replace('-', '_')
      dict_cat = net.dat['node_info'][inst_rc][tmp_name]

      all_cats = sorted(dict_cat.keys())

      # this is the ordering of the columns based on their category, not
      # including their clustering ordering within category
      all_cat_orders = []
      tmp_names_list = []
      for inst_cat in all_cats:

        inst_nodes = dict_cat[inst_cat]

        tmp_names_list.extend(inst_nodes)

        cat_net = deepcopy(Network())

        cat_net.dat['mat'] = deepcopy(net.dat['mat'])
        cat_net.dat['nodes'] = deepcopy(net.dat['nodes'])

        cat_df = cat_net.dat_to_df()

        sub_df = {}
        if inst_rc == 'col':
          sub_df['mat'] = cat_df['mat'][inst_nodes]
        elif inst_rc == 'row':
          # need to transpose df
          cat_df['mat'] = cat_df['mat'].transpose()
          sub_df['mat'] = cat_df['mat'][inst_nodes]
          sub_df['mat'] = sub_df['mat'].transpose()

        # load back to dat
        cat_net.df_to_dat(sub_df)

        try:
          calc_clust.cluster_row_and_col(cat_net, 'cos')
          # inst_cat_order = cat_net.dat['node_info'][inst_rc]
          inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc]))
        except:
          inst_cat_order = range(len(cat_net.dat['nodes'][inst_rc]))

        prev_order_len = len(all_cat_orders)

        # add prev order length to the current order number
        inst_cat_order = [i + prev_order_len for i in inst_cat_order]
        all_cat_orders.extend(inst_cat_order)

      names_clust_list = [x for (y, x) in sorted(zip(all_cat_orders,
                          tmp_names_list))]

      # calc category-cluster order
      final_order = []

      for i in range(len(net.dat['nodes'][inst_rc])):

        inst_node_name = net.dat['nodes'][inst_rc][i]
        inst_node_num = names_clust_list.index(inst_node_name)
        final_order.append(inst_node_num)

      net.dat['node_info'][inst_rc][inst_name_cat.replace('-', '_') +
                                     '_index'] = final_order
示例#7
0
def main(real_net, vect_post):
    import numpy as np
    from copy import deepcopy
    from __init__ import Network
    import proc_df_labels

    net = deepcopy(Network())

    sigs = vect_post['columns']

    all_rows = []
    all_sigs = []
    for inst_sig in sigs:
        all_sigs.append(inst_sig['col_name'])

        col_data = inst_sig['data']

        for inst_row_data in col_data:
            all_rows.append(inst_row_data['row_name'])

    all_rows = sorted(list(set(all_rows)))
    all_sigs = sorted(list(set(all_sigs)))

    net.dat['nodes']['row'] = all_rows
    net.dat['nodes']['col'] = all_sigs

    net.dat['mat'] = np.empty((len(all_rows), len(all_sigs)))
    net.dat['mat'][:] = np.nan

    is_up_down = False
    if 'is_up_down' in vect_post:
        if vect_post['is_up_down'] is True:
            is_up_down = True

    if is_up_down is True:
        net.dat['mat_up'] = np.empty((len(all_rows), len(all_sigs)))
        net.dat['mat_up'][:] = np.nan

        net.dat['mat_dn'] = np.empty((len(all_rows), len(all_sigs)))
        net.dat['mat_dn'][:] = np.nan

    for inst_sig in sigs:
        inst_sig_name = inst_sig['col_name']
        col_data = inst_sig['data']

        for inst_row_data in col_data:
            inst_row = inst_row_data['row_name']
            inst_value = inst_row_data['val']

            row_index = all_rows.index(inst_row)
            col_index = all_sigs.index(inst_sig_name)

            net.dat['mat'][row_index, col_index] = inst_value

            if is_up_down is True:
                net.dat['mat_up'][row_index,
                                  col_index] = inst_row_data['val_up']
                net.dat['mat_dn'][row_index,
                                  col_index] = inst_row_data['val_dn']

    tmp_df = net.dat_to_df()
    tmp_df = proc_df_labels.main(tmp_df)

    real_net.df_to_dat(tmp_df)
示例#8
0
def N_rows(net, df, all_views, dist_type='cosine', rank_type='sum'):
    from copy import deepcopy
    from __init__ import Network
    import calc_clust, run_filter

    keep_top = ['all', 500, 400, 300, 200, 100, 80, 60, 40, 20, 10]

    df_abs = deepcopy(df['mat'])
    df_abs = df_abs.transpose()

    if rank_type == 'sum':
        tmp_sum = df_abs.sum(axis=0)
    elif rank_type == 'var':
        tmp_sum = df_abs.var(axis=0)

    tmp_sum = tmp_sum.abs()
    tmp_sum.sort_values(inplace=True, ascending=False)
    rows_sorted = tmp_sum.index.values.tolist()

    for inst_keep in keep_top:

        tmp_df = deepcopy(df)

        if inst_keep < len(rows_sorted) or inst_keep == 'all':

            tmp_net = deepcopy(Network())

            if inst_keep != 'all':

                keep_rows = rows_sorted[0:inst_keep]
                tmp_df['mat'] = tmp_df['mat'].ix[keep_rows]

                if 'mat_up' in tmp_df:
                    tmp_df['mat_up'] = tmp_df['mat_up'].ix[keep_rows]
                    tmp_df['mat_dn'] = tmp_df['mat_dn'].ix[keep_rows]

                tmp_df = run_filter.df_filter_col(tmp_df, 0.001)
                tmp_net.df_to_dat(tmp_df)

            else:
                tmp_net.df_to_dat(tmp_df)

            try:
                try:
                    calc_clust.cluster_row_and_col(tmp_net,
                                                   dist_type,
                                                   run_clustering=True)
                except:
                    calc_clust.cluster_row_and_col(tmp_net,
                                                   dist_type,
                                                   run_clustering=False)

                # add view
                inst_view = {}
                inst_view['N_row_' + rank_type] = inst_keep
                inst_view['dist'] = 'cos'
                inst_view['nodes'] = {}
                inst_view['nodes']['row_nodes'] = tmp_net.viz['row_nodes']
                inst_view['nodes']['col_nodes'] = tmp_net.viz['col_nodes']
                all_views.append(inst_view)

            except:
                # print('\t*** did not cluster N filtered view')
                pass

    return all_views