def df_filter_row(df, threshold, take_abs=True):
    ''' filter rows in matrix at some threshold
    and remove columns that have a sum below this threshold '''

    from copy import deepcopy
    from clustergrammer import Network
    net = Network()

    if take_abs is True:
      df_copy = deepcopy(df['mat'].abs())
    else:
      df_copy = deepcopy(df['mat'])

    ini_rows = df_copy.index.values.tolist()
    df_copy = df_copy.transpose()
    tmp_sum = df_copy.sum(axis=0)
    tmp_sum = tmp_sum.abs()
    tmp_sum.sort_values(inplace=True, ascending=False)

    tmp_sum = tmp_sum[tmp_sum > threshold]
    keep_rows = sorted(tmp_sum.index.values.tolist())

    if len(keep_rows) < len(ini_rows):
      df['mat'] = net.grab_df_subset(df['mat'], keep_rows=keep_rows)

      if 'mat_up' in df:
        df['mat_up'] = net.grab_df_subset(df['mat_up'], keep_rows=keep_rows)
        df['mat_dn'] = net.grab_df_subset(df['mat_dn'], keep_rows=keep_rows)

    return df
def add_mutations(cl_info):
  print('add mutations\n')

  from clustergrammer import Network
  net = Network()
  old_cl_info = net.load_json_to_dict('cell_line_muts.json')

  cl_muts = old_cl_info['muts']

  for inst_cl in cl_info:

    # remove plex name if necessary
    if '_plex_' in inst_cl:
      simple_cl = inst_cl.split('_')[0]
    else:
      simple_cl = inst_cl

    for inst_mut in cl_muts:
      mutated_cls = cl_muts[inst_mut]

      if simple_cl in mutated_cls:
        has_mut = 'true'
      else:
        has_mut = 'false'

      mutation_title = 'mut-'+inst_mut

      # use the original long cell line name (with possible plex)
      cl_info[inst_cl][mutation_title] = has_mut

  return cl_info
  def df_filter_col(df, threshold, take_abs=True):
    ''' filter columns in matrix at some threshold
    and remove rows that have all zero values '''

    from copy import deepcopy
    from clustergrammer import Network
    net = Network()

    if take_abs is True:
      df_copy = deepcopy(df['mat'].abs())
    else:
      df_copy = deepcopy(df['mat'])

    df_copy = df_copy.transpose()
    df_copy = df_copy[df_copy.sum(axis=1) > threshold]
    df_copy = df_copy.transpose()
    df_copy = df_copy[df_copy.sum(axis=1) > 0]

    if take_abs is True:
      inst_rows = df_copy.index.tolist()
      inst_cols = df_copy.columns.tolist()
      df['mat'] = net.grab_df_subset(df['mat'], inst_rows, inst_cols)

    else:
      df['mat'] = df_copy

    return df
def calc_treatment_ratios():

  from clustergrammer import Network

  net = Network()

  net.load_tsv_to_net('treated_cell_12_1_2015/treated_cl_phospho.tsv')
def make_enr_vect_clust():
  import enrichr_functions as enr_fun 
  from clustergrammer import Network

  net = Network()

  g2e_post = net.load_json_to_dict('json/g2e_enr_vect.json')

  net = enr_fun.make_enr_vect_clust(g2e_post, 0.001, 1)

  net.write_json_to_file('viz','json/enr_vect_example.json')
def main():
  from clustergrammer import Network

  net = Network()

  net.load_file('txt/rc_two_cats.txt')

  tmp_size = 50

  inst_dm = make_distance_matrix(net, tmp_size)

  randomly_sample_rows(net, inst_dm, tmp_size)
def main():
  from clustergrammer import Network

  net = Network()

  gene_list = ['EGFR', 'TP53', 'SMARCA4', 'CLASP1']
  list_id = net.enrichr('post', gene_list)

  print(list_id)

  enr, response_list = net.enrichr('get', lib='ChEA_2015', list_id=list_id,
    max_terms=10)

  print(response_list)
def make_viz_json(inst_df, name):
  from clustergrammer import Network
  net = Network()

  filename = 'json/'+name
  load_df = {}
  load_df['mat'] = inst_df
  net.df_to_dat(load_df)
  net.swap_nan_for_zero()
  net.make_clust(views=[])
  net.write_json_to_file('viz', filename, 'no-indent')
def cluster():
  from clustergrammer import Network

  net = Network()

  vect_post = net.load_json_to_dict('fake_vect_post.json')  

  net.load_vect_post_to_net(vect_post)

  net.swap_nan_for_zero()
  
  # net.N_top_views()
  net.make_clust(dist_type='cos',views=['N_row_sum','N_row_var'], dendro=True)

  net.write_json_to_file('viz','json/large_vect_post_example.json','indent')  
def make_plex_matrix():
  '''
  Make a cell line matrix with plex rows and cell line columns.
  This will be used as a negative control that should show worsening correlation
  as data is normalized/filtered.
  '''
  import numpy as np
  import pandas as pd
  from clustergrammer import Network

  # load cl_info
  net = Network()
  cl_info = net.load_json_to_dict('../cell_line_info/cell_line_info_dict.json')

  # load cell line expression
  net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt')
  tmp_df = net.dat_to_df()
  df = tmp_df['mat']

  cols = df.columns.tolist()

  rows = range(9)
  rows = [i+1 for i in rows]
  print(rows)

  mat = np.zeros((len(rows), len(cols)))

  for inst_col in cols:

    for inst_cl in cl_info:

      if inst_col in inst_cl:
        inst_plex = int(cl_info[inst_cl]['Plex'])

        if inst_plex != -1:
          # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex))

          row_index = rows.index(inst_plex)
          col_index = cols.index(inst_col)

          mat[row_index, col_index] = 1


  df_plex = pd.DataFrame(data=mat, columns=cols, index=rows)

  filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
            'exp-plex.txt'
  df_plex.to_csv(filename, sep='\t')
def main():
  import numpy as np
  import pandas as pd
  from clustergrammer import Network

  rtk_list = load_rtks()

  net = Network()
  net.load_file('txt/tmp_cst_drug_treat_cl.txt')
  df_dict = net.dat_to_df()

  inst_df = df_dict['mat']

  inst_df = inst_df.ix[rtk_list]

  inst_df.to_csv('txt/RTK_exp_in_drug_treat_cl.txt', sep='\t')
def post_to_clustergrammer():

  from clustergrammer import Network
  import requests 
  import json

  upload_url = 'http://localhost:9000/clustergrammer/vector_upload/'
  # upload_url = 'http://amp.pharm.mssm.edu/clustergrammer/vector_upload/'

  net = Network()
  vect_post = net.load_json_to_dict('test_vector_upload.json')
  # vect_post = net.load_json_to_dict('fake_vect_post.json')

  r = requests.post(upload_url, data=json.dumps(vect_post) )

  link = r.text

  print(link)
示例#13
0
def main( buff, inst_filename, mongo_address, viz_id):
  import numpy as np
  import flask
  from bson.objectid import ObjectId
  from pymongo import MongoClient
  from flask import request
  from clustergrammer import Network
  import StringIO

  client = MongoClient(mongo_address)
  db = client.clustergrammer

  viz_id = ObjectId(viz_id)
  found_viz = db.networks.find_one({'_id':viz_id})

  try:

    net = Network()
    net.load_tsv_to_net(buff)

    net.swap_nan_for_zero()

    views = ['N_row_sum', 'N_row_var']

    net.make_clust(dist_type='cosine', dendro=True, views=views, \
                   linkage_type='average')

    export_dat = {}
    export_dat['name'] = inst_filename
    export_dat['dat'] = net.export_net_json('dat')
    export_dat['source'] = 'user_upload'

    dat_id = db.network_data.insert(export_dat)

    update_viz = net.viz 
    update_dat = dat_id

  except:
    print('\n-----------------------')
    print('error in clustering')
    print('-----------------------\n')
    update_viz = 'error'
    update_dat = 'error'

  found_viz['viz'] = update_viz
  found_viz['dat'] = update_dat

  db.networks.update_one( {'_id':viz_id}, {'$set': found_viz} )

  client.close()


  
def mock_g2e_json(gl):
  import enrichr_functions as enr_fun
  from clustergrammer import Network

  ''' 
  A json of signatures from g2e, for enrichment vectoring, should look like this

  {
    "signature_ids":[
      {"col_title":"title 1", "enr_id_up":###, "enr_id_dn":###},
      {"col_title":"title 2", "enr_id_up":###, "enr_id_dn":###}
    ],
    "background_type":"ChEA_2015"
  }
  '''

  net = Network()

  g2e_post = {}
  sig_ids = []

  # I have to get user_list_ids from Enrichr 
  tmp = 1
  for inst_gl in gl:

    inst_sig = {}
    inst_sig['col_title'] = 'Sig-'+str(tmp)
    tmp = tmp+1

    # submit to enrichr and get user_list_ids
    for inst_updn in inst_gl:
      inst_list = inst_gl[inst_updn]
      inst_id = enr_fun.enrichr_post_request(inst_list)
      inst_sig['enr_id_'+inst_updn] = inst_id

    sig_ids.append(inst_sig)

  g2e_post['signature_ids'] = sig_ids

  g2e_post['background_type'] = 'ChEA_2015'

  net.save_dict_to_json(g2e_post,'json/g2e_enr_vect.json','indent')
def main():
  '''
  This will add cell line category information (including plexes and
  gene-expression groups to the gene expression data from CCLE)
  '''
  from clustergrammer import Network
  net = Network()

  # load original CCLE gene expression data for CST lung cancer cell lines
  filename = 'CCLE_gene_expression/CCLE_NSCLC_all_genes.txt'
  f = open(filename, 'r')
  lines = f.readlines()
  f.close()

  # load cell line info
  cl_info = net.load_json_to_dict('cell_line_info/cell_line_muts.json')

  # write to new file
  new_file = 'CCLE_gene_expression/CCLE_NSCLC_cats_all_genes.txt'
  fw = open(new_file, 'w')

  fw.close()
def clustergrammer_load():
  # import network class from Network.py
  from clustergrammer import Network

  net = Network()

  net.pandas_load_file('mat_cats.tsv')  

  net.make_clust(dist_type='cos',views=['N_row_sum','N_row_var'])

  net.write_json_to_file('viz','json/mult_cats.json','indent')  

  print('\n**********************')
  print(net.dat['node_info']['row'].keys())

  print('\n\n')
def main():

  import time
  start_time = time.time()
  import pandas as pd
  import StringIO

  # import network class from Network.py
  from clustergrammer import Network

  net = Network()

  # load data to dataframe 
  # net.load_tsv_to_net('txt/example_tsv_network.txt')
  # net.load_tsv_to_net('txt/mat_1mb.txt')

  # choose file 
  ################
  # file_buffer = open('txt/col_categories.txt')
  file_buffer = open('txt/example_tsv_network.txt'  )


  buff = StringIO.StringIO( file_buffer.read() )
  net.pandas_load_tsv_to_net(buff)

  # filter rows 
  views = ['filter_row_sum','N_row_sum']

  # distance metric 
  dist_type = 'cosine'

  # linkage type 
  linkage_type = 'average'


  net.make_clust(dist_type=dist_type, views=views, calc_col_cats=True,\
  linkage_type=linkage_type)

  net.write_json_to_file('viz', 'json/mult_view.json', 'no-indent')

  elapsed_time = time.time() - start_time
  print('\n\n\nelapsed time: '+str(elapsed_time))
def proc_locally():
  from clustergrammer import Network
  # import run_g2e_background

  net = Network()

  vect_post = net.load_json_to_dict('large_vect_post.json')

  print(vect_post.keys())

  # mongo_address = '10.125.161.139'


  net.load_vect_post_to_net(vect_post)

  net.swap_nan_for_zero()

  net.N_top_views()  

  print(net.viz.keys())
def reproduce_Mark_correlation_matrix():
  import pandas as pd
  from scipy.spatial.distance import squareform
  from clustergrammer import Network
  from copy import deepcopy

  dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation',
                              pairwise='True')


  dist_mat = squareform(dist_vect)

  # make similarity matrix
  dist_mat = 1 - dist_mat

  net = Network()

  data_type = 'ptm_none'

  filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \
             data_type + '.txt'

  # load file and export dataframe
  net = deepcopy(Network())
  net.load_file(filename)
  net.swap_nan_for_zero()
  tmp_df = net.dat_to_df()
  df = tmp_df['mat']

  cols = df.columns.tolist()
  rows = cols

  mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows)

  save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \
             + 'Mark_corr_sim_mat' + '.txt'
  mark_df.to_csv(save_filename, sep='\t')
def main(mongo_address, viz_id, vect_post):
  from bson.objectid import ObjectId
  from pymongo import MongoClient
  from clustergrammer import Network

  # set up database connection 
  client = MongoClient(mongo_address)
  db = client.clustergrammer 
  viz_id = ObjectId(viz_id)
  # get placeholder viz data 
  found_viz = db.networks.find_one({'_id': viz_id })

  # initialize export_dat 
  export_dat = {}
  export_viz = {}

  # try to make clustegram using vect_post 
  try:

    # ini network obj 
    net = Network()
    
    # vector endpoint 
    net.load_vect_post_to_net(vect_post)

    # swap nans for zeros
    net.swap_nan_for_zero()

    # deprecated clustering modules 
    ####################################
    # cluster g2e using pandas
    # net.fast_mult_views()

    # # calculate top views rather than percentage views
    # net.N_top_views()
    ####################################

    net.make_filtered_views(dist_type='cosine', dendro=True, \
      views=['N_row_sum'], linkage_type='average')

    # export dat 
    try:

      # convert data to list 
      net.dat['mat'] = net.dat['mat'].tolist()
      net.dat['mat_up'] = net.dat['mat_up'].tolist()
      net.dat['mat_dn'] = net.dat['mat_dn'].tolist()

      export_dat['dat'] = net.export_net_json('dat')
      export_dat['source'] = 'g2e_enr_vect'
      dat_id = db.network_data.insert( export_dat )
      print('G2E: network data successfully uploaded')
    
    except:
      export_dat['dat'] = 'data-too-large'
      export_dat['source'] = 'g2e_enr_vect'
      dat_id = db.network_data.insert( export_dat )
      print('G2E: network data too large to be uploaded')

    update_viz = net.viz 
    update_dat = dat_id

  # if there is an error update json with error 
  except:

    print('\n--------------------------------')
    print('G2E clustering error')
    print('----------------------------------\n')
    update_viz = 'error'
    update_dat = 'error'


  # export vix to database 

  found_viz['viz'] = update_viz
  found_viz['dat'] = update_dat

   # update the viz data 
  try:
    db.networks.update_one( {"_id":viz_id}, {"$set": found_viz} )
    print('\n\n---------------------------------------------------')
    print( 'G2E Successfully made and uploaded clustergram')
    print('---------------------------------------------------\n\n')
  except:
    print('\n--------------------------------')
    print('G2E error in loading viz into database')
    print('----------------------------------\n')

  # close database connection 
  client.close() 
import time
start_time = time.time()

from clustergrammer import Network
net = Network()

# choose tsv file
####################
inst_name = 'Tyrosine'
# net.load_file('txt/phos_ratios_all_treat_no_geld_ST.txt')
net.load_file('txt/phos_ratios_all_treat_no_geld_Tyrosine.txt')


net.swap_nan_for_zero()

# net.normalize(axis='row', norm_type='zscore', keep_orig=True)

print(net.dat.keys())

views = ['N_row_sum', 'N_row_var']

net.make_clust(dist_type='cos',views=views , dendro=True,
               sim_mat=True, filter_sim=0.1, calc_cat_pval=False)
               # run_enrichr=['KEA_2015'])
               # run_enrichr=['ENCODE_TF_ChIP-seq_2014'])
               # run_enrichr=['GO_Biological_Process_2015'])

net.write_json_to_file('viz', 'json/'+inst_name+'.json', 'no-indent')
net.write_json_to_file('sim_row', 'json/'+inst_name+'_sim_row.json', 'no-indent')
net.write_json_to_file('sim_col', 'json/'+inst_name+'_sim_col.json', 'no-indent')
def clust_from_response(response_list):
  from clustergrammer import Network
  import scipy
  import json
  import pandas as pd
  import math
  from copy import deepcopy

  print('----------------------')
  print('enrichr_clust_from_response')
  print('----------------------')

  ini_enr = transfer_to_enr_dict( response_list )

  enr = []
  scores = {}
  score_types = ['combined_score','pval','zscore']

  for score_type in score_types:
    scores[score_type] = pd.Series()

  for inst_enr in ini_enr:
    if inst_enr['combined_score'] > 0:

      # make series of enriched terms with scores
      for score_type in score_types:

        # collect the scores of the enriched terms
        if score_type == 'combined_score':
          scores[score_type][inst_enr['name']] = inst_enr[score_type]
        if score_type == 'pval':
          scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type])
        if score_type == 'zscore':
          scores[score_type][inst_enr['name']] = -inst_enr[score_type]

      # keep enrichement values
      enr.append(inst_enr)

  # sort and normalize the scores
  for score_type in score_types:
    scores[score_type] = scores[score_type]/scores[score_type].max()
    scores[score_type].sort(ascending=False)

  number_of_enriched_terms = len(scores['combined_score'])

  enr_score_types = ['combined_score','pval','zscore']

  if number_of_enriched_terms <10:
    num_dict = {'ten':10}
  elif number_of_enriched_terms <20:
    num_dict = {'ten':10, 'twenty':20}
  else:
    num_dict = {'ten':10, 'twenty':20, 'thirty':30}

  # gather lists of top scores
  top_terms = {}
  for enr_type in enr_score_types:
    top_terms[enr_type] = {}
    for num_terms in num_dict.keys():
      inst_num = num_dict[num_terms]
      top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num]

  # gather the terms that should be kept - they are at the top of the score list
  keep_terms = []
  for inst_enr_score in top_terms:
    for tmp_num in num_dict.keys():
      keep_terms.extend( top_terms[inst_enr_score][tmp_num] )

  keep_terms = list(set(keep_terms))

  # keep enriched terms that are at the top 10 based on at least one score
  keep_enr = []
  for inst_enr in enr:
    if inst_enr['name'] in keep_terms:
      keep_enr.append(inst_enr)


  # fill in full matrix
  #######################

  # genes
  row_node_names = []
  # enriched terms
  col_node_names = []

  # gather information from the list of enriched terms
  for inst_enr in keep_enr:
    col_node_names.append(inst_enr['name'])
    row_node_names.extend(inst_enr['int_genes'])

  row_node_names = sorted(list(set(row_node_names)))

  net = Network()
  net.dat['nodes']['row'] = row_node_names
  net.dat['nodes']['col'] = col_node_names
  net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)])

  for inst_enr in keep_enr:

    inst_term = inst_enr['name']
    col_index = col_node_names.index(inst_term)

    # use combined score for full matrix - will not be seen in viz
    tmp_score = scores['combined_score'][inst_term]
    net.dat['node_info']['col']['value'].append(tmp_score)

    for inst_gene in inst_enr['int_genes']:
      row_index = row_node_names.index(inst_gene)

      # save association
      net.dat['mat'][row_index, col_index] = 1

  # cluster full matrix
  #############################
  # do not make multiple views
  views = ['']

  if len(net.dat['nodes']['row']) > 1:
    net.make_clust(dist_type='jaccard', views=views, dendro=False)
  else:
    net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False)

  # get dataframe from full matrix
  df = net.dat_to_df()

  for score_type in score_types:

    for num_terms in num_dict:

      inst_df = deepcopy(df)
      inst_net = deepcopy(Network())

      inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]]

      # load back into net
      inst_net.df_to_dat(inst_df)

      # make views
      if len(net.dat['nodes']['row']) > 1:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False)
      else:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False)

      inst_views = inst_net.viz['views']

      # add score_type to views
      for inst_view in inst_views:

        inst_view['N_col_sum'] = num_dict[num_terms]

        inst_view['enr_score_type'] = score_type

        # add values to col_nodes and order according to rank
        for inst_col in inst_view['nodes']['col_nodes']:

          inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name'])

          inst_name = inst_col['name']
          inst_col['value'] = scores[score_type][inst_name]

      # add views to main network
      net.viz['views'].extend(inst_views)

  return net
示例#23
0
'''
The clustergrammer python module can be installed using pip:
pip install clustergrammer

or by getting the code from the repo:
https://github.com/MaayanLab/clustergrammer-py
'''

# from clustergrammer import Network
from clustergrammer import Network
net = Network()

# load matrix tsv file
net.load_stdin()

# optional filtering and normalization
##########################################
# net.filter_sum('row', threshold=20)
# net.normalize(axis='col', norm_type='zscore', keep_orig=True)
# net.filter_N_top('row', 250, rank_type='sum')
# net.filter_threshold('row', threshold=3.0, num_occur=4)
# net.swap_nan_for_zero()

net.make_clust(dist_type='cos',
               views=['N_row_sum', 'N_row_var'],
               dendro=True,
               sim_mat=True,
               filter_sim=0.1,
               calc_cat_pval=False)

# output jsons for front-end visualizations
示例#24
0
def make_phos_homepage_viz():

    from clustergrammer import Network
    net = Network()

    filename = 'lung_cellline_3_1_16/lung_cellline_phospho/' + \
      'lung_cellline_TMT_phospho_combined_ratios.tsv'

    net.load_file(filename)

    # quantile normalize to normalize cell lines
    net.normalize(axis='col', norm_type='qn')

    # only keep most differentially regulated PTMs
    net.filter_N_top('row', 250, 'sum')

    # take zscore of rows
    net.normalize(axis='row', norm_type='zscore', keep_orig=True)

    net.swap_nan_for_zero()

    # threshold filter PTMs
    net.filter_threshold('row', threshold=1.75, num_occur=3)

    views = ['N_row_sum', 'N_row_var']
    net.make_clust(dist_type='cos',
                   views=views,
                   dendro=True,
                   sim_mat=True,
                   calc_cat_pval=True)

    net.write_json_to_file('viz', 'json/homepage_phos.json', 'indent')
示例#25
0
def process_GCT_and_export_tsv():
  from clustergrammer import Network

  filename = 'gcts/LDS-1003.gct'
  print('exporting processed GCT as tsv file')

  df = load_file(filename)

  net = Network()

  net.df_to_dat(df)
  net.swap_nan_for_zero()

  # zscore first to get the columns distributions to be similar
  net.normalize(axis='col', norm_type='zscore', keep_orig=True)

  # filter the rows to keep the perts with the largest normalizes values
  net.filter_N_top('row', 200)

  net.write_matrix_to_tsv('txt/example_gct_export.txt')
def clust_from_response(response_list):
  from clustergrammer import Network
  import scipy
  import json
  import pandas as pd
  import math
  from copy import deepcopy

  print('----------------------')
  print('enrichr_clust_from_response')
  print('----------------------')

  ini_enr = transfer_to_enr_dict( response_list )

  enr = []
  scores = {}
  score_types = ['combined_score','pval','zscore']

  for score_type in score_types:
    scores[score_type] = pd.Series()

  for inst_enr in ini_enr:
    if inst_enr['combined_score'] > 0:

      # make series of enriched terms with scores
      for score_type in score_types:

        # collect the scores of the enriched terms
        if score_type == 'combined_score':
          scores[score_type][inst_enr['name']] = inst_enr[score_type]
        if score_type == 'pval':
          scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type])
        if score_type == 'zscore':
          scores[score_type][inst_enr['name']] = -inst_enr[score_type]

      # keep enrichement values
      enr.append(inst_enr)

  # sort and normalize the scores
  for score_type in score_types:
    scores[score_type] = scores[score_type]/scores[score_type].max()
    scores[score_type].sort(ascending=False)

  number_of_enriched_terms = len(scores['combined_score'])

  enr_score_types = ['combined_score','pval','zscore']

  if number_of_enriched_terms <10:
    num_dict = {'ten':10}
  elif number_of_enriched_terms <20:
    num_dict = {'ten':10, 'twenty':20}
  else:
    num_dict = {'ten':10, 'twenty':20, 'thirty':30}

  # gather lists of top scores
  top_terms = {}
  for enr_type in enr_score_types:
    top_terms[enr_type] = {}
    for num_terms in list(num_dict.keys()):
      inst_num = num_dict[num_terms]
      top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num]

  # gather the terms that should be kept - they are at the top of the score list
  keep_terms = []
  for inst_enr_score in top_terms:
    for tmp_num in list(num_dict.keys()):
      keep_terms.extend( top_terms[inst_enr_score][tmp_num] )

  keep_terms = list(set(keep_terms))

  # keep enriched terms that are at the top 10 based on at least one score
  keep_enr = []
  for inst_enr in enr:
    if inst_enr['name'] in keep_terms:
      keep_enr.append(inst_enr)


  # fill in full matrix
  #######################

  # genes
  row_node_names = []
  # enriched terms
  col_node_names = []

  # gather information from the list of enriched terms
  for inst_enr in keep_enr:
    col_node_names.append(inst_enr['name'])
    row_node_names.extend(inst_enr['int_genes'])

  row_node_names = sorted(list(set(row_node_names)))

  net = Network()
  net.dat['nodes']['row'] = row_node_names
  net.dat['nodes']['col'] = col_node_names
  net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)])

  for inst_enr in keep_enr:

    inst_term = inst_enr['name']
    col_index = col_node_names.index(inst_term)

    # use combined score for full matrix - will not be seen in viz
    tmp_score = scores['combined_score'][inst_term]
    net.dat['node_info']['col']['value'].append(tmp_score)

    for inst_gene in inst_enr['int_genes']:
      row_index = row_node_names.index(inst_gene)

      # save association
      net.dat['mat'][row_index, col_index] = 1

  # cluster full matrix
  #############################
  # do not make multiple views
  views = ['']

  if len(net.dat['nodes']['row']) > 1:
    net.make_clust(dist_type='jaccard', views=views, dendro=False)
  else:
    net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False)

  # get dataframe from full matrix
  df = net.dat_to_df()

  for score_type in score_types:

    for num_terms in num_dict:

      inst_df = deepcopy(df)
      inst_net = deepcopy(Network())

      inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]]

      # load back into net
      inst_net.df_to_dat(inst_df)

      # make views
      if len(net.dat['nodes']['row']) > 1:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False)
      else:
        inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False)

      inst_views = inst_net.viz['views']

      # add score_type to views
      for inst_view in inst_views:

        inst_view['N_col_sum'] = num_dict[num_terms]

        inst_view['enr_score_type'] = score_type

        # add values to col_nodes and order according to rank
        for inst_col in inst_view['nodes']['col_nodes']:

          inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name'])

          inst_name = inst_col['name']
          inst_col['value'] = scores[score_type][inst_name]

      # add views to main network
      net.viz['views'].extend(inst_views)

  return net
#from sys import argv
from clustergrammer import Network

net = Network()
net.load_file('mat.txt')
#argv[1]
# calculate clustering using default parameters
net.cluster()

# save visualization JSON to file for use by front end
net.write_json_to_file('viz', 'kbio_mhcii_view.json')

net2 = Network()
net2.load_file('mat2.txt')
#argv[1]
# calculate clustering using default parameters
net2.cluster()

# save visualization JSON to file for use by front end
net2.write_json_to_file('viz', 'kbio_mhcii_view_summary.json')
def make_json():
  from clustergrammer import Network
  net = Network()

  row_num = 200
  num_columns = 20

  # make up all names for all data 
  row_names = make_up_names(row_num)

  # initialize vect_post 
  vect_post = {}

  vect_post['title'] = 'Some-Clustergram'
  vect_post['link'] = 'some-link'
  vect_post['filter'] = 'N_row_sum'
  vect_post['is_up_down'] = False
  vect_post['columns'] = []


  split = True

  # fraction of rows in each column - 1 means all columns have all rows 
  inst_prob = 1


  # make column data 
  for col_num in range(num_columns):

    inst_col = {}

    col_name = 'Col-' + str( col_num+1 ) + ' make name longer'

    inst_col['col_name'] = col_name
    inst_col['link'] = 'col-link'

    if col_num < 5:
      inst_col['cat'] = 'brain'
    else:
      inst_col['cat'] = 'lung'

    # save to columns 
    inst_col['data'] = [] #vector

    # get random subset of row_names 
    vect_rows = get_subset_rows(row_names, inst_prob)

    # generate vectors 
    for inst_row in vect_rows:

      # genrate values 
      ##################

      # add positive/negative values 
      if random.random() > 0.5:
        value_up = 10*random.random()
      else: 
        value_up = 0

      if random.random() > 0.5:
        value_dn = -10*random.random()
      else: 
        value_dn = 0

      value = value_up + value_dn

      # # generate vector component 
      # #############################
      # vector.append([ inst_row, value ])
      # vector_up.append([ inst_row, value_up ])
      # vector_dn.append([ inst_row, value_dn ])

      # define row object - within column 
      row_obj = {}
      row_obj['row_name'] = inst_row
      row_obj['val'] = value
      row_obj['val_up'] = value_up
      row_obj['val_dn'] = value_dn

      inst_col['data'].append(row_obj)


    # if split:
    #   inst_col['vector_up'] = vector_up
    #   inst_col['vector_dn'] = vector_dn


    # save columns to vect_post
    vect_post['columns'].append(inst_col)

  net.save_dict_to_json(vect_post, 'fake_vect_post.json', indent='indent')
import time
start_time = time.time()

from clustergrammer import Network
net = Network()

# choose tsv file
#######################
net.load_file('txt/rc_two_cats.txt')
# net.load_file('txt/tuple_cats.txt')
# net.load_file('txt/tuple_names.txt')
# net.load_file('txt/missing_values.txt')
# net.load_file('txt/example_tsv.txt')
# net.load_file('txt/col_categories.txt')
# net.load_file('txt/mat_cats.tsv')
# net.load_file('txt/mat_1mb.txt')
# net.load_file('txt/mnist.txt')
# net.load_file('txt/sim_mat_4_cats.txt')
# net.load_file('txt/number_names.txt')

# link = net.Iframe_web_app('txt/rc_two_cats.txt', width=1000, height=800)
# link = net.Iframe_web_app( width=1000, height=800)

# print(link)

# possible filtering and normalization
##########################################
# net.filter_sum('row', threshold=20)
# net.filter_sum('col', threshold=30)

# net.normalize(axis='row', norm_type='qn')
示例#30
0
'''
The clustergrammer python module can be installed using pip:
pip install clustergrammer

or by getting the code from the repo:
https://github.com/MaayanLab/clustergrammer-py
'''

# from clustergrammer import Network
from clustergrammer import Network
net = Network()

# load matrix tsv file
net.load_file('txt/rc_two_cats.txt')
# net.load_file('txt/rc_val_cats.txt')

# optional filtering and normalization
##########################################
# net.filter_sum('row', threshold=20)
# net.normalize(axis='col', norm_type='zscore', keep_orig=True)
# net.filter_N_top('row', 250, rank_type='sum')
# net.filter_threshold('row', threshold=3.0, num_occur=4)
# net.swap_nan_for_zero()
# net.downsample(ds_type='kmeans', axis='col', num_samples=10)
# net.random_sample(random_state=100, num_samples=10, axis='col')
# net.clip(-6,6)
# net.filter_cat('row', 1, 'Gene Type: Interesting')
# net.set_cat_color('col', 1, 'Category: one', 'blue')

net.cluster(dist_type='cos',
            views=['N_row_sum', 'N_row_var'],
import time
# import StringIO

start_time = time.time()

# import network class from Network.py

from clustergrammer import Network
net = Network()

net.load_file('txt/rc_two_cats.txt')
# net.load_file('txt/example_tsv.txt')
# net.load_file('txt/col_categories.txt')
# net.load_file('txt/mat_cats.tsv')
# net.load_file('txt/mat_1mb.Txt')
# net.load_file('txt/mnist.txt')
# net.load_file('txt/sim_mat_4_cats.txt')

views = ['N_row_sum','N_row_var']

# # filtering rows and cols by sum 
# net.filter_sum('row', threshold=20)
# net.filter_sum('col', threshold=30)
  
# # keep top rows based on sum 
# net.filter_N_top('row', 10, 'sum')

net.make_clust(dist_type='cos',views=views , dendro=True,
               sim_mat=True, filter_sim=0.1)

# net.produce_view({'N_row_sum':10,'dist':'euclidean'})
def prepare_heatmap(matrix_input, html_file, html_dir, tools_dir, categories, distance, linkage):
    # prepare directory and html
    os.mkdir(html_dir)

    env = Environment(loader=FileSystemLoader(tools_dir + "/templates"))
    template = env.get_template("clustergrammer.template")
    overview = template.render()
    with open(html_file, "w") as outf:
        outf.write(overview)

    json_output = html_dir + "/mult_view.json"

    net = Network()
    net.load_file(matrix_input)
    if (categories['row']):
        net.add_cats('row', categories['row'])
    if (categories['col']):
        net.add_cats('col', categories['col'])
    net.cluster(dist_type=distance, linkage_type=linkage)
    net.write_json_to_file('viz', json_output)
示例#33
0
def make_viz_from_df(df, filename):
  from clustergrammer import Network

  net = Network()

  net.df_to_dat(df)
  net.swap_nan_for_zero()

  # zscore first to get the columns distributions to be similar
  net.normalize(axis='col', norm_type='zscore', keep_orig=True)

  # filter the rows to keep the perts with the largest normalizes values
  net.filter_N_top('row', 2000)

  num_coluns = net.dat['mat'].shape[1]

  if num_coluns < 50:
    # views = ['N_row_sum', 'N_row_var']
    views = ['N_row_sum']
    net.make_clust(dist_type='cos', views=views)

    filename = 'json/' + filename.split('/')[1].replace('.gct','') + '.json'

    net.write_json_to_file('viz', filename)
'''
Python 2.7
The clustergrammer python module can be installed using pip:
pip install clustergrammer

or by getting the code from the repo:
https://github.com/MaayanLab/clustergrammer-py
'''

from clustergrammer import Network
net = Network()

# load matrix tsv file
net.load_file('txt/rc_two_cats.txt')
# net.load_file('txt/ccle_example.txt')
# net.load_file('txt/rc_val_cats.txt')
# net.load_file('txt/number_labels.txt')
# net.load_file('txt/mnist.txt')
# net.load_file('txt/tuple_cats.txt')
# net.load_file('txt/example_tsv.txt')

# net.enrichrgram('KEA_2015')

# optional filtering and normalization
##########################################
# net.filter_sum('row', threshold=20)
# net.normalize(axis='col', norm_type='zscore', keep_orig=True)
# net.filter_N_top('row', 250, rank_type='sum')
# net.filter_threshold('row', threshold=3.0, num_occur=4)
# net.swap_nan_for_zero()
# net.set_cat_color('col', 1, 'Category: one', 'blue')
# make network object and load file
from clustergrammer import Network
net = Network()
net.load_file('mult_view.tsv')




# Z-score normalize the rows
#net.normalize(axis='row', norm_type='zscore', keep_orig=True)





# calculate clustering using default parameters
net.cluster()

# save visualization JSON to file for use by front end
net.write_json_to_file('viz', 'mult_view.json')



#	needs pandas and sklearn as well
#	pip install --user --upgrade clustergrammer pandas sklearn
示例#36
0
'''
The clustergrammer python module can be installed using pip:
pip install clustergrammer

or by getting the code from the repo:
https://github.com/MaayanLab/clustergrammer-py
'''
import os
from clustergrammer import Network

for filename in os.listdir("tsv"):
    name = filename.split(".")[0]
    net = Network()
    # load matrix tsv file
    print name
    net.load_file('tsv/' + name + '.tsv')

    # optional filtering and normalization
    ##########################################
    net.swap_nan_for_zero()

    net.make_clust(dist_type='cos',
                   views=['N_row_sum', 'N_row_var'],
                   dendro=True,
                   sim_mat=True,
                   filter_sim=0.1,
                   calc_cat_pval=False)

    # write jsons for front-end visualizations
    net.write_json_to_file('viz', 'output/' + name + '.json', 'indent')