def load_hgram_matrix(): hgram = json_scripts.load_to_dict('hgram_data_latest/hgram_latest.json') hgram['mat'] = np.asarray(hgram['mat']) mat = hgram['mat'] row_names = hgram['nodes']['row'] # will add resource category information to column names tmp_col_names = hgram['nodes']['col'] tmp_col_cats = hgram['node_info']['col']['info'] col_names = [] for i in range(len(tmp_col_names)): inst_name = 'Resource: ' + tmp_col_names[i] inst_cat = 'Resource Type: ' + tmp_col_cats[i] inst_tuple = (inst_name, inst_cat) col_names.append(inst_tuple) # print(hgram.keys()) # print( len(hgram['node_info']['col']['info']) ) # print(len(hgram['nodes']['col'])) # col_names = tmp_col_names # nodes, and mat ini_df = pd.DataFrame(data=mat, columns=col_names, index=row_names) return ini_df
def json_2_gmt(filename): import json_scripts # load json inst_json = json_scripts.load_to_dict(filename) # get sorted dict keys all_keys = sorted( inst_json.keys() ) # write gmt ############### # convert filename to .gmt filename = filename.split('.')[0] + '.gmt' fw = open(filename, 'w') # loop through keys for inst_key in all_keys: # get gene list inst_list = inst_json[inst_key] # print( inst_key + '\t' + str(len(inst_list)) + '\n' ) # write line of gmt fw.write(inst_key + '\tna\t' ) # write genes for inst_elem in inst_list: fw.write(inst_elem + '\t') # write new line fw.write('\n') fw.close()
def main(): ''' I'm just going to add the perturbation signatures as up/dn values. I'll generate comma separated files in the files_2-17-2017/ directory ''' file_names = glob.glob('Pert_sigs/*.json') for inst_filename in file_names: inst_pert = json_scripts.load_to_dict(inst_filename) pert_name = inst_filename.split('/')[1].split('.json')[0] up_genes = inst_pert['upGenes'] dn_genes = inst_pert['dnGenes'] bin_sig = [] for inst_gene in up_genes: bin_sig.append(inst_gene + ',1') for inst_gene in dn_genes: bin_sig.append(inst_gene + ',-1') # save bin_sig to file fw = open('files_2-17-2017/' + pert_name + '.txt', 'w') for inst_val in bin_sig: fw.write(inst_val + '\n') fw.close()
def main(): ''' I'm working on making similarity matrices for KIN, IC, and GPCR genes based on data in the Hzome. Here I'm gathering my old (Hgram) gene lists with the latest list of the 'dark' genes from the KMC 2017 grant. I'm saving these to a new JSON for later use. The next step is to calculate the similarity matrices and visualize them in a notebook or webpage. ''' import json_scripts hgram_info = json_scripts.load_to_dict( '../harmonogram_classes/gene_classes_harmonogram.json') grant_poi = json_scripts.load_to_dict( '../grant_pois/proteins_of_interest.json') gene_types = ['KIN', 'IC', 'GPCR'] # make a new json with merged all genes and dark gene info gene_info = {} for inst_type in gene_types: # add any dark genes to all_genes dark_genes = grant_poi[inst_type] all_genes = hgram_info[inst_type] + dark_genes dark_genes = sorted(list(set(dark_genes))) all_genes = sorted(list(set(all_genes))) print(inst_type) print('all: ' + str(len(all_genes))) print('dark: ' + str(len(dark_genes))) print(len(list(set(dark_genes) - set(all_genes)))) gene_info[inst_type] = {} gene_info[inst_type]['all'] = all_genes gene_info[inst_type]['dark'] = dark_genes print('\n\n') json_scripts.save_to_json(gene_info, '../grant_pois/gene_info_with_dark.json', indent='indent')
def load_gene_classes(): gene_classes = json_scripts.load_to_dict('gene_classes_harmonogram.json') keep_types = ['TF', 'GPCR', 'IC', 'KIN'] keep_genes = [] for inst_class in gene_classes: if inst_class in keep_types: inst_genes = gene_classes[inst_class] keep_genes.extend(inst_genes) return gene_classes
def extract_nodes(): import json_scripts print('extracting nodes: as, cl, pt') # load the LDR data is json format ldr = json_scripts.load_to_dict('LDR/LDR_api.json') # first generate lists of cell_lines, assays, and perturbagens nodes = {} nodes['as'] = [] nodes['cl'] = [] nodes['pt'] = [] nodes['ct'] = [] # loop the ldr list for inst_ldr in ldr: # add assay (datasetName) nodes['as'].append( inst_ldr['datasetName'].strip() ) # get center name nodes['ct'].append(inst_ldr['group']['name']) # # get release # print( 'released: ' + str(inst_ldr['released']) ) # add cell line(s) for inst_cl in inst_ldr['metadata']['cellLines']: if 'name' in inst_cl: nodes['cl'].append( inst_cl['name'].strip() ) # add perturbation(s) for inst_pt in inst_ldr['metadata']['perturbagens']: nodes['pt'].append( inst_pt['name'].strip() ) # get unique and sort for inst_key in nodes: nodes[inst_key] = list(set(nodes[inst_key])) nodes[inst_key] = sorted(nodes[inst_key]) print( 'there are ' + str(len(nodes[inst_key])) + ' ' + inst_key ) return nodes
def construct_array(): import json_scripts import scipy print('\nconstructing array\n') # load the LDR data is json format ldr = json_scripts.load_to_dict('LDR/LDR_api.json') # load cl and as dictionary as_cl_dict = json_scripts.load_to_dict('as_cl_dict.json') # get nodes from 'short name' dictionary values nodes = {} nodes['as'] = sorted(list(set(as_cl_dict['as'].values()))) nodes['cl'] = list(set(as_cl_dict['cl'].values())) # add cell-free to list of cell lines nodes['cl'].append('cell-free') nodes['cl'] = sorted(nodes['cl']) # # run once - add back removed as and cl to Avi dictionary # # find assays and cell lines that were removed from original list # ##################################################################### # all_nodes = extract_nodes() # for inst_data in as_cl_dict: # # get all nodes # tmp_dict = set( as_cl_dict[inst_data].keys() ) # tmp_all = set( all_nodes[inst_data] ) # not_found = list( tmp_all - tmp_dict ) # print('\n') # print(inst_data) # for tmp in not_found: # print(tmp) # print('\n') # make 2d matrix for now mat = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ]) # generate two released matrices rl = {} rl['t'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ]) rl['f'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ]) # generate perturbation dictionary that will save perturbation # information for assays and cell lines perts = {} total = 0 # loop through the ldf datasets for inst_ldr in ldr: # get the inst_assay: put name through dictionary # print( inst_ldr['datasetName'].strip() ) inst_as = as_cl_dict['as'][ inst_ldr['datasetName'].strip() ] # print('inst_as: '+ inst_as) # get the cell line(s) inst_cls = [] for inst_cl in inst_ldr['metadata']['cellLines']: if 'name' in inst_cl: #!! remove cell line 'TBD among cell ...' if 'TBD among' not in inst_cl['name'].strip(): inst_cls.append( as_cl_dict['cl'][ inst_cl['name'].strip() ] ) # get the perturbations inst_pts = [] for inst_pt in inst_ldr['metadata']['perturbagens']: inst_pts.append( inst_pt['name'].strip() ) # if the assay is kinomescan then set cell line to 'cell-free if inst_as == 'KINOMEscan': # print('kinomescan') inst_cls.append( 'cell-free' ) # print(inst_cls) # print('\n\n\n') # add information to mat # get index of assay index_as = nodes['as'].index(inst_as) # loop through cell lines for inst_cl in inst_cls: # get the index of the cell line index_cl = nodes['cl'].index(inst_cl) for inst_pt in inst_pts: # check if the perturbation represents multiple perturbations if 'compounds' in inst_pt and 'among' not in inst_pt: mult_pts = int(inst_pt.split(' ')[0]) else: mult_pts = 0 # track the number of perturbations and the released status ############################################################## if mult_pts == 0: mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + 1 # track number of released if inst_ldr['released'] == True: rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + 1 else: rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + 1 else: mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + mult_pts # track number of released if inst_ldr['released'] == True: rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + mult_pts else: rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + mult_pts # keep track of perturbation information in the dictionary ############################################################## # genrate as cl tuple inst_tuple = str((inst_as, inst_cl)) # initailize list if necessary if inst_tuple not in perts: perts[inst_tuple] = [] # generate pert_dict pert_dict = {} pert_dict['name'] = inst_pt pert_dict['release'] = inst_ldr['released'] pert_dict['_id'] = inst_ldr['_id'] # add dictionary to list perts[inst_tuple].append(pert_dict) # add to total total = total + 1 # check perts dictionary print('perts dictionary - the number of found as/cl combinations') print(len(perts.keys())) # print(perts) # print('\n\n'+str(total)) # save the matrix mat = mat.tolist() rl['t'] = rl['t'].tolist() rl['f'] = rl['f'].tolist() # save the list ldr_mat = {} ldr_mat['nodes'] = nodes ldr_mat['mat'] = mat ldr_mat['rl'] = rl ldr_mat['perts'] = perts json_scripts.save_to_json( ldr_mat, 'ldr_mat.json', 'no-indent' )
def make_ldr_clust(): import json_scripts import numpy as np import d3_clustergram from d3_clustergram_class import Network from ast import literal_eval # load LDR data - stored as: # released status (rl) # nodes, and mat ldr = json_scripts.load_to_dict('ldr_mat.json') print('\nload ldr_mat.json with perts') print(ldr.keys()) ldr['mat'] = np.asarray(ldr['mat']) ldr['rl']['t'] = np.asarray(ldr['rl']['t']) ldr['rl']['f'] = np.asarray(ldr['rl']['f']) print( 'sum all \t' + str(np.sum(ldr['mat'])) ) print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) ) print( 'sum no \t' + str(np.sum(ldr['rl']['f'])) ) print(len(ldr['nodes']['as'])) print(len(ldr['nodes']['cl'])) print(ldr['mat'].shape) print('\n') print( 'size all \t' + str(ldr['mat'].shape) ) print( 'size yes \t' + str(ldr['rl']['t'].shape) ) print( 'size no \t' + str(ldr['rl']['f'].shape) ) print('\n') print( 'sum all \t' + str(np.sum(ldr['mat'])) ) print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) ) print( 'sum no \t' + str(np.sum(ldr['rl']['f'])) ) print( 'total yes/no:\t' + str( np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f']) ) ) # define nodes: unfiltered nodes_uf = {} nodes_uf['row'] = ldr['nodes']['as'] nodes_uf['col'] = ldr['nodes']['cl'] # initialize a new network class ################################## net = Network() net.dat['nodes']['row'] = nodes_uf['row'] net.dat['nodes']['col'] = nodes_uf['col'] # net.dat['mat'] = ldr['mat'] # net.dat['mat_up'] = ldr['rl']['t'] # net.dat['mat_dn'] = -ldr['rl']['f'] # only include released data in visualization net.dat['mat'] = ldr['rl']['t'] # add perts as mat_info ############################ print('\nperts') net.dat['mat_info'] = {} # initialize mat_info for i in range(len(net.dat['nodes']['row'])): for j in range(len(net.dat['nodes']['col'])): tmp_tuple = str((i,j)) # initialize info net.dat['mat_info'][tmp_tuple] = {} for inst_pert in ldr['perts']: pert_data = ldr['perts'][inst_pert] inst_pert = literal_eval(inst_pert) # assay inst_row = inst_pert[0] # cell line inst_col = inst_pert[1] # assay index_row = net.dat['nodes']['row'].index(inst_row) # cell line index_col = net.dat['nodes']['col'].index(inst_col) # save to mat_info tmp_tuple = str((index_row, index_col)) net.dat['mat_info'][str(tmp_tuple)] = pert_data # filter the matrix using cutoff and min_num_meet ################################################### # filtering matrix cutoff_meet = 1 min_num_meet = 1 net.filter_network_thresh( cutoff_meet, min_num_meet ) # cluster ############# cutoff_comp = 3 min_num_comp = 4 net.cluster_row_and_col('cos', cutoff_comp, min_num_comp, dendro=False) # export data visualization to file ###################################### net.write_json_to_file('viz', 'static/networks/LDR_as_cl_released_only.json','indent')
def merge_sigs_to_mat(): tmp_exp_sigs = json_scripts.load_to_dict('proc_data/exp-pert_sigs.json') exp_sigs = {} for inst_sig in tmp_exp_sigs: if 'CD34' not in inst_sig: exp_sigs[inst_sig] = tmp_exp_sigs[inst_sig] all_sigs = sorted(exp_sigs.keys()) num_sigs = len(all_sigs) print('num_sigs: ' + str(num_sigs)) # collect all genes across all experimental signatures all_genes = [] for sig_name in exp_sigs: inst_sig = exp_sigs[sig_name] for inst_gene in inst_sig: # fix sept problems if '-SEP' in inst_gene: inst_num = inst_gene.split('-')[0] inst_gene = 'SEPT' + inst_num if inst_gene != '-': all_genes.append(inst_gene) print(len(all_genes)) all_genes = sorted(list(set(all_genes))) print(len(all_genes)) num_genes = len(all_genes) print('there are ' + str(num_genes) + ' unique genes') mat = np.zeros([num_genes, num_sigs]) # fill in the matrix for sig_name in exp_sigs: inst_sig = exp_sigs[sig_name] col_index = all_sigs.index(sig_name) for inst_gene in inst_sig: # initialize value as false inst_value = False if inst_gene in all_genes: inst_value = inst_sig[inst_gene] if inst_value != False: row_index = all_genes.index(inst_gene) # fill in matrix mat[row_index, col_index] = inst_value # save as dataframe df = pd.DataFrame(data=mat, columns=all_sigs, index=all_genes) df.to_csv('proc_data/exp-pert_sigs.txt', sep='\t')
def make_ccle_matrix_subset(): ''' This will save a subset of the downsampled matrix using the proteins of interest ''' from clustergrammer import Network import json_scripts print('-- load CCLE downsampled data') # load downsampled CCLE data net = Network() net.load_file('CCLE/CCLE_kmeans_ds_col_100.txt') df = net.export_df() # load proteins of interest filename = 'proteins_of_interest/proteins_of_interest.json' poi = json_scripts.load_to_dict(filename) all_poi = [] for inst_type in poi: all_poi.extend(poi[inst_type]) # only keep pois that are found in the CCLE all_genes = df.index.tolist() found_poi = list(set(all_genes) & set(all_poi)) num_found_poi = len(found_poi) print( str(num_found_poi) + ' proteins of interest were found in the CCLE data') # filter dataframe using row list (transpose and transpose-back) ################################################################## df = df.transpose() df = df[found_poi] df = df.transpose() # save version without protein categories (e.g. kinase) df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi_no_cats.txt', sep='\t') row_cats = [] for inst_gene in found_poi: # add protein type to gene names found_type = '' for inst_type in poi: if inst_gene in poi[inst_type]: found_type = inst_type gene_name = 'gene: ' + inst_gene cat_name = 'type: ' + found_type inst_tuple = (gene_name, cat_name) row_cats.append(inst_tuple) # redefine index df.index = row_cats print('-- save matrix with proteins_of_interest subset') df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi.txt', sep='\t')
def make_ldr_clust(): import json_scripts import numpy as np import d3_clustergram # load LDR data ldr = json_scripts.load_to_dict('ldr_mat.json') print(ldr.keys()) ldr['mat'] = np.asarray(ldr['mat']) ldr['rl']['t'] = np.asarray(ldr['rl']['t']) ldr['rl']['f'] = np.asarray(ldr['rl']['f']) print('sum all \t' + str(np.sum(ldr['mat']))) print('sum yes \t' + str(np.sum(ldr['rl']['t']))) print('sum no \t' + str(np.sum(ldr['rl']['f']))) print(len(ldr['nodes']['as'])) print(len(ldr['nodes']['cl'])) print(ldr['mat'].shape) # define nodes: unfiltered nodes_uf = {} nodes_uf['row'] = ldr['nodes']['as'] nodes_uf['col'] = ldr['nodes']['cl'] # define parameters compare_cutoff = 0.05 min_num_compare = 2 # filter to remove nodes with no values ldr['mat'], nodes = d3_clustergram.filter_sim_mat(ldr['mat'], nodes_uf, 1, 1) # cherrypick using hte nodes ldr['rl']['t'] = d3_clustergram.cherrypick_mat_from_nodes( nodes_uf, nodes, ldr['rl']['t']) ldr['rl']['f'] = d3_clustergram.cherrypick_mat_from_nodes( nodes_uf, nodes, ldr['rl']['f']) print('size all \t' + str(ldr['mat'].shape)) print('size yes \t' + str(ldr['rl']['t'].shape)) print('size no \t' + str(ldr['rl']['f'].shape)) print('\n') print('sum all \t' + str(np.sum(ldr['mat']))) print('sum yes \t' + str(np.sum(ldr['rl']['t']))) print('sum no \t' + str(np.sum(ldr['rl']['f']))) print('total yes/no:\t' + str(np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f']))) print('\n\n\n') # print out nodes for inst_row in nodes['row']: print(inst_row) print('\n\n\n') # print out nodes for inst_row in nodes['row']: print(inst_row) print('\n\n\n') # cluster rows and columns print('calculating clustering') clust_order = d3_clustergram.cluster_row_and_column( nodes, ldr['mat'], 'cosine', compare_cutoff, min_num_compare) print('finished calculating clustering') # write the d3_clustergram base_path = 'static/networks/' full_path = base_path + 'LDR_as_cl.json' # add class information row_class = {} col_class = {} print(len(nodes['row'])) print(len(nodes['col'])) # # last minute cleaning up of row/col names # for i in range(len(nodes['col'])): # nodes['col'][i] = nodes['col'][i].replace('/ single drugs','') # for i in range(len(nodes['row'])): # nodes['row'][i] = nodes['row'][i].replace('cell lines','') # write the clustergram d3_clustergram.write_json_single_value(nodes, clust_order, ldr, full_path, row_class, col_class)
def add_grant_num_to_clust(): import json_scripts import numpy as np import scipy print('\n-----------------\nadding grant numbers\n-----------------\n') # load json of Andrew data data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json') print( '\nthere are ' + str(len(data_json['nodes']['row'])) + ' genes in total' ) print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total\n' ) data_mat = np.asarray(data_json['data_mat']) print('data_mat shape') print(data_mat.shape) print('\ngoing to add grants per gene as a column into the harmonogram\n') # make an array of zeros that will be added to the matrix as a new column num_rows = len(data_json['nodes']['row']) extra_col = scipy.zeros([ num_rows, 1 ]) # #!! temporarily switching to ones from zeros # extra_col = scipy.ones([ num_rows, 1 ]) print('extra col shape') print(extra_col.shape) print(extra_col) # add the column using hstack data_mat = np.hstack((data_mat, extra_col)) print('data_mat shape after adding in extra column') print(data_mat.shape) # does not need to be done here ###################### # # add extra resource name # data_json['nodes']['col'].append('Grants_Per_Gene') print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total after adding grants per gene\n' ) # print(data_json['nodes']['col']) # add grants data to data_mat ############################### # load grants_per_gene data grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json') # make list of genes that were not found genes_not_found = [] genes_found = [] # loop through genes and add grant information into data_mat for inst_gene in grants_gene: # get the index if the gene is in rows if inst_gene in data_json['nodes']['row']: # get the index of inst_gene inst_index = data_json['nodes']['row'].index(inst_gene) # print(inst_index) # keep track of found genes genes_found.append(inst_gene) # save CumulProbWeightSum to the matrix inst_grants = grants_gene[inst_gene]['CumulProbWeightSum'] # save the number of grants to the last column # data_mat[inst_index,-1] = inst_grants ## put in fake data data_mat[inst_index,82] = 1 #inst_grants else: # keep track of not found genes genes_not_found.append(inst_gene) # print(len(genes_found)) # print(len(genes_not_found)) # print(data_mat.shape) print('\n-------------\nchecking data_mat\n----------------\n') print(len(data_mat[:,-1])) print(data_mat[inst_index,82]) print(data_mat[2,82]) # convert data_mat to list data_mat = data_mat.tolist() # add back to json data_json['data_mat'] = data_mat # save to json json_scripts.save_to_json(data_json, 'andrew_data/cumul_probs.json', 'no_indent')
def load_andrew_data(): import json_scripts import scipy import numpy as np # load resource classes load_resource_classes() # load resource mapping names load_resource_real_names() # load Andrew's data matrix = json_scripts.load_to_dict('andrew_data/gene_dataset_cumulprobs_20150609.json') # add grants data to data_mat ############################### # load grants_per_gene data grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json') # only keep the resources with real names rn = json_scripts.load_to_dict('resource_real_names.json') # Andrew data format ###################### # matrix is a list of dictionaries # each element of the list has a dictionary with two keys: label and entries # the first element of the list describes the columns of the matrix - label: n.a., entries: resources # the rest of the rows have gene names and the value of the gene in each resource # I will convert Andrew's data into # nodes and data_mat print('\nstarting to process data') # save row and column data to nodes nodes = {} # initialize a list of genes nodes['row'] = [] # get the good resources - get the real names nodes['col'] = rn.values() # save the column index of grants per gene col_index_grant = nodes['col'].index('Grants_Per_Gene') # print('\nlength of nodes col') # print(len(nodes['col'])) # print('\n') # get the number of rows in the matrix # make the matrix smaller by one row # num_rows = len(matrix) num_rows = len(matrix) - 1 # print('\nmatrix:') # print(matrix[0]['label']) # print(matrix[1]['label']) # print(matrix[2]['label']) # print('...') # print(matrix[-2]['label']) # print(matrix[-1]['label']) # print('\n') # print('there are '+str(num_rows)+' genes in the original data from Andrew') # initialize data matrix # rows - genes # cols - good resources data_mat = scipy.zeros([ num_rows, len(rn.keys()) ]) print('\n---------------\nadding original data to matrix\n----------------') # loop through the list # add one to account for the full length of the matrix for i in range(num_rows+1): # get the inst row of the matrix inst_row = matrix[i] # grab the gene name inst_name = inst_row['label'] # grab the list of entries - the actual numerical data inst_entries = inst_row['entries'] # gather the resource names if i == 0: # gather all resource (columns) all_res = inst_row['entries'] # skip the first line - it has column information if i > 0: # save to nodes['row'] nodes['row'].append(inst_name) # only add data from good resources ###################################### # save values to matrix for j in range(len(inst_entries)): # only add data from good resources if all_res[j] in rn: # get the inst inst_data_point = inst_entries[j] # get the resource index in the list of good resources - nodes['col'] # translate the long name (with underscores) to the real name inst_index = nodes['col'].index( rn[all_res[j]] ) # fill in the matrix with the entries from row i # shift the index back one to compensate for first row matrix_index = i-1 # shift the index to account for first row of colun labels data_mat[matrix_index,inst_index] = inst_data_point print('\n---------------\nadding grants to matrix\n----------------') # add grants per gene to matrix ################################## for inst_gene in grants_gene: # get the index of the gene if it is in the original rows if inst_gene in nodes['row']: # get the index of inst_gene inst_index = nodes['row'].index(inst_gene) # get the number of grants inst_grants = grants_gene[inst_gene]['CumulProbWeightSum'] # save the number of grants to the appropriate column data_mat[inst_index,col_index_grant] = inst_grants # print('i '+str(i)) # print('\n') # print('shape of data_mat after filling in ') # print(data_mat.shape) # print('\n') # print('length of nodes row') # print(len(nodes['row'])) # print('nodes') # print(nodes['row'][0]) # print(nodes['row'][-1]) # print('\n') # save json of the numpy-ready data # # convert numpy array to list data_mat = data_mat.tolist() # make one dictionary inst_dict = {} inst_dict['nodes'] = nodes inst_dict['data_mat'] = data_mat # save to json json_scripts.save_to_json(inst_dict,'andrew_data/cumul_probs.json','no_indent')
def generate_d3_json(): import json_scripts import d3_clustergram import scipy import numpy as np print('loading json in generate_d3_json') # load saved json of andrew data data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json') # get nodes and data_mat nodes = data_json['nodes'] data_mat = np.asarray(data_json['data_mat']) print(nodes['col']) print(data_mat.shape) print('calculating clustering orders') # gene and resource classes ################################# # gene class gc = json_scripts.load_to_dict('gene_classes_harmonogram.json') # resource class rc = json_scripts.load_to_dict('resource_classes_harminogram.json') # loop through classes for inst_class in gc: print(inst_class + '\n') # initialize class matrix # class_mat is the subset of data_mat that only has genes of one class, e.g. kinases class_mat = np.array([]) # initialize class_nodes for export class_nodes = {} class_nodes['col'] = nodes['col'] class_nodes['row'] = [] # loop through the rows and check if they are in the class for i in range(len(nodes['row'])): # get the index inst_gs = nodes['row'][i] # check if in class list if inst_gs in gc[inst_class]: # append gene symbol name to row class_nodes['row'].append(inst_gs) # initialize class_mat if necesary if len(class_mat) == 0: class_mat = data_mat[i,:] else: # fill in class_mat class_mat = np.vstack( (class_mat, data_mat[i,:] )) # actual clustering ######################## # cluster the matrix, return clust_order clust_order = d3_clustergram.cluster_row_and_column( class_nodes, class_mat, 'cosine' ) # # mock clustering # ############################ # print('mock clustering') # clust_order = {} # # mock cluster # clust_order['clust'] = {} # clust_order['clust']['row'] = range(len(class_nodes['row'])) # clust_order['clust']['col'] = range(len(class_nodes['col'])) # # mock rank # clust_order['rank'] = {} # clust_order['rank']['row'] = range(len(class_nodes['row'])) # clust_order['rank']['col'] = range(len(class_nodes['col'])) print('generating d3 json') # generate d3_clust json: return json d3_json = d3_clustergram.d3_clust_single_value(class_nodes, clust_order, class_mat ) # add extra information (data_group) to d3_json - add resource class to d3_json['col_nodes'] ############################################################################################### # loop through col_nodes for inst_col in d3_json['col_nodes']: # get the inst_res inst_res = inst_col['name'] # add the resource-class - data_group inst_col['data_group'] = rc[ inst_res ]['data_group'].replace(' ','_') # add extra link information about grant: this will be used to color the grant links externally # from the d3_clustergram code for inst_link in d3_json['links']: inst_link['info'] = 0 if d3_json['col_nodes'][inst_link['target']]['name'] == 'Grants_Per_Gene': inst_link['info'] = 1 print('saving to disk') # save visualization json json_scripts.save_to_json(d3_json,'static/networks/'+inst_class+'_cumul_probs.json','no_indent')
def make_ldr_clust(): import json_scripts import numpy as np import d3_clustergram # load LDR data ldr = json_scripts.load_to_dict('ldr_mat.json') print(ldr.keys()) ldr['mat'] = np.asarray(ldr['mat']) ldr['rl']['t'] = np.asarray(ldr['rl']['t']) ldr['rl']['f'] = np.asarray(ldr['rl']['f']) print( 'sum all \t' + str(np.sum(ldr['mat'])) ) print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) ) print( 'sum no \t' + str(np.sum(ldr['rl']['f'])) ) print(len(ldr['nodes']['as'])) print(len(ldr['nodes']['cl'])) print(ldr['mat'].shape) # define nodes: unfiltered nodes_uf = {} nodes_uf['row'] = ldr['nodes']['as'] nodes_uf['col'] = ldr['nodes']['cl'] # define parameters compare_cutoff = 0.05 min_num_compare = 2 # filter to remove nodes with no values ldr['mat'], nodes = d3_clustergram.filter_sim_mat( ldr['mat'], nodes_uf, 1, 1 ) # cherrypick using hte nodes ldr['rl']['t'] = d3_clustergram.cherrypick_mat_from_nodes(nodes_uf, nodes, ldr['rl']['t']) ldr['rl']['f'] = d3_clustergram.cherrypick_mat_from_nodes(nodes_uf, nodes, ldr['rl']['f']) print( 'size all \t' + str(ldr['mat'].shape) ) print( 'size yes \t' + str(ldr['rl']['t'].shape) ) print( 'size no \t' + str(ldr['rl']['f'].shape) ) print('\n') print( 'sum all \t' + str(np.sum(ldr['mat'])) ) print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) ) print( 'sum no \t' + str(np.sum(ldr['rl']['f'])) ) print( 'total yes/no:\t' + str( np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f']) ) ) print('\n\n\n') # print out nodes for inst_row in nodes['row']: print(inst_row) print('\n\n\n') # print out nodes for inst_row in nodes['row']: print(inst_row) print('\n\n\n') # cluster rows and columns print('calculating clustering') clust_order = d3_clustergram.cluster_row_and_column( nodes, ldr['mat'], 'cosine', compare_cutoff, min_num_compare ) print('finished calculating clustering') # write the d3_clustergram base_path = 'static/networks/' full_path = base_path + 'LDR_as_cl.json' # add class information row_class = {} col_class = {} print(len(nodes['row'])) print(len(nodes['col'])) # # last minute cleaning up of row/col names # for i in range(len(nodes['col'])): # nodes['col'][i] = nodes['col'][i].replace('/ single drugs','') # for i in range(len(nodes['row'])): # nodes['row'][i] = nodes['row'][i].replace('cell lines','') # write the clustergram d3_clustergram.write_json_single_value( nodes, clust_order, ldr, full_path, row_class, col_class)