Пример #1
0
def convert_enr_dict_to_array(enr, pval_cutoff):
	import scipy
	import find_dict_in_list
	import numpy as np

	# enr - data structure 
		# cell lines 
			# up_genes, dn_genes
				# name, pval, pval_bon, pva_bh, int_genes 

	# the columns are the cell lines 
	all_col = sorted(enr.keys())

	# the rows are the enriched terms 
	all_row = []

	# gather all genes with significantly enriched pval_bh 
	#######################################################
	updn = ['up_genes','dn_genes']
	# loop through cell lines 
	for inst_cl in enr:
		# loop through up/dn genes 
		for inst_updn in updn:

			# get inst_enr: the enrichment results from a cell line in either up/dn
			inst_enr = enr[inst_cl][inst_updn]

			# loop through enriched terms 
			for i in range(len(inst_enr)):

				# # append name if pval is significant 
				# if inst_enr[i]['pval_bh'] <= pval_cutoff:

				# append name to all terms 
				all_row.append(inst_enr[i]['name'])

	# get unique terms, sort them
	all_row = sorted(list(set(all_row)))

	# save row and column data to nodes 
	nodes = {}
	nodes['row'] = all_row
	nodes['col'] = all_col

	# gather data into matrix 
	#############################
	# initialize data_mat
	data_mat = {}
	data_mat['merge'] = scipy.zeros([ len(all_row), len(all_col) ])
	data_mat['up']    = scipy.zeros([ len(all_row), len(all_col) ])
	data_mat['dn']    = scipy.zeros([ len(all_row), len(all_col) ])	

	# loop through the rows (genes)
	for i in range(len(all_row)):
		
		# get inst row: gene 
		inst_gene = all_row[i]

		# loop through the columns (cell lines)
		for j in range(len(all_col)):

			# get inst col: cell line 
			inst_cl = all_col[j]

			# initialize pval_nl negative log up/dn
			pval_nl = {}

			# get enrichment from up/dn genes
			for inst_updn in updn:

				# initialize pval_nl[inst_updn] = np.nan
				pval_nl[inst_updn] = np.nan

				# gather the current set of enrichment results
				# from the cell line 
				inst_enr = enr[inst_cl][inst_updn]

				# check if gene is in list of enriched results 
				if any(d['name'] == inst_gene for d in inst_enr):

					# get the dict from the list
					inst_dict = find_dict_in_list.main( inst_enr, 'name', inst_gene)
					
					# only include significant pvalues
					if inst_dict['pval_bh'] <= 0.05:

						# retrieve the negative log pval_
						pval_nl[inst_updn] = -np.log2( inst_dict['pval_bh'] )

					else:
						# set nan pval
						pval_nl[inst_updn] = np.nan

			# set value for data_mat 
			###########################
			# now that the enrichment results have been gathered
			# for up/dn genes save the results 

			# there is both up and down enrichment 
			if np.isnan(pval_nl['up_genes']) == False and np.isnan(pval_nl['dn_genes']) == False:
				
				# set value of data_mat['merge'] as the mean of up/dn enrichment 
				data_mat['merge'][i,j] = np.mean([ pval_nl['up_genes'], -pval_nl['dn_genes'] ])

				# set values of up/dn
				data_mat['up'][i,j] =  pval_nl['up_genes']
				data_mat['dn'][i,j] = -pval_nl['dn_genes']

			# there is only up enrichment 
			elif np.isnan(pval_nl['up_genes']) == False:
				# set value of data_mat as up enrichment 
				data_mat['merge'][i,j] = pval_nl['up_genes'] 
				data_mat['up'   ][i,j] = pval_nl['up_genes']

			# there is only dn enrichment
			elif np.isnan(pval_nl['dn_genes']) == False:
				# set value of data_mat as the mean of up/dn enrichment 
				data_mat['merge'][i,j] = -pval_nl['dn_genes']
				data_mat['dn'   ][i,j] = -pval_nl['dn_genes']


	# return nodes, and data_mat 
	return nodes, data_mat
Пример #2
0
def convert_enr_to_nodes_mat(enr):
	import scipy
	import find_dict_in_list
	import numpy as np

	# enr - data structure 
		# name, pval, pval_bon, pva_bh, int_genes 

	# gather all enriched terms 
	all_col = []
	for i in range(len(enr)):
		all_col.append(enr[i]['name'])

	# the rows are the input genes 
	all_row = []

	# gather terms significantly enriched terms 
	############################################# 
	# loop through the enriched terms 
	for i in range(len(enr)):

		# load inst_enr dict from the list of dicts, enr
		inst_enr = enr[i]

		# extend genes to all_row
		all_row.extend( inst_enr['int_genes'] )

	# get unique terms, sort them
	all_row = sorted(list(set(all_row)))

	# print( 'there are ' + str(len(all_row)) + ' input genes ')

	# save row and column data to nodes 
	nodes = {}
	nodes['row'] = all_row
	nodes['col'] = all_col

	# gather data into matrix 
	#############################
	# initialize data_mat
	data_mat = scipy.zeros([ len(all_row), len(all_col) ])

	# loop through the enriched terms (columns) and fill in data_mat 
	for inst_col in all_col:

		# get col index
		j = all_col.index(inst_col)

		# get the enrichment dict 
		inst_enr = find_dict_in_list.main( enr, 'name', inst_col )

		# grab the intersecting genes 
		inst_gene_list = inst_enr['int_genes']

		# loop through the intersecting genes 
		for inst_gene in inst_gene_list:

			# get the row index 
			i = all_row.index(inst_gene)

			# fill in 1 for the position i,j in data_mat 
			data_mat[i,j] = 1 

	# return nodes, and data_mat 
	return nodes, data_mat
Пример #3
0
def cluster_row_and_column( nodes, data_mat, dist_type, enr ):
	import find_dict_in_list
	import scipy
	import scipy.cluster.hierarchy as hier
	import numpy as np 
	from operator import itemgetter

	num_row = len(nodes['row'])
	num_col = len(nodes['col'])


	# # check pvalues 
	# for inst_term in nodes['col']:
	# 	# find dict in list 
	# 	inst_dict = find_dict_in_list.main( enr, 'name', inst_term ) 

	# Generate Row and Column Distance Matrices 
	############################################
	# initialize distance matrices 
	row_dm = scipy.zeros([num_row, num_row])
	col_dm = scipy.zeros([num_col, num_col])

	# print('making distance matrices')

	# define the minimum number of intersecting measurements
	min_num_intersect = 1

	# row dist_mat
	for i in range(num_row):
		for j in range(num_row):
			# calculate the distance between the rows in data_mat
			inst_dist = calc_dist_vectors( data_mat[i,:], data_mat[j,:], dist_type, min_num_intersect )
			# save the distance in the row distance matrix 
			row_dm[i,j] = inst_dist 

	# col dist_mat 
	for i in range(num_col):
		for j in range(num_col):
			# calculate the distance betweeen the columns in data_mat
			inst_dist = calc_dist_vectors( data_mat[:,i], data_mat[:,j], dist_type, min_num_intersect )
			# save the distance in the col distance matrix 
			col_dm[i,j] = inst_dist 

	# initialize index
	clust_order = {}
	clust_order['clust'] = {}
	clust_order['rank'] = {}
	clust_order['pval'] = {}
	clust_order['pval_bh'] = {}
	clust_order['nl_pval'] = {}

	# Cluster Rows
	###############
	cluster_method = 'centroid'
	# calculate linkage 
	Y = hier.linkage( row_dm, method=cluster_method)
	# getting error at dendrogram 
	Z = hier.dendrogram( Y, no_plot=True  )
	# get ordering
	clust_order['clust']['row'] = Z['leaves']

	# Cluster Columns 
	##################
	# calculate linkage 
	# print('clustering columns')
	Y = hier.linkage( col_dm, method=cluster_method)
	Z = hier.dendrogram( Y, no_plot=True )
	# get ordering
	clust_order['clust']['col'] = Z['leaves']

	# rank terms by pval
	#####################
	# since the enriched terms are already ordered by their pval
	# I will just reverse their order so that the terms with the 
	# lowest pvalues appear at the left 
	tmp_col_order = []
	# initialize the nl_pval data
	clust_order['nl_pval']['row'] = []
	clust_order['nl_pval']['col'] = []

	clust_order['pval']['col'] = []
	clust_order['pval_bh']['col'] = []
	clust_order['pval_bh']['row'] = []

	# add nl_pval
	for i in range(len(nodes['col'])):
		# get the ordering in reverse
		tmp_col_order.append( len(nodes['col']) - i )
		# get enrichment dict 
		inst_dict = find_dict_in_list.main( enr, 'name', nodes['col'][i])
		# gather pval 
		clust_order['pval']['col'].append( inst_dict['pval'] )
		# gather pval_bh 
		# clust_order['pval_bh']['col'].append( inst_dict['pval_bh'] )
		# use combined score instead 
		# 
		clust_order['pval_bh']['col'].append( inst_dict['combined_score'] )

		# # gather nl_pval 
		# clust_order['nl_pval']['col'].append( -np.log2(inst_dict['pval_bh']) )

		# use combined score instead 
		# the combined score can be negative if the zscore is positive 
		if inst_dict['combined_score'] < 0:
			clust_order['nl_pval']['col'].append( 0 )
		else: 
			clust_order['nl_pval']['col'].append( inst_dict['combined_score'] )

	# print( clust_order['nl_pval']['col'] )

	# save rank order 
	clust_order['rank']['col'] = tmp_col_order

	# rank genes by number 
	#######################
	# loop through genes 
	sum_term = []
	for i in range(len(nodes['row'])):
		
		# initialize dict 
		inst_dict = {}

		# get the name of the gene 
		inst_dict['name'] = nodes['row'][i] 

		# sum the number of terms that the gene is found in 
		inst_dict['num_term'] = np.sum(data_mat[i,:]) 

		# save the number of terms associated with each gene
		# data_mat is a binary matrix with 1 for gene in term and 0 for gene not in term  
		# take the dot product of the nl_pvalues and the binary matrix to get a weighted score for 
		# each row. The more highly enriched terms a gene is in the darker the tile 
		clust_order['nl_pval']['row'].append( np.dot( data_mat[i,:], clust_order['nl_pval']['col'] ) )

		# add this to the list of dicts
		sum_term.append(inst_dict)

	sum_term = sorted(sum_term, key=itemgetter('num_term'), reverse=False)
	
	# get list of sorted genes 
	tmp_sort_genes = []
	for inst_dict in sum_term:
		tmp_sort_genes.append(inst_dict['name']) 

	# get the sorted index 
	sort_index = []
	for inst_gene in nodes['row']:
		sort_index.append( tmp_sort_genes.index(inst_gene) )

	# save the sorted indexes 
	clust_order['rank']['row'] = sort_index


	return clust_order