예제 #1
0
파일: stats.py 프로젝트: mdrasmus/spimap
def rhyper(m, n, M, N, report=0):
    '''
    calculates cumulative probability based on
    hypergeometric distribution
    over/under/both (report = 0/1/2)
    (uses R through RPy)
    
    N = total balls in urn
    M = total white balls in urn
    n = drawn balls from urn
    m = drawn white balls from urn
    
    '''

    from rpy import r

    assert ((type(m) == type(n) == type(M) == type(N) == int) and m <= n
            and m <= M and n <= N)

    if report == 0:
        #p-val for over-repr.
        return r.phyper(m - 1, M, N - M, n, lower_tail=False)
    elif report == 1:
        #p-val for under-repr.
        return r.phyper(m, M, N - M, n)
    elif report == 2:
        #tuple (over, under)
        return r.phyper(m - 1, M, N - M, n,
                        lower_tail=False), r.phyper(m, M, N - M, n)
    else:
        raise "unknown option"
예제 #2
0
파일: stats.py 프로젝트: jeffhsu3/argweaver
def rhyper(m, n, M, N, report=0):
    '''
    calculates cumulative probability based on
    hypergeometric distribution
    over/under/both (report = 0/1/2)
    (uses R through RPy)
    
    N = total balls in urn
    M = total white balls in urn
    n = drawn balls from urn
    m = drawn white balls from urn
    
    '''

    from rpy import r

    
    assert( (type(m) == type(n) == type(M) == type(N) == int)
            and m <= n and m <= M and n <= N)
    
    
    
    if report == 0:
        #p-val for over-repr.
        return r.phyper(m-1, M, N-M, n, lower_tail=False)
    elif report == 1:
        #p-val for under-repr.
        return r.phyper(m, M, N-M, n)
    elif report == 2:
        #tuple (over, under)
        return r.phyper(m-1, M, N-M, n, lower_tail=False), r.phyper(m, M, N-M, n)
    else:
        raise "unknown option"
예제 #3
0
	def _cluster_stat(self, mcl_id, vertex_set):
		if vertex_set in self.cluster_memory:
			entry = self.cluster_memory[vertex_set]
			p_value_min = entry[0]
			go_no_vector = entry[1]
			unknown_gene_ratio = entry[2]
			self.to_db.append([p_value_min, go_no_vector, unknown_gene_ratio, mcl_id])
			
			self.no_of_records += 1
			return
		else:
			_cluster_memroy = {}
		vertex_list = vertex_set[1:-1].split(',')
		vertex_list = map(int, vertex_list)
		cluster_size = len(vertex_list)
		self.local_go_no_dict_construct(vertex_list)
		if 0 in self.local_go_no_dict:
			unknown_gene_ratio = self.local_go_no_dict[0]/float(cluster_size)
		else:
			unknown_gene_ratio = 0
		if self.local_go_no_dict == {}:
			self.logfile.write('%d %s: local_go_no_dict empty\n'%(mcl_id, repr(vertex_set)))
			return
		for go_no in self.local_go_no_dict:
			if self.wu:
			# code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included)
				x = self.local_go_no_dict[go_no]
				m = self.global_go_no_to_size_dict[go_no]
				n = self.no_of_genes - m
				k = cluster_size
			else:
				pass
			if self.bonferroni:
				p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self.local_go_no_dict)
			else:
				p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)
			self.logfile.write('%d %d %d %d %d %d %f %f\n'%(mcl_id,go_no,x,m,n,k,p_value, unknown_gene_ratio))
			if p_value in _cluster_memroy:
				_cluster_memroy[p_value].append(go_no)
			else:
				_cluster_memroy[p_value] = [go_no]
		p_value_vector = _cluster_memroy.keys()
		if p_value_vector == []:
			self.logfile.write('%d %s: all vertices belong to population singleton classes\n'%(mcl_id, repr(vertex_set)))
			return
		p_value_min = min(p_value_vector)
		go_no_vector = _cluster_memroy[p_value_min]

		self.no_of_records += 1
		self.cluster_memory[vertex_set] =[p_value_min, go_no_vector, unknown_gene_ratio]
		self.to_db.append([p_value_min, go_no_vector, unknown_gene_ratio, mcl_id])
예제 #4
0
	def _cluster_stat(self, mcl_id, vertex_set, connectivity):
		vertex_list_all = vertex_set[1:-1].split(',')
		vertex_list = []
		for i in range(len(vertex_list_all)):
			vertex_list_all[i] = int(vertex_list_all[i])
			if vertex_list_all[i] in self.global_gene_to_go_dict:
			#this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included)
				vertex_list.append(vertex_list_all[i])
		cluster_size = len(vertex_list)
		p_value_vector = [1] * self.no_of_functions
		self.local_go_no_dict_construct(vertex_list)
		for gene_no in vertex_list_all:
			self.go_no_dict_adjust(gene_no)
			for go_no in self._local_go_no_dict:
				if self.wu or (gene_no not in self.global_gene_to_go_dict):
				# code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included)
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes - m
					k = cluster_size
				else:
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes -1 - m
					k = cluster_size-1
				if self.bonferroni:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict)
				else:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)
				if self.log:
					self.logfile.write('%d %d %d %d %d %d %d %f\n'%\
						(mcl_id,gene_no,go_no,x,m,n,k,p_value))
				p_value_vector[go_no] = p_value
			#for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py
			if self.wu:
				p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size)
			else:
				#not wu's strategy, throw away the gene, the cluster_size is down by 1.
				if self._local_go_no_dict.has_key(0):
					#after leave_one_out, still unknown genes present
					p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size-1)
				else:
					#no unknown genes
					p_value_vector[0] = 1
			if self.output:
				self.outf.write('%d\t%d\t%s\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity))
			elif self.needcommit:
				self.curs.execute("insert into %s(mcl_id, leave_one_out, p_value_vector, connectivity)\
				values(%d, %d, ARRAY%s, %8.6f)"%(self.target_table, mcl_id, gene_no, repr(p_value_vector), connectivity))
			self.no_of_records += 1
예제 #5
0
	def get_information_of_go_functions(self, curs, go_no2association_genes, cluster_size, \
		no_of_total_genes, p_value_cut_off=0, go_table='go'):
		"""
		04-06-05
			input: go_no_list
			output: go_no2information
			
			information includes go_no, go_id, name, depth, no_of_associated genes
		"""
		sys.stderr.write("Getting information about a list of go_nos...")
		go_no2information = {}
		for go_no,association_genes in go_no2association_genes.iteritems():
			no_of_associated_genes = len(association_genes)
			curs.execute("select go_no, go_id, name, depth, array_upper(gene_array,1) from %s \
				where go_no=%s"%(go_table, go_no))
			rows = curs.fetchall()
			for row in rows:
				p_value = r.phyper(no_of_associated_genes-1, row[-1],no_of_total_genes-row[-1], cluster_size,lower_tail = r.FALSE)
				if p_value_cut_off:	#non zero, needs cut some p-values
					if p_value>p_value_cut_off:
						continue
				go_no2information[go_no] = list(row) + [no_of_associated_genes, p_value]	#go_no, go_id, name, depth, population size, local size, p_value
					
		sys.stderr.write("Done.\n")
		return go_no2information
예제 #6
0
	def _cluster_stat(self, mcl_id, vertex_set):
		vertex_list = vertex_set[1:-1].split(',')
		vertex_list = map(int, vertex_list)
		vertex_list_gene_symbol = []
		for vertex in vertex_list:
			vertex_list_gene_symbol.append(self.gene_no2gene_id[vertex])
		cluster_size = len(vertex_list)
		local_go_no_dict = self.local_go_no_dict_construct(vertex_list)
		
		if local_go_no_dict == {}:
			self.logfile.write('%d %s: local_go_no_dict empty\n'%(mcl_id, repr(vertex_set)))
			return
		for go_no in local_go_no_dict:
			if self.wu:
			# code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included)
				x = len(local_go_no_dict[go_no])
				m = self.global_go_no_to_size_dict[go_no]
				n = self.no_of_genes - m
				k = cluster_size
			else:
				pass
			if self.bonferroni:
				p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(local_go_no_dict)
			else:
				p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)
			if x >1:
				#this function must have more than one gene associated.
				transfac_dict = self.transfac_dict_construct(local_go_no_dict[go_no])
				gene_id_list = []
				for gene_no in local_go_no_dict[go_no]:
					gene_id_list.append(self.gene_no2gene_id[gene_no])
				self.logfile.write('%d\t%s\t%d\t%d\t%d\t%d\t%f\t%s\t%s\t%s\n'%(mcl_id,\
					self.global_go_no2go_name[go_no], x, k, m, self.no_of_genes,\
					p_value, '|'.join(gene_id_list), repr(vertex_list_gene_symbol), repr(transfac_dict)))

		self.no_of_records += 1
예제 #7
0
	def _cluster_stat(self, mcl_id, vertex_set, connectivity):
		"""
		04-18-05
			add two important criteria to avoid the situation that hypergeometric test
			is powerless (the population size of the go-no is too small).
			1. percentage of associated-genes over total known genes >= uniformity (0.5 default)
			2. apart from the percentage, the absolute number is also needed in case the cluster is too small.
		04-19-05
			fix a bug. self._no_of_known_genes_of_the_cluster could be 0.
		"""	
		vertex_list_all = vertex_set[1:-1].split(',')
		vertex_list = []
		for i in range(len(vertex_list_all)):
			vertex_list_all[i] = int(vertex_list_all[i])
			if vertex_list_all[i] in self.global_gene_to_go_dict:
			#this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included)
				vertex_list.append(vertex_list_all[i])
		cluster_size = len(vertex_list)
		p_value_vector = [1] * self.no_of_functions
		self.local_go_no_dict_construct(vertex_list)
		for gene_no in vertex_list_all:
			self.go_no_dict_adjust(gene_no)
			for go_no in self._local_go_no_dict:
				if self.global_go_no_to_size_dict[go_no]<self.min_node_size:	#06-11-05
					continue
				if self.go_no2depth[go_no]>self.max_node_depth:	#06-11-05
					continue
				if self.wu or (gene_no not in self.global_gene_to_go_dict):
				# code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included)
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes - m
					k = cluster_size
				else:
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes -1 - m
					k = cluster_size-1
				if self._no_of_known_genes_of_the_cluster == 0:
					go_no_ratio = 0
				else:
					go_no_ratio = float(x)/self._no_of_known_genes_of_the_cluster	#NOTE: it's different from no_of_known_genes_of_the_cluster
				if  go_no_ratio < self.uniformity and go_no!=0:	#It doesn't apply to the 0(unknown) category.
					#ignore the function category if its percentage is < uniformity
					continue
				if x < 3 and go_no!=0:	#apart from the percentage, the absolute number is also needed in case the cluster is too small.
					continue
				if self.bonferroni:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict)
				else:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)
				if self.log:
					self.logfile.write('%d %d %d %d %d %d %d %f\n'%\
						(mcl_id,gene_no,go_no,x,m,n,k,p_value))
				p_value_vector[go_no] = p_value
			#for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py
			if self.wu:
				p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size)
			else:
				#not wu's strategy, throw away the gene, the cluster_size is down by 1.
				if self._local_go_no_dict.has_key(0):
					#after leave_one_out, still unknown genes present
					p_value_vector[0] = self._local_go_no_dict[0]/float(cluster_size-1)
				else:
					#no unknown genes
					p_value_vector[0] = 1
			#03-18-05increment before inserted into table, cluster_stat_id starting from 1
			self.no_of_records += 1
			if self.output:
				self.outf.write('%d\t%d\t%s\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity))
			elif self.needcommit:
				self.curs.execute("insert into %s(cluster_stat_id, mcl_id, leave_one_out, p_value_vector, connectivity)\
				values(%d, %d, %d, ARRAY%s, %8.6f)"%(self.target_table, self.no_of_records, mcl_id, gene_no, repr(p_value_vector), connectivity))
예제 #8
0
	def _cluster_stat(self, mcl_id, vertex_set, connectivity):
		"""
		04-18-05
			add two important criteria to avoid the situation that hypergeometric test
			is powerless (the population size of the go-no is too small).
			1. percentage of associated-genes over total known genes >= uniformity (0.5 default)
			2. apart from the percentage, the absolute number is also needed in case the cluster is too small.
		04-19-05
			fix a bug. self._no_of_known_genes_of_the_cluster could be 0.
		08-13-05
			remove the redundant information in the p_value_vector to save space
				p_value_vector only keeps (p_value,go_no) pairs with lowest p_value and go_no>=min_node_depth
			unknown_gene_ratio is split from p_value_vector
			a bug found, p_value_vector is not grounded after each gene cycle
			table submit() is not supported anymore
		08-15-05
			if go_no == 0, skip p-value calculation
			self.local_go_no_dict[go_no]<3, skip the go_no. previously it's using self._local_go_no_dict[go_no] to do filter,
			known genes and unknown genes get different treatment.
		"""
		vertex_list_all = vertex_set[1:-1].split(',')
		vertex_list = []
		for i in range(len(vertex_list_all)):
			vertex_list_all[i] = int(vertex_list_all[i])
			if vertex_list_all[i] in self.global_gene_to_go_dict:
			#this filter will only be useful when Jasmine's strategy is applied to whole gene-set(unknown included)
				vertex_list.append(vertex_list_all[i])
		cluster_size = len(vertex_list)
		self.local_go_no_dict_construct(vertex_list)
		for gene_no in vertex_list_all:
			p_value_vector = []
			self.go_no_dict_adjust(gene_no)
			#for the unknown class, use the ratio instead of p_value, in accordance with mcl_result_stat.py
			if self.wu:
				unknown_gene_ratio = self._local_go_no_dict[0]/float(cluster_size)	#08-13-05	unknown_gene_ratio is split from p_value_vector
			else:
				#not wu's strategy, throw away the gene, the cluster_size is down by 1.
				if self._local_go_no_dict.has_key(0):
					#after leave_one_out, still unknown genes present
					unknown_gene_ratio = self._local_go_no_dict[0]/float(cluster_size-1)
				else:
					#no unknown genes
					unknown_gene_ratio = 0
			for go_no in self._local_go_no_dict:
				if go_no == 0:	#unknown function category doesn't need to do calculation
					continue
				if self.local_go_no_dict[go_no]<3:	#apart from the percentage, the absolute number is also needed in case the cluster is too small.
					#08-15-05 fix a bug here, previously it's using self._local_go_no_dict[go_no] to do filter. known genes and unknown genes get different treatment.
					continue
				if self.global_go_no_to_size_dict[go_no]<self.min_node_size:	#06-11-05
					continue
				if self.go_no2depth[go_no]<self.min_node_depth:	#06-11-05, 08-13-05 max_node_depth changed to min_node_depth
					continue
				if self.wu or (gene_no not in self.global_gene_to_go_dict):
				# code after 'or' deals with the situation that Jasmine's strategy is applied to whole gene-set(unknown included)
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes - m
					k = cluster_size
				else:
					x = self._local_go_no_dict[go_no]
					m = self._global_go_no_dict[go_no]
					n = self.no_of_genes -1 - m
					k = cluster_size-1
				if self._no_of_known_genes_of_the_cluster == 0:
					go_no_ratio = 0
				else:
					go_no_ratio = float(x)/self._no_of_known_genes_of_the_cluster	#NOTE: it's different from no_of_known_genes_of_the_cluster
				if  go_no_ratio < self.uniformity and go_no!=0:	#It doesn't apply to the 0(unknown) category.
					#ignore the function category if its percentage is < uniformity
					continue
				if self.bonferroni:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)*len(self._local_go_no_dict)
				else:
					p_value = r.phyper(x-1,m,n,k,lower_tail = r.FALSE)
				if self.log:
					self.logfile.write('%d %d %d %d %d %d %d %f\n'%\
						(mcl_id,gene_no,go_no,x,m,n,k,p_value))
				p_value_vector.append([p_value, go_no])	#08-13-05
			if len(p_value_vector) == 0:	#08-13-05	no qualified go_no for this gene_no
				continue
			p_value_vector = self.retain_min_p_value_pairs(p_value_vector)	#08-13-05	get the pairs with minimum p_value
			
			#03-18-05increment before inserted into table, cluster_stat_id starting from 1
			self.no_of_records += 1
			if self.output:
				self.outf.write('%d\t%d\t%s\t%f\t%f\n'%(mcl_id, gene_no, repr(p_value_vector), connectivity, unknown_gene_ratio))	#08-13-05
			elif self.needcommit:
				self.curs.execute("insert into %s(cluster_stat_id, mcl_id, leave_one_out, p_value_vector, connectivity)\
				values(%d, %d, %d, ARRAY%s, %8.6f)"%(self.target_table, self.no_of_records, mcl_id, gene_no, repr(p_value_vector), connectivity))