示例#1
0
	def get_recurrence_go_no_rec_array_cluster_id_ls(self, curs, pattern_table, mcl_id2go_no_set):
		"""
		2006-09-26
			from pattern_table and use mcl_id2go_no_set
			go_no_list is the go_id Set
			mcl_id2enc_recurrence is for get_recurrence_rec_array_bs_no_list()
		"""
		sys.stderr.write("Getting recurrence_go_no_rec_array_cluster_id_ls...\n")
		no_of_datasets = 0
		go_no2recurrence_cluster_id = {}
		mcl_id2enc_recurrence = {}
		curs.execute("DECLARE crs CURSOR FOR SELECT id, recurrence_array from %s"\
			%pattern_table)
		curs.execute("fetch 5000 from crs")
		rows = curs.fetchall()
		counter = 0
		real_counter = 0
		while rows:
			for row in rows:
				mcl_id, recurrence_array = row
				if mcl_id in mcl_id2go_no_set:
					#if this pattern has functions predicted
					recurrence_array = recurrence_array[1:-1].split(',')
					recurrence_array = map(float, recurrence_array)	#this is not a binary 0/1 array
					occurrence_cutoff_func = lambda x: int(x>=0.8)	#0.8 is arbitrary
					recurrence_array = map(occurrence_cutoff_func, recurrence_array)
					if no_of_datasets == 0:
						no_of_datasets = len(recurrence_array)
					go_no_list = mcl_id2go_no_set[mcl_id]
					encoded_recurrence = encodeOccurrenceBv(recurrence_array)
					mcl_id2enc_recurrence[mcl_id] = encoded_recurrence	#2006-09-26
					for go_no in go_no_list:
						if go_no not in go_no2recurrence_cluster_id:
							go_no2recurrence_cluster_id[go_no] = [encoded_recurrence, Set([mcl_id])]
								#use Set() because mcl_id has duplicates due to different p-values
						else:
							go_no2recurrence_cluster_id[go_no][0] = \
								go_no2recurrence_cluster_id[go_no][0] | encoded_recurrence
							go_no2recurrence_cluster_id[go_no][1].add(mcl_id)
					real_counter += 1
				counter += 1
			if self.report:
				sys.stderr.write("%s%s\t%s"%('\x08'*20, counter, real_counter))
			curs.execute("fetch 5000 from crs")
			rows = curs.fetchall()
		curs.execute("close crs")
		
		recurrence_go_no_rec_array_cluster_id_ls = []
		for go_no in go_no2recurrence_cluster_id:
			encoded_recurrence, mcl_id_set = go_no2recurrence_cluster_id[go_no]
			recurrence_array = decodeOccurrence(encoded_recurrence)	#not binary vector
			recurrence = len(recurrence_array)
			recurrence_go_no_rec_array_cluster_id_ls.append([recurrence, go_no, recurrence_array, mcl_id_set])
		
		recurrence_go_no_rec_array_cluster_id_ls.sort()
		sys.stderr.write("End getting recurrence_go_no_rec_array_cluster_id_ls.\n")
		return recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence
示例#2
0
	def get_recurrence_rec_array_bs_no_list(self, curs, cluster_bs_table, mcl_id2enc_recurrence):
		"""
		11-01-05
		""" 
		sys.stderr.write("Getting recurrence_rec_array_bs_no_list...\n")
		bs_no2enc_recurrence = {}
		curs.execute("DECLARE crs CURSOR FOR select c.mcl_id, c.bs_no_list from %s c"%(cluster_bs_table))
		curs.execute("fetch 5000 from crs")
		rows = curs.fetchall()
		counter = 0
		real_counter =0
		while rows:
			for row in rows:
				mcl_id, bs_no_list = row
				if mcl_id in mcl_id2enc_recurrence:
					encoded_recurrence = mcl_id2enc_recurrence[mcl_id]
					bs_no_list = bs_no_list[1:-1].split(',')
					bs_no_list = map(int, bs_no_list)
					for bs_no in bs_no_list:
						if bs_no not in bs_no2enc_recurrence:
							bs_no2enc_recurrence[bs_no] = encoded_recurrence
						else:
							bs_no2enc_recurrence[bs_no] |= encoded_recurrence
					real_counter += 1
				counter += 1
			if self.report:
				sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter))
			curs.execute("fetch 10000 from crs")
			rows = curs.fetchall()
		curs.execute("close crs")
		recurrence_rec_array_bs_no_list = []
		for bs_no, enc_recurrence in bs_no2enc_recurrence.iteritems():
			recurrence_array = decodeOccurrence(enc_recurrence)	#not binary vector
			recurrence = len(recurrence_array)
			recurrence_rec_array_bs_no_list.append([recurrence, recurrence_array, bs_no])
		recurrence_rec_array_bs_no_list.sort()
		sys.stderr.write("End getting recurrence_rec_array_bs_no_list.\n")
		return recurrence_rec_array_bs_no_list
示例#3
0
	def get_core_vertex_set(self, vertex_list, recurrence_array, degree_cut_off):
		"""
		12-16-05
			global structures used:
				--self.edge2encodedOccurrence
		12-18-05
			expand to all datasets
			
			--init_graph_from_vertex_set()
			--decodeOccurrence()
			--remove_loose_part_of_graph()
				--remove_singleton_vertices()
				--get_vertex_min_degree()
		"""
		no_of_datasets = len(recurrence_array)
		#initialize all graphs
		graph_list = [None]*no_of_datasets
		anti_vertex_id_list = [None]*no_of_datasets
		recurrence_set = Set()
		for i in range(no_of_datasets):
			graph_list[i], anti_vertex_id_list[i] = self.init_graph_from_vertex_set(vertex_list)
			if recurrence_array[i] == 1:
				recurrence_set.add(i)
		
		no_of_vertices = len(vertex_list)
		#vertex_list.sort()	#presorted
		#construct graphs for each 'on' dataset
		for i in range(no_of_vertices):
			for j in range(i+1, no_of_vertices):
				edge_tuple = (vertex_list[i], vertex_list[j])
				"""
				if self.debug:
					print "checking", edge_tuple
				"""
				if edge_tuple in self.edge2encodedOccurrence:
					edge_recurrence = decodeOccurrence(self.edge2encodedOccurrence[edge_tuple])	#starting from 1
					"""
					if self.debug:
						print "edge_recurrence", edge_recurrence
					"""
					for k in edge_recurrence:
						index = k-1
						v_descriptor1 = anti_vertex_id_list[index][vertex_list[i]]
						v_descriptor2 = anti_vertex_id_list[index][vertex_list[j]]
						graph_list[index].add_edge(v_descriptor1, v_descriptor2)
		#remove loose part for each graph
		on_dataset_index_ls = [0]*no_of_datasets
		for i in range(no_of_datasets):
			if graph_list[i].num_edges()>1:	#at least the graph has two edges
				degree_percentage = self.remove_loose_part_of_graph(graph_list[i], degree_cut_off)
				if graph_list[i].num_vertices()>=4:	#min graph size
					"""
					if self.debug:
						print "graph %s has %s vertices remaining with degree_percentage: %s."%(i, graph_list[i].num_vertices(), degree_percentage)
					"""
					on_dataset_index_ls[i] = 1	#this dataset should be counted as 'on'
				else:
					on_dataset_index_ls[i] = 0
		
		#find core vertex_list only in those recurrent 'on' datasets
		vertex_id2occurrence = {}
		recurrent_and_on_datasets_ls = []
		on_but_not_recurrent_dataset2vertex_set = {}
		for i in range(no_of_datasets):
			if recurrence_array[i] == 1 and on_dataset_index_ls[i] == 1:
				recurrent_and_on_datasets_ls.append(i)
				for v in graph_list[i].vertices:
					vertex_id = graph_list[i].vertex_properties['vertex_id'][v]
					if vertex_id not in vertex_id2occurrence:
						vertex_id2occurrence[vertex_id]	= 0
					vertex_id2occurrence[vertex_id] += 1
			if recurrence_array[i] == 0 and on_dataset_index_ls[i] == 1:
				on_but_not_recurrent_dataset2vertex_set[i] = Set()
				for v in graph_list[i].vertices:
					vertex_id = graph_list[i].vertex_properties['vertex_id'][v]
					on_but_not_recurrent_dataset2vertex_set[i].add(vertex_id)
		#only vertices in recurrent and 'on' datasets go into core_vertex_set
		core_vertex_set = Set()
		for vertex_id in vertex_id2occurrence:
			if vertex_id2occurrence[vertex_id] == len(recurrent_and_on_datasets_ls):
				core_vertex_set.add(vertex_id)
		
		#find other on datasets from on_but_not_recurrent_dataset2vertex_set
		for dataset_no, vertex_set in on_but_not_recurrent_dataset2vertex_set.iteritems():
			intersection_set = core_vertex_set & vertex_set
			if len(intersection_set)==len(core_vertex_set):
				recurrent_and_on_datasets_ls.append(dataset_no)
		
		core_vertex_ls  = list(core_vertex_set)
		core_vertex_ls.sort()
		recurrent_and_on_datasets_ls.sort()
		
		return core_vertex_ls, recurrent_and_on_datasets_ls