Exemplo n.º 1
0
	def __init__(self, file_list, outputdir, delimiter, debug=0):
		""" 
		08-28-05
		"""
		self.files = file_list
		self.files.sort()
		self.outputdir = outputdir
		self.delimiter = delimiter
		self.debug = int(debug)
		#02-24-06 a must before using graph_modeling
		p_value_cut_off = 0
		cor_cut_off = 0.6
		graph_modeling.cor_cut_off_vector_construct(p_value_cut_off, cor_cut_off)
		
		#02-24-06 a temporary data structure to show how correlations between probes
		#pointing to the same gene are distributed
		self.cor_list = []
Exemplo n.º 2
0
	def cor_vector_from_files(self, communicator, dir, gph_dir, cor_fname, sig_fname, p_value_cut_off, cor_cut_off):
		"""
		05-14-05
			modify to be mpi form, feed mode(in MpiBiclustering.py and MpiGraphModeling.py)
		
		05-16-05
			output the edge tuple into 0th file.
			
		06-30-05	if both 0, get the corCut from the top 1% graph files	
		
			--files_sort
			if node_rank==0:
				--edge_tuple_list_output()
				--edge_tuple_list_output()
			else:
				--graph_modeling.cor_cut_off_vector_construct()
				--get_corCut_list()
				
			if node_rank==0:
				(send signal to other nodes)
			else:
				--node_fire()
					--gene_index2expr_array_setup()
					--cor_calculate()
			if node_rank==0:
				--collect_and_merge_output()
		"""
		files = os.listdir(dir)
		#sort all the files based on the dataset number, to order the columns of the outputed edge correlation vector
		files = self.files_sort(files)
		
		file_index_list = range(len(files))
		node_rank = communicator.rank
		
		if p_value_cut_off ==0 and cor_cut_off == 0 and gph_dir==None:
			sys.stderr.write("p_value_cut_off and cor_cut_off both are 0, but no gph_dir. Aborted.\n")
			sys.exit(3)
		if node_rank == 0:
			#output the name first
			self.edge_tuple_list_output("%s_0"%cor_fname)	#05-16-05 output the edge tuple into 0th file.
			self.edge_tuple_list_output("%s_0"%sig_fname)
		else:
			#set the cor_cut_off_vector, internal structure of graph_modeling
			graph_modeling.cor_cut_off_vector_construct(p_value_cut_off, cor_cut_off)
			if p_value_cut_off ==0 and cor_cut_off == 0:	#06-30-05	if both 0, get the corCut from the top 1% graph files
				corCut_list = self.get_corCut_list(gph_dir)
			else:
				corCut_list = []
		
		self.mpi_synchronize(communicator)
		
		if node_rank == 0:
			sys.stderr.write("\tTotally, %d files to be processed.\n"%len(files))

			seed_utilized = Set()
			for node in range(1, communicator.size):
				if len(file_index_list)==0:	#if #nodes > #jobs, tell those nodes to break their listening loop.
					stop_signal = "-1"
					communicator.send(stop_signal, node, 0)	#no more jobs, stop that node,
					if self.debug:
						sys.stderr.write("node %s stopped.\n"%node)
				else:
					input_file_index = file_index_list.pop(0)	#the first item poped first.
					communicator.send(repr(input_file_index), node, 0)	#string format
					if self.debug:
						sys.stderr.write("Node %s schedule a job to %s\n"%(node_rank, node))
					seed_utilized.add(node)
			
			received_value, source, tag = communicator.receiveString(None, None)	#listen
			while received_value:		#??check what the received_value is
				if len(file_index_list) == 0:	#first check if there're still files left, otherwise pop(0) raises error.
					stop_signal = "-1"
					communicator.send(stop_signal, source, 0)	#no more jobs, stop that node,
					if self.debug:
						sys.stderr.write("node %s stopped.\n"%source)
					seed_utilized.remove(source)
					if len(seed_utilized) == 0:	#all seed used have finished their jobs
						break
				else:
					input_file_index = file_index_list.pop(0)
					if input_file_index:
						communicator.send(repr(input_file_index), source, 0)	#string format,
						if self.debug:
							sys.stderr.write("Node %s get one more job\n"%source)
				received_value, source, tag = communicator.receiveString(None, None)	#listen
		else:
			received_data, source, tag = communicator.receiveString(0, None)	#get data from node 0,
				#04-24-05 the array is one-dimension no matter what dimension the original array is
			while received_data:
				if received_data=="-1":	#stop signal
					if self.debug:
						sys.stderr.write("node %s breaked.\n"%node_rank)
					break
				else:
					input_file_index = int(received_data)	#convert it to integer
					sys.stderr.write("node %s working on %s...\n"%(node_rank, received_data))
					self.node_fire(dir, files, input_file_index, cor_fname, sig_fname, corCut_list)
					sys.stderr.write("node %s work on %s finished.\n"%(node_rank, received_data))
					communicator.send("finished", 0, node_rank)
					
				received_data, source, tag = communicator.receiveString(0, None)	#get data from node 0
		
		self.mpi_synchronize(communicator)
		
		if node_rank==0:
			self.collect_and_merge_output(cor_fname, len(files))
		elif node_rank==1:
			self.collect_and_merge_output(sig_fname, len(files))
Exemplo n.º 3
0
	def run(self):
		"""
		03-18-05
			mapping_dict all changed to haiyan_no2gene_no
		04-12-05
			use min_cluster_size to cut off some small clusters
		07-03-05
			construct graph_modeling's cor_cut_off vector first
		10-14-05
			add calculate_unknown_gene_ratio()
		12-06-05
			add gene_no2incidence_array to parser_type ==4
		05-31-06
			add type 5 (haifeng's output)
			
			--db_connect()
			--get_haiyan_no2gene_no()
			--get_known_genes_dict()
			--get_gene_id2gene_no()
			--create_tables()
			--graph_modeling.cor_cut_off_vector_construct()
			(loop over inf)
				--parser_dict[parser_type]() (codense_parser(), copath_parser() )
					--get_combined_cor_vector
					--parse_recurrence
					--parse_connectivity
					--get_vertex_set_gim_array() (parser_type=4 only)
				--calculate_unknown_gene_ratio()
				--db_submit()
		"""
		
		inf = csv.reader(open(self.infname, 'r'), delimiter=self.delimiter)
		(conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
		
		#setup the haiyan_no2gene_no
		if self.mapping_file != None:
			haiyan_no2gene_no = get_haiyan_no2gene_no(self.mapping_file)
		else:
			haiyan_no2gene_no = {}	#a blank dictionary, 
		known_gene_no2go_no_set = get_known_genes_dict(curs)	#10-14-05	used to get unknown_gene_ratio
		
		if self.parser_type == 4 or self.parser_type==5:	#12-06-05
			if self.gim_inputfname == None:
				sys.stderr.write("\n parser_type = 4 needs gim_inputfname.\n")
				sys.exit(3)
			gene_id2gene_no = get_gene_id2gene_no(curs)
			gene_no2incidence_array = get_gene_no2incidence_array(self.gim_inputfname, gene_id2gene_no)
		else:
			gene_no2incidence_array = None
		
		mapping_dict = {1:haiyan_no2gene_no,
			2:haiyan_no2gene_no,
			3:None,
			4:gene_no2incidence_array,
			5:gene_no2incidence_array}
		self.create_tables(curs, self.table, self.mcl_table, self.pattern_table)
		no = 0
		
		graph_modeling.cor_cut_off_vector_construct(0, 0.8)	#07-03-05 compute the cor cutoff vector for graph_modeling, use 0.8 as cutoff
			#graph_modeling.ind_min_cor() requires the cor_cut_off vector to be constructed ahead.
		graph_modeling.set_jk_cut_off(6)	#07-03-05 haiyan's cutoff is 6, different from my default value, 7.
		for row in inf:
			cluster_list = self.parser_dict[self.parser_type](row, mapping_dict[self.parser_type], curs)
			for cluster in cluster_list:
				if self.parser_type!=5 and len(cluster.vertex_set)<self.min_cluster_size:
					#too small, ignore, 2006-08-29 if it's haifeng_output_parser, no restriction for cluster size, haifeng imposes 4
					continue
				#10-14-05 unknown_gene_ratio to submit to pattern_table
				cluster.unknown_gene_ratio = self.calculate_unknown_gene_ratio(cluster.vertex_set, known_gene_no2go_no_set)
				self.db_submit(curs, cluster, self.pattern_table)
				no+=1
				if self.report and no%1000==0:
					sys.stderr.write('%s%d'%('\x08'*20, no))
		if self.report:
			sys.stderr.write('%s%d'%('\x08'*20, no))
		if self.needcommit:
			conn.commit()
		sys.stderr.write('\n\tTotal patterns: %d\n'%no)