Пример #1
0
	def submit_predictions(self, curs, schema_instance, prediction_pair2instance, cluster_id2properties):
		sys.stderr.write("Submitting predictions...\n")
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.createGeneTable(curs, schema_instance.p_gene_table)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		counter = 0
		for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems():
			#1st fill those empty items
			properties = cluster_id2properties[p_attr_instance.mcl_id]
			vertex_set = properties[2]
			p_attr_instance.p_value_cut_off = cal_hg_p_value(p_attr_instance.gene_no, p_attr_instance.go_no,\
				vertex_set, no_of_total_genes, go_no2gene_no_set, r)
			p_attr_instance.avg_p_value = p_attr_instance.p_value_cut_off
			p_attr_instance.connectivity_cut_off = properties[0]
			p_attr_instance.cluster_size_cut_off = len(vertex_set)
			p_attr_instance.unknown_cut_off = properties[1]
			MpiPredictionFilter_instance.submit_to_p_gene_table(curs, schema_instance.p_gene_table, p_attr_instance)
			counter += 1
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s"%('\x08'*20, counter))
		if self.report:
			sys.stderr.write("%s%s"%('\x08'*20, counter))
		sys.stderr.write("Done.\n")
Пример #2
0
	def get_known_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, schema_instance, filter_type, is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del prediction_ls, all_data
		return known_data
Пример #3
0
	def get_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		"""
		11-19-05
			data_fetch() of rpart_prediction.py changed
			return unknown_data
		"""
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(curs, schema_instance, \
			filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del unknown_prediction_ls, known_prediction_ls
		return unknown_data, known_data
Пример #4
0
	def run(self):
		"""
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--rpart_fit_and_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		old_schema_instance = form_schema_tables(self.fname1)
		new_schema_instance = form_schema_tables(self.fname2)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, old_schema_instance, self.filter_type, self.is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		
		"""
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.no_of_buckets, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
		pred, pred_training = self.rpart_fit_and_predict(all_data, known_data, self.rpart_cp, self.loss_matrix, self.prior_prob)
		
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
		MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
		self.record_data(curs, MpiPredictionFilter_instance, prediction_ls, pred, new_schema_instance)
		if self.commit:
			curs.execute("end")
Пример #5
0
    def run(self):
        """
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		11-23-05
			rpart_fit_and_predict() is split
		2006-12-05
			add need_output_data_for_R flag
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--output_data_for_R()
			
			--rpart_fit()
			--rpart_predict()
			--rpart_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        old_schema_instance = form_schema_tables(self.fname1)
        new_schema_instance = form_schema_tables(self.fname2)

        no_of_total_genes = get_no_of_total_genes(curs)
        go_no2gene_no_set = get_go_no2gene_no_set(curs)

        unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(
            curs,
            old_schema_instance,
            self.filter_type,
            self.is_correct_type,
            no_of_total_genes,
            go_no2gene_no_set,
            need_cal_hg_p_value,
        )

        if self.need_output_data_for_R:  # 2006-12-05
            self.output_data_for_R(known_data, "%s.known" % self.fname1)
            self.output_data_for_R(unknown_data, "%s.unknown" % self.fname1)
        """
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.training_perc, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
        fit_model = self.fit_function_dict[self.type](known_data, self.parameter_list_dict[self.type], self.bit_string)
        known_pred = self.predict_function_dict[self.type](fit_model, known_data)
        unknown_pred = self.predict_function_dict[self.type](fit_model, unknown_data)

        if self.debug:
            if self.type == 2:
                # randomForest's model has its own oob prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                print self.cal_accuracy(known_data, fit_model_py["predicted"], pred_type=1)
            print self.cal_accuracy(known_data, known_pred, pred_type=self.type)
            print self.cal_accuracy(unknown_data, unknown_pred, pred_type=self.type)

        if self.commit:
            MpiPredictionFilter_instance = MpiPredictionFilter()
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.splat_table, new_schema_instance.splat_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table
            )
            MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
            self.record_data(
                curs,
                MpiPredictionFilter_instance,
                unknown_prediction_ls,
                unknown_pred,
                new_schema_instance,
                pred_type=self.type,
            )
            if (
                self.type == 2
            ):  # 2006-10-31 randomForest's model has its own oob prediction, but use rpart's way of storing prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                known_pred = fit_model_py["predicted"]
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=1,
                )
            else:
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=self.type,
                )
            curs.execute("end")
Пример #6
0
	def run(self):
		"""
		09-05-05
		10-23-05
			create views from old schema
			result goes to the new schema's p_gene_table
		
			(input_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--get_gene_no2go_no_set()
				--get_go_no2depth()
				(pass data to computing_node)
			(computing_node)
				(take data from other nodes, 0 and size-1)
			(judge_node)
				--gene_stat()
				--db_connect()
				--gene_p_map_redundancy()
			(output_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--MpiPredictionFilter()
				--MpiPredictionFilter_instance.createGeneTable()
				--get_go_no2edge_counter_list()(if necessary)
				(pass go_no2edge_counter_list to computing_node)
			
			(input_node)
				--fetch_cluster_block()
			(computing_node)
				--get_no_of_unknown_genes()
				--node_fire_handler()
				--cleanup_handler()
			--judge_node()
				--gene_stat_instance.(match functions)
			--output_node()
				--output_node_handler()
					--MpiPredictionFilter_instance.submit_to_p_gene_table()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			"""
			gene_no2go_no = get_gene_no2go_no_set(curs)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)	#-1 means use the highest protocol
			go_no2depth = get_go_no2depth(curs)
			go_no2depth_pickle = cPickle.dumps(go_no2depth, -1)
			go_no2gene_no_set = get_go_no2gene_no_set(curs)
			go_no2gene_no_set_pickle = cPickle.dumps(go_no2gene_no_set, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(gene_no2go_no_pickle, node, 0)
				communicator.send(go_no2depth_pickle, node, 0)
				communicator.send(go_no2gene_no_set_pickle, node, 0)
		elif node_rank<=communicator.size-3:	#WATCH: last 2 nodes are not here.
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)	#take the data
			data, source, tag = communicator.receiveString(0, 0)
			go_no2depth = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			go_no2gene_no_set = cPickle.loads(data)
			data, source, tag = communicator.receiveString(communicator.size-1, 0)	#from the last node
			go_no2edge_counter_list = cPickle.loads(data)
			#choose a functor for recurrence_array
			functor_dict = {0: None,
				1: lambda x: int(x>=self.recurrence_x),
				2: lambda x: math.pow(x, self.recurrence_x)}
			functor = functor_dict[self.recurrence_x_type]
		elif node_rank == communicator.size-2:	#judge node
			gene_stat_instance = gene_stat(depth_cut_off=self.depth)
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			gene_stat_instance.dstruc_loadin(curs)
			from gene_p_map_redundancy import gene_p_map_redundancy
			node_distance_class = gene_p_map_redundancy()			
		elif node_rank==communicator.size-1:	#establish connection before pursuing
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06, input and output are all directed to files
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			MpiPredictionFilter_instance = MpiPredictionFilter()
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
			if self.new_table:
				MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
			"""
			if self.go_no2edge_counter_list_fname:
				go_no2edge_counter_list = cPickle.load(open(self.go_no2edge_counter_list_fname,'r'))
			else:
				if self.eg_d_type==2:
					go_no2edge_counter_list = None
				else:
					gene_no2go_no = get_gene_no2go_no_set(curs)
					go_no2edge_counter_list = get_go_no2edge_counter_list(curs, gene_no2go_no, self.edge_type2index)
			go_no2edge_counter_list_pickle = cPickle.dumps(go_no2edge_counter_list, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(go_no2edge_counter_list_pickle, node, 0)
		
		mpi_synchronize(communicator)
		
		free_computing_nodes = range(1,communicator.size-2)	#exclude the last node
		if node_rank == 0:
			"""
			curs.execute("DECLARE crs CURSOR FOR SELECT id, vertex_set, edge_set, no_of_edges,\
			connectivity, unknown_gene_ratio, recurrence_array, d_matrix from %s"%(old_schema_instance.pattern_table))
			"""
			self.counter = 0	#01-02-06 counter is used as id
			reader = csv.reader(open(self.input_fname, 'r'), delimiter='\t')
			parameter_list = [reader]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, \
				self.report, input_handler=self.input_handler)
			del reader
		elif node_rank in free_computing_nodes:
			no_of_unknown_genes = get_no_of_unknown_genes(gene_no2go_no)
			GradientScorePrediction_instance = GradientScorePrediction(gene_no2go_no, go_no2gene_no_set, go_no2depth, \
				go_no2edge_counter_list, no_of_unknown_genes, self.depth, self.min_layer1_associated_genes, \
				self.min_layer1_ratio, self.min_layer2_associated_genes, self.min_layer2_ratio, self.exponent, \
				self.score_list, self.max_layer, self.norm_exp, self.eg_d_type, self.debug)
			parameter_list = [GradientScorePrediction_instance, functor]
			computing_node(communicator, parameter_list, self.node_fire_handler, self.cleanup_handler, self.report)
		elif node_rank == communicator.size-2:
			self.judge_node(communicator, curs, gene_stat_instance, node_distance_class)
		elif node_rank==communicator.size-1:
			#01-02-06 output goes to plain file, not database
			writer = csv.writer(open(self.jnput_fname, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer