示例#1
0
	def data_fetch(self, curs, splat_table, mcl_table, crs_no=0, output_fname=None):
		"""
		04-17-05
			fetch cluster_dstructures for all clusters(Jasmine's request)	
		04-19-05
			1. return a mcl_id2cluster_dstructure
			2. crs_no
		08-31-05
			output clusters directly to output_fname
		09-01-05
			add the last []
		"""
		gene_no2gene_id = get_gene_no2gene_id(curs)	#08-31-05
		outf = open(output_fname, 'w')	#08-31-05
		outf.write("r:=[")	#08-31-05
		
		mcl_id2cluster_dstructure = {}
		no_of_total_genes = get_no_of_total_genes(curs)
		sys.stderr.write("Getting the basic information for all clusters...\n")
		curs.execute("DECLARE crs%s CURSOR FOR select m.mcl_id, m.vertex_set, m.connectivity, 0,\
			m.recurrence_array, s.edge_set, s.connectivity, m.cooccurrent_cluster_id from %s m, %s s where \
			m.splat_id=s.splat_id"\
			%(crs_no, mcl_table, splat_table))	#06-20-05	connectivity_original faked to be 0
		curs.execute("fetch 5000 from crs%s"%crs_no)
		rows = curs.fetchall()
		while rows:
			for row in rows:
				unit = cluster_dstructure()
				unit.cluster_id = row[0]
				vertex_set = row[1][1:-1].split(',')
				unit.vertex_set = map(int, vertex_set)
				unit.connectivity = row[2]
				unit.connectivity_original = row[3]
				recurrence_array = row[4][1:-1].split(',')
				unit.recurrence_array = map(float, recurrence_array)
				unit.edge_set = parse_splat_table_edge_set(row[5])
				unit.splat_connectivity = row[6]
				unit.cooccurrent_cluster_id = row[7]
				unit.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, unit.vertex_set)
				unit.go_no2information = self.get_information_of_go_functions(curs, \
					unit.go_no2association_genes, len(unit.vertex_set), no_of_total_genes, p_value_cut_off=0.05)	#jasmine wants to cut some go-nos.
				unit.edge_cor_2d_list, unit.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, unit.edge_set)
				
				str_tmp = self.return_string_form_of_cluster_dstructure(unit, gene_no2gene_id)	#08-31-05
				outf.write("%s,"%str_tmp)
				#mcl_id2cluster_dstructure[unit.cluster_id] = unit
				"""
				order_1st_id, order_2nd_id = map(int, unit.cooccurrent_cluster_id.split('.'))
				if order_1st_id not in self.order_1st_id2all_clusters:
					self.order_1st_id2all_clusters[order_1st_id] = {}
				if order_2nd_id not in self.order_1st_id2all_clusters[order_1st_id]:
					self.order_1st_id2all_clusters[order_1st_id][order_2nd_id] = []
				self.order_1st_id2all_clusters[order_1st_id][order_2nd_id].append(unit)
				"""
			curs.execute("fetch 5000 from crs%s"%crs_no)
			rows = curs.fetchall()
		outf.write("[]]:")	#08-31-05, 09-01-05 add the last blank []
		del outf
		sys.stderr.write("Done.\n")
		return mcl_id2cluster_dstructure
	def submit_predictions(self, curs, schema_instance, prediction_pair2instance, cluster_id2properties):
		sys.stderr.write("Submitting predictions...\n")
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.createGeneTable(curs, schema_instance.p_gene_table)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		counter = 0
		for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems():
			#1st fill those empty items
			properties = cluster_id2properties[p_attr_instance.mcl_id]
			vertex_set = properties[2]
			p_attr_instance.p_value_cut_off = cal_hg_p_value(p_attr_instance.gene_no, p_attr_instance.go_no,\
				vertex_set, no_of_total_genes, go_no2gene_no_set, r)
			p_attr_instance.avg_p_value = p_attr_instance.p_value_cut_off
			p_attr_instance.connectivity_cut_off = properties[0]
			p_attr_instance.cluster_size_cut_off = len(vertex_set)
			p_attr_instance.unknown_cut_off = properties[1]
			MpiPredictionFilter_instance.submit_to_p_gene_table(curs, schema_instance.p_gene_table, p_attr_instance)
			counter += 1
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s"%('\x08'*20, counter))
		if self.report:
			sys.stderr.write("%s%s"%('\x08'*20, counter))
		sys.stderr.write("Done.\n")
示例#3
0
	def get_known_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, schema_instance, filter_type, is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del prediction_ls, all_data
		return known_data
示例#4
0
	def get_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		"""
		11-19-05
			data_fetch() of rpart_prediction.py changed
			return unknown_data
		"""
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(curs, schema_instance, \
			filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del unknown_prediction_ls, known_prediction_ls
		return unknown_data, known_data
示例#5
0
	def run(self):
		"""
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--rpart_fit_and_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		old_schema_instance = form_schema_tables(self.fname1)
		new_schema_instance = form_schema_tables(self.fname2)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, old_schema_instance, self.filter_type, self.is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		
		"""
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.no_of_buckets, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
		pred, pred_training = self.rpart_fit_and_predict(all_data, known_data, self.rpart_cp, self.loss_matrix, self.prior_prob)
		
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
		MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
		self.record_data(curs, MpiPredictionFilter_instance, prediction_ls, pred, new_schema_instance)
		if self.commit:
			curs.execute("end")
示例#6
0
	def get_cluster_dstructure(self, curs, mcl_id, splat_table, mcl_table):
		"""
		04-18-05
			called by GuiAnalyzer.py
			
			--get_basic_cluster_dstructure()
			--get_go_functions_of_this_gene_set()
			--get_information_of_go_functions()
			--get_cor_sig_2d_list()
			--graph_from_node_edge_set()
			--column_output()
		"""
		no_of_total_genes = get_no_of_total_genes(curs)
		cluster  = self.get_basic_cluster_dstructure(curs, mcl_id, splat_table, mcl_table)
		if cluster:	#not None
			cluster.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, cluster.vertex_set)
			cluster.go_no2information = self.get_information_of_go_functions(curs, cluster.go_no2association_genes, \
				len(cluster.vertex_set), no_of_total_genes)
			cluster.edge_cor_2d_list, cluster.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, cluster.edge_set)
			#graph = self.graph_from_node_edge_set(cluster.vertex_set, cluster.edge_set)
		return cluster
		
		"""
示例#7
0
    def run(self):
        """
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		11-23-05
			rpart_fit_and_predict() is split
		2006-12-05
			add need_output_data_for_R flag
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--output_data_for_R()
			
			--rpart_fit()
			--rpart_predict()
			--rpart_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        old_schema_instance = form_schema_tables(self.fname1)
        new_schema_instance = form_schema_tables(self.fname2)

        no_of_total_genes = get_no_of_total_genes(curs)
        go_no2gene_no_set = get_go_no2gene_no_set(curs)

        unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(
            curs,
            old_schema_instance,
            self.filter_type,
            self.is_correct_type,
            no_of_total_genes,
            go_no2gene_no_set,
            need_cal_hg_p_value,
        )

        if self.need_output_data_for_R:  # 2006-12-05
            self.output_data_for_R(known_data, "%s.known" % self.fname1)
            self.output_data_for_R(unknown_data, "%s.unknown" % self.fname1)
        """
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.training_perc, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
        fit_model = self.fit_function_dict[self.type](known_data, self.parameter_list_dict[self.type], self.bit_string)
        known_pred = self.predict_function_dict[self.type](fit_model, known_data)
        unknown_pred = self.predict_function_dict[self.type](fit_model, unknown_data)

        if self.debug:
            if self.type == 2:
                # randomForest's model has its own oob prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                print self.cal_accuracy(known_data, fit_model_py["predicted"], pred_type=1)
            print self.cal_accuracy(known_data, known_pred, pred_type=self.type)
            print self.cal_accuracy(unknown_data, unknown_pred, pred_type=self.type)

        if self.commit:
            MpiPredictionFilter_instance = MpiPredictionFilter()
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.splat_table, new_schema_instance.splat_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table
            )
            MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
            self.record_data(
                curs,
                MpiPredictionFilter_instance,
                unknown_prediction_ls,
                unknown_pred,
                new_schema_instance,
                pred_type=self.type,
            )
            if (
                self.type == 2
            ):  # 2006-10-31 randomForest's model has its own oob prediction, but use rpart's way of storing prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                known_pred = fit_model_py["predicted"]
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=1,
                )
            else:
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=self.type,
                )
            curs.execute("end")