def data_fetch(self, curs, splat_table, mcl_table, crs_no=0, output_fname=None): """ 04-17-05 fetch cluster_dstructures for all clusters(Jasmine's request) 04-19-05 1. return a mcl_id2cluster_dstructure 2. crs_no 08-31-05 output clusters directly to output_fname 09-01-05 add the last [] """ gene_no2gene_id = get_gene_no2gene_id(curs) #08-31-05 outf = open(output_fname, 'w') #08-31-05 outf.write("r:=[") #08-31-05 mcl_id2cluster_dstructure = {} no_of_total_genes = get_no_of_total_genes(curs) sys.stderr.write("Getting the basic information for all clusters...\n") curs.execute("DECLARE crs%s CURSOR FOR select m.mcl_id, m.vertex_set, m.connectivity, 0,\ m.recurrence_array, s.edge_set, s.connectivity, m.cooccurrent_cluster_id from %s m, %s s where \ m.splat_id=s.splat_id"\ %(crs_no, mcl_table, splat_table)) #06-20-05 connectivity_original faked to be 0 curs.execute("fetch 5000 from crs%s"%crs_no) rows = curs.fetchall() while rows: for row in rows: unit = cluster_dstructure() unit.cluster_id = row[0] vertex_set = row[1][1:-1].split(',') unit.vertex_set = map(int, vertex_set) unit.connectivity = row[2] unit.connectivity_original = row[3] recurrence_array = row[4][1:-1].split(',') unit.recurrence_array = map(float, recurrence_array) unit.edge_set = parse_splat_table_edge_set(row[5]) unit.splat_connectivity = row[6] unit.cooccurrent_cluster_id = row[7] unit.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, unit.vertex_set) unit.go_no2information = self.get_information_of_go_functions(curs, \ unit.go_no2association_genes, len(unit.vertex_set), no_of_total_genes, p_value_cut_off=0.05) #jasmine wants to cut some go-nos. unit.edge_cor_2d_list, unit.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, unit.edge_set) str_tmp = self.return_string_form_of_cluster_dstructure(unit, gene_no2gene_id) #08-31-05 outf.write("%s,"%str_tmp) #mcl_id2cluster_dstructure[unit.cluster_id] = unit """ order_1st_id, order_2nd_id = map(int, unit.cooccurrent_cluster_id.split('.')) if order_1st_id not in self.order_1st_id2all_clusters: self.order_1st_id2all_clusters[order_1st_id] = {} if order_2nd_id not in self.order_1st_id2all_clusters[order_1st_id]: self.order_1st_id2all_clusters[order_1st_id][order_2nd_id] = [] self.order_1st_id2all_clusters[order_1st_id][order_2nd_id].append(unit) """ curs.execute("fetch 5000 from crs%s"%crs_no) rows = curs.fetchall() outf.write("[]]:") #08-31-05, 09-01-05 add the last blank [] del outf sys.stderr.write("Done.\n") return mcl_id2cluster_dstructure
def submit_predictions(self, curs, schema_instance, prediction_pair2instance, cluster_id2properties): sys.stderr.write("Submitting predictions...\n") MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.createGeneTable(curs, schema_instance.p_gene_table) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) counter = 0 for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems(): #1st fill those empty items properties = cluster_id2properties[p_attr_instance.mcl_id] vertex_set = properties[2] p_attr_instance.p_value_cut_off = cal_hg_p_value(p_attr_instance.gene_no, p_attr_instance.go_no,\ vertex_set, no_of_total_genes, go_no2gene_no_set, r) p_attr_instance.avg_p_value = p_attr_instance.p_value_cut_off p_attr_instance.connectivity_cut_off = properties[0] p_attr_instance.cluster_size_cut_off = len(vertex_set) p_attr_instance.unknown_cut_off = properties[1] MpiPredictionFilter_instance.submit_to_p_gene_table(curs, schema_instance.p_gene_table, p_attr_instance) counter += 1 if self.report and counter%2000==0: sys.stderr.write("%s%s"%('\x08'*20, counter)) if self.report: sys.stderr.write("%s%s"%('\x08'*20, counter)) sys.stderr.write("Done.\n")
def get_known_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value): schema_instance = form_schema_tables(fname) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) prediction_ls, all_data, known_data = self.data_fetch(curs, schema_instance, filter_type, is_correct_type, \ no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) del prediction_ls, all_data return known_data
def get_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value): """ 11-19-05 data_fetch() of rpart_prediction.py changed return unknown_data """ schema_instance = form_schema_tables(fname) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(curs, schema_instance, \ filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) del unknown_prediction_ls, known_prediction_ls return unknown_data, known_data
def run(self): """ 11-09-05 11-09-05 add rpart_cp 11-10-05 add need_cal_hg_p_value --db_connect() --form_schema_tables() --form_schema_tables() --get_no_of_total_genes() --get_go_no2gene_no_set() --data_fetch() --get_vertex_list() --cal_hg_p_value() --rpart_fit_and_predict() --MpiPredictionFilter_instance....() --record_data() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.fname1) new_schema_instance = form_schema_tables(self.fname2) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) prediction_ls, all_data, known_data = self.data_fetch(curs, old_schema_instance, self.filter_type, self.is_correct_type, \ no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) """ testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.no_of_buckets, self.rpart_cp, \ self.loss_matrix, self.prior_prob) print testing_acc_ls print training_acc_ls """ pred, pred_training = self.rpart_fit_and_predict(all_data, known_data, self.rpart_cp, self.loss_matrix, self.prior_prob) MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) self.record_data(curs, MpiPredictionFilter_instance, prediction_ls, pred, new_schema_instance) if self.commit: curs.execute("end")
def get_cluster_dstructure(self, curs, mcl_id, splat_table, mcl_table): """ 04-18-05 called by GuiAnalyzer.py --get_basic_cluster_dstructure() --get_go_functions_of_this_gene_set() --get_information_of_go_functions() --get_cor_sig_2d_list() --graph_from_node_edge_set() --column_output() """ no_of_total_genes = get_no_of_total_genes(curs) cluster = self.get_basic_cluster_dstructure(curs, mcl_id, splat_table, mcl_table) if cluster: #not None cluster.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, cluster.vertex_set) cluster.go_no2information = self.get_information_of_go_functions(curs, cluster.go_no2association_genes, \ len(cluster.vertex_set), no_of_total_genes) cluster.edge_cor_2d_list, cluster.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, cluster.edge_set) #graph = self.graph_from_node_edge_set(cluster.vertex_set, cluster.edge_set) return cluster """
def run(self): """ 11-09-05 11-09-05 add rpart_cp 11-10-05 add need_cal_hg_p_value 11-23-05 rpart_fit_and_predict() is split 2006-12-05 add need_output_data_for_R flag --db_connect() --form_schema_tables() --form_schema_tables() --get_no_of_total_genes() --get_go_no2gene_no_set() --data_fetch() --get_vertex_list() --cal_hg_p_value() --output_data_for_R() --rpart_fit() --rpart_predict() --rpart_predict() --MpiPredictionFilter_instance....() --record_data() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.fname1) new_schema_instance = form_schema_tables(self.fname2) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch( curs, old_schema_instance, self.filter_type, self.is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value, ) if self.need_output_data_for_R: # 2006-12-05 self.output_data_for_R(known_data, "%s.known" % self.fname1) self.output_data_for_R(unknown_data, "%s.unknown" % self.fname1) """ testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.training_perc, self.rpart_cp, \ self.loss_matrix, self.prior_prob) print testing_acc_ls print training_acc_ls """ fit_model = self.fit_function_dict[self.type](known_data, self.parameter_list_dict[self.type], self.bit_string) known_pred = self.predict_function_dict[self.type](fit_model, known_data) unknown_pred = self.predict_function_dict[self.type](fit_model, unknown_data) if self.debug: if self.type == 2: # randomForest's model has its own oob prediction fit_model_py = fit_model.as_py(BASIC_CONVERSION) print self.cal_accuracy(known_data, fit_model_py["predicted"], pred_type=1) print self.cal_accuracy(known_data, known_pred, pred_type=self.type) print self.cal_accuracy(unknown_data, unknown_pred, pred_type=self.type) if self.commit: MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.splat_table, new_schema_instance.splat_table ) MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table ) MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table ) MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) self.record_data( curs, MpiPredictionFilter_instance, unknown_prediction_ls, unknown_pred, new_schema_instance, pred_type=self.type, ) if ( self.type == 2 ): # 2006-10-31 randomForest's model has its own oob prediction, but use rpart's way of storing prediction fit_model_py = fit_model.as_py(BASIC_CONVERSION) known_pred = fit_model_py["predicted"] self.record_data( curs, MpiPredictionFilter_instance, known_prediction_ls, known_pred, new_schema_instance, pred_type=1, ) else: self.record_data( curs, MpiPredictionFilter_instance, known_prediction_ls, known_pred, new_schema_instance, pred_type=self.type, ) curs.execute("end")