def dstruc_loadin(self, curs): ''' ''' sys.stderr.write("Loading Data STructure...\n") from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_id2gene_no = get_gene_id2gene_no(curs) self.global_gene_to_go_dict = get_gene_no2go_no(curs) #04-01-05 the second kind in label_dict gene_no2no = {} for gene_no in self.gene_no2gene_id: gene_no2no[gene_no] = gene_no self.label_dict = {1:self.gene_no2gene_id, 2: gene_no2no} curs.execute("select gene_no,go_functions from gene") if self.type == 3: curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table) rows = curs.fetchall() self.no_of_datasets = int(rows[0][0]) sys.stderr.write("Done\n")
def dstruc_loadin(self, curs): """ 03-09-05 get the context from mcl_table via linking through mcl_id of p_gene_table context_dict is set """ from codense.common import get_known_genes_dict, get_go_no2go_id, get_go_no2name, get_gene_no2gene_id self.known_genes_dict = get_known_genes_dict(curs) self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) sys.stderr.write("Setting up gene_prediction_dict...") # setup self.gene_prediction_dict curs.execute( "select p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, p.is_correct_lca, m.vertex_set\ from %s p, %s g, %s m where g.p_gene_id=p.p_gene_id and m.mcl_id=p.mcl_id" % (self.gene_table, self.table, self.mcl_table) ) rows = curs.fetchall() for row in rows: gene_no = row[0] if self.type == 2 and gene_no not in self.known_genes_dict: # I only want the known genes, but this gene is unknown continue elif self.type == 3 and gene_no in self.known_genes_dict: # i only want the unknown genes, but this gene is known continue go_no = row[1] is_correct = row[2] is_correct_l1 = row[3] is_correct_lca = row[4] vertex_set = row[5][1:-1].split(",") vertex_set = map(int, vertex_set) item = function_struc() item.is_correct = is_correct item.is_correct_l1 = is_correct_l1 item.is_correct_lca = is_correct_lca # context_dict is a set item.context_dict = Set(vertex_set) if gene_no not in self.gene_prediction_dict: self.gene_prediction_dict[gene_no] = gene_prediction() self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item else: self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item sys.stderr.write("Done\n") """
def run(self): """ 09-28-05 12-19-05 use class_list and output_fname_list to ease program writing 12-30-05 fix a bug in indexing darwin_instance_list 2006-09-25 2007-02-08 add context_prediction_csv_format """ tf_darwin_ofname = os.path.join(self.output_dir, '%s.tf.darwin'%self.cluster_bs_table) cluster_darwin_ofname = os.path.join(self.output_dir, '%s.cluster.darwin'%os.path.basename(self.input_fname)) prediction_darwin_ofname = os.path.join(self.output_dir, '%s.prediction.darwin'%os.path.basename(self.input_fname)) pattern_darwin_ofname = os.path.join(self.output_dir, '%s.pattern.darwin'%self.pattern_table) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) conn, curs = db_connect(self.hostname, self.dbname, self.schema) tax_id = org2tax_id(self.organism) #gene_no2id = get_gene_no2gene_id(curs) #Watch, if unigene, should use this. gene_id2symbol = get_gene_id2gene_symbol(curs, tax_id) gene_id2symbol = self.replace_prime_in_gene_id2symbol(gene_id2symbol) #01-26-06 #gene_no2symbol = dict_transfer(gene_no2id, gene_id2symbol) #Jasmine wants the gene symbol 09-28-05 #gene_id is integer in gene.gene table and same as gene_no, so just use it. go_no2name = get_go_no2name(curs) #09-28-05 Jasmine wants the go_name, not go_id #2006-09-25 use gene_id2symbol to replace mt_no2tf_name #mt_no2tf_name = get_mt_no2tf_name() mt_no2tf_name = gene_id2symbol class_list = [tf_darwin_format, cluster_darwin_format, prediction_darwin_format, pattern_darwin_format, context_prediction_csv_format] context_prediction_csv_fname = os.path.join(self.output_dir, '%s.context.csv'%self.input_fname) output_fname_list = [tf_darwin_ofname, cluster_darwin_ofname, prediction_darwin_ofname, pattern_darwin_ofname, context_prediction_csv_fname] darwin_instance_list = [] for i in range(len(self.running_bit)): if self.running_bit[i] == '1': darwin_instance_list.append(class_list[i](self.hostname, self.dbname, self.schema, self.pattern_table,\ self.cluster_bs_table, self.input_fname, self.lm_bit, self.acc_cut_off, \ output_fname_list[i], gene_id2symbol, go_no2name, mt_no2tf_name, debug, report)) #2006-09-25 current_pos = len(darwin_instance_list)-1 #12-30-05 darwin_instance_list[current_pos].start() for i in range(len(darwin_instance_list)): darwin_instance_list[i].join()
def output(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-03-05 loop over gene_no2p_gene_id_src and p_gene_id_src_map 03-13-05 add a column, #clusters in the output file --output_one_gene() --output_function_group() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")
def output1(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-15-05 copied from output() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) from codense.common import get_prediction_pair2lca_list prediction_pair2lca_list = get_prediction_pair2lca_list(curs,p_gene_table=self.p_gene_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'is_correct_lca', 'lca_list', 'p_value_list', '#clusters',\ 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: #NOTE: the arguments passed to this function is different between known and unknown genes. self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair, prediction_pair2lca_list, gene_no) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'p_value_list', '#clusters', 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")