Пример #1
0
	def dstruc_loadin(self, curs):
		'''
		'''
		sys.stderr.write("Loading Data STructure...\n")
		
		from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no
		self.go_no2go_id = get_go_no2go_id(curs)
		self.go_no2go_name = get_go_no2name(curs)
		self.gene_no2gene_id = get_gene_no2gene_id(curs)
		self.gene_id2gene_no = get_gene_id2gene_no(curs)
		self.global_gene_to_go_dict = get_gene_no2go_no(curs)
		
		#04-01-05 the second kind in label_dict
		gene_no2no = {}
		for gene_no in self.gene_no2gene_id:
			gene_no2no[gene_no] = gene_no
		self.label_dict = {1:self.gene_no2gene_id,
			2: gene_no2no}
		
		
		curs.execute("select gene_no,go_functions from gene")
		
		if self.type == 3:
			curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table)
			rows = curs.fetchall()
			self.no_of_datasets = int(rows[0][0])
			
		sys.stderr.write("Done\n")
Пример #2
0
    def dstruc_loadin(self, curs):
        """
		
		03-09-05
			get the context from mcl_table via linking through mcl_id of p_gene_table
			context_dict is set
		"""
        from codense.common import get_known_genes_dict, get_go_no2go_id, get_go_no2name, get_gene_no2gene_id

        self.known_genes_dict = get_known_genes_dict(curs)
        self.go_no2go_id = get_go_no2go_id(curs)
        self.go_no2go_name = get_go_no2name(curs)
        self.gene_no2gene_id = get_gene_no2gene_id(curs)

        sys.stderr.write("Setting up gene_prediction_dict...")
        # setup self.gene_prediction_dict
        curs.execute(
            "select p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, p.is_correct_lca, m.vertex_set\
			from %s p, %s g, %s m where g.p_gene_id=p.p_gene_id and m.mcl_id=p.mcl_id"
            % (self.gene_table, self.table, self.mcl_table)
        )
        rows = curs.fetchall()
        for row in rows:
            gene_no = row[0]
            if self.type == 2 and gene_no not in self.known_genes_dict:
                # I only want the known genes, but this gene is unknown
                continue
            elif self.type == 3 and gene_no in self.known_genes_dict:
                # i only want the unknown genes, but this gene is known
                continue
            go_no = row[1]
            is_correct = row[2]
            is_correct_l1 = row[3]
            is_correct_lca = row[4]
            vertex_set = row[5][1:-1].split(",")
            vertex_set = map(int, vertex_set)

            item = function_struc()
            item.is_correct = is_correct
            item.is_correct_l1 = is_correct_l1
            item.is_correct_lca = is_correct_lca
            # context_dict is a set
            item.context_dict = Set(vertex_set)
            if gene_no not in self.gene_prediction_dict:
                self.gene_prediction_dict[gene_no] = gene_prediction()
                self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item
            else:
                self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item

        sys.stderr.write("Done\n")

        """
Пример #3
0
	def run(self):
		"""
		09-28-05
		12-19-05
			use class_list and output_fname_list to ease program writing
		12-30-05
			fix a bug in indexing darwin_instance_list
		2006-09-25
		2007-02-08
			add context_prediction_csv_format
		"""
		tf_darwin_ofname = os.path.join(self.output_dir, '%s.tf.darwin'%self.cluster_bs_table)
		cluster_darwin_ofname = os.path.join(self.output_dir, '%s.cluster.darwin'%os.path.basename(self.input_fname))
		prediction_darwin_ofname = os.path.join(self.output_dir, '%s.prediction.darwin'%os.path.basename(self.input_fname))
		pattern_darwin_ofname = os.path.join(self.output_dir, '%s.pattern.darwin'%self.pattern_table)
		
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		
		tax_id = org2tax_id(self.organism)
		#gene_no2id = get_gene_no2gene_id(curs)	#Watch, if unigene, should use this.
		gene_id2symbol = get_gene_id2gene_symbol(curs, tax_id)
		
		gene_id2symbol = self.replace_prime_in_gene_id2symbol(gene_id2symbol)	#01-26-06
		
		#gene_no2symbol = dict_transfer(gene_no2id, gene_id2symbol)
		#Jasmine wants the gene symbol 09-28-05
		#gene_id is integer in gene.gene table and same as gene_no, so just use it.
		go_no2name = get_go_no2name(curs)	#09-28-05 Jasmine wants the go_name, not go_id
		
		#2006-09-25 use gene_id2symbol to replace mt_no2tf_name
		#mt_no2tf_name = get_mt_no2tf_name()
		mt_no2tf_name = gene_id2symbol
		
		class_list = [tf_darwin_format, cluster_darwin_format, prediction_darwin_format, pattern_darwin_format, context_prediction_csv_format]
		context_prediction_csv_fname = os.path.join(self.output_dir, '%s.context.csv'%self.input_fname)
		output_fname_list = [tf_darwin_ofname, cluster_darwin_ofname, prediction_darwin_ofname, pattern_darwin_ofname, context_prediction_csv_fname]
		darwin_instance_list = []
		for i in range(len(self.running_bit)):
			if self.running_bit[i] == '1':
				darwin_instance_list.append(class_list[i](self.hostname, self.dbname, self.schema, self.pattern_table,\
					self.cluster_bs_table, self.input_fname, self.lm_bit, self.acc_cut_off, \
					output_fname_list[i], gene_id2symbol, go_no2name, mt_no2tf_name, debug, report))	#2006-09-25
				current_pos = len(darwin_instance_list)-1 #12-30-05
				darwin_instance_list[current_pos].start()
			
		for i in range(len(darwin_instance_list)):
			darwin_instance_list[i].join()
Пример #4
0
	def output(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map):
		"""
		03-03-05
			loop over gene_no2p_gene_id_src and p_gene_id_src_map
		03-13-05
			add a column, #clusters in the output file
			
			--output_one_gene()
			--output_function_group()
		"""
		#three dictionaries
		gene_no2gene_id = get_gene_no2gene_id(curs)
		gene_no2direct_go = get_gene_no2direct_go(curs)
		go_no2go_id = get_go_no2go_id(curs)
		go_no2name = get_go_no2name(curs)
		go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table)
		
		sys.stderr.write("Outputing prediction table...")
		writer = csv.writer(outf, delimiter='\t')
		#first output the known genes
		for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \
				'e_acc', 'e_acc_pair', 'cluster_context']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		#second output the unknown genes
		for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \
				'e_acc', 'e_acc_pair', 'cluster_context']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		del writer
		sys.stderr.write("Done\n")
Пример #5
0
	def output1(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map):
		"""
		03-15-05
			copied from output()
		"""
		#three dictionaries
		gene_no2gene_id = get_gene_no2gene_id(curs)
		gene_no2direct_go = get_gene_no2direct_go(curs)
		go_no2go_id = get_go_no2go_id(curs)
		go_no2name = get_go_no2name(curs)
		go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table)
		from codense.common import get_prediction_pair2lca_list
		prediction_pair2lca_list = get_prediction_pair2lca_list(curs,p_gene_table=self.p_gene_table)
		
		sys.stderr.write("Outputing prediction table...")
		writer = csv.writer(outf, delimiter='\t')
		#first output the known genes
		for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_id', 'go_name', 'is_correct_lca', 'lca_list', 'p_value_list', '#clusters',\
				'e_acc']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				#NOTE: the arguments passed to this function is different between known and unknown genes.
				self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair, prediction_pair2lca_list, gene_no)
			writer.writerow([])
		#second output the unknown genes
		for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_id', 'go_name', 'p_value_list', '#clusters', 'e_acc']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		del writer
		sys.stderr.write("Done\n")