示例#1
0
def compare_gene_condition_vertex_set(curs, p_gene_table, gene_p_table, good_cluster_table, output_fname):
    import os, sys, csv
    from sets import Set
    from codense.common import pg_1d_array2python_ls

    sys.stderr.write("Getting gene_no2mcl_id_go_no_list ...\n")
    gene_no2mcl_id_go_no_list = {}
    curs.execute(
        "DECLARE crs1 CURSOR for select p.gene_no, p.mcl_id, p.go_no from %s p, %s g\
		where p.p_gene_id=g.p_gene_id"
        % (p_gene_table, gene_p_table)
    )
    counter = 0
    curs.execute("fetch 1000 from crs1")
    rows = curs.fetchall()
    mcl_id_set = Set()
    while rows:
        for row in rows:
            gene_no, mcl_id, go_no = row
            mcl_id_set.add(mcl_id)
            if gene_no not in gene_no2mcl_id_go_no_list:
                gene_no2mcl_id_go_no_list[gene_no] = []
            gene_no2mcl_id_go_no_list[gene_no].append([mcl_id, go_no])
            counter += 1
        sys.stderr.write("%s%s" % ("\x08" * 30, counter))
        curs.execute("fetch 1000 from crs1")
        rows = curs.fetchall()
    curs.execute("close crs1")
    sys.stderr.write("Done.\n")

    sys.stderr.write("Getting mcl_id2recurrence_array_vertex_set ...\n")
    mcl_id2recurrence_array_vertex_set = {}
    curs.execute("DECLARE crs0 CURSOR for select mcl_id, vertex_set, recurrence_array from %s" % good_cluster_table)
    counter = 0
    curs.execute("fetch 1000 from crs0")
    rows = curs.fetchall()
    while rows:
        for row in rows:
            mcl_id, vertex_set, recurrence_array = row
            vertex_set = pg_1d_array2python_ls(vertex_set)
            recurrence_array = pg_1d_array2python_ls(recurrence_array)
            mcl_id2recurrence_array_vertex_set[mcl_id] = [recurrence_array, Set(vertex_set)]
            counter += 1
        sys.stderr.write("%s%s" % ("\x08" * 30, counter))
        curs.execute("fetch 1000 from crs0")
        rows = curs.fetchall()
    curs.execute("close crs0")
    sys.stderr.write("Done.\n")

    sys.stderr.write("Comparing gene condition and vertex_set ...\n")
    writer = csv.writer(open(output_fname, "w"), delimiter="\t")
    for gene_no, mcl_id_go_no_list in gene_no2mcl_id_go_no_list.iteritems():
        cmp_condition_vertex_result = get_mcl_id_sharing(mcl_id_go_no_list, mcl_id2recurrence_array_vertex_set)
        for row in cmp_condition_vertex_result:
            writer.writerow([gene_no] + row)
    sys.stderr.write("Done.\n")
	def get_prot_interaction_graph(self, curs, prot_interaction_table, tax_id):
		"""
		2006-11-20
		2006-01-05
			print the number of connected components in the end
		"""
		sys.stderr.write("Getting protein interaction graph ...\n")
		curs.execute("DECLARE crs0 CURSOR for select gene_id_array, interaction_type_id\
			from %s where tax_id=%s"%(prot_interaction_table, tax_id))
		curs.execute("fetch 3000 from crs0")
		rows = curs.fetchall()
		prot_interaction_graph = nx.XGraph()
		counter = 0
		while rows:
			for row in rows:
				gene_id_array, interaction_type_id = row
				gene_id_array = pg_1d_array2python_ls(gene_id_array)
				if len(gene_id_array)>1:
					for i in range(len(gene_id_array)):
						for j in range(i+1, len(gene_id_array)):
							prot_interaction_graph.add_edge(gene_id_array[i], gene_id_array[j], interaction_type_id)
				counter += 1
			if self.report:
				sys.stderr.write("%s%s"%('\x08'*20, counter))
			curs.execute("fetch 3000 from crs0")
			rows = curs.fetchall()
		curs.execute("close crs0")
		sys.stderr.write("%s nodes, %s edges and %s components, Done.\n"%(nx.number_of_nodes(prot_interaction_graph),\
			nx.number_of_edges(prot_interaction_graph), nx.number_connected_components(prot_interaction_graph)))
		return prot_interaction_graph
示例#3
0
	def get_prom_seq_from_entrezgene_mapping_table(self, curs, prom_seq_table, entrezgene_mapping_table='entrezgene_mapping', \
		annot_assembly_table = 'annot_assembly'):
		sys.stderr.write("Getting prom_seq from entrezgene_mapping_table...\n")
		curs.execute("DECLARE crs CURSOR FOR SELECT e.gene_id, e.genomic_gi, e.tax_id, a.chromosome, e.strand,\
			e.start, e.stop, e.mrna_start, e.mrna_stop, e.cds_start, e.cds_stop from %s e, %s a \
			where e.genomic_gi=a.gi"%(entrezgene_mapping_table, annot_assembly_table))
		curs.execute("fetch 10000 from crs")
		rows = curs.fetchall()
		counter = 0
		while rows:
			for row in rows:
				gene_id, genomic_gi, tax_id, chromosome, strand, start, stop, mrna_start, mrna_stop, cds_start, cds_stop = row
				seg_loc_ls = []
				if cds_start and cds_stop:
					cds_start = pg_1d_array2python_ls(cds_start, int)
					cds_stop = pg_1d_array2python_ls(cds_stop, int)
					for i in range(len(cds_start)):
						seg_loc_ls.append([cds_start[i],cds_stop[i]])
				elif mrna_start and mrna_stop:
					mrna_start = pg_1d_array2python_ls(mrna_start, int)
					mrna_stop = pg_1d_array2python_ls(mrna_stop, int)
					for i in range(len(mrna_start)):
						seg_loc_ls.append([mrna_start[i],mrna_stop[i]])
				else:
					seg_loc_ls.append([start, stop])
				seg_loc_ls.sort()	#some genes have reversed cds order
				ps_attr_instance = prom_seq_attr()
				ps_attr_instance.prom_acc = gene_id
				ps_attr_instance.chromosome = chromosome
				ps_attr_instance.organism = tax_id2org(tax_id)
				upstream_loc_ls = [0,0]
				instron_1st_loc_ls = []
				if strand=='1':	#plus strand
					ps_attr_instance.strand = '+'
					upstream_loc_ls[1] = seg_loc_ls[0][0]-1
					upstream_loc_ls[0] = upstream_loc_ls[1] - 9999
					if upstream_loc_ls[0]<1:	#in case exceed the chromosome boundary
						upstream_loc_ls[0] = 1
					#check whether there's gene upstream
					upstream_loc_ls[0] = self.return_closest_anchor(curs, 'stop', upstream_loc_ls, gene_id, tax_id, genomic_gi, \
						entrezgene_mapping_table)
					if upstream_loc_ls[0]>upstream_loc_ls[1]:	#No upstream
						if self.debug:
							sys.stderr.write("\tgene_id: %s no upstream\n"%gene_id)
						upstream_loc_ls = []
					if len(seg_loc_ls)>1:	#the first intron
						instron_1st_loc_ls.append(seg_loc_ls[0][1]+1)
						instron_1st_loc_ls.append(seg_loc_ls[1][0]-1)
				elif strand=='-1':	#minus strand
					ps_attr_instance.strand = '-'
					upstream_loc_ls[0] = seg_loc_ls[-1][1]+1
					upstream_loc_ls[1] = upstream_loc_ls[0] + 9999
					#NOTE: exceeding the chromosome boundary is taken care of by get_sequence_segment()
					#check whether there's gene upstream
					upstream_loc_ls[1] = self.return_closest_anchor(curs, 'start', upstream_loc_ls, gene_id, tax_id, genomic_gi, \
						entrezgene_mapping_table)
					if upstream_loc_ls[0]>upstream_loc_ls[1]:	#No upstream
						sys.stderr.write("\tgene_id: %s no upstream\n"%gene_id)
						upstream_loc_ls = []
					if len(seg_loc_ls)>1:	#the first intron
						instron_1st_loc_ls.append(seg_loc_ls[-2][1]+1)
						instron_1st_loc_ls.append(seg_loc_ls[-1][0]-1)
				else:	#ignore genes with no strand info, some are not real genes
					continue
				
				#1st deal with upstream_loc_ls
				if upstream_loc_ls:
					ps_attr_instance.prom_genome_start = upstream_loc_ls[0]
					ps_attr_instance.prom_genome_end = upstream_loc_ls[1]
					ps_attr_instance.prom_type_id = 1
					ps_attr_instance.sequence = get_sequence_segment(curs, genomic_gi, upstream_loc_ls[0], upstream_loc_ls[1])
					self.submit_to_prom_seq(curs, prom_seq_table, ps_attr_instance)
				#2nd handle instron_1st_loc_ls, might not exist
				if instron_1st_loc_ls:
					if instron_1st_loc_ls[0]>instron_1st_loc_ls[1]:
						sys.stderr.write("\tgene_id: %s weird 1st intron %s.\n"%(gene_id, instron_1st_loc_ls))
					ps_attr_instance.prom_genome_start = instron_1st_loc_ls[0]
					ps_attr_instance.prom_genome_end = instron_1st_loc_ls[1]
					ps_attr_instance.prom_type_id = 5
					ps_attr_instance.sequence = get_sequence_segment(curs, genomic_gi, instron_1st_loc_ls[0], instron_1st_loc_ls[1])
					self.submit_to_prom_seq(curs, prom_seq_table, ps_attr_instance)
				counter += 1
			if self.report:
				sys.stderr.write("%s\t%s"%('\x08'*20, counter))
			if self.debug:	#enough
				break
			curs.execute("fetch 10000 from crs")
			rows = curs.fetchall()
		sys.stderr.write("Done getting prom_seq from entrezgene_mapping_table.\n")