Exemplo n.º 1
0
	def run(self):
		"""
		2007-03-20
		2007-04-03
		"""		
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		if self.draw_only:
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.output_fname)
			data_matrix = Numeric.array(data_matrix)
		else:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname)
			
			snp_acc_ls = header[2:]
			strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list)
			snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls)
			
			from dbSNP2data import dbSNP2data
			dbSNP2data_instance = dbSNP2data(report=self.report)
			data_matrix = dbSNP2data_instance.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1)
			
			FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(data_matrix)
		self.displayDataMatrix(heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0')
		self.displayDataMatrix(coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
		raw_input("enter")
Exemplo n.º 2
0
	def run(self):
		"""
		2007-07-12
		2007-07-17
		"""
		from dbSNP2data import dbSNP2data
		dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever')
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user = self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		#snp_id2index, snp_id_list, snp_acc_list, snp_id2acc = self.get_snp_struc(curs, self.snpacc_fname, self.snp_locus_table)
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table)
		#snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
		snp_acc_list = []
		for snp_id in snp_id_list:
			snp_acc_list.append(snp_id2info[snp_id][0])
		
		#popid2ecotypeid_ls = self.get_popid2ecotypeid_ls(curs, self.population_table)
		popid2strain_id_snp_id_ls = self.get_popid2strain_id_snp_id_ls(curs, self.population_table, self.popid2snpid_table)
		strain_id2index, strain_id_list = self.get_strain_id2index(popid2strain_id_snp_id_ls, self.min_no_of_strains_per_pop)
		
		#strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category  = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table)
		
		strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
		
		data_matrix = dbSNP2data_instance.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1)
		
		self.OutputPop_dict[self.output_type](data_matrix, popid2strain_id_snp_id_ls, strain_id2index, self.output_fname, snp_id2index, strain_id2acc,\
			 strain_id2category, snp_acc_list, self.with_header_line, self.nt_alphabet)
	def remove_identity_strains(self, data_matrix, rows_to_be_checked, cols_to_be_checked):
		"""
		2009-2-18
			class "dbSNP2data" has a few non-null arguments. feed it during initialization
		2007-04-16
			the similarity graph structure complicated the issue
			bug found by Chris Toomajian
			Now use the greedy graph algorithm to remove identity strains.
		2007-09-13
			remove parameter strain_index2no_of_NAs
		"""
		sys.stderr.write("Searching for identity strains ...")
		rows_to_be_checked_ls = list(rows_to_be_checked)
		rows_to_be_checked_ls.sort()	#from small to big
		if self.debug:
			import pdb
			pdb.set_trace()
		no_of_total_cols_to_be_checked = len(cols_to_be_checked)
		identity_pair_ls = []	#2007-04-16
		for i in range(len(rows_to_be_checked_ls)):
			row1_index = rows_to_be_checked_ls[i]	#watch this
			for j in rows_to_be_checked_ls[i+1:]:
				no_of_same_cols = 0
				for k in cols_to_be_checked:
					if data_matrix[row1_index][k] == data_matrix[j][k] or data_matrix[row1_index][k]==0 or data_matrix[j][k]==0:
						no_of_same_cols += 1
				if no_of_same_cols == no_of_total_cols_to_be_checked:
					identity_pair_ls.append([row1_index, j])
		if self.debug:
			import pdb
			pdb.set_trace()
		sys.stderr.write("done.\n")
		sys.stderr.write("Removing identity strains ...")
		import networkx as nx
		g = nx.Graph()
		g.add_edges_from(identity_pair_ls)
		from dbSNP2data import dbSNP2data
		dbSNP2data_instance = dbSNP2data(user='******', passwd='secret', output_fname='/tmp/nothing')	#dbSNP2data has a few non-null arguments.
		vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g)
		identity_strains_to_be_removed = set(vertex_list_to_be_deleted)
		"""
		#2007-04-16 useless
		identity_strains_to_be_removed = set()
		for src, tg_list in src2tg_list.iteritems():
			strain_with_least_NA = src
			least_no_of_NAs = strain_index2no_of_NAs[src]
			identity_strains_to_be_removed.add(src)	#add in src
			for tg in tg_list:
				identity_strains_to_be_removed.add(tg)	#add in tg
				if strain_index2no_of_NAs[tg] < least_no_of_NAs:
					strain_with_least_NA = tg
					least_no_of_NAs = strain_index2no_of_NAs[tg]
			identity_strains_to_be_removed.remove(strain_with_least_NA)	#remove the one with least NAs
		"""
		if self.debug:
			print
			print 'identity_strains_to_be_removed'
			print identity_strains_to_be_removed
		sys.stderr.write("%s identity strains, done.\n"%(len(identity_strains_to_be_removed)))
		return identity_strains_to_be_removed
Exemplo n.º 4
0
	def test_find_smallest_vertex_set_to_remove_all_edges(self):
		from dbSNP2data import dbSNP2data
		identity_pair_ls = [[1,2],[2,3],[2,4],[4,5]]
		import networkx as nx
		g = nx.Graph()
		g.add_edges_from(identity_pair_ls)
		from dbSNP2data import dbSNP2data
		dbSNP2data_instance = dbSNP2data()
		#import pdb
		#pdb.set_trace()
		vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g)
		print 'graph'
		print identity_pair_ls
		print 'vertex_list_to_be_deleted'
		print vertex_list_to_be_deleted
Exemplo n.º 5
0
 def test_find_smallest_vertex_set_to_remove_all_edges(self):
     from dbSNP2data import dbSNP2data
     identity_pair_ls = [[1, 2], [2, 3], [2, 4], [4, 5]]
     import networkx as nx
     g = nx.Graph()
     g.add_edges_from(identity_pair_ls)
     from dbSNP2data import dbSNP2data
     dbSNP2data_instance = dbSNP2data()
     #import pdb
     #pdb.set_trace()
     vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(
         g)
     print 'graph'
     print identity_pair_ls
     print 'vertex_list_to_be_deleted'
     print vertex_list_to_be_deleted
Exemplo n.º 6
0
    def run(self):
        """
		2007-07-12
		2007-07-17
		"""
        from dbSNP2data import dbSNP2data
        dbSNP2data_instance = dbSNP2data(user=self.user,
                                         passwd=self.passwd,
                                         output_fname='whatever')

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()

        #snp_id2index, snp_id_list, snp_acc_list, snp_id2acc = self.get_snp_struc(curs, self.snpacc_fname, self.snp_locus_table)
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(
            curs, self.input_table, self.snp_locus_table)
        #snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
        snp_acc_list = []
        for snp_id in snp_id_list:
            snp_acc_list.append(snp_id2info[snp_id][0])

        #popid2ecotypeid_ls = self.get_popid2ecotypeid_ls(curs, self.population_table)
        popid2strain_id_snp_id_ls = self.get_popid2strain_id_snp_id_ls(
            curs, self.population_table, self.popid2snpid_table)
        strain_id2index, strain_id_list = self.get_strain_id2index(
            popid2strain_id_snp_id_ls, self.min_no_of_strains_per_pop)

        #strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category  = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table)

        strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(
            curs, strain_id_list, self.strain_info_table)

        data_matrix = dbSNP2data_instance.get_data_matrix_m(
            curs,
            strain_id2index,
            snp_id2index,
            nt2number,
            self.input_table,
            need_heterozygous_call=1)

        self.OutputPop_dict[self.output_type](data_matrix, popid2strain_id_snp_id_ls, strain_id2index, self.output_fname, snp_id2index, strain_id2acc,\
          strain_id2category, snp_acc_list, self.with_header_line, self.nt_alphabet)
Exemplo n.º 7
0
	def run(self):
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user = self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		from dbSNP2data import dbSNP2data
		dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever')
		
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table)
		#strain_id2index, strain_id_list
		strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category  = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table)
		#2008-06-02 stuff returned by get_strain_id2index_m is totally changed.
		ecotype_id2row_index = {}
		for strain_id, acc in strain_id2acc.iteritems():
			row_index = strain_id2index[strain_id]
			ecotype_id2row_index[acc] = row_index
		
		#strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
		snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
		data_matrix = dbSNP2data_instance.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1)
		
		
		from OutputPopulation import OutputPopulation
		
		popid2ecotypeid_ls = OutputPopulation.get_popid2ecotypeid_ls(curs, self.population_table)
		from FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		
		from RemoveBadSNPs import RemoveBadSNPs
		RemoveBadSNPs_instance = RemoveBadSNPs()
		popid2strain_id_snp_id_ls = {}
		for popid, ecotypeid_ls in popid2ecotypeid_ls.iteritems():
			if len(ecotypeid_ls)>=self.min_no_of_strains_per_pop:
				sys.stderr.write("Population %s\n"%popid)
				sub_data_matrix, new_ecotypeid_ls = self.create_sub_data_matrix(popid, data_matrix, ecotypeid_ls, ecotype_id2row_index)
				if len(new_ecotypeid_ls)>=self.min_no_of_strains_per_pop:
					sys.stderr.write("\tPopulation %s has %s strains\n"%(popid, len(new_ecotypeid_ls)))
					strain_id_selected, snp_id_selected = self.cleanup_one_population(FilterStrainSNPMatrix_instance, RemoveBadSNPs_instance, sub_data_matrix, new_ecotypeid_ls, snp_id_list, self.min_no_of_strains_per_pop, self.row_cutoff, self.col_cutoff, self.min_log_prob)
					if strain_id_selected and snp_id_selected:
						popid2strain_id_snp_id_ls[popid] = [strain_id_selected, snp_id_selected]
		
		if self.commit:
			self.create_popid2snpid_table(curs, self.output_table)
			self.mark_strain_id_selected(curs, popid2strain_id_snp_id_ls, self.population_table)
			self.submit_popid2snpid_list(curs, popid2strain_id_snp_id_ls, self.population_table, self.output_table)
			conn.commit()
    def run(self):
        """
		2007-03-20
		2007-04-03
		"""
        from FilterStrainSNPMatrix import FilterStrainSNPMatrix
        FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
        if self.draw_only:
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.output_fname)
            data_matrix = Numeric.array(data_matrix)
        else:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)

            snp_acc_ls = header[2:]
            strain_id2index = self.get_id2index(curs, self.strain_info_table,
                                                strain_acc_list)
            snp_id2index = self.get_id2index(curs, self.snp_locus_table,
                                             snp_acc_ls)

            from dbSNP2data import dbSNP2data
            dbSNP2data_instance = dbSNP2data(report=self.report)
            data_matrix = dbSNP2data_instance.get_data_matrix(
                curs,
                strain_id2index,
                snp_id2index,
                nt2number,
                self.data_table,
                need_heterozygous_call=1)

            FilterStrainSNPMatrix_instance.write_data_matrix(
                data_matrix, self.output_fname, header, strain_acc_list,
                category_list)

        heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(
            data_matrix)
        self.displayDataMatrix(
            heterozygous_data_matrix,
            title='heterozygous_data_matrix, 5-10=hetero, else=0')
        self.displayDataMatrix(
            coarse_data_matrix,
            title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero')
        raw_input("enter")
Exemplo n.º 9
0
    def remove_identity_strains(self, data_matrix, rows_to_be_checked, cols_to_be_checked):
        """
		2009-2-18
			class "dbSNP2data" has a few non-null arguments. feed it during initialization
		2007-04-16
			the similarity graph structure complicated the issue
			bug found by Chris Toomajian
			Now use the greedy graph algorithm to remove identity strains.
		2007-09-13
			remove parameter strain_index2no_of_NAs
		"""
        sys.stderr.write("Searching for identity strains ...")
        rows_to_be_checked_ls = list(rows_to_be_checked)
        rows_to_be_checked_ls.sort()  # from small to big
        if self.debug:
            import pdb

            pdb.set_trace()
        no_of_total_cols_to_be_checked = len(cols_to_be_checked)
        identity_pair_ls = []  # 2007-04-16
        for i in range(len(rows_to_be_checked_ls)):
            row1_index = rows_to_be_checked_ls[i]  # watch this
            for j in rows_to_be_checked_ls[i + 1 :]:
                no_of_same_cols = 0
                for k in cols_to_be_checked:
                    if (
                        data_matrix[row1_index][k] == data_matrix[j][k]
                        or data_matrix[row1_index][k] == 0
                        or data_matrix[j][k] == 0
                    ):
                        no_of_same_cols += 1
                if no_of_same_cols == no_of_total_cols_to_be_checked:
                    identity_pair_ls.append([row1_index, j])
        if self.debug:
            import pdb

            pdb.set_trace()
        sys.stderr.write("done.\n")
        sys.stderr.write("Removing identity strains ...")
        import networkx as nx

        g = nx.Graph()
        g.add_edges_from(identity_pair_ls)
        from dbSNP2data import dbSNP2data

        dbSNP2data_instance = dbSNP2data(
            user="******", passwd="secret", output_fname="/tmp/nothing"
        )  # dbSNP2data has a few non-null arguments.
        vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g)
        identity_strains_to_be_removed = Set(vertex_list_to_be_deleted)
        """
		#2007-04-16 useless
		identity_strains_to_be_removed = Set()
		for src, tg_list in src2tg_list.iteritems():
			strain_with_least_NA = src
			least_no_of_NAs = strain_index2no_of_NAs[src]
			identity_strains_to_be_removed.add(src)	#add in src
			for tg in tg_list:
				identity_strains_to_be_removed.add(tg)	#add in tg
				if strain_index2no_of_NAs[tg] < least_no_of_NAs:
					strain_with_least_NA = tg
					least_no_of_NAs = strain_index2no_of_NAs[tg]
			identity_strains_to_be_removed.remove(strain_with_least_NA)	#remove the one with least NAs
		"""
        if self.debug:
            print
            print "identity_strains_to_be_removed"
            print identity_strains_to_be_removed
        sys.stderr.write("%s identity strains, done.\n" % (len(identity_strains_to_be_removed)))
        return identity_strains_to_be_removed