Пример #1
0
    def submit2db(self, curs, experiment_id2accession_id2FT,
                  experiment_id2data, method_table, phenotype_table,
                  phenotype_avg_table):
        """
		2008-04-20
			check if a method is already in method_table before insertion.
		2008-03-01
			submit to method_table and phenotype_table
			... need the accession_id2ecotype_id ...
		"""
        sys.stderr.write("Submitting experiment_id2data to %s ... " %
                         method_table)
        no_of_methods_submitted = 0
        for experiment_id, data in experiment_id2data.iteritems():
            curs.execute(
                "select id, short_name from %s where id=%s and short_name='%s'"
                % (method_table, experiment_id, data[0]))
            rows = curs.fetchall()
            if not rows:  #only insert if the method_table doesn't have these data.
                curs.execute(
                    "insert into %s(id, short_name) values (%s, '%s')" %
                    (method_table, experiment_id, data[0]))
                no_of_methods_submitted += 1
        sys.stderr.write("%s methods inserted. Done.\n" %
                         (no_of_methods_submitted))

        sys.stderr.write(
            "Submitting experiment_id2accession_id2FT to %s and %s ... " %
            (phenotype_table, phenotype_avg_table))
        from variation.src.common import map_accession_id2ecotype_id
        accession_id2ecotype_id = map_accession_id2ecotype_id(curs)
        for expt_id in experiment_id2accession_id2FT:
            for accession_id in experiment_id2accession_id2FT[expt_id]:
                FT_rep_ls = experiment_id2accession_id2FT[expt_id][
                    accession_id]
                ecotype_id = accession_id2ecotype_id[accession_id]
                for FT, replicate in FT_rep_ls:
                    curs.execute("insert into %s(ecotype_id, value, replicate, method_id) values (%s, %s, %s, %s)"%\
                       (phenotype_table, ecotype_id, FT, replicate, expt_id))

                FT_ls = [
                    row[0] for row in experiment_id2accession_id2FT[expt_id]
                    [accession_id]
                ]
                avg_FT = numpy.average(FT_ls)
                if len(FT_ls) > 1:
                    std_FT = numpy.std(FT_ls)
                else:
                    std_FT = 'NULL'  #for mySQL db submission
                curs.execute("insert into %s(ecotype_id, value, stdev, sample_size, method_id) values (%s, %s, %s, %s, %s)"%\
                   (phenotype_avg_table, ecotype_id, avg_FT, std_FT, len(FT_ls), expt_id))
        sys.stderr.write("Done.\n")
Пример #2
0
	def submit2db(self, curs, experiment_id2accession_id2FT, experiment_id2data, method_table, phenotype_table, phenotype_avg_table):
		"""
		2008-04-20
			check if a method is already in method_table before insertion.
		2008-03-01
			submit to method_table and phenotype_table
			... need the accession_id2ecotype_id ...
		"""
		sys.stderr.write("Submitting experiment_id2data to %s ... "%method_table)
		no_of_methods_submitted = 0
		for experiment_id, data in experiment_id2data.iteritems():
			curs.execute("select id, short_name from %s where id=%s and short_name='%s'"%(method_table, experiment_id, data[0]))
			rows = curs.fetchall()
			if not rows:	#only insert if the method_table doesn't have these data.
				curs.execute("insert into %s(id, short_name) values (%s, '%s')"%(method_table, experiment_id, data[0]))
				no_of_methods_submitted += 1
		sys.stderr.write("%s methods inserted. Done.\n"%(no_of_methods_submitted))
		
		sys.stderr.write("Submitting experiment_id2accession_id2FT to %s and %s ... "%(phenotype_table, phenotype_avg_table))
		from variation.src.common import map_accession_id2ecotype_id
		accession_id2ecotype_id = map_accession_id2ecotype_id(curs)
		for expt_id in experiment_id2accession_id2FT:
			for accession_id in experiment_id2accession_id2FT[expt_id]:
				FT_rep_ls = experiment_id2accession_id2FT[expt_id][accession_id]
				ecotype_id = accession_id2ecotype_id[accession_id]
				for FT, replicate in FT_rep_ls:
					curs.execute("insert into %s(ecotype_id, value, replicate, method_id) values (%s, %s, %s, %s)"%\
								(phenotype_table, ecotype_id, FT, replicate, expt_id))
				
				FT_ls = [row[0] for row in experiment_id2accession_id2FT[expt_id][accession_id]]
				avg_FT = numpy.average(FT_ls)
				if len(FT_ls)>1:
					std_FT = numpy.std(FT_ls)
				else:
					std_FT = 'NULL'	#for mySQL db submission
				curs.execute("insert into %s(ecotype_id, value, stdev, sample_size, method_id) values (%s, %s, %s, %s, %s)"%\
							(phenotype_avg_table, ecotype_id, avg_FT, std_FT, len(FT_ls), expt_id))
		sys.stderr.write("Done.\n")
Пример #3
0
	def run(self):
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd = self.passwd)
		curs = conn.cursor()
		if self.debug:
			import pdb
			pdb.set_trace()
		"""
		#2008-02-08 old way to get 2010 data is from raw alignments. didn't realize all SNPs are put into db.
		alignment_id2positions_to_be_checked_ls, alignment_id2chr_start_end = self.get_alignment_id2positions_to_be_checked_ls(curs, self.alignment_table)
		SNPpos_snpacc_ls = self.get_SNPpos_snpacc_ls(curs, self.snp_locus_table)
		SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc(SNPpos_snpacc_ls, alignment_id2chr_start_end)

		ecotype_id2accession_id, ecotype_id2row_index, ecotype_id2info_ls, ecotype_id_ls, accession_id2row_index, accession_id_ls, nativename_ls = self.setup_accession_ecotype_dstruc(curs, self.accession2ecotype_table, self.ecotype_table)
		accession_X_snp_matrix, accession_X_snp_matrix_touched, snp_index2alignment_id = self.get_accession_X_snp_matrix(curs, accession_id2row_index, SNPpos2col_index, self.sequence_table, self.alignment_table, alignment_id2positions_to_be_checked_ls)
		"""
		if self.processing_bits[3]==0:
			#2009-2-12 will be a problem if snp_locus_table doesn't have field offset
			SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, offset=self.offset)
		elif self.processing_bits[3]==1:
			SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, \
																cross_linking_table=self.data_type2data_table[self.processing_bits[1]], \
																offset=self.offset)
		elif self.processing_bits[3]==2:
			SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.data_type2data_table[self.processing_bits[1]], \
																offset=self.offset)
		else:
			sys.stderr.write("Error: unsupported 3rd bit in processing_bits %s.\n"%self.processing_bits[3])
			sys.exit(3)
		from variation.src.common import get_accession_id2name
		accession_id2name = get_accession_id2name(curs)
		if self.processing_bits[1]==0:
			row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, offset=self.offset, version=self.version)
		elif self.processing_bits[1]==1:
			from variation.src.common import map_perlegen_ecotype_name2accession_id
			ecotype_name2accession_id = map_perlegen_ecotype_name2accession_id(curs)
			row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, data_type=self.processing_bits[1], \
												ecotype_name2accession_id=ecotype_name2accession_id,\
												offset=self.offset, version=self.version)
		else:
			sys.stderr("Unsupported data type: %s or no ecotype_name2accession_id specified.\n"%self.processing_bits[1])
			sys.exit(2)
		accession_id_ls, accession_name_ls, data_matrix = self.transform_row_id2dstruc_2_matrix(row_id2dstruc)
		
		from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
		
		#2008-02-08 which type of row id/1st column
		if self.processing_bits[0]==0:
			from variation.src.common import map_accession_id2ecotype_id
			accession_id2ecotype_id = map_accession_id2ecotype_id(curs, accession2ecotype_table=self.accession2ecotype_table)
			accession_id2ecotype_id[99] = 6909	#accession 99 is the reference genome, which col-0 (ecotype_id=6909)
			ecotype_id_ls = []
			rows_to_be_tossed_out=Set()
			for i in range(len(accession_id_ls)):
				ecotype_id = accession_id2ecotype_id.get(accession_id_ls[i])
				if not ecotype_id:	#mapping failed
					rows_to_be_tossed_out.add(i)
				ecotype_id_ls.append(ecotype_id)
			strain_acc_list = ecotype_id_ls
			header = ['ecotype_id']	#1st column in the header
		else:
			rows_to_be_tossed_out=Set()
			strain_acc_list = accession_id_ls
			header = ['accession_id']
		#2008-02-08 which type of 2nd column
		if self.processing_bits[2]==0:
			category_list = [1]*len(accession_name_ls)
			header.append('duplicate')	#2nd column in the header
		elif self.processing_bits[2]==1:
			category_list = accession_name_ls
			header.append('accession_name')
		else:
			category_list = accession_name_ls
			header.append('accession_name')
		
		header += snp_acc_ls
		FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out)