def submit2db(self, curs, experiment_id2accession_id2FT, experiment_id2data, method_table, phenotype_table, phenotype_avg_table): """ 2008-04-20 check if a method is already in method_table before insertion. 2008-03-01 submit to method_table and phenotype_table ... need the accession_id2ecotype_id ... """ sys.stderr.write("Submitting experiment_id2data to %s ... " % method_table) no_of_methods_submitted = 0 for experiment_id, data in experiment_id2data.iteritems(): curs.execute( "select id, short_name from %s where id=%s and short_name='%s'" % (method_table, experiment_id, data[0])) rows = curs.fetchall() if not rows: #only insert if the method_table doesn't have these data. curs.execute( "insert into %s(id, short_name) values (%s, '%s')" % (method_table, experiment_id, data[0])) no_of_methods_submitted += 1 sys.stderr.write("%s methods inserted. Done.\n" % (no_of_methods_submitted)) sys.stderr.write( "Submitting experiment_id2accession_id2FT to %s and %s ... " % (phenotype_table, phenotype_avg_table)) from variation.src.common import map_accession_id2ecotype_id accession_id2ecotype_id = map_accession_id2ecotype_id(curs) for expt_id in experiment_id2accession_id2FT: for accession_id in experiment_id2accession_id2FT[expt_id]: FT_rep_ls = experiment_id2accession_id2FT[expt_id][ accession_id] ecotype_id = accession_id2ecotype_id[accession_id] for FT, replicate in FT_rep_ls: curs.execute("insert into %s(ecotype_id, value, replicate, method_id) values (%s, %s, %s, %s)"%\ (phenotype_table, ecotype_id, FT, replicate, expt_id)) FT_ls = [ row[0] for row in experiment_id2accession_id2FT[expt_id] [accession_id] ] avg_FT = numpy.average(FT_ls) if len(FT_ls) > 1: std_FT = numpy.std(FT_ls) else: std_FT = 'NULL' #for mySQL db submission curs.execute("insert into %s(ecotype_id, value, stdev, sample_size, method_id) values (%s, %s, %s, %s, %s)"%\ (phenotype_avg_table, ecotype_id, avg_FT, std_FT, len(FT_ls), expt_id)) sys.stderr.write("Done.\n")
def submit2db(self, curs, experiment_id2accession_id2FT, experiment_id2data, method_table, phenotype_table, phenotype_avg_table): """ 2008-04-20 check if a method is already in method_table before insertion. 2008-03-01 submit to method_table and phenotype_table ... need the accession_id2ecotype_id ... """ sys.stderr.write("Submitting experiment_id2data to %s ... "%method_table) no_of_methods_submitted = 0 for experiment_id, data in experiment_id2data.iteritems(): curs.execute("select id, short_name from %s where id=%s and short_name='%s'"%(method_table, experiment_id, data[0])) rows = curs.fetchall() if not rows: #only insert if the method_table doesn't have these data. curs.execute("insert into %s(id, short_name) values (%s, '%s')"%(method_table, experiment_id, data[0])) no_of_methods_submitted += 1 sys.stderr.write("%s methods inserted. Done.\n"%(no_of_methods_submitted)) sys.stderr.write("Submitting experiment_id2accession_id2FT to %s and %s ... "%(phenotype_table, phenotype_avg_table)) from variation.src.common import map_accession_id2ecotype_id accession_id2ecotype_id = map_accession_id2ecotype_id(curs) for expt_id in experiment_id2accession_id2FT: for accession_id in experiment_id2accession_id2FT[expt_id]: FT_rep_ls = experiment_id2accession_id2FT[expt_id][accession_id] ecotype_id = accession_id2ecotype_id[accession_id] for FT, replicate in FT_rep_ls: curs.execute("insert into %s(ecotype_id, value, replicate, method_id) values (%s, %s, %s, %s)"%\ (phenotype_table, ecotype_id, FT, replicate, expt_id)) FT_ls = [row[0] for row in experiment_id2accession_id2FT[expt_id][accession_id]] avg_FT = numpy.average(FT_ls) if len(FT_ls)>1: std_FT = numpy.std(FT_ls) else: std_FT = 'NULL' #for mySQL db submission curs.execute("insert into %s(ecotype_id, value, stdev, sample_size, method_id) values (%s, %s, %s, %s, %s)"%\ (phenotype_avg_table, ecotype_id, avg_FT, std_FT, len(FT_ls), expt_id)) sys.stderr.write("Done.\n")
def run(self): import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() """ #2008-02-08 old way to get 2010 data is from raw alignments. didn't realize all SNPs are put into db. alignment_id2positions_to_be_checked_ls, alignment_id2chr_start_end = self.get_alignment_id2positions_to_be_checked_ls(curs, self.alignment_table) SNPpos_snpacc_ls = self.get_SNPpos_snpacc_ls(curs, self.snp_locus_table) SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc(SNPpos_snpacc_ls, alignment_id2chr_start_end) ecotype_id2accession_id, ecotype_id2row_index, ecotype_id2info_ls, ecotype_id_ls, accession_id2row_index, accession_id_ls, nativename_ls = self.setup_accession_ecotype_dstruc(curs, self.accession2ecotype_table, self.ecotype_table) accession_X_snp_matrix, accession_X_snp_matrix_touched, snp_index2alignment_id = self.get_accession_X_snp_matrix(curs, accession_id2row_index, SNPpos2col_index, self.sequence_table, self.alignment_table, alignment_id2positions_to_be_checked_ls) """ if self.processing_bits[3]==0: #2009-2-12 will be a problem if snp_locus_table doesn't have field offset SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, offset=self.offset) elif self.processing_bits[3]==1: SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, \ cross_linking_table=self.data_type2data_table[self.processing_bits[1]], \ offset=self.offset) elif self.processing_bits[3]==2: SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.data_type2data_table[self.processing_bits[1]], \ offset=self.offset) else: sys.stderr.write("Error: unsupported 3rd bit in processing_bits %s.\n"%self.processing_bits[3]) sys.exit(3) from variation.src.common import get_accession_id2name accession_id2name = get_accession_id2name(curs) if self.processing_bits[1]==0: row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, offset=self.offset, version=self.version) elif self.processing_bits[1]==1: from variation.src.common import map_perlegen_ecotype_name2accession_id ecotype_name2accession_id = map_perlegen_ecotype_name2accession_id(curs) row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, data_type=self.processing_bits[1], \ ecotype_name2accession_id=ecotype_name2accession_id,\ offset=self.offset, version=self.version) else: sys.stderr("Unsupported data type: %s or no ecotype_name2accession_id specified.\n"%self.processing_bits[1]) sys.exit(2) accession_id_ls, accession_name_ls, data_matrix = self.transform_row_id2dstruc_2_matrix(row_id2dstruc) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() #2008-02-08 which type of row id/1st column if self.processing_bits[0]==0: from variation.src.common import map_accession_id2ecotype_id accession_id2ecotype_id = map_accession_id2ecotype_id(curs, accession2ecotype_table=self.accession2ecotype_table) accession_id2ecotype_id[99] = 6909 #accession 99 is the reference genome, which col-0 (ecotype_id=6909) ecotype_id_ls = [] rows_to_be_tossed_out=Set() for i in range(len(accession_id_ls)): ecotype_id = accession_id2ecotype_id.get(accession_id_ls[i]) if not ecotype_id: #mapping failed rows_to_be_tossed_out.add(i) ecotype_id_ls.append(ecotype_id) strain_acc_list = ecotype_id_ls header = ['ecotype_id'] #1st column in the header else: rows_to_be_tossed_out=Set() strain_acc_list = accession_id_ls header = ['accession_id'] #2008-02-08 which type of 2nd column if self.processing_bits[2]==0: category_list = [1]*len(accession_name_ls) header.append('duplicate') #2nd column in the header elif self.processing_bits[2]==1: category_list = accession_name_ls header.append('accession_name') else: category_list = accession_name_ls header.append('accession_name') header += snp_acc_ls FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out)