def run(self): """ 2007-04-30 2007-05-14 add nt_alphabet_bits """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, int(self.nt_alphabet_bits[0])) data_matrix = num.array(data_matrix) strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix) snp_locus_log_prob = self.cal_snp_locus_log_prob( data_matrix, strain_homo_perc_vector) from sets import Set cols_to_be_tossed_out_set = Set() for i in range(len(snp_locus_log_prob)): if snp_locus_log_prob[i] <= min_log_prob: cols_to_be_tossed_out_set.add(i) print "%sSNPs removed:" % (len(cols_to_be_tossed_out_set)) for col_index in cols_to_be_tossed_out_set: print '\t%s\t%s' % (col_index, header[2 + col_index]) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1])) import pylab pylab.title("histogram of snp locus log probability") pylab.hist(snp_locus_log_prob, 20) pylab.show()
def run(self): """ 2008-05-20 read_call_matrix returns PassingData object """ if self.debug: import pdb pdb.set_trace() db = self.db_250k session = db.session QC_method_id = 0 #just for QC_250k.get_call_info_id2fname() call_data = QC_250k.get_call_info_id2fname(db, QC_method_id, self.call_method_id, filter_calls_QCed=0, \ max_call_info_mismatch_rate=self.max_array_mismatch_rate, input_dir=self.input_dir,\ take_unique_ecotype=self.take_unique_ecotype) #snps_with_best_QC_ls = self.get_snps_with_best_QC_ls(db, self.call_method_id) if self.max_snp_mismatch_rate < 1 or self.max_snp_NA_rate < 1: #2008-05-18 only do this when it's necessary snps_name_set = self.get_snps_name_set_given_criteria( db, self.call_method_id, self.max_snp_mismatch_rate, self.max_snp_NA_rate) else: snps_name_set = None db_id2chr_pos = db.getSNPID2ChrPos() if len(call_data.call_info_id2fname) > 0: db_id2index = self.getSNPID2index( call_data.call_info_id2fname.values()[0][1], db_id2chr_pos) pdata = QC_250k.read_call_matrix(call_data.call_info_id2fname, self.min_probability, snps_name_set, \ db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #2008-05-20 read_call_matrix returns PassingData object strain_acc_list, category_list = pdata.ecotype_id_ls, pdata.array_id_ls write_data_matrix(pdata.data_matrix, self.outputFname, pdata.header, strain_acc_list, category_list)
def run(self): db = AtDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) passingdata = self.getAlignmentMatrix(self.alignment_id) self.pickPolymorphicColumns(passingdata) header = ['id', 'name'] for snp_pos in passingdata.snp_pos_ls: header.append('%s_%s_%s'%snp_pos) if self.strain_id_type==1: ecotype_id_ls = [] for accession_id in passingdata.accession_id_ls: rows = db.metadata.bind.execute("select * from %s where accession_id=%s"%('accession2tg_ecotypeid', accession_id)) row = rows.fetchone() ecotype_id_ls.append(row.ecotype_id) strain_acc_list = ecotype_id_ls elif self.strain_id_type==2: strain_acc_list = passingdata.accession_id_ls else: sys.stderr.write("strain_id_type %s not supported.\n"%(self.strain_id_type)) sys.exit(2) write_data_matrix(passingdata.data_matrix, self.output_fname, header, \ strain_acc_list, passingdata.name_ls)
def output_data(self, data_to_output_label_ls, data_to_output_ls, min_distance, output_fname): """ 2008-11-11 data_to_output_ls is a list of (score_cutoff_ls, data_ls). each score_cutoff_ls might be a bit different from each other. 1. get score_cutoff2index out of all score_cutoffs in descending order 2. each row is same score_cutoff. column is data_ls of one result from analysis_method on phenotype. 3. first column is score cutoffs. 4. 2nd column is min_distance. 3rd and so forth columns are data. """ sys.stderr.write("Outputting data matrix ...") score_cutoff_ls, score_cutoff2index = self.get_score_cutoff2index(data_to_output_ls) header = ['score_cutoff', 'min_distance'] + data_to_output_label_ls no_of_cols = len(data_to_output_label_ls) data_matrix = numpy.zeros([len(score_cutoff2index), no_of_cols], numpy.float) data_matrix[:] = -1 for j in range(no_of_cols): sub_score_cutoff_ls, data_ls = data_to_output_ls[j] for i in range(len(sub_score_cutoff_ls)): score_cutoff = sub_score_cutoff_ls[i] data = data_ls[i] row_index = score_cutoff2index[score_cutoff] data_matrix[row_index][j] = data category_list = [min_distance]*len(score_cutoff2index) write_data_matrix(data_matrix, output_fname, header, score_cutoff_ls, category_list) sys.stderr.write("Done.\n")
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session results_method_id_info = self.getResultsMethodIDInfo(db, self.call_method_id_ls, self.min_distance, self.get_closest, self.min_MAF) results_method_id2gene_set = self.getResultsMethodID2GeneSet(db, results_method_id_info, self.results_directory, self.max_rank) rdata = self.getDataMatrix(results_method_id2gene_set, results_method_id_info) header = ['', ''] + results_method_id_info.results_method_id_label_ls strain_acc_list = results_method_id_info.results_method_id_label_ls category_list = results_method_id_info.results_method_id_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_acc_list,\ strain_acc_list, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def output_data(self, data_to_output_label_ls, data_to_output_ls, min_distance, output_fname): """ 2008-11-11 data_to_output_ls is a list of (score_cutoff_ls, data_ls). each score_cutoff_ls might be a bit different from each other. 1. get score_cutoff2index out of all score_cutoffs in descending order 2. each row is same score_cutoff. column is data_ls of one result from analysis_method on phenotype. 3. first column is score cutoffs. 4. 2nd column is min_distance. 3rd and so forth columns are data. """ sys.stderr.write("Outputting data matrix ...") score_cutoff_ls, score_cutoff2index = self.get_score_cutoff2index( data_to_output_ls) header = ['score_cutoff', 'min_distance'] + data_to_output_label_ls no_of_cols = len(data_to_output_label_ls) data_matrix = numpy.zeros([len(score_cutoff2index), no_of_cols], numpy.float) data_matrix[:] = -1 for j in range(no_of_cols): sub_score_cutoff_ls, data_ls = data_to_output_ls[j] for i in range(len(sub_score_cutoff_ls)): score_cutoff = sub_score_cutoff_ls[i] data = data_ls[i] row_index = score_cutoff2index[score_cutoff] data_matrix[row_index][j] = data category_list = [min_distance] * len(score_cutoff2index) write_data_matrix(data_matrix, output_fname, header, score_cutoff_ls, category_list) sys.stderr.write("Done.\n")
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def run(self): db = AtDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) passingdata = self.getAlignmentMatrix(self.alignment_id) self.pickPolymorphicColumns(passingdata) header = ['id', 'name'] for snp_pos in passingdata.snp_pos_ls: header.append('%s_%s_%s' % snp_pos) if self.strain_id_type == 1: ecotype_id_ls = [] for accession_id in passingdata.accession_id_ls: rows = db.metadata.bind.execute( "select * from %s where accession_id=%s" % ('accession2tg_ecotypeid', accession_id)) row = rows.fetchone() ecotype_id_ls.append(row.ecotype_id) strain_acc_list = ecotype_id_ls elif self.strain_id_type == 2: strain_acc_list = passingdata.accession_id_ls else: sys.stderr.write("strain_id_type %s not supported.\n" % (self.strain_id_type)) sys.exit(2) write_data_matrix(passingdata.data_matrix, self.output_fname, header, \ strain_acc_list, passingdata.name_ls)
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter ) data_matrix = num.array(data_matrix) if self.filtering_bits[0] == "1": remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = Set() if self.filtering_bits[1] == "1": remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = Set() if self.filtering_bits[2] == "1": no_of_rows, no_of_cols = data_matrix.shape total_rows_set = Set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = Set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains( data_matrix, rows_to_be_checked, cols_to_be_checked ) else: identity_strains_to_be_removed = Set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix( data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter, )
def run(self): if self.debug==1: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() pheno_data = self.getPhenotypeData(curs, self.phenotype_avg_table, self.phenotype_method_table, \ self.ecotype_table, get_raw_data=self.get_raw_data) header = ['ecotype id', 'nativename'] + pheno_data.col_label_ls write_data_matrix(pheno_data.data_matrix, self.output_fname, header, pheno_data.row_id_ls, pheno_data.row_label_ls, \ transform_to_numpy=False)
def run(self): if self.debug == 1: import pdb pdb.set_trace() #import MySQLdb #conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) #curs = conn.cursor() pheno_data = self.getPhenotypeData(self.db_250k.metadata.bind, self.phenotype_avg_table, self.phenotype_method_table, \ self.ecotype_table, get_raw_data=self.get_raw_data,\ getPublicPhenotype=self.getPublicPhenotype) header = ['ecotype id', 'nativename'] + pheno_data.col_label_ls write_data_matrix(pheno_data.data_matrix, self.outputFname, header, pheno_data.row_id_ls, pheno_data.row_label_ls, \ transform_to_numpy=False)
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid( curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls( strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename( curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter) data_matrix = numpy.array(data_matrix) if self.filtering_bits[0]=='1': remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = set() if self.filtering_bits[1]=='1': remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = set() if self.filtering_bits[2]=='1': no_of_rows, no_of_cols = data_matrix.shape total_rows_set = set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked) else: identity_strains_to_be_removed = set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed #self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
def run(self): """ 2007-04-30 2007-05-14 add nt_alphabet_bits """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0])) data_matrix = num.array(data_matrix) strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix) snp_locus_log_prob = self.cal_snp_locus_log_prob(data_matrix, strain_homo_perc_vector) from sets import Set cols_to_be_tossed_out_set = Set() for i in range(len(snp_locus_log_prob)): if snp_locus_log_prob[i]<=min_log_prob: cols_to_be_tossed_out_set.add(i) print "%sSNPs removed:"%(len(cols_to_be_tossed_out_set)) for col_index in cols_to_be_tossed_out_set: print '\t%s\t%s'%(col_index, header[2+col_index]) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1])) import pylab pylab.title("histogram of snp locus log probability") pylab.hist(snp_locus_log_prob, 20) pylab.show()
def run(self): """ 2008-05-20 read_call_matrix returns PassingData object """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session QC_method_id = 0 #just for QC_250k.get_call_info_id2fname() call_data = QC_250k.get_call_info_id2fname(db, QC_method_id, self.call_method_id, filter_calls_QCed=0, \ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, input_dir=self.input_dir,\ take_unique_ecotype=self.take_unique_ecotype) #snps_with_best_QC_ls = self.get_snps_with_best_QC_ls(db, self.call_method_id) if self.max_snp_mismatch_rate<1 or self.max_snp_NA_rate<1: #2008-05-18 only do this when it's necessary snps_name_set = self.get_snps_name_set_given_criteria(db, self.call_method_id, self.max_snp_mismatch_rate, self.max_snp_NA_rate) else: snps_name_set = None pdata = QC_250k.read_call_matrix(call_data.call_info_id2fname, self.min_probability, snps_name_set) #2008-05-20 read_call_matrix returns PassingData object strain_acc_list, category_list = pdata.ecotype_id_ls, pdata.array_id_ls write_data_matrix(pdata.data_matrix, self.output_fname, pdata.header, strain_acc_list, category_list)
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session if self.test_result_type == 1: test_result_class_table = CandidateGeneRankSumTestResult.table.name test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15' elif self.test_result_type == 2: test_result_class_table = CandidateGeneTopSNPTest.table.name elif self.test_result_type == 3: test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name else: sys.stderr.write(" test_result_type %s not supported.\n" % (self.test_result_type)) sys.exit(2) #the condition for min_MAF is tricky because of the floating precision. if self.test_result_type == 1: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001" \ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type == 2: where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \ and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001" \ %(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type == 3: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001" \ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) if self.call_method_id_ls: where_condition += " and r.call_method_id in (%s)" % self.call_method_id_ls if self.analysis_method_id_ls: where_condition += " and r.analysis_method_id in (%s)" % self.analysis_method_id_ls if self.super_type_id: where_condition += " and g.super_type_id=%s" % self.super_type_id if self.test_type: where_condition += " and c.test_type=%s" % self.test_type if self.test_result_type == 1: pass where_condition += " and c.max_pvalue_per_gene=%s" % ( self.max_pvalue_per_gene) elif self.test_result_type == 2: where_condition += " and c.no_of_top_snps=%s" % ( self.no_of_top_snps) list_type_id_ls = self.getListTypeInfo(db, where_condition) analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition) list_type_analysis_method_info = self.orderListTypeAnalysisMethodID( list_type_id_ls, analysis_method_id_ls) phenotype_info = self.getPhenotypeInfo(db, where_condition) rdata = self.get_data_matrix(db, phenotype_info, list_type_analysis_method_info, where_condition) rdata.data_matrix = self.markDataMatrixBoundary( rdata.data_matrix, phenotype_info, list_type_analysis_method_info) header = ['list_type_analysis_method', '' ] + phenotype_info.phenotype_method_label_ls strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor( x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\ phenotype_info.phenotype_method_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session if self.test_result_type==1: test_result_class_table = CandidateGeneRankSumTestResult.table.name test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15' elif self.test_result_type==2: test_result_class_table = CandidateGeneTopSNPTest.table.name elif self.test_result_type==3: test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name else: sys.stderr.write(" test_result_type %s not supported.\n"%(self.test_result_type)) sys.exit(2) #the condition for min_MAF is tricky because of the floating precision. if self.test_result_type==1: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type==2: where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \ and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type==3: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) if self.call_method_id_ls: where_condition += " and r.call_method_id in (%s)"%self.call_method_id_ls if self.analysis_method_id_ls: where_condition += " and r.analysis_method_id in (%s)"%self.analysis_method_id_ls if self.super_type_id: where_condition += " and g.super_type_id=%s"%self.super_type_id if self.test_type: where_condition += " and c.test_type=%s"%self.test_type if self.test_result_type==1: pass where_condition += " and c.max_pvalue_per_gene=%s"%(self.max_pvalue_per_gene) elif self.test_result_type==2: where_condition += " and c.no_of_top_snps=%s"%(self.no_of_top_snps) list_type_id_ls = self.getListTypeInfo(db, where_condition) analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition) list_type_analysis_method_info = self.orderListTypeAnalysisMethodID(list_type_id_ls, analysis_method_id_ls) phenotype_info = self.getPhenotypeInfo(db, where_condition) rdata = self.get_data_matrix(db, phenotype_info, list_type_analysis_method_info, where_condition) rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, phenotype_info, list_type_analysis_method_info) header = ['list_type_analysis_method', ''] + phenotype_info.phenotype_method_label_ls strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\ phenotype_info.phenotype_method_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename " #how to order strains. if self.QC_method_id == 4: sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence strain_where_condition = common_where_condition % ( " and e.id=st.ecotypeid") strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % ( sql_table_str, StockDB.Strain.table.name, strain_where_condition) else: sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\ (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence strain_where_condition = common_where_condition % ( " and e.id=st.ecotypeid and st.id=q.strainid") strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % ( sql_table_str, StockDB.Strain.table.name, strain_where_condition) if self.how_to_group_strains == 2 or self.how_to_group_strains == 3: plate_info = self.alignStrainsAccordingToSeqPlate(db) id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None elif self.input_fname: id_set_data = self.getStrainidTargetidFromFile( db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None if self.how_to_group_strains == 2 or self.how_to_group_strains == 3: strain_id_info = self.getStrainInfoGivenPlateInfo( db, plate_info, strain_id_info_query, strain_id_set=None) else: strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) if self.QC_method_id == 4: if self.how_to_group_strains == 3: #2008-09-15 column strain id is in country, strain-longitude order target_id_info = self.getStrainIDInfo( db, strain_id_info_query, id_set_data.strain_id_set) else: target_id_info = strain_id_info else: target_where_condition = common_where_condition % ( " and e.id=q.target_id") target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s" % ( sql_table_str, target_where_condition) target_id_info = self.getStrainIDInfo(db, target_id_info_query) if self.input_fname: rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs) rdata.data_matrix = self.markDataMatrixBoundary( rdata.data_matrix, strain_id_info, target_id_info) header = ['strain info', ''] + target_id_info.strain_label_ls strain_acc_list = strain_id_info.strain_label_ls category_list = [1] * len(strain_acc_list) if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor( x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\ target_id_info.strain_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): """ 2008-11-08 generate combinations of results_id, list_type_id and generate plots one after another save the plots into database if commit=1 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = MpiGeneListRankTest.generate_params(param_obj) ResultsClass, TestResultClass = db.getResultsAndTestResultsClass( results_type=self.results_type) if ResultsClass is None or TestResultClass is None: sys.stderr.write("Invalid results type : %s.\n" % pd.results_type) sys.exit(3) for results_id, list_type_id in params_ls: rm = ResultsClass.get(results_id) list_type = Stock_250kDB.GeneListType.get(list_type_id) title = 'result(%s) of %s on %s with %s(%s) list'%\ (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id) TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \ self.test_type_id, self.null_distribution_type_id) if self.commit: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count() > 0: row = rows.first() sys.stderr.write( '%s already in db (%s of them) with first id=%s.\n' % (title, rows.count(), row.id)) continue if not TopSNPTestType_id_ls: sys.stderr.write( "No TopSNPTestType matches the input requirements. Exit.\n" ) sys.exit(3) TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls) from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\ (TestResultClass.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\ results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str)) no_of_top_snps_info = self.get_no_of_top_snps_info( db, from_where_clause) min_distance_info = self.get_min_distance_info( db, from_where_clause) rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \ null_distribution_type_id=self.null_distribution_type_id) header = ['no_of_top_snps', ''] + min_distance_info.label_ls strain_acc_list = no_of_top_snps_info.label_ls category_list = no_of_top_snps_info.label_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") #sys.exit(3) continue if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) """ if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\ min_distance_info.label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname) """ if self.commit: output_fname_prefix = None else: title_cp = title title_cp = title_cp.replace('/', '_') output_fname_prefix = '%s_%s_type_%s.png' % (os.path.splitext( self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0]) if rm.analysis_method_id == 1 or rm.analysis_method_id == 7: preset_xlim = [0, 8] preset_xlim = None else: preset_xlim = None return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim) if self.commit and return_data.png_data: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count() > 0: row = rows.first() sys.stderr.write( '%s already in db (%s of them) with first id=%s.\n' % (title, rows.count(), row.id)) continue plot = Stock_250kDB.CandidateVsNonRatioPlot( type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id) plot.png_thumbnail = return_data.png_thumbnail.getvalue() plot.png_data = return_data.png_data.getvalue() plot.svg_data = return_data.svg_data.getvalue() db.session.save(plot) db.session.flush()
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type == 1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info( curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type == 2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index( curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info( curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free( data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs( data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [ strain_id2acc[strain_id] for strain_id in strain_id_list ] category_list = [ strain_id2category[strain_id] for strain_id in strain_id_list ] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[ strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type == 1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
def run(self): """ 2008-11-08 generate combinations of results_id, list_type_id and generate plots one after another save the plots into database if commit=1 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = MpiGeneListRankTest.generate_params(param_obj) for results_id, list_type_id in params_ls: rm = Stock_250kDB.ResultsMethod.get(results_id) list_type = Stock_250kDB.GeneListType.get(list_type_id) title = 'result(%s) of %s on %s with %s(%s) list'%\ (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id) TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \ self.test_type_id, self.null_distribution_type_id) if self.commit: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count()>0: row = rows.first() sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id)) continue if not TopSNPTestType_id_ls: sys.stderr.write("No TopSNPTestType matches the input requirements. Exit.\n") sys.exit(3) TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls) from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\ (Stock_250kDB.CandidateGeneTopSNPTestRM.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\ results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str)) no_of_top_snps_info = self.get_no_of_top_snps_info(db, from_where_clause) min_distance_info = self.get_min_distance_info(db, from_where_clause) rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \ null_distribution_type_id=self.null_distribution_type_id) header = ['no_of_top_snps', ''] + min_distance_info.label_ls strain_acc_list = no_of_top_snps_info.label_ls category_list = no_of_top_snps_info.label_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") #sys.exit(3) continue if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) """ if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\ min_distance_info.label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname) """ if self.commit: output_fname_prefix = None else: title_cp = title title_cp = title_cp.replace('/', '_') output_fname_prefix='%s_%s_type_%s.png'%(os.path.splitext(self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0]) if rm.analysis_method_id ==1 or rm.analysis_method_id==7: preset_xlim = [0,8] else: preset_xlim = None return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim) if self.commit and return_data.png_data: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count()>0: row = rows.first() sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id)) continue plot = Stock_250kDB.CandidateVsNonRatioPlot(type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id) plot.png_thumbnail = return_data.png_thumbnail.getvalue() plot.png_data = return_data.png_data.getvalue() plot.svg_data = return_data.svg_data.getvalue() db.session.save(plot) db.session.flush()
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename " #how to order strains. if self.QC_method_id ==4: sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid") strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition) else: sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\ (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid and st.id=q.strainid") strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition) if self.how_to_group_strains==2 or self.how_to_group_strains==3: plate_info = self.alignStrainsAccordingToSeqPlate(db) id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None elif self.input_fname: id_set_data = self.getStrainidTargetidFromFile(db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None if self.how_to_group_strains==2 or self.how_to_group_strains==3: strain_id_info = self.getStrainInfoGivenPlateInfo(db, plate_info, strain_id_info_query, strain_id_set=None) else: strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) if self.QC_method_id==4: if self.how_to_group_strains==3: #2008-09-15 column strain id is in country, strain-longitude order target_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) else: target_id_info = strain_id_info else: target_where_condition = common_where_condition%(" and e.id=q.target_id") target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s"%(sql_table_str, target_where_condition) target_id_info = self.getStrainIDInfo(db, target_id_info_query) if self.input_fname: rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs) rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, strain_id_info, target_id_info) header = ['strain info', ''] + target_id_info.strain_label_ls strain_acc_list = strain_id_info.strain_label_ls category_list = [1]*len(strain_acc_list) if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\ target_id_info.strain_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type==1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info(curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type==2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index(curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info(curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [strain_id2acc[strain_id] for strain_id in strain_id_list] category_list = [strain_id2category[strain_id] for strain_id in strain_id_list] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type==1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)