def get_data_matrix(self, db, phenotype_info, list_type_analysis_method_info, where_condition): sys.stderr.write("Getting data matrix ...") data_matrix = num.zeros([len(list_type_analysis_method_info.list_type_id_analysis_method_id2index), len(phenotype_info.phenotype_method_id2index)], num.float) data_matrix[:] = -1 i = 0 rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\ %(where_condition)) min_value = None max_value = None for row in rows: tup = (row.list_type_id, row.analysis_method_id) row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[tup] col_index = phenotype_info.phenotype_method_id2index[row.phenotype_method_id] if row.pvalue>0: data_value = -math.log10(row.pvalue) if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value else: data_value = -2 #0 pvalue data_matrix[row_index, col_index] = data_value sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value return return_data
def get_data_matrix(self, db, strain_id_info, target_id_info, QC_method_id, max_mismatch_rate, min_no_of_non_NAs=20): """ 2008-08-29 """ sys.stderr.write("Getting data matrix ... \n") data_matrix = num.zeros([ len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls) ], num.float) data_matrix[:] = -1 i = 0 block_size = 10000 query = StockDB.QCCrossMatch.query.filter_by( qc_method_id=QC_method_id).filter( StockDB.QCCrossMatch.no_of_non_NA_pairs > min_no_of_non_NAs ).filter(StockDB.QCCrossMatch.mismatch_rate <= max_mismatch_rate) rows = query.offset(i).limit(block_size) min_value = None max_value = None while rows.count() != 0: for row in rows: row_index = strain_id_info.strain_id2index[row.strainid] col_index = target_id_info.strain_id2index[row.target_id] data_value = row.mismatch_rate if data_value >= 0: if min_value == None: min_value = data_value elif data_value < min_value: min_value = data_value if max_value == None: max_value = data_value elif data_value > max_value: max_value = data_value data_matrix[row_index, col_index] = data_value i += 1 if self.report: sys.stderr.write("%s\t%s" % ('\x08' * 40, i)) rows = query.offset(i).limit(block_size) sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value return return_data
def get_data_matrix(self, db, phenotype_info, list_type_analysis_method_info, where_condition): sys.stderr.write("Getting data matrix ...") data_matrix = num.zeros([ len(list_type_analysis_method_info. list_type_id_analysis_method_id2index), len(phenotype_info.phenotype_method_id2index) ], num.float) data_matrix[:] = -1 i = 0 rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\ %(where_condition)) min_value = None max_value = None for row in rows: tup = (row.list_type_id, row.analysis_method_id) row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[ tup] col_index = phenotype_info.phenotype_method_id2index[ row.phenotype_method_id] if row.pvalue > 0: data_value = -math.log10(row.pvalue) if min_value == None: min_value = data_value elif data_value < min_value: min_value = data_value if max_value == None: max_value = data_value elif data_value > max_value: max_value = data_value else: data_value = -2 #0 pvalue data_matrix[row_index, col_index] = data_value sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value return return_data
def get_data_matrix(self, db, strain_id_info, target_id_info, QC_method_id, max_mismatch_rate, min_no_of_non_NAs=20): """ 2008-08-29 """ sys.stderr.write("Getting data matrix ... \n") data_matrix = num.zeros([len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls)], num.float) data_matrix[:] = -1 i = 0 block_size = 10000 query = StockDB.QCCrossMatch.query.filter_by(qc_method_id=QC_method_id).filter(StockDB.QCCrossMatch.no_of_non_NA_pairs>min_no_of_non_NAs).filter(StockDB.QCCrossMatch.mismatch_rate<=max_mismatch_rate) rows = query.offset(i).limit(block_size) min_value = None max_value = None while rows.count()!=0: for row in rows: row_index = strain_id_info.strain_id2index[row.strainid] col_index = target_id_info.strain_id2index[row.target_id] data_value = row.mismatch_rate if data_value>=0: if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value data_matrix[row_index, col_index] = data_value i += 1 if self.report: sys.stderr.write("%s\t%s"%('\x08'*40, i)) rows = query.offset(i).limit(block_size) sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value return return_data
def getDataMatrix(self, results_method_id2gene_set, results_method_id_info): sys.stderr.write("Gettiing data matrix ...") data_matrix = num.zeros([len(results_method_id_info.results_method_id_ls), len(results_method_id_info.results_method_id_ls)], num.float) data_matrix[:] = -1 min_value = None max_value = None no_of_results = len(results_method_id_info.results_method_id_ls) for i in range(no_of_results): results_method_id1 = results_method_id_info.results_method_id_ls[i] for j in range(i, no_of_results): results_method_id2 = results_method_id_info.results_method_id_ls[j] row_index = results_method_id_info.results_method_id2index.get(results_method_id1) col_index = results_method_id_info.results_method_id2index.get(results_method_id2) if results_method_id1<0 or results_method_id2<0: data_value = -3 elif results_method_id1 in results_method_id2gene_set and results_method_id2 in results_method_id2gene_set: data_value = len(results_method_id2gene_set[results_method_id1]&results_method_id2gene_set[results_method_id2]) if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value else: continue #data_value = -1 data_matrix[row_index, col_index] = data_value data_matrix[col_index, row_index] = data_value return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value sys.stderr.write("Done.\n") return return_data
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ # database connection and etc db = Stock_250kDB.Stock_250kDB( drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname, ) db.setup() session = db.session session.begin() # transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class ) qm = self.QCMethod_class.query.get(self.QC_method_id) # 2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.save(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: # from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het ) strain_acc_list = map( int, strain_acc_list ) # it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData( header=header, strain_acc_list=strain_acc_list, data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het, ) # category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): # it's file call_info_id2fname = None else: if self.run_type == 2: # no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 # don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname( db, self.QC_method_id, self.call_method_id, filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir, ) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: if self.one_by_one and self.run_type == 1: # one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability ) # 05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme ) row_id2NA_mismatch_rate.update(passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix( call_info_id2fname, self.min_probability ) # 05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: # input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het ) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ["", ""] + snps_name_ls # fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: # if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) # row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC( session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, row_id12row_id2, self.call_method_id, readme, ) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate # for plone to get the data structure
def get_data_matrixFromFile(self, db, strain_id_info, target_id_info, QC_method_id, input_fname, max_mismatch_rate, min_no_of_non_NAs=20): """ 2008-09-10 column in input_fname is determined on the fly """ sys.stderr.write("Getting data matrix from %s ... \n"%input_fname) data_matrix = num.zeros([len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls)], num.float) data_matrix[:] = -1 reader = csv.reader(open(input_fname), delimiter='\t') #figure out which variable is in which column header = reader.next() col_name2index = {} for i in range(len(header)): column_name = header[i] col_name2index[column_name] = i min_value = None max_value = None i = 0 for row in reader: """ id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row strainid = int(strainid) target_id = int(target_id) qc_method_id = int(qc_method_id) mismatch_rate = float(mismatch_rate) no_of_mismatches = int(no_of_mismatches) no_of_non_NA_pairs = int(no_of_non_NA_pairs) """ strainid = int(row[col_name2index['strainid']]) target_id = int(row[col_name2index['target_id']]) qc_method_id = int(row[col_name2index['qc_method_id']]) mismatch_rate = float(row[col_name2index['mismatch_rate']]) no_of_mismatches = int(row[col_name2index['no_of_mismatches']]) no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']]) if qc_method_id == QC_method_id and no_of_non_NA_pairs>=min_no_of_non_NAs and mismatch_rate<=max_mismatch_rate: row_index = strain_id_info.strain_id2index.get(strainid) col_index = target_id_info.strain_id2index.get(target_id) if row_index is None or col_index is None: continue data_value = mismatch_rate if data_value>=0: if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value data_matrix[row_index, col_index] = data_value if QC_method_id==4: #149 self-cross-match row_index = strain_id_info.strain_id2index.get(target_id) col_index = target_id_info.strain_id2index.get(strainid) data_matrix[row_index, col_index] = data_value i += 1 if self.report and i%100000==0: sys.stderr.write("%s\t%s"%('\x08'*40, i)) if self.debug and i>1000000: break return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value del reader sys.stderr.write("Done.\n") return return_data
def get_data_matrix(cls, db, row_info, col_info, from_where_clause, need_other_values=False,\ null_distribution_type_id=2): """ 2008-11-04 get data_matrix_candidate_sample_size_null & data_matrix_candidate_gw_size_null also if need_other_values=True """ sys.stderr.write("Getting data matrix ...") data_matrix = num.zeros( [len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix[:] = -1 max_no_of_null_data = 200 if need_other_values: data_matrix_candidate_sample_size = num.zeros( [len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_candidate_sample_size[:] = -1 data_matrix_non_candidate_sample_size = num.zeros( [len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_non_candidate_sample_size[:] = -1 data_matrix_candidate_gw_size = num.zeros( [len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_candidate_gw_size[:] = -1 data_matrix_non_candidate_gw_size = num.zeros( [len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_non_candidate_gw_size[:] = -1 data_matrix_candidate_sample_size_null = num.zeros([ len(row_info.id2index), len(col_info.id2index), max_no_of_null_data ], num.float) data_matrix_candidate_sample_size_null[:] = -1 data_matrix_candidate_gw_size_null = num.zeros([ len(row_info.id2index), len(col_info.id2index), max_no_of_null_data ], num.float) data_matrix_candidate_gw_size_null[:] = -1 else: data_matrix_candidate_sample_size = None data_matrix_non_candidate_sample_size = None data_matrix_candidate_gw_size = None data_matrix_non_candidate_gw_size = None data_matrix_candidate_sample_size_null = None data_matrix_candidate_gw_size_null = None rows = db.metadata.bind.execute( "select t.id, t.no_of_top_snps, t.min_distance, t.pvalue, t.candidate_sample_size, \ t.non_candidate_sample_size, t.candidate_gw_size, t.non_candidate_gw_size %s" % from_where_clause) min_value = None max_value = None for row in rows: row_index = row_info.id2index[row.no_of_top_snps] col_index = col_info.id2index[row.min_distance] if row.pvalue > 0: data_value = -math.log10(row.pvalue) if min_value == None: min_value = data_value elif data_value < min_value: min_value = data_value if max_value == None: max_value = data_value elif data_value > max_value: max_value = data_value else: data_value = -2 #0 pvalue data_matrix[row_index, col_index] = data_value if need_other_values: data_matrix_candidate_sample_size[ row_index, col_index] = row.candidate_sample_size data_matrix_non_candidate_sample_size[ row_index, col_index] = row.non_candidate_sample_size data_matrix_candidate_gw_size[ row_index, col_index] = row.candidate_gw_size data_matrix_non_candidate_gw_size[ row_index, col_index] = row.non_candidate_gw_size null_datas = db.metadata.bind.execute("select candidate_sample_size, candidate_gw_size from %s where observed_id=%s and null_distribution_type_id=%s"%\ (Stock_250kDB.TopSNPTestRMNullData.table.name, row.id, null_distribution_type_id)) i = 0 for null_data in null_datas: data_matrix_candidate_sample_size_null[ row_index, col_index, i] = null_data.candidate_sample_size data_matrix_candidate_gw_size_null[ row_index, col_index, i] = null_data.candidate_gw_size i += 1 if i >= max_no_of_null_data: #no more than this break sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.data_matrix_candidate_sample_size = data_matrix_candidate_sample_size return_data.data_matrix_non_candidate_sample_size = data_matrix_non_candidate_sample_size return_data.data_matrix_candidate_gw_size = data_matrix_candidate_gw_size return_data.data_matrix_non_candidate_gw_size = data_matrix_non_candidate_gw_size return_data.min_value = min_value return_data.max_value = max_value return_data.data_matrix_candidate_sample_size_null = data_matrix_candidate_sample_size_null return_data.data_matrix_candidate_gw_size_null = data_matrix_candidate_gw_size_null return return_data
def get_data_matrixFromFile(self, db, strain_id_info, target_id_info, QC_method_id, input_fname, max_mismatch_rate, min_no_of_non_NAs=20): """ 2008-09-10 column in input_fname is determined on the fly """ sys.stderr.write("Getting data matrix from %s ... \n" % input_fname) data_matrix = num.zeros([ len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls) ], num.float) data_matrix[:] = -1 reader = csv.reader(open(input_fname), delimiter='\t') #figure out which variable is in which column header = reader.next() col_name2index = {} for i in range(len(header)): column_name = header[i] col_name2index[column_name] = i min_value = None max_value = None i = 0 for row in reader: """ id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row strainid = int(strainid) target_id = int(target_id) qc_method_id = int(qc_method_id) mismatch_rate = float(mismatch_rate) no_of_mismatches = int(no_of_mismatches) no_of_non_NA_pairs = int(no_of_non_NA_pairs) """ strainid = int(row[col_name2index['strainid']]) target_id = int(row[col_name2index['target_id']]) qc_method_id = int(row[col_name2index['qc_method_id']]) mismatch_rate = float(row[col_name2index['mismatch_rate']]) no_of_mismatches = int(row[col_name2index['no_of_mismatches']]) no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']]) if qc_method_id == QC_method_id and no_of_non_NA_pairs >= min_no_of_non_NAs and mismatch_rate <= max_mismatch_rate: row_index = strain_id_info.strain_id2index.get(strainid) col_index = target_id_info.strain_id2index.get(target_id) if row_index is None or col_index is None: continue data_value = mismatch_rate if data_value >= 0: if min_value == None: min_value = data_value elif data_value < min_value: min_value = data_value if max_value == None: max_value = data_value elif data_value > max_value: max_value = data_value data_matrix[row_index, col_index] = data_value if QC_method_id == 4: #149 self-cross-match row_index = strain_id_info.strain_id2index.get(target_id) col_index = target_id_info.strain_id2index.get(strainid) data_matrix[row_index, col_index] = data_value i += 1 if self.report and i % 100000 == 0: sys.stderr.write("%s\t%s" % ('\x08' * 40, i)) if self.debug and i > 1000000: break return_data = PassingData() return_data.data_matrix = data_matrix return_data.min_value = min_value return_data.max_value = max_value del reader sys.stderr.write("Done.\n") return return_data
def get_data_matrix(cls, db, row_info, col_info, from_where_clause, need_other_values=False,\ null_distribution_type_id=2): """ 2008-11-04 get data_matrix_candidate_sample_size_null & data_matrix_candidate_gw_size_null also if need_other_values=True """ sys.stderr.write("Getting data matrix ...") data_matrix = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix[:] = -1 max_no_of_null_data = 100 if need_other_values: data_matrix_candidate_sample_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_candidate_sample_size[:] = -1 data_matrix_non_candidate_sample_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_non_candidate_sample_size[:] = -1 data_matrix_candidate_gw_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_candidate_gw_size[:] = -1 data_matrix_non_candidate_gw_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float) data_matrix_non_candidate_gw_size[:] = -1 data_matrix_candidate_sample_size_null = num.zeros([len(row_info.id2index), len(col_info.id2index), max_no_of_null_data], num.float) data_matrix_candidate_sample_size_null[:] = -1 data_matrix_candidate_gw_size_null = num.zeros([len(row_info.id2index), len(col_info.id2index), max_no_of_null_data], num.float) data_matrix_candidate_gw_size_null[:] = -1 else: data_matrix_candidate_sample_size = None data_matrix_non_candidate_sample_size = None data_matrix_candidate_gw_size = None data_matrix_non_candidate_gw_size = None data_matrix_candidate_sample_size_null = None data_matrix_candidate_gw_size_null = None rows = db.metadata.bind.execute("select t.id, t.no_of_top_snps, t.min_distance, t.pvalue, t.candidate_sample_size, \ t.non_candidate_sample_size, t.candidate_gw_size, t.non_candidate_gw_size %s"%from_where_clause) min_value = None max_value = None for row in rows: row_index = row_info.id2index[row.no_of_top_snps] col_index = col_info.id2index[row.min_distance] if row.pvalue>0: data_value = -math.log10(row.pvalue) if min_value==None: min_value = data_value elif data_value<min_value: min_value = data_value if max_value==None: max_value=data_value elif data_value>max_value: max_value =data_value else: data_value = -2 #0 pvalue data_matrix[row_index, col_index] = data_value if need_other_values: data_matrix_candidate_sample_size[row_index, col_index] = row.candidate_sample_size data_matrix_non_candidate_sample_size[row_index, col_index] = row.non_candidate_sample_size data_matrix_candidate_gw_size[row_index, col_index] = row.candidate_gw_size data_matrix_non_candidate_gw_size[row_index, col_index] = row.non_candidate_gw_size """ null_datas = db.metadata.bind.execute("select candidate_sample_size, candidate_gw_size from %s where observed_id=%s and null_distribution_type_id=%s"%\ (Stock_250kDB.TopSNPTestRMNullData.table.name, row.id, null_distribution_type_id)) i = 0 for null_data in null_datas: data_matrix_candidate_sample_size_null[row_index, col_index, i] = null_data.candidate_sample_size data_matrix_candidate_gw_size_null[row_index, col_index, i] = null_data.candidate_gw_size i+=1 if i>=max_no_of_null_data: #no more than this break """ sys.stderr.write("Done.\n") return_data = PassingData() return_data.data_matrix = data_matrix return_data.data_matrix_candidate_sample_size = data_matrix_candidate_sample_size return_data.data_matrix_non_candidate_sample_size = data_matrix_non_candidate_sample_size return_data.data_matrix_candidate_gw_size = data_matrix_candidate_gw_size return_data.data_matrix_non_candidate_gw_size = data_matrix_non_candidate_gw_size return_data.min_value = min_value return_data.max_value = max_value return_data.data_matrix_candidate_sample_size_null = data_matrix_candidate_sample_size_null return_data.data_matrix_candidate_gw_size_null = data_matrix_candidate_gw_size_null return return_data
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ #database connection and etc db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class) qm = self.QCMethod_class.query.get(self.QC_method_id) #2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.add(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\ ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): #it's file call_info_id2fname = None else: if self.run_type == 2: #no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 #don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\ min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: db_id2chr_pos = db.getSNPID2ChrPos() #2011-22 from DB_250k2data import DB_250k2Data db_id2index = DB_250k2Data.getSNPID2index( call_info_id2fname.values()[0][1], db_id2chr_pos) if self.one_by_one and self.run_type == 1: #one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate.update( passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ['', '' ] + snps_name_ls #fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \ row_id12row_id2, self.call_method_id, readme) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate #for plone to get the data structure