def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list] category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type==1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type==2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n"%self.run_type) sys.exit(5) if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list] #tg_ecotypeid category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id==4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info( self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix( db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] #tg_ecotypeid category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id == 4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()