def run(self): """ 2008-06-02 """ if self.debug: import pdb pdb.set_trace() if self.row_matching_by_which_value == 0: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1, ignore_2nd_column=1) else: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1) snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1) if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2: row_matching_by_which_value = self.row_matching_by_which_value - 1 else: row_matching_by_which_value = None twoSNPData = TwoSNPData( SNPData1=snpData1, SNPData2=snpData2, debug=self.debug, row_matching_by_which_value=row_matching_by_which_value) newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData() newSnpData.tofile(self.output_fname)
def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs, session, readme): """ 2008-08-16 split from run() to enable one_by_one option """ #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: twoSNPData.save_col_wise(session, readme) row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) passingdata = PassingData() passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2 return passingdata
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list] category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type==1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type==2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n"%self.run_type) sys.exit(5) if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()
def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs, session, readme): """ 2008-08-16 split from run() to enable one_by_one option """ # swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData( header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id, max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table="stock_250k.snps", ) # snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData( SNPData1=snpData1, SNPData2=snpData2, curs=curs, QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug, ) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: twoSNPData.save_col_wise(session, readme) row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) passingdata = PassingData() passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2 return passingdata
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id!=0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls)>1: self.cmp_data_filename = qm.data_description.split('=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id!=0: sys.stderr.write("cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n") sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[call_info_id_wanted] = call_info_id2fname[call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n"%(call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist() #database submission is done along. return row_id2NA_mismatch_rate
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info( self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix( db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] #tg_ecotypeid category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id == 4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\ max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None): """ 2009-10-11 replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 2008-12-22 replace '=' and ',' with '_' in the output filename 2008-05-19 matrix_ls has to be of length >0 before concatenation 2008-05-19 use SNPData structure 2008-05-18 add onlyCommon=True to FilterAccessions.filterByError() 2008-05-17 add argument output_dir. if it's available, output data matrix before and after imputation 2008-05-12 add qcdata.no_of_accessions_filtered_by_mismatch qcdata.no_of_accessions_filtered_by_na qcdata.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed 2008-05-11 split up from computing_node_handler """ qcdata = PassingData() twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0, debug=self.debug) row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() del twoSNPData newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate) qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate) qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() del twoSNPData newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate) qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate) qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) newSnpData = twoSNPData.mergeTwoSNPData(priority=2) del twoSNPData #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2) newSnpData = SNPData.removeMonomorphicCols(newSnpData) qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols #FilterSnps.filterMonomorphic(snpsd_250k_tmp) if output_dir: #output data here if not os.path.isdir(output_dir): os.makedirs(output_dir) output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\ 'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\ 'max_array_NA_rate_%s'%max_call_NA_rate,\ 'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\ 'max_snp_NA_rate_%s'%max_snp_NA_rate,\ 'npute_window_size_%s'%npute_window_size] output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['before_imputation.tsv'])) newSnpData.tofile(output_fname) #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) """ qcdata.no_of_snps_filtered_by_mismatch = 0 qcdata.no_of_snps_filtered_by_na = 0 qcdata.no_of_monomorphic_snps_removed = 0 for snpsd in snpsd_250k_tmp: qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed """ #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp) twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise() col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise() del twoSNPData0 result = [] #for npute_window_size in npute_window_size_ls: #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp) #deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well if len(newSnpData.row_id_ls) > 5: snps_name_ls = newSnpData.col_id_ls ## 2009-10-8 use NPUTE.samplingImpute() imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \ input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\ npute_window_size=int(npute_window_size), \ no_of_accessions_per_sampling=300, coverage=3) snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix) """ ## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[]) matrix_ls = [] for chromosome in chr_ls: if chr2no_of_snps[chromosome]>5: #enough for imputation npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \ input_file_format=1, input_NA_char=0) imputeData(npute_data_struc, int(npute_window_size)) matrix_ls.append(npute_data_struc.snps) snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls if len(matrix_ls)>0: snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls)) """ if output_dir: #2008-05-16 write the data out if output_fname is available #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #already produced in the previous before_imputation output output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['after_imputation.tsv'])) #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) snpData_imputed.tofile(output_fname) twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise() qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise() del twoSNPData1, snpData_imputed else: snpData_imputed = None #qcdata.row_id2NA_mismatch_rate1 = {} #qcdata.col_id2NA_mismatch_rate1 = {} del newSnpData """ for i in range(len(snpsd_250k_tmp)): #snpsd_250k_tmp_1[i].snps = [] #clear it up if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5: #not enough for imputation npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0) imputeData(npute_data_struc, int(npute_window_size)) snpsd_250k_tmp[i].snps = npute_data_struc.snps del npute_data_struc """ qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0 qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0 qcdata.min_call_probability = min_call_probability qcdata.max_call_mismatch_rate = max_call_mismatch_rate qcdata.max_call_NA_rate = max_call_NA_rate qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate qcdata.max_snp_NA_rate = max_snp_NA_rate qcdata.npute_window_size = npute_window_size result.append(qcdata) return result
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id != 0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls) > 1: self.cmp_data_filename = qm.data_description.split( '=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id != 0: sys.stderr.write( "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n" ) sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[ call_info_id_wanted] = call_info_id2fname[ call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n" % (call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist( ) #database submission is done along. return row_id2NA_mismatch_rate
def doFilter( self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate, max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size, output_dir=None, ): """ 2009-10-11 replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 2008-12-22 replace '=' and ',' with '_' in the output filename 2008-05-19 matrix_ls has to be of length >0 before concatenation 2008-05-19 use SNPData structure 2008-05-18 add onlyCommon=True to FilterAccessions.filterByError() 2008-05-17 add argument output_dir. if it's available, output data matrix before and after imputation 2008-05-12 add qcdata.no_of_accessions_filtered_by_mismatch qcdata.no_of_accessions_filtered_by_na qcdata.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed 2008-05-11 split up from computing_node_handler """ qcdata = PassingData() twoSNPData = TwoSNPData( SNPData1=snpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0, debug=self.debug ) row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() del twoSNPData newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate) qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_filtered_by_mismatch newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate) qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_filtered_by_na twoSNPData = TwoSNPData( SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug ) col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() del twoSNPData newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate) qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate) qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na twoSNPData = TwoSNPData( SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug ) newSnpData = twoSNPData.mergeTwoSNPData(priority=2) del twoSNPData # MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2) newSnpData = SNPData.removeMonomorphicCols(newSnpData) qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols # FilterSnps.filterMonomorphic(snpsd_250k_tmp) if output_dir: # output data here if not os.path.isdir(output_dir): os.makedirs(output_dir) output_fname_prefix_ls = [ "min_oligo_call_probability_%s" % min_call_probability, "max_array_mismatch_rate_%s" % max_call_mismatch_rate, "max_array_NA_rate_%s" % max_call_NA_rate, "max_snp_mismatch_rate_%s" % max_snp_mismatch_rate, "max_snp_NA_rate_%s" % max_snp_NA_rate, "npute_window_size_%s" % npute_window_size, ] output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["before_imputation.tsv"])) newSnpData.tofile(output_fname) # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) """ qcdata.no_of_snps_filtered_by_mismatch = 0 qcdata.no_of_snps_filtered_by_na = 0 qcdata.no_of_monomorphic_snps_removed = 0 for snpsd in snpsd_250k_tmp: qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed """ # snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp) twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0) row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise() col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise() del twoSNPData0 result = [] # for npute_window_size in npute_window_size_ls: # snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp) #deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well if len(newSnpData.row_id_ls) > 5: snps_name_ls = newSnpData.col_id_ls ## 2009-10-8 use NPUTE.samplingImpute() imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute( snps_name_ls, newSnpData.data_matrix, input_file_format=1, input_NA_char=0, lower_case_for_imputation=False, npute_window_size=int(npute_window_size), no_of_accessions_per_sampling=300, coverage=3, ) snpData_imputed = SNPData( row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix ) """ ## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[]) matrix_ls = [] for chromosome in chr_ls: if chr2no_of_snps[chromosome]>5: #enough for imputation npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \ input_file_format=1, input_NA_char=0) imputeData(npute_data_struc, int(npute_window_size)) matrix_ls.append(npute_data_struc.snps) snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls if len(matrix_ls)>0: snpData_imputed.data_matrix = num.transpose(num.concatenate(matrix_ls)) """ if output_dir: # 2008-05-16 write the data out if output_fname is available # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #already produced in the previous before_imputation output output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["after_imputation.tsv"])) # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) snpData_imputed.tofile(output_fname) twoSNPData1 = TwoSNPData( SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, row_matching_by_which_value=0 ) qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise() qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise() del twoSNPData1, snpData_imputed else: snpData_imputed = None # qcdata.row_id2NA_mismatch_rate1 = {} # qcdata.col_id2NA_mismatch_rate1 = {} del newSnpData """ for i in range(len(snpsd_250k_tmp)): #snpsd_250k_tmp_1[i].snps = [] #clear it up if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5: #not enough for imputation npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0) imputeData(npute_data_struc, int(npute_window_size)) snpsd_250k_tmp[i].snps = npute_data_struc.snps del npute_data_struc """ qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0 qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0 qcdata.min_call_probability = min_call_probability qcdata.max_call_mismatch_rate = max_call_mismatch_rate qcdata.max_call_NA_rate = max_call_NA_rate qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate qcdata.max_snp_NA_rate = max_snp_NA_rate qcdata.npute_window_size = npute_window_size result.append(qcdata) return result